Skip to content

Commit

Permalink
EVA3454 - Retrieve and Update Md5checksum for assemblies (#121)
Browse files Browse the repository at this point in the history
* created an end-point as well as a scheduler for retrieving and saving md5 checksum
  • Loading branch information
nitin-ebi authored Jan 23, 2024
1 parent 2464759 commit 66350a3
Show file tree
Hide file tree
Showing 16 changed files with 356 additions and 28 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@
<dependency>
<groupId>org.springframework.retry</groupId>
<artifactId>spring-retry</artifactId>
<version>1.2.5.RELEASE</version>
<version>1.3.1</version>
</dependency>

</dependencies>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import org.springframework.hateoas.config.EnableHypermediaSupport;
import org.springframework.retry.annotation.EnableRetry;
import org.springframework.scheduling.annotation.EnableScheduling;

@EnableScheduling
@SpringBootApplication
@EnableRetry
@EnableHypermediaSupport(type = EnableHypermediaSupport.HypermediaType.HAL)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package uk.ac.ebi.eva.contigalias.conf;

import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.web.client.RestTemplate;

@Configuration
public class ContigAliasConfiguration {

@Bean
public RestTemplate getRestTemplate() {
return new RestTemplate();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,20 @@
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.DeleteMapping;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PutMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

@RequestMapping("/v1/admin")
@RestController
Expand Down Expand Up @@ -87,6 +91,36 @@ public ResponseEntity<?> fetchAndInsertAssemblyByAccession(
return new ResponseEntity<>("Accession Processing Result : " + accessionResult, HttpStatus.MULTI_STATUS);
}

@ApiOperation(value = "Given an assembly accession, retrieve MD5 checksum for all chromosomes belonging to assembly and update")
@PutMapping(value = "assemblies/{accession}/md5checksum")
public ResponseEntity<String> retrieveAndInsertMd5ChecksumForAssembly(@PathVariable(name = "accession")
@ApiParam(value = "INSDC or RefSeq assembly accession. Eg: " +
"GCA_000001405.10") String asmAccession) {
try {
handler.getAssemblyByAccession(asmAccession);
handler.retrieveAndInsertMd5ChecksumForAssembly(asmAccession);
return ResponseEntity.ok("A task has been submitted for updating md5checksum for all chromosomes " +
"in assembly " + asmAccession + ". Depending upon the number of chromosomes present in assembly, " +
"this might take some time to complete");
} catch (AssemblyNotFoundException e) {
return ResponseEntity.ok("Could not find assembly " + asmAccession +
". Please insert the assembly first (md5checksum will be updated as part of the insertion process");
}
}

@ApiOperation(value = "Retrieve list of assemblies for which MD5 Checksum updates are running/going-to-run ")
@GetMapping(value = "assemblies/md5checksum/status")
public ResponseEntity<String> getMD5ChecksumUpdateTaskStatus() {
Map<String, Set<String>> md5ChecksumUpdateTasks = handler.getMD5ChecksumUpdateTaskStatus();
Set<String> runningTasks = md5ChecksumUpdateTasks.get("running");
Set<String> scheduledTasks = md5ChecksumUpdateTasks.get("scheduled");
String runningTaskRes = runningTasks == null || runningTasks.isEmpty() ? "No running MD5 checksum update tasks" :
runningTasks.stream().collect(Collectors.joining(","));
String scheduledTaskRes = scheduledTasks == null || scheduledTasks.isEmpty() ? "No scheduled MD5 checksum update tasks" :
scheduledTasks.stream().collect(Collectors.joining(","));
return ResponseEntity.ok("running: " + runningTaskRes + "\nscheduled: " + scheduledTaskRes);
}

// This endpoint can be enabled in the future when checksums for assemblies are added to the project.
// @ApiOperation(value = "Add MD5 and TRUNC512 checksums to an assembly by accession.",
// notes = "Given an INSDC or RefSeq accession along with a MD5 or a TRUNC512 checksum, this endpoint will
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

@Service
public class AdminHandler {
Expand All @@ -46,6 +48,10 @@ public AdminHandler(AssemblyService assemblyService,
this.assemblyAssembler = assemblyAssembler;
}

public Optional<AssemblyEntity> getAssemblyByAccession(String accession) {
return assemblyService.getAssemblyByAccession(accession);
}

public void fetchAndInsertAssemblyByAccession(String accession) throws IOException {
assemblyService.fetchAndInsertAssembly(accession);
}
Expand All @@ -54,6 +60,14 @@ public Map<String, List<String>> fetchAndInsertAssemblyByAccession(List<String>
return assemblyService.fetchAndInsertAssembly(accessions);
}

public void retrieveAndInsertMd5ChecksumForAssembly(String accession) {
assemblyService.retrieveAndInsertMd5ChecksumForAssembly(accession);
}

public Map<String, Set<String>> getMD5ChecksumUpdateTaskStatus() {
return assemblyService.getMD5ChecksumUpdateTaskStatus();
}

public void deleteAssemblyByAccession(String accession) {
assemblyService.deleteAssemblyByAccession(accession);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,17 @@ public Optional<Path> downloadAssemblyReport(ENABrowser enaBrowser, String acces
* @param optional {@link AssemblyEntity} to add ENA sequence names to
* @throws IOException Passes IOException thrown by {@link #getAssemblyByAccession(String)}
*/
public void addENASequenceNamesToAssembly(Optional<AssemblyEntity> optional) throws IOException {
if (optional.isPresent()) {
AssemblyEntity targetAssembly = optional.get();
if (!hasAllEnaSequenceNames(targetAssembly)) {
String insdcAccession = targetAssembly.getInsdcAccession();
Optional<AssemblyEntity> enaAssembly = getAssemblyByAccession(insdcAccession);

if (enaAssembly.isPresent()) {
AssemblyEntity sourceAssembly = enaAssembly.get();
addENASequenceNames(Objects.nonNull(sourceAssembly.getChromosomes()) ?
sourceAssembly.getChromosomes() : Collections.emptyList(),
Objects.nonNull(targetAssembly.getChromosomes()) ?
targetAssembly.getChromosomes() : Collections.emptyList());
}
public void addENASequenceNamesToAssembly(AssemblyEntity targetAssembly) throws IOException {
if (!hasAllEnaSequenceNames(targetAssembly)) {
String insdcAccession = targetAssembly.getInsdcAccession();
Optional<AssemblyEntity> enaAssembly = getAssemblyByAccession(insdcAccession);

if (enaAssembly.isPresent()) {
AssemblyEntity sourceAssembly = enaAssembly.get();
addENASequenceNames(Objects.nonNull(sourceAssembly.getChromosomes()) ?
sourceAssembly.getChromosomes() : Collections.emptyList(),
Objects.nonNull(targetAssembly.getChromosomes()) ?
targetAssembly.getChromosomes() : Collections.emptyList());
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@
import org.springframework.data.domain.Page;
import org.springframework.data.domain.Pageable;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Modifying;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;

import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;

import java.util.List;

@Repository
public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Long> {

Expand All @@ -35,6 +40,16 @@ public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Lo

Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_InsdcAccession(String asmInsdcAccession, Pageable request);

@Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '')")
Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_InsdcAccessionAndMd5checksumIsNullOrEmpty(@Param("asmInsdcAccession") String asmInsdcAccession, Pageable pageable);

@Query("SELECT distinct c.assembly.insdcAccession FROM ChromosomeEntity c WHERE c.md5checksum IS NULL OR c.md5checksum = ''")
List<String> findAssembliesWhereChromosomeMd5checksumIsNullOrEmpty();

@Modifying
@Query("UPDATE ChromosomeEntity c SET c.md5checksum = :md5Checksum WHERE c.assembly.insdcAccession= :asmInsdcAccession AND c.insdcAccession = :insdcAccession")
void updateMd5ChecksumByInsdcAccession(@Param("asmInsdcAccession") String asmInsdcAccession, @Param("insdcAccession") String insdcAccession, @Param("md5Checksum") String md5Checksum);

Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_Refseq(String asmRefseq, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(String genbankName, long asmTaxid, Pageable request);
Expand Down
123 changes: 123 additions & 0 deletions src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package uk.ac.ebi.eva.contigalias.scheduler;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Slice;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
import uk.ac.ebi.eva.contigalias.service.ChromosomeService;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;

@Component
public class ChecksumSetter {
private final Logger logger = LoggerFactory.getLogger(ChecksumSetter.class);
private final Map<String, CompletableFuture<Void>> runningMD5ChecksumUpdateTasks = new ConcurrentHashMap<>();
private Set<String> scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>();
private int DEFAULT_PAGE_SIZE = 10000;
private ChromosomeService chromosomeService;
private Md5ChecksumRetriever md5ChecksumRetriever;

@Autowired
public ChecksumSetter(ChromosomeService chromosomeService, Md5ChecksumRetriever md5ChecksumRetriever) {
this.chromosomeService = chromosomeService;
this.md5ChecksumRetriever = md5ChecksumRetriever;
}

@Scheduled(cron = "0 0 0 ? * TUE")
public void updateMd5CheckSumForAllAssemblies() {
scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>();
List<String> assemblyList = chromosomeService.getAssembliesWhereChromosomeMd5ChecksumIsNull();
logger.info("List of assemblies to be updated for MD5 Checksum: " + assemblyList);
scheduledToRunMD5ChecksumUpdateTasks.addAll(assemblyList.stream().collect(Collectors.toSet()));

for (String assembly : assemblyList) {
CompletableFuture<Void> future = updateMd5CheckSumForAssemblyAsync(assembly);
try {
future.get();
} catch (InterruptedException | ExecutionException e) {
logger.error("Encountered an error when running MD5Checksum update for assembly: " + assembly);
} finally {
scheduledToRunMD5ChecksumUpdateTasks.remove(assembly);
}
}
}

public CompletableFuture<Void> updateMd5CheckSumForAssemblyAsync(String assembly) {
logger.info("Submitted job for updating MD5 Checksum for assembly (asynchronously)");
// Check if the async task for this assembly is already running
CompletableFuture<Void> existingTask = runningMD5ChecksumUpdateTasks.get(assembly);
if (existingTask != null && !existingTask.isDone()) {
logger.info("Async task is still running for assembly: " + assembly);
return existingTask;
}
// Start the async task (removing existing run if present)
runningMD5ChecksumUpdateTasks.remove(assembly);
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
updateMD5ChecksumForAllChromosomesInAssembly(assembly);
});
// Store the future in the map for the given assembly
runningMD5ChecksumUpdateTasks.put(assembly, future);

// check the status of task upon completion and remove from running tasks
future.whenComplete((result, exception) -> {
if (exception != null) {
logger.error("Async task (MD5Checksum setter) failed for assembly: " + assembly, exception);
} else {
logger.info("Async task (MD5Checksum setter) completed successfully for assembly: " + assembly);
}
runningMD5ChecksumUpdateTasks.remove(assembly);
});

return future;
}

public void updateMD5ChecksumForAllChromosomesInAssembly(String assembly) {
logger.info("Trying to update md5checksum for assembly: " + assembly);
Slice<ChromosomeEntity> chrSlice;
Pageable pageable = PageRequest.of(0, DEFAULT_PAGE_SIZE);
long chromosomeUpdated = 0;
do {
chrSlice = chromosomeService.getChromosomesByAssemblyInsdcAccessionWhereMd5ChecksumIsNull(assembly, pageable);
List<ChromosomeEntity> chromosomeEntityList = chrSlice.getContent();
updateMd5ChecksumForChromosome(chromosomeEntityList);

chromosomeUpdated += chromosomeEntityList.size();
logger.info("Chromosomes Updated till now: " + chromosomeUpdated);
} while (chrSlice.hasNext());

logger.info("Updating md5checksum for assembly " + assembly + " completed");
}

public void updateMd5ChecksumForChromosome(List<ChromosomeEntity> chromosomesList) {
chromosomesList.parallelStream().forEach(chromosome -> {
try {
String md5Checksum = md5ChecksumRetriever.retrieveMd5Checksum(chromosome.getInsdcAccession());
chromosome.setMd5checksum(md5Checksum);
} catch (Exception e) {
logger.info("Could not retrieve md5Checksum for insdc accession: " + chromosome.getInsdcAccession());
}
});

chromosomeService.updateMd5ChecksumForAll(chromosomesList);
}

public Map<String, Set<String>> getMD5ChecksumUpdateTaskStatus() {
Map<String, Set<String>> taskStatus = new HashMap<>();
taskStatus.put("running", runningMD5ChecksumUpdateTasks.keySet());
taskStatus.put("scheduled", scheduledToRunMD5ChecksumUpdateTasks);
return taskStatus;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package uk.ac.ebi.eva.contigalias.scheduler;

import com.fasterxml.jackson.databind.JsonNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.retry.annotation.Backoff;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestTemplate;

@Component
public class Md5ChecksumRetriever {
private final Logger logger = LoggerFactory.getLogger(Md5ChecksumRetriever.class);
private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER";
private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata";

private RestTemplate restTemplate;

@Autowired
public Md5ChecksumRetriever(RestTemplate restTemplate) {
this.restTemplate = restTemplate;
}

@Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
public String retrieveMd5Checksum(String insdcAccession) {
String apiURL = INSDC_CHECKSUM_URL.replace(INSDC_ACCESSION_PLACE_HOLDER, insdcAccession);
JsonNode jsonResponse = restTemplate.getForObject(apiURL, JsonNode.class);
String md5Checksum = jsonResponse.get("metadata").get("md5").asText();
return md5Checksum;
}
}
Loading

0 comments on commit 66350a3

Please sign in to comment.