From 840687a8b8414612b993f319c5b64dcc7bc87159 Mon Sep 17 00:00:00 2001 From: waterflow80 Date: Tue, 30 May 2023 14:04:11 +0100 Subject: [PATCH 1/7] retrieve fna file function --- .../AssemblySequenceDataSource.java | 12 ++ .../NCBIAssemblySequenceDataSource.java | 105 ++++++++++++++++++ .../ebi/eva/contigalias/dus/NCBIBrowser.java | 11 ++ .../dus2/AssemblySequenceReader.java | 39 +++++++ .../dus2/NCBIAssemblySequenceReader.java | 27 +++++ .../NCBIAssemblySequenceReaderFactory.java | 18 +++ .../entities/AssemblySequenceEntity.java | 23 ++++ .../AssemblySequenceNotFoundException.java | 7 ++ .../DuplicateAssemblySequenceException.java | 8 ++ .../repo/AssemblySequenceRepository.java | 14 +++ .../service/AssemblySequenceService.java | 80 +++++++++++++ .../eva/contigalias/utils/GzipCompress.java | 52 +++++++++ 12 files changed, 396 insertions(+) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java new file mode 100644 index 00000000..3a4d5b46 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java @@ -0,0 +1,12 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.util.Optional; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; + +public interface AssemblySequenceDataSource { + + Optional getAssemblySequenceByAccession(String accession) throws IOException; + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java new file mode 100644 index 00000000..d76741cf --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java @@ -0,0 +1,105 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Optional; + +import org.apache.commons.net.ftp.FTPFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Retryable; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReader; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReaderFactory; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; +import uk.ac.ebi.eva.contigalias.utils.GzipCompress; + +@Repository("NCBISequenceDataSource") +public class NCBIAssemblySequenceDataSource implements AssemblySequenceDataSource{ + + private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequenceDataSource.class); + + private final NCBIBrowserFactory factory; + + private final NCBIAssemblySequenceReaderFactory readerFactory; + + @Value("${asm.file.download.dir}") + private String asmFileDownloadDir; + + @Autowired + public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory, + NCBIAssemblySequenceReaderFactory readerFactory){ + this.factory = factory; + this.readerFactory = readerFactory; + } + + @Override + public Optional getAssemblySequenceByAccession(String accession) throws IOException, IllegalArgumentException { + NCBIBrowser ncbiBrowser = factory.build(); + ncbiBrowser.connect(); + GzipCompress gzipCompress = new GzipCompress(); + + Optional downloadFilePath = downloadAssemblySequence(accession, ncbiBrowser); + if (!downloadFilePath.isPresent()) { + return Optional.empty(); + } + logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath); + // Uncompress the .gz file + Optional uncompressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir); + if (!uncompressedFilePath.isPresent()){ + return Optional.empty(); + } + + AssemblySequenceEntity assemblySequenceEntity; + try (InputStream stream = new FileInputStream(uncompressedFilePath.get().toFile())){ + NCBIAssemblySequenceReader reader = readerFactory.build(stream); + assemblySequenceEntity = reader.getAssemblySequenceEntity(); + //TODO : The logger info will be canged when we add more attributes to the entity and we parse the whole file info + logger.info("NCBI: Name of the sequence in " + accession + " : " + assemblySequenceEntity.getName()); + } finally { + try { + ncbiBrowser.disconnect(); + //Files.deleteIfExists(downloadFilePath.get()); + } catch (IOException e) { + logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")"); + } + } + return Optional.of(assemblySequenceEntity); + } + + + /** + * Download the assembly fna/fasta file given the accession and save it to /tmp + * After this method is called, the file will be downloaded, and the path to this file + * on your local computer will be returned*/ + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) + public Optional downloadAssemblySequence(String accession, NCBIBrowser ncbiBrowser) throws IOException { + // The same directory as the report file + Optional directory = ncbiBrowser.getGenomeReportDirectory(accession); + + if (!directory.isPresent()) { + return Optional.empty(); + } + + logger.info("NCBI directory for assembly genomic.fna download: " + directory.get()); + FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get()); + String ftpFilePath = directory.get() + ftpFile.getName(); + Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName()); + boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); + if (success) { + logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")"); + return Optional.of(downloadFilePath); + } else { + logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")"); + return Optional.empty(); + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java index 30ea4f73..fcb1f8e7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java @@ -38,6 +38,7 @@ public class NCBIBrowser extends PassiveAnonymousFTPClient { public static final String PATH_GENOMES_ALL = "/genomes/all/"; + private String ftpProxyHost; private Integer ftpProxyPort; @@ -148,4 +149,14 @@ public FTPFile getNCBIAssemblyReportFile(String directoryPath) throws IOExceptio return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Report File not present in given directory: " + directoryPath)); } + /** + * Return the fna/fasta file that will be downloaded (a pointer to that FtpFile)*/ + public FTPFile getAssemblyGenomicFnaFile(String directoryPath) throws IOException { + Stream ftpFileStream = Arrays.stream(super.listFiles(directoryPath)); + Stream assemblyReportFilteredStream = ftpFileStream.filter(f -> f.getName().contains("genomic.fna.gz") && !f.getName().contains("from")); + Optional assemblyReport = assemblyReportFilteredStream.findFirst(); + + return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Genomic Fna (Fasta) File not present in given directory: " + directoryPath)); + } + } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java new file mode 100644 index 00000000..0b107042 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java @@ -0,0 +1,39 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; + +public abstract class AssemblySequenceReader { + + protected final BufferedReader reader; + + protected AssemblySequenceEntity assemblySequenceEntity; + + protected boolean fileParsed = false; + + + public AssemblySequenceReader(InputStreamReader inputStreamReader){ + this.reader = new BufferedReader(inputStreamReader); + } + + public AssemblySequenceEntity getAssemblySequenceEntity() throws IOException { + if(!fileParsed || assemblySequenceEntity == null){ + parseFile(); + } + return assemblySequenceEntity; + } + + protected abstract void parseFile() throws IOException, NullPointerException; + + + protected abstract void parseAssemblySequenceEntity(String line); + + + + public boolean ready() throws IOException { + return reader.ready(); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java new file mode 100644 index 00000000..12e01689 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java @@ -0,0 +1,27 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.IOException; +import java.io.InputStreamReader; + +public class NCBIAssemblySequenceReader extends AssemblySequenceReader{ + + public NCBIAssemblySequenceReader(InputStreamReader inputStreamReader){ + super(inputStreamReader); + } + + @Override + protected void parseFile() throws IOException, NullPointerException { + if (reader == null){ + throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader."); + } + // TODO: HERE WE'LL EXTARACT THE .gz FILE AND PARSE THE fna FILE + } + + @Override + // Parsing a line of the file + protected void parseAssemblySequenceEntity(String line) { + // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY) + // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY + // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java new file mode 100644 index 00000000..06867aba --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.springframework.stereotype.Component; + +@Component +public class NCBIAssemblySequenceReaderFactory { + + public NCBIAssemblySequenceReader build(InputStream inputStream){ + return new NCBIAssemblySequenceReader(new InputStreamReader(inputStream)); + } + + public NCBIAssemblySequenceReader build(InputStreamReader inputStreamReader){ + return new NCBIAssemblySequenceReader(inputStreamReader); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java new file mode 100644 index 00000000..c1a58894 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java @@ -0,0 +1,23 @@ +package uk.ac.ebi.eva.contigalias.entities; + +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.Id; +import javax.persistence.Table; + +import lombok.Getter; +import lombok.Setter; + +@Setter +@Getter +@Table(name = "AssemblySequence") +@Entity +public class AssemblySequenceEntity { + + @Id + @Column(nullable = false) + private String accession; + + @Column(nullable = false) + private String name; +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java new file mode 100644 index 00000000..03deecb9 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java @@ -0,0 +1,7 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class AssemblySequenceNotFoundException extends RuntimeException{ + public AssemblySequenceNotFoundException(String accession) { + super("No assembly sequence corresponding to accession " + accession + " could be found"); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java new file mode 100644 index 00000000..f382e62f --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java @@ -0,0 +1,8 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class DuplicateAssemblySequenceException extends RuntimeException{ + + public DuplicateAssemblySequenceException(String msg){ + super(msg); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java new file mode 100644 index 00000000..6eb6fa01 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java @@ -0,0 +1,14 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import java.util.Optional; + +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; + +@Repository +public interface AssemblySequenceRepository extends JpaRepository { + Optional findAssemblySequenceEntityByAccession(String accession); + + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java new file mode 100644 index 00000000..5dfd917a --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java @@ -0,0 +1,80 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.util.Optional; + +import javax.transaction.Transactional; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequenceDataSource; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; +import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequenceRepository; + +@Service +public class AssemblySequenceService { + + private final AssemblySequenceRepository repository; + + private final NCBIAssemblySequenceDataSource ncbiSequenceDataSource; + + private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); + + + public AssemblySequenceService( + AssemblySequenceRepository repository, NCBIAssemblySequenceDataSource ncbiSequenceDataSource){ + this.repository = repository; + this.ncbiSequenceDataSource = ncbiSequenceDataSource; + } + + public void fetchAndInsertAssemblySequence(String accession) throws IOException { + Optional entity = repository.findAssemblySequenceEntityByAccession(accession); + if(entity.isPresent()) + throw duplicateAssemblySequenceInsertionException(accession, entity.get()); + Optional fetchAssembly = ncbiSequenceDataSource.getAssemblySequenceByAccession(accession); + if(!fetchAssembly.isPresent()){ + throw new AssemblySequenceNotFoundException(accession); + } + if (fetchAssembly.get().getName() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity + insertAssemblySequence(fetchAssembly.get()); + logger.info("Successfully inserted assembly for accession " + accession); + }else { + logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession); + } + } + + @Transactional + public void insertAssemblySequence(AssemblySequenceEntity entity) { + if (isEntityPresent(entity)) { + throw duplicateAssemblySequenceInsertionException(null, entity); + } else { + repository.save(entity); + } + } + + private boolean isEntityPresent(AssemblySequenceEntity entity) { + // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY + Optional existingAssembly = repository.findAssemblySequenceEntityByAccession(entity.getAccession()); + return existingAssembly.isPresent(); + } + + private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequenceEntity present) { + StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists"); + if (accession != null){ + exception.append("\n"); + exception.append("Assembly Sequence trying to insert:"); + exception.append("\t"); + exception.append(accession); + } + if (present != null){ + exception.append("\n"); + exception.append("Assembly Sequence already present"); + exception.append("\t"); + exception.append(present); + } + return new DuplicateAssemblySequenceException(exception.toString()); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java new file mode 100644 index 00000000..455c0582 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java @@ -0,0 +1,52 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Optional; +import java.util.zip.GZIPInputStream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class GzipCompress { + + private final Logger logger = LoggerFactory.getLogger(GzipCompress.class); + + /** + * Decompress (Unzip) a .gz file and save the output file in the same + * input file's location. + * The output file's name will be the same as the input's but without '.gz' + * @return The output (decompressed) file path*/ + public Optional unzip(String compressedFilePath, String outputDirPath) { + String outputFileName = "genome_sequence.fna"; + String decompressedFilePath = outputDirPath + "/" + outputFileName; + + byte[] buffer = new byte[1024]; + + try { + FileInputStream fileIn = new FileInputStream(compressedFilePath); + GZIPInputStream gzipInputStream = new GZIPInputStream(fileIn); + FileOutputStream fileOutputStream = new FileOutputStream(decompressedFilePath); + + int bytes_read; + + while ((bytes_read = gzipInputStream.read(buffer)) > 0) { + fileOutputStream.write(buffer, 0, bytes_read); + } + gzipInputStream.close(); + fileOutputStream.close(); + logger.info("File " + compressedFilePath + " was decompressed successfully"); + Path outputFilePath = Paths.get(outputDirPath, outputFileName); + return Optional.of(outputFilePath); + } catch ( + IOException e) { + logger.error("Could not find or read file !!"); + return Optional.empty(); + } + + } +} From 724b91a9dc4e72ab6ed67470f3b218afea5bf42f Mon Sep 17 00:00:00 2001 From: waterflow80 Date: Tue, 30 May 2023 14:12:29 +0100 Subject: [PATCH 2/7] adding pom.xml --- pom.xml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pom.xml b/pom.xml index f63b0907..689a2619 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,8 @@ 8 + + org.springframework.boot @@ -147,6 +149,13 @@ 1.2.5.RELEASE + + org.projectlombok + lombok + 1.18.28 + provided + + From acfad8a3a3b82ca004a37151c81baad50063df02 Mon Sep 17 00:00:00 2001 From: waterflow80 Date: Thu, 1 Jun 2023 12:04:41 +0100 Subject: [PATCH 3/7] assembly-sequences-fasta --- .../AssemblySequencesDataSource.java | 13 +++ .../NCBIAssemblySequencesDataSource.java | 107 ++++++++++++++++++ .../dus2/AssemblySequencesReader.java | 44 +++++++ .../dus2/NCBIAssemblySequencesReader.java | 61 ++++++++++ .../NCBIAssemblySequencesReaderFactory.java | 18 +++ .../entities/AssemblySequencesEntity.java | 37 ++++++ .../eva/contigalias/entities/Sequence.java | 34 ++++++ .../repo/AssemblySequencesRepository.java | 14 +++ .../contigalias/repo/SequenceRepository.java | 7 ++ .../service/AssemblySequencesService.java | 81 +++++++++++++ .../eva/contigalias/utils/GzipCompress.java | 2 +- .../ebi/eva/contigalias/utils/MD5Digest.java | 20 ++++ .../NCBIAssemblySequencesDataSourceTest.java | 55 +++++++++ .../dus2/NCBIAssemblySequencesReaderTest.java | 67 +++++++++++ .../service/AssemblySequencesServiceTest.java | 44 +++++++ .../contigalias/utils/GzipCompressTest.java | 18 +++ .../eva/contigalias/utils/MD5DigestTest.java | 18 +++ 17 files changed, 639 insertions(+), 1 deletion(-) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java new file mode 100644 index 00000000..f3a12e03 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java @@ -0,0 +1,13 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +public interface AssemblySequencesDataSource { + + Optional getAssemblySequencesByAccession(String accession) throws IOException, NoSuchAlgorithmException; + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java new file mode 100644 index 00000000..211ab422 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java @@ -0,0 +1,107 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import org.apache.commons.net.ftp.FTPFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Retryable; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReader; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReaderFactory; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.utils.GzipCompress; + +@Repository("NCBISequenceDataSource") +public class NCBIAssemblySequencesDataSource implements AssemblySequencesDataSource { + + private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequencesDataSource.class); + + private final NCBIBrowserFactory factory; + + private final NCBIAssemblySequencesReaderFactory readerFactory; + + @Value("${asm.file.download.dir}") + private String asmFileDownloadDir; + + @Autowired + public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory, + NCBIAssemblySequencesReaderFactory readerFactory){ + this.factory = factory; + this.readerFactory = readerFactory; + } + + @Override + public Optional getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException { + NCBIBrowser ncbiBrowser = factory.build(); + ncbiBrowser.connect(); + GzipCompress gzipCompress = new GzipCompress(); + + Optional downloadFilePath = downloadAssemblySequences(accession, ncbiBrowser); + if (!downloadFilePath.isPresent()) { + return Optional.empty(); + } + logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath); + // Uncompress the .gz file + Optional compressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir); + if (!compressedFilePath.isPresent()){ + return Optional.empty(); + } + + AssemblySequencesEntity assemblySequencesEntity; + try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){ + NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession); + assemblySequencesEntity = reader.getAssemblySequenceEntity(); + logger.info("NCBI: Assembly sequences' fasta file with accession " + accession + " has been parsed successfully" ); + } finally { + try { + ncbiBrowser.disconnect(); + Files.deleteIfExists(downloadFilePath.get()); + Files.deleteIfExists(compressedFilePath.get()); // Deleting the fasta file + } catch (IOException e) { + logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")"); + } + } + return Optional.of(assemblySequencesEntity); + } + + + /** + * Download the assembly fna/fasta file given the accession and save it to /tmp + * After this method is called, the file will be downloaded, and the path to this file + * on your local computer will be returned*/ + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) + public Optional downloadAssemblySequences(String accession, NCBIBrowser ncbiBrowser) throws IOException { + // The same directory as the report file + Optional directory = ncbiBrowser.getGenomeReportDirectory(accession); + + if (!directory.isPresent()) { + return Optional.empty(); + } + + logger.info("NCBI directory for assembly genomic.fna download: " + directory.get()); + FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get()); + String ftpFilePath = directory.get() + ftpFile.getName(); + Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName()); + boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); + if (success) { + logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")"); + return Optional.of(downloadFilePath); + } else { + logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")"); + return Optional.empty(); + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java new file mode 100644 index 00000000..c7a974bb --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java @@ -0,0 +1,44 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +public abstract class AssemblySequencesReader { + + protected final BufferedReader reader; + + protected final String accession; + + protected AssemblySequencesEntity assemblySequencesEntity; + + + protected boolean fileParsed = false; + + + public AssemblySequencesReader(InputStreamReader inputStreamReader, String accession){ + this.reader = new BufferedReader(inputStreamReader); + this.accession = accession; + } + + public AssemblySequencesEntity getAssemblySequenceEntity() throws IOException, NoSuchAlgorithmException { + if(!fileParsed || assemblySequencesEntity == null){ + parseFile(); + } + return assemblySequencesEntity; + } + + protected abstract void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException; + + + protected abstract void parseAssemblySequenceEntity(String line); + + + + public boolean ready() throws IOException { + return reader.ready(); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java new file mode 100644 index 00000000..b979a8eb --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java @@ -0,0 +1,61 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; +import java.util.LinkedList; +import java.util.List; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; +import uk.ac.ebi.eva.contigalias.utils.MD5Digest; + +public class NCBIAssemblySequencesReader extends AssemblySequencesReader { + + public NCBIAssemblySequencesReader(InputStreamReader inputStreamReader, String accession){ + super(inputStreamReader, accession); + } + + @Override + protected void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException { + if (reader == null){ + throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader."); + } + MD5Digest md5Digest = new MD5Digest(); + if (assemblySequencesEntity == null){ + assemblySequencesEntity = new AssemblySequencesEntity(); + } + // Setting the accession of the whole assembly file + assemblySequencesEntity.setInsdcAccession(accession); + List sequences = new LinkedList<>(); + String line = reader.readLine(); + while (line != null){ + if (line.startsWith(">")){ + Sequence sequence = new Sequence(); + String refSeq = line.substring(1, line.indexOf(' ')); + sequence.setRefseq(refSeq); + line = reader.readLine(); + StringBuilder sequenceValue = new StringBuilder(); + while (line != null && !line.startsWith(">")){ + // Looking for the sequence lines for this refseq + sequenceValue.append(line); + line = reader.readLine(); + } + String md5checksum = md5Digest.hash(sequenceValue.toString()); + sequence.setSequenceMD5(md5checksum); + sequences.add(sequence); + } + } + assemblySequencesEntity.setSequences(sequences); + fileParsed = true; + reader.close(); + } + + @Override + // Parsing a line of the file + protected void parseAssemblySequenceEntity(String line) { + // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY) + // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY + // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java new file mode 100644 index 00000000..a727bea1 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.springframework.stereotype.Component; + +@Component +public class NCBIAssemblySequencesReaderFactory { + + public NCBIAssemblySequencesReader build(InputStream inputStream, String accession){ + return new NCBIAssemblySequencesReader(new InputStreamReader(inputStream), accession); + } + + public NCBIAssemblySequencesReader build(InputStreamReader inputStreamReader, String accession){ + return new NCBIAssemblySequencesReader(inputStreamReader, accession); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java new file mode 100644 index 00000000..5875b48d --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java @@ -0,0 +1,37 @@ +package uk.ac.ebi.eva.contigalias.entities; + + +import java.util.List; + +import javax.persistence.CascadeType; +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.Id; +import javax.persistence.JoinColumn; +import javax.persistence.OneToMany; +import javax.persistence.Table; + +import io.swagger.annotations.ApiModelProperty; +import lombok.Getter; +import lombok.Setter; +import org.hibernate.annotations.LazyCollection; +import org.hibernate.annotations.LazyCollectionOption; + +@Setter +@Getter +@Table(name = "AssemblySequences") +@Entity +public class AssemblySequencesEntity { + + @Id + @Column(nullable = false) + @ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.") + private String insdcAccession; + + + @ApiModelProperty(value = "List of all sequences of the assembly.") + @LazyCollection(LazyCollectionOption.FALSE) + @OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL) + @JoinColumn(name = "insdcAccession", referencedColumnName = "insdcAccession") + private List sequences; +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java new file mode 100644 index 00000000..728b5987 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java @@ -0,0 +1,34 @@ +package uk.ac.ebi.eva.contigalias.entities; + +import javax.persistence.CascadeType; +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.Id; +import javax.persistence.JoinColumn; +import javax.persistence.ManyToOne; +import javax.persistence.Table; + +import io.swagger.annotations.ApiModelProperty; +import lombok.Getter; +import lombok.Setter; + + +@Getter +@Setter +@Entity +@Table(name = "Sequence") +public class Sequence { + + + @Id + @Column(nullable = false) + @ApiModelProperty(value = "Assembly's Refseq accession.") + private String refseq; + + @Column + @ApiModelProperty(value = "Sequence's MD5 checksum value.") + private String sequenceMD5; + + + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java new file mode 100644 index 00000000..0992b3c3 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java @@ -0,0 +1,14 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import java.util.Optional; + +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +@Repository +public interface AssemblySequencesRepository extends JpaRepository { + Optional findAssemblySequenceEntityByInsdcAccession(String accession); + + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java new file mode 100644 index 00000000..ba9164b0 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java @@ -0,0 +1,7 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import org.springframework.data.jpa.repository.JpaRepository; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +public interface SequenceRepository extends JpaRepository { +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java new file mode 100644 index 00000000..5da37b27 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java @@ -0,0 +1,81 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import javax.transaction.Transactional; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; +import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +@Service +public class AssemblySequencesService { + + private final AssemblySequencesRepository repository; + + private final NCBIAssemblySequencesDataSource ncbiSequenceDataSource; + + private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); + + + public AssemblySequencesService( + AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequenceDataSource){ + this.repository = repository; + this.ncbiSequenceDataSource = ncbiSequenceDataSource; + } + + public void fetchAndInsertAssemblySequence(String accession) throws IOException, NoSuchAlgorithmException { + Optional entity = repository.findAssemblySequenceEntityByInsdcAccession(accession); + if(entity.isPresent()) + throw duplicateAssemblySequenceInsertionException(accession, entity.get()); + Optional fetchAssembly = ncbiSequenceDataSource.getAssemblySequencesByAccession(accession); + if(!fetchAssembly.isPresent()){ + throw new AssemblySequenceNotFoundException(accession); + } + if (fetchAssembly.get().getInsdcAccession() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity + insertAssemblySequence(fetchAssembly.get()); + logger.info("Successfully inserted assembly for accession " + accession); + }else { + logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession); + } + } + + @Transactional + public void insertAssemblySequence(AssemblySequencesEntity entity) { + if (isEntityPresent(entity)) { + throw duplicateAssemblySequenceInsertionException(null, entity); + } else { + repository.save(entity); + } + } + + private boolean isEntityPresent(AssemblySequencesEntity entity) { + // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY + Optional existingAssembly = repository.findAssemblySequenceEntityByInsdcAccession(entity.getInsdcAccession()); + return existingAssembly.isPresent(); + } + + private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequencesEntity present) { + StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists"); + if (accession != null){ + exception.append("\n"); + exception.append("Assembly Sequence trying to insert:"); + exception.append("\t"); + exception.append(accession); + } + if (present != null){ + exception.append("\n"); + exception.append("Assembly Sequence already present"); + exception.append("\t"); + exception.append(present); + } + return new DuplicateAssemblySequenceException(exception.toString()); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java index 455c0582..a8aecd49 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java @@ -19,7 +19,7 @@ public class GzipCompress { /** * Decompress (Unzip) a .gz file and save the output file in the same * input file's location. - * The output file's name will be the same as the input's but without '.gz' + * The output file's name will be genome_sequence.fna * @return The output (decompressed) file path*/ public Optional unzip(String compressedFilePath, String outputDirPath) { String outputFileName = "genome_sequence.fna"; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java new file mode 100644 index 00000000..72a3c0ee --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java @@ -0,0 +1,20 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +import javax.xml.bind.DatatypeConverter; + +public class MD5Digest { + + /** + * Return the digest of the text using the MD5 algorithm*/ + public String hash(String text) throws NoSuchAlgorithmException { + MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(text.getBytes()); + byte[] digest = md.digest(); + String textHash = DatatypeConverter + .printHexBinary(digest).toUpperCase(); + return textHash.toLowerCase(); + } +} diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java new file mode 100644 index 00000000..d1305371 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java @@ -0,0 +1,55 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +import static org.junit.jupiter.api.Assertions.*; +@SpringBootTest +class NCBIAssemblySequencesDataSourceTest { + + + @Autowired + NCBIAssemblySequencesDataSource dataSource; + + @BeforeEach + void setUp() { + } + + @AfterEach + void tearDown() { + } + + @Test + void getAssemblySequenceByAccession() throws IOException, NoSuchAlgorithmException, InterruptedException { + + + String accession = "GCF_000001765.3"; + //String accession2 = "GCF_000001405.31"; + Optional entity = dataSource.getAssemblySequencesByAccession(accession); + //displayAssemblySequencesEntityContent(entity.get()); + assertEquals(accession, entity.get().getInsdcAccession()); + } + + void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity) throws InterruptedException { + System.out.println("ACCESSION: " + entity.getInsdcAccession()); + System.out.println("TOTAL NUMBER OF SEQUENCES: " + entity.getSequences().size()); + for (Sequence s: entity.getSequences()){ + System.out.print("REFSEQ: " + s.getRefseq() + " | "); + System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); + Thread.sleep(1000); // Just for lazy and fun display :) + } + } + + @Test + void downloadAssemblySequence() { + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java new file mode 100644 index 00000000..b652ea13 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java @@ -0,0 +1,67 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +import static org.junit.jupiter.api.Assertions.*; + +@SpringBootTest +class NCBIAssemblySequencesReaderTest { + + private static final String ACCESSION = "GCF_000001765.3"; + + private static final String FASTA_FILE_PATH = "/tmp/genome_sequence.fna"; + private InputStreamReader streamReader; + + private InputStream stream; + + @Autowired + private NCBIAssemblySequencesReaderFactory readerFactory; + + private NCBIAssemblySequencesReader reader; + + @BeforeEach + void setUp() throws FileNotFoundException { + stream = new FileInputStream(FASTA_FILE_PATH); + streamReader = new InputStreamReader(stream); + reader = readerFactory.build(streamReader, ACCESSION); + } + + @AfterEach + void tearDown() throws IOException { + stream.close(); + streamReader.close(); + } + + @Test + void getAssemblySequencesReader() throws IOException { + assertTrue(reader.ready()); + } + + @Test + void assertParsedFastaFileValid() throws IOException, NoSuchAlgorithmException { + reader.parseFile(); + displayAssemblySequencesEntityContent(reader.assemblySequencesEntity); + assertEquals(ACCESSION, reader.assemblySequencesEntity.getInsdcAccession()); + } + + void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity){ + System.out.println("ACCESSION: " + entity.getInsdcAccession()); + for (Sequence s: entity.getSequences()){ + System.out.print("REFSEQ: " + s.getRefseq() + " | "); + System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); + } + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java new file mode 100644 index 00000000..53ba6296 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java @@ -0,0 +1,44 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +import static org.junit.jupiter.api.Assertions.*; + +@SpringBootTest +class AssemblySequencesServiceTest { + + + @Autowired + private AssemblySequencesService assemblySequencesService; + + @Autowired + private AssemblySequencesRepository assemblySequencesRepository; + + @BeforeEach + void setUp() { + } + + @AfterEach + void tearDown() { + } + + @Test + void fetchAndInsertAssemblySequence() throws IOException, NoSuchAlgorithmException { + String accession = "GCF_000001765.3"; + assemblySequencesService.fetchAndInsertAssemblySequence(accession); + assertNotNull(assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession)); + assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get()); + } + + @Test + void insertAssemblySequence() { + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java new file mode 100644 index 00000000..a2ea9f99 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class GzipCompressTest { + + @Test + void unzip() { + String compressedFilePath = "/tmp/GCF_000001765.3_Dpse_3.0_genomic.fna.gz"; + String outputDirPath = "/tmp"; + GzipCompress gzipCompress = new GzipCompress(); + + + assertEquals("/tmp/genome_sequence.fna", gzipCompress.unzip(compressedFilePath, outputDirPath).get().toString()); + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java new file mode 100644 index 00000000..1676e77d --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class MD5DigestTest { + + @Test + void hash() throws NoSuchAlgorithmException { + MD5Digest md5Digest = new MD5Digest(); + String toBeHashed = "AAA"; + String MD5Digest = "8880cd8c1fb402585779766f681b868b"; + assertEquals(MD5Digest,md5Digest.hash(toBeHashed)); + } +} \ No newline at end of file From 0a8c1e34da43877504c556b0482fd669264a0a43 Mon Sep 17 00:00:00 2001 From: Haroun <82417779+waterflow80@users.noreply.github.com> Date: Thu, 1 Jun 2023 12:29:32 +0100 Subject: [PATCH 4/7] Delete NCBIAssemblySequencesDataSourceTest.java --- .../NCBIAssemblySequencesDataSourceTest.java | 55 ------------------- 1 file changed, 55 deletions(-) delete mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java deleted file mode 100644 index d1305371..00000000 --- a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java +++ /dev/null @@ -1,55 +0,0 @@ -package uk.ac.ebi.eva.contigalias.datasource; - -import java.io.IOException; -import java.security.NoSuchAlgorithmException; -import java.util.Optional; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; -import uk.ac.ebi.eva.contigalias.entities.Sequence; - -import static org.junit.jupiter.api.Assertions.*; -@SpringBootTest -class NCBIAssemblySequencesDataSourceTest { - - - @Autowired - NCBIAssemblySequencesDataSource dataSource; - - @BeforeEach - void setUp() { - } - - @AfterEach - void tearDown() { - } - - @Test - void getAssemblySequenceByAccession() throws IOException, NoSuchAlgorithmException, InterruptedException { - - - String accession = "GCF_000001765.3"; - //String accession2 = "GCF_000001405.31"; - Optional entity = dataSource.getAssemblySequencesByAccession(accession); - //displayAssemblySequencesEntityContent(entity.get()); - assertEquals(accession, entity.get().getInsdcAccession()); - } - - void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity) throws InterruptedException { - System.out.println("ACCESSION: " + entity.getInsdcAccession()); - System.out.println("TOTAL NUMBER OF SEQUENCES: " + entity.getSequences().size()); - for (Sequence s: entity.getSequences()){ - System.out.print("REFSEQ: " + s.getRefseq() + " | "); - System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); - Thread.sleep(1000); // Just for lazy and fun display :) - } - } - - @Test - void downloadAssemblySequence() { - } -} \ No newline at end of file From dc012d91ea3bcdd4d14f0cfa0235c6faf940b813 Mon Sep 17 00:00:00 2001 From: Haroun <82417779+waterflow80@users.noreply.github.com> Date: Thu, 1 Jun 2023 12:32:22 +0100 Subject: [PATCH 5/7] Delete NCBIAssemblySequencesReaderTest.java --- .../dus2/NCBIAssemblySequencesReaderTest.java | 67 ------------------- 1 file changed, 67 deletions(-) delete mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java deleted file mode 100644 index b652ea13..00000000 --- a/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java +++ /dev/null @@ -1,67 +0,0 @@ -package uk.ac.ebi.eva.contigalias.dus2; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.security.NoSuchAlgorithmException; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; -import uk.ac.ebi.eva.contigalias.entities.Sequence; - -import static org.junit.jupiter.api.Assertions.*; - -@SpringBootTest -class NCBIAssemblySequencesReaderTest { - - private static final String ACCESSION = "GCF_000001765.3"; - - private static final String FASTA_FILE_PATH = "/tmp/genome_sequence.fna"; - private InputStreamReader streamReader; - - private InputStream stream; - - @Autowired - private NCBIAssemblySequencesReaderFactory readerFactory; - - private NCBIAssemblySequencesReader reader; - - @BeforeEach - void setUp() throws FileNotFoundException { - stream = new FileInputStream(FASTA_FILE_PATH); - streamReader = new InputStreamReader(stream); - reader = readerFactory.build(streamReader, ACCESSION); - } - - @AfterEach - void tearDown() throws IOException { - stream.close(); - streamReader.close(); - } - - @Test - void getAssemblySequencesReader() throws IOException { - assertTrue(reader.ready()); - } - - @Test - void assertParsedFastaFileValid() throws IOException, NoSuchAlgorithmException { - reader.parseFile(); - displayAssemblySequencesEntityContent(reader.assemblySequencesEntity); - assertEquals(ACCESSION, reader.assemblySequencesEntity.getInsdcAccession()); - } - - void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity){ - System.out.println("ACCESSION: " + entity.getInsdcAccession()); - for (Sequence s: entity.getSequences()){ - System.out.print("REFSEQ: " + s.getRefseq() + " | "); - System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); - } - } -} \ No newline at end of file From 61e166bf121940d2227543067f2e913ca5c17483 Mon Sep 17 00:00:00 2001 From: Haroun <82417779+waterflow80@users.noreply.github.com> Date: Thu, 1 Jun 2023 12:33:17 +0100 Subject: [PATCH 6/7] Delete GzipCompressTest.java --- .../contigalias/utils/GzipCompressTest.java | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java deleted file mode 100644 index a2ea9f99..00000000 --- a/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java +++ /dev/null @@ -1,18 +0,0 @@ -package uk.ac.ebi.eva.contigalias.utils; - -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class GzipCompressTest { - - @Test - void unzip() { - String compressedFilePath = "/tmp/GCF_000001765.3_Dpse_3.0_genomic.fna.gz"; - String outputDirPath = "/tmp"; - GzipCompress gzipCompress = new GzipCompress(); - - - assertEquals("/tmp/genome_sequence.fna", gzipCompress.unzip(compressedFilePath, outputDirPath).get().toString()); - } -} \ No newline at end of file From 64a1ea2ae6458a99ec2c1a12d74f706ddcce2063 Mon Sep 17 00:00:00 2001 From: Haroun <82417779+waterflow80@users.noreply.github.com> Date: Thu, 1 Jun 2023 12:33:36 +0100 Subject: [PATCH 7/7] Delete MD5DigestTest.java --- .../eva/contigalias/utils/MD5DigestTest.java | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java deleted file mode 100644 index 1676e77d..00000000 --- a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java +++ /dev/null @@ -1,18 +0,0 @@ -package uk.ac.ebi.eva.contigalias.utils; - -import java.security.NoSuchAlgorithmException; - -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class MD5DigestTest { - - @Test - void hash() throws NoSuchAlgorithmException { - MD5Digest md5Digest = new MD5Digest(); - String toBeHashed = "AAA"; - String MD5Digest = "8880cd8c1fb402585779766f681b868b"; - assertEquals(MD5Digest,md5Digest.hash(toBeHashed)); - } -} \ No newline at end of file