Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Seqcol #110

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
<java.version>8</java.version>
</properties>



<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
Expand Down Expand Up @@ -147,6 +149,13 @@
<version>1.2.5.RELEASE</version>
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.28</version>
<scope>provided</scope>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package uk.ac.ebi.eva.contigalias.datasource;

import java.io.IOException;
import java.util.Optional;

import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;

public interface AssemblySequenceDataSource {

Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException;

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package uk.ac.ebi.eva.contigalias.datasource;

import java.io.IOException;
import java.security.NoSuchAlgorithmException;
import java.util.Optional;

import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;

public interface AssemblySequencesDataSource {

Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, NoSuchAlgorithmException;

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package uk.ac.ebi.eva.contigalias.datasource;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Optional;

import org.apache.commons.net.ftp.FTPFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.retry.annotation.Backoff;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Repository;
import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReader;
import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReaderFactory;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
import uk.ac.ebi.eva.contigalias.utils.GzipCompress;

@Repository("NCBISequenceDataSource")
public class NCBIAssemblySequenceDataSource implements AssemblySequenceDataSource{

private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequenceDataSource.class);

private final NCBIBrowserFactory factory;

private final NCBIAssemblySequenceReaderFactory readerFactory;

@Value("${asm.file.download.dir}")
private String asmFileDownloadDir;

@Autowired
public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory,
NCBIAssemblySequenceReaderFactory readerFactory){
this.factory = factory;
this.readerFactory = readerFactory;
}

@Override
public Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException, IllegalArgumentException {
NCBIBrowser ncbiBrowser = factory.build();
ncbiBrowser.connect();
GzipCompress gzipCompress = new GzipCompress();

Optional<Path> downloadFilePath = downloadAssemblySequence(accession, ncbiBrowser);
if (!downloadFilePath.isPresent()) {
return Optional.empty();
}
logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath);
// Uncompress the .gz file
Optional<Path> uncompressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir);
if (!uncompressedFilePath.isPresent()){
return Optional.empty();
}

AssemblySequenceEntity assemblySequenceEntity;
try (InputStream stream = new FileInputStream(uncompressedFilePath.get().toFile())){
NCBIAssemblySequenceReader reader = readerFactory.build(stream);
assemblySequenceEntity = reader.getAssemblySequenceEntity();
//TODO : The logger info will be canged when we add more attributes to the entity and we parse the whole file info
logger.info("NCBI: Name of the sequence in " + accession + " : " + assemblySequenceEntity.getName());
} finally {
try {
ncbiBrowser.disconnect();
//Files.deleteIfExists(downloadFilePath.get());
} catch (IOException e) {
logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
}
}
return Optional.of(assemblySequenceEntity);
}


/**
* Download the assembly fna/fasta file given the accession and save it to /tmp
* After this method is called, the file will be downloaded, and the path to this file
* on your local computer will be returned*/
@Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
public Optional<Path> downloadAssemblySequence(String accession, NCBIBrowser ncbiBrowser) throws IOException {
// The same directory as the report file
Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession);

if (!directory.isPresent()) {
return Optional.empty();
}

logger.info("NCBI directory for assembly genomic.fna download: " + directory.get());
FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get());
String ftpFilePath = directory.get() + ftpFile.getName();
Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName());
boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize());
if (success) {
logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")");
return Optional.of(downloadFilePath);
} else {
logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")");
return Optional.empty();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package uk.ac.ebi.eva.contigalias.datasource;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.NoSuchAlgorithmException;
import java.util.Optional;

import org.apache.commons.net.ftp.FTPFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.retry.annotation.Backoff;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Repository;
import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReader;
import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReaderFactory;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
import uk.ac.ebi.eva.contigalias.utils.GzipCompress;

@Repository("NCBISequenceDataSource")
public class NCBIAssemblySequencesDataSource implements AssemblySequencesDataSource {

private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequencesDataSource.class);

private final NCBIBrowserFactory factory;

private final NCBIAssemblySequencesReaderFactory readerFactory;

@Value("${asm.file.download.dir}")
private String asmFileDownloadDir;

@Autowired
public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory,
NCBIAssemblySequencesReaderFactory readerFactory){
this.factory = factory;
this.readerFactory = readerFactory;
}

@Override
public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException {
NCBIBrowser ncbiBrowser = factory.build();
ncbiBrowser.connect();
GzipCompress gzipCompress = new GzipCompress();

Optional<Path> downloadFilePath = downloadAssemblySequences(accession, ncbiBrowser);
if (!downloadFilePath.isPresent()) {
return Optional.empty();
}
logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath);
// Uncompress the .gz file
Optional<Path> compressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir);
if (!compressedFilePath.isPresent()){
return Optional.empty();
}

AssemblySequencesEntity assemblySequencesEntity;
try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){
NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession);
assemblySequencesEntity = reader.getAssemblySequenceEntity();
logger.info("NCBI: Assembly sequences' fasta file with accession " + accession + " has been parsed successfully" );
} finally {
try {
ncbiBrowser.disconnect();
Files.deleteIfExists(downloadFilePath.get());
Files.deleteIfExists(compressedFilePath.get()); // Deleting the fasta file
} catch (IOException e) {
logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
}
}
return Optional.of(assemblySequencesEntity);
}


/**
* Download the assembly fna/fasta file given the accession and save it to /tmp
* After this method is called, the file will be downloaded, and the path to this file
* on your local computer will be returned*/
@Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
public Optional<Path> downloadAssemblySequences(String accession, NCBIBrowser ncbiBrowser) throws IOException {
// The same directory as the report file
Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession);

if (!directory.isPresent()) {
return Optional.empty();
}

logger.info("NCBI directory for assembly genomic.fna download: " + directory.get());
FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get());
String ftpFilePath = directory.get() + ftpFile.getName();
Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName());
boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize());
if (success) {
logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")");
return Optional.of(downloadFilePath);
} else {
logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")");
return Optional.empty();
}
}
}
11 changes: 11 additions & 0 deletions src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public class NCBIBrowser extends PassiveAnonymousFTPClient {

public static final String PATH_GENOMES_ALL = "/genomes/all/";


private String ftpProxyHost;

private Integer ftpProxyPort;
Expand Down Expand Up @@ -148,4 +149,14 @@ public FTPFile getNCBIAssemblyReportFile(String directoryPath) throws IOExceptio
return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Report File not present in given directory: " + directoryPath));
}

/**
* Return the fna/fasta file that will be downloaded (a pointer to that FtpFile)*/
public FTPFile getAssemblyGenomicFnaFile(String directoryPath) throws IOException {
Stream<FTPFile> ftpFileStream = Arrays.stream(super.listFiles(directoryPath));
Stream<FTPFile> assemblyReportFilteredStream = ftpFileStream.filter(f -> f.getName().contains("genomic.fna.gz") && !f.getName().contains("from"));
Optional<FTPFile> assemblyReport = assemblyReportFilteredStream.findFirst();

return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Genomic Fna (Fasta) File not present in given directory: " + directoryPath));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package uk.ac.ebi.eva.contigalias.dus2;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;

public abstract class AssemblySequenceReader {

protected final BufferedReader reader;

protected AssemblySequenceEntity assemblySequenceEntity;

protected boolean fileParsed = false;


public AssemblySequenceReader(InputStreamReader inputStreamReader){
this.reader = new BufferedReader(inputStreamReader);
}

public AssemblySequenceEntity getAssemblySequenceEntity() throws IOException {
if(!fileParsed || assemblySequenceEntity == null){
parseFile();
}
return assemblySequenceEntity;
}

protected abstract void parseFile() throws IOException, NullPointerException;


protected abstract void parseAssemblySequenceEntity(String line);



public boolean ready() throws IOException {
return reader.ready();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package uk.ac.ebi.eva.contigalias.dus2;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.security.NoSuchAlgorithmException;

import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;

public abstract class AssemblySequencesReader {

protected final BufferedReader reader;

protected final String accession;

protected AssemblySequencesEntity assemblySequencesEntity;


protected boolean fileParsed = false;


public AssemblySequencesReader(InputStreamReader inputStreamReader, String accession){
this.reader = new BufferedReader(inputStreamReader);
this.accession = accession;
}

public AssemblySequencesEntity getAssemblySequenceEntity() throws IOException, NoSuchAlgorithmException {
if(!fileParsed || assemblySequencesEntity == null){
parseFile();
}
return assemblySequencesEntity;
}

protected abstract void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException;


protected abstract void parseAssemblySequenceEntity(String line);



public boolean ready() throws IOException {
return reader.ready();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package uk.ac.ebi.eva.contigalias.dus2;

import java.io.IOException;
import java.io.InputStreamReader;

public class NCBIAssemblySequenceReader extends AssemblySequenceReader{

public NCBIAssemblySequenceReader(InputStreamReader inputStreamReader){
super(inputStreamReader);
}

@Override
protected void parseFile() throws IOException, NullPointerException {
if (reader == null){
throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader.");
}
// TODO: HERE WE'LL EXTARACT THE .gz FILE AND PARSE THE fna FILE
}

@Override
// Parsing a line of the file
protected void parseAssemblySequenceEntity(String line) {
// TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY)
// TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY
// TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER
}
}
Loading