-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta_get.py
executable file
·144 lines (107 loc) · 4.62 KB
/
fasta_get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#! /usr/bin/python
## fasta_get.py
## Sandip Chatterjee
## v3, January 27, 2014
##
## Usage:
## for new download:
## >>python fasta-get.py
##
## for resuming directory download (from previous run):
## >>python fasta-get.py [FILENAME].log
##
## for obtaining FASTA protein sequence files from the NCBI RefSeq complete database, located at ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete/
## 1) downloads FASTA.faa.gz files
## 2) extracts .gz files
## 3) concatenates all files into one large file and removes .gz files (order of files concatenated is not necessarily maintained)
import gzip
from ftplib import FTP
import os
import time
import sys
import shutil
def main():
startTime = time.time()
if len(sys.argv) == 1:
resumeFlag = False
elif len(sys.argv) == 2:
resumeFlag = True
if ".log" not in sys.argv[1]:
print 'Proper usage: >>fasta_get.py [*.log] ## resume from last downloaded file in this log file'
sys.exit()
else:
print "Wrong number of arguments"
print 'Proper usage: >>fasta-get.py ## run without resuming'
print 'Proper usage: >>fasta-get.py *.log ## resume from last downloaded file in this log file'
sys.exit()
########################################################################
####### Change this variable to change RefSeq download directory #######
########################################################################
refseq_paths = []
refseq_paths = [
'viral',
'fungi',
'mitochondrion',
'plasmid',
'protozoa',
'microbial', ##soon to be replaced by the following 2 terms
# 'archaea', ##available May 2014
# 'bacteria', ##available May 2014
]
########################################################################
##### ^ Change this variable to change RefSeq download directory ^ #####
########################################################################
refseq_paths_str = '_'.join(refseq_paths)
refseq_paths = ['/'+x+'/' for x in refseq_paths]
file_paths = []
ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp.login()
for refseq_path in refseq_paths:
print 'Accessing NCBI RefSeq'+refseq_path+' at ftp://ftp.ncbi.nlm.nih.gov/refseq/release'+refseq_path
ftp.cwd('/refseq/release'+refseq_path)
directory_listing = []
ftp.retrlines("LIST",directory_listing.append) ## retrieve directory, append to list dirListing
for directory_line in directory_listing:
file_paths.append('/refseq/release'+refseq_path+directory_line.split(None, 8)[-1])
file_paths = [file_path for file_path in file_paths if '.faa.gz' in file_path]
fasta_master_file = time.strftime("%m%d%y_%H%M%S")+'_refseq_'+refseq_paths_str+'.fasta'
with open(fasta_master_file, 'a') as extracted_file, open(fasta_master_file+'.log', 'a') as extracted_file_log:
num_files_processed = 0
for count, file_path in enumerate(file_paths):
file_name = file_path.split('/')[-1] ## temporary gzip file
with open(file_name, 'wb') as temp_gz_file:
ftp.retrbinary('RETR '+file_path, temp_gz_file.write)
gzipFile = gzip.open(file_name, 'rb')
extracted_file.write(gzipFile.read())
gzipFile.close()
os.remove(file_name) ## delete temporary gzip file
num_files_processed = count+1
extracted_file_log.write(file_name+" - Item #"+str(num_files_processed)+' of '+str(len(file_paths))+' downloaded, extracted, and appended to master file\n')
end_status = "Downloaded, extracted, and concatenated "+str(num_files_processed)+" files in "+str((time.time()-startTime)/60)+" minutes\n"
extracted_file_log.write(end_status)
extracted_file_log.write("(DATE_CURRENTTIME_refseq_*.fasta)")
print end_status
print "Saved compiled data to "+fasta_master_file
print "(DATE_CURRENTTIME_refseq_*.fasta)"
ftp.quit()
if __name__ == '__main__':
main()
# if resumeFlag:
# ## read in file names from given file log
# logfileNameList = []
# fileLog = open(sys.argv[1], 'rb')
# for line in fileLog:
# logfileNameList.append(line.split()[0])
# logfileNameList = filter(lambda x: '.protein.faa.gz' in x,logfileNameList) ## only keep lines that contain valid filenames of interest...
# fileLog.close()
# ## concatenate old fasta file and new (empty) fasta file
# oldFastaFile = open(sys.argv[1].strip('.log'),'rb')
# shutil.copyfileobj(oldFastaFile,extractedFile)
# oldFastaFile.close()
# ## update log file
# fileLog = open(sys.argv[1], 'rb')
# with open(newFileName+'.log', 'wb') as extractedFileLog:
# shutil.copyfileobj(fileLog,extractedFileLog)
# fileLog.close()
# ## remove previously downloaded filenames from new filename (removes order from list due to set operations)
# fileNameList = list(set(fileNameList)-set(logfileNameList))