-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_stats_per_stage.py
executable file
·78 lines (65 loc) · 2.27 KB
/
get_stats_per_stage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
from io import StringIO
from subprocess import Popen, PIPE, check_output
from difflib import SequenceMatcher
from glob import glob
from joblib import Parallel, delayed
from tqdm import tqdm
import sys
import os
import gzip
import mimetypes
COLS = 'prefix type num_seqs min_len avg_len max_len'.split()
def get_prefix(lis):
first_el = lis.pop(0)
for i in lis:
match = SequenceMatcher(None, first_el, i)
match = match.find_longest_match(0,len(first_el), 0, len(i))
first_el = first_el[match.a: match.a + match.size]
return first_el
def loop(d):
line = 'seqkit stats %s*.fast*' % d
#seqkit = Popen(line, shell=True, stdout=PIPE)
#o, e = seqkit.communicate()
files = glob('%s/*.fast*' % d)
o = check_output(['seqkit', 'stats'] + [f for f in files if gz_size(f)])
df = pd.read_table(StringIO(o.decode()), delim_whitespace=True,
thousands=',')
prefix = get_prefix(df.file)
df['prefix'] = prefix
df['type'] = df.file.apply(lambda x: x.replace(prefix, ''))
df = df.reindex(columns=COLS)
df = pd.pivot_table(df, values=COLS[2:], index=['prefix'], columns=['type']
)
return df
def writedf(df, col):
df.loc[:,col].to_csv('%s_%s.tsv' % (sys.argv[1], col), sep='\t')
def gz_size(fname):
"""
Taken and modified from https://stackoverflow.com/questions/37874936/how-to-check-empty-gzip-file-in-python
:param fname:
:return:
"""
type1, type2 = mimetypes.guess_type(fname)
if type2 == 'gzip':
with gzip.open(fname, 'rb') as f:
try:
size = f.seek(0, whence=2)
except EOFError:
with open('EOFERROR.txt', 'a') as A:
A.write('%s\n' % fname)
size = 0
else:
size = os.stat(fname).st_size
if size == 0:
with open('emptyfiles.txt', 'a') as A:
A.write('%s\n' % fname)
return size != 0
dirs = glob('%s*/' % sys.argv[1])
cpus = int(sys.argv[2])
alldfs = Parallel(n_jobs=cpus)(delayed(loop)(d) for d in tqdm(
dirs, desc='Getting stats'))
df = pd.concat(alldfs)
cols = 'min_len avg_len max_len num_seqs'.split()
Parallel(n_jobs=cpus)(delayed(writedf)(df, col) for col in tqdm(
cols, desc='Writing to file'))