Skip to content

Commit

Permalink
improve comparison between metadata_files.
Browse files Browse the repository at this point in the history
  • Loading branch information
pchaumeil committed Feb 20, 2020
1 parent e4cc4e7 commit cdb325a
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 42 deletions.
33 changes: 26 additions & 7 deletions bin/gtdb_migration_tk
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,13 @@ def print_help():
Information from Nomenclatural resources:
lpsn -> Process steps for LPSN. [In Dev]
lpsn -> Process steps for LPSN.
bacdive -> Process steps for BacDive. [In Dev]
strains -> Set of tools to combined information from LPSN,DSMZ and Straininfo.
Test suite for data validation:
overview -> Compare the Metadata file from the previous version with the new one.
overview -> Compare the Metadata file from the previous version with the new one.
compare_field -> Compare a specific metadata field between to metadata files.
Use: gtdb_migration_tk <command> -h for command specific help.
Expand All @@ -89,7 +90,7 @@ if __name__ == '__main__':
lpsn_parser_lpsn_wf = lpsn_subparser.add_parser('lpsn_wf',
add_help=False,
formatter_class=CustomHelpFormatter,
help='Full Pipeline Pull HTML -> Parse HTML')
help='Full Pipeline Pull HTML and Parse HTML')
lpsn_parser_lpsn_wf.add_argument(
'output_dir', help='Output directory.')
lpsn_parser_lpsn_wf.add_argument(
Expand All @@ -110,7 +111,7 @@ if __name__ == '__main__':
help='Parse HTML files.')

lpsn_parser_parse_html.add_argument(
'input directory', help='Directory containing all genus HTML files.')
'input_dir', help='Directory containing all genus HTML files.')
lpsn_parser_parse_html.add_argument(
'output_dir', help='Output directory.')
lpsn_parser_parse_html.add_argument(
Expand Down Expand Up @@ -189,12 +190,30 @@ if __name__ == '__main__':

overview_parser = subparsers.add_parser('overview',
formatter_class=CustomHelpFormatter,
description='Compare the Metadata file from the previous version with the new one.')
help='Compare the Metadata file from the previous version with the new one.')
overview_parser.add_argument(
'--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.')
'--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True)
overview_parser.add_argument(
'--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.')
'--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True)
overview_parser.add_argument(
'--only_ncbi', help='Output file.', action='store_true')
overview_parser.add_argument(
'--silent', help="suppress output", action='store_true')

metafield_parser = subparsers.add_parser('compare_field',
formatter_class=CustomHelpFormatter,
help='Compare a specific metadata field between to metadata files.')
metafield_parser.add_argument(
'--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True)
metafield_parser.add_argument(
'--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True)
metafield_parser.add_argument(
'--field_of_interest', help='common field to compare between files.',required=True)
metafield_parser.add_argument(
'--output_file', help='Output file.',required=True)
metafield_parser.add_argument(
'--only_ncbi', help='Output file.', action='store_true')
metafield_parser.add_argument(
'--silent', help="suppress output", action='store_true')

# get and check options
Expand Down
35 changes: 31 additions & 4 deletions gtdb_migration_tk/bacdive.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import io
import re
import logging
import time

from requests.auth import HTTPBasicAuth
from unidecode import unidecode
Expand Down Expand Up @@ -52,8 +53,21 @@ def getGenera(self, outfile, urlreq=None):
response = requests.get(
'https://bacdive.dsmz.de/api/pnu/genus/', headers=self.headers, auth=self.credentials)
else:
response = requests.get(
urlreq, headers=self.headers, auth=self.credentials)
while True:
try:
print(urlreq)
response = requests.get(
urlreq, headers=self.headers, auth=self.credentials)
except requests.exceptions.ConnectionError:
print('Max retries for {}'.format(urlreq))
time.sleep(10)
continue
except Exception:
print(e)
print('Max retries for {}'.format(urlreq))
time.sleep(10)
continue
break

if response.status_code == 200:
results = response.json()
Expand Down Expand Up @@ -87,8 +101,21 @@ def getSpecies(self, outfile_species, outfile_strains, dictgenus, urlreq=None):
response = requests.get(
'https://bacdive.dsmz.de/api/pnu/species/', headers=self.headers, auth=self.credentials)
else:
response = requests.get(
urlreq, headers=self.headers, auth=self.credentials)
while True:
try:
print(urlreq)
response = requests.get(
urlreq, headers=self.headers, auth=self.credentials)
except requests.exceptions.ConnectionError:
print('Max retries for {}'.format(urlreq))
time.sleep(10)
continue
except Exception:
print(e)
print('Max retries for {}'.format(urlreq))
time.sleep(10)
continue
break

if response.status_code == 200:
results = response.json()
Expand Down
12 changes: 6 additions & 6 deletions gtdb_migration_tk/lpsn.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,13 @@ def download_lpsn_html(self):

# Download pages listing all genus in LPSN
print('Beginning file download lpsn ...')
url = 'http://www.bacterio.net/-ac.html'
url = 'http://www.bacterio.net/archive/-ac.html'
urllib.request.urlretrieve(url, os.path.join(self.outdir, 'ac.html'))
url = 'http://www.bacterio.net/-dl.html'
url = 'http://www.bacterio.net/archive/-dl.html'
urllib.request.urlretrieve(url, os.path.join(self.outdir, 'dl.html'))
url = 'http://www.bacterio.net/-mr.html'
url = 'http://www.bacterio.net/archive/-mr.html'
urllib.request.urlretrieve(url, os.path.join(self.outdir, 'mr.html'))
url = 'http://www.bacterio.net/-sz.html'
url = 'http://www.bacterio.net/archive/-sz.html'
urllib.request.urlretrieve(url, os.path.join(self.outdir, 'sz.html'))

# Parse html pages lising all genus
Expand Down Expand Up @@ -187,9 +187,9 @@ def download_lpsn_html(self):
for line in gsl:
genus = line.strip()
try:
print(os.path.join('http://www.bacterio.net', genus))
print(os.path.join('http://www.bacterio.net/archive/', genus))
urllib.request.urlretrieve(os.path.join(
'http://www.bacterio.net', genus), os.path.join(self.outdir, 'genus_html', genus))
'http://www.bacterio.net/archive/', genus), os.path.join(self.outdir, 'genus_html', genus))
except:
failed_html_file.write('{}\n'.format(genus))
failed_html_file.close()
Expand Down
14 changes: 12 additions & 2 deletions gtdb_migration_tk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def pull_html(self, options):
"""Pull all genus.html files."""
make_sure_path_exists(options.output_dir)
p = LPSN(options.output_dir)
p.pull_html()
p.download_lpsn_html()

def parse_html(self, options):
"""Parse all html files."""
Expand Down Expand Up @@ -80,7 +80,15 @@ def generate_type_table(self, options):
def compare_metadata(self, options):
p = Tools()
p.compare_metadata(options.previous_metadata_file,
options.new_metadata_file)
options.new_metadata_file,
options.only_ncbi)

def compare_selected_data(self, options):
p = Tools()
p.compare_selected_data(options.previous_metadata_file,
options.new_metadata_file,
options.field_of_interest,
options.output_file, options.only_ncbi)

def parse_options(self, options):
"""Parse user options and call the correct pipeline(s)"""
Expand All @@ -104,6 +112,8 @@ def parse_options(self, options):
self.generate_type_table(options)
elif options.subparser_name == 'overview':
self.compare_metadata(options)
elif options.subparser_name == 'compare_field':
self.compare_selected_data(options)
else:
self.logger.error('Unknown command: ' +
options.subparser_name + '\n')
Expand Down
108 changes: 85 additions & 23 deletions gtdb_migration_tk/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,56 +48,65 @@ def __init__(self):
"""Initialization."""
self.logger = logging.getLogger()

def compare_metadata(self, old_meta_file, new_meta_file):

old_delimiter = None
def select_delimiter(self, metafile):
# Parse TSV or CSV file
for line in open(old_meta_file):
for line in open(metafile):
if len(line.split('\t')) >= len(line.split(',')):
old_delimiter = '\t'
break
return '\t'
else:
old_delimiter = ','
break
return ','

def compare_metadata(self, old_meta_file, new_meta_file, only_ncbi=False):

old_delimiter = self.select_delimiter(old_meta_file)

old_nested_dict = {}
with open(old_meta_file, 'r') as omf:
old_headers = omf.readline().split(old_delimiter)
if old_delimiter == ',':
for line in csv.reader(omf):
old_nested_dict[line[0]] = {}
for i, j in enumerate(line):
old_nested_dict[line[0]][old_headers[i]] = str(j)
if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
old_nested_dict[line[0]] = {}
for i, j in enumerate(line):
old_nested_dict[line[0]][old_headers[i]] = str(j)
else:
for raw_line in omf:
line = raw_line.strip('\n').split('\t')
old_nested_dict[line[0]] = {}
for i, j in enumerate(line):
old_nested_dict[line[0]][old_headers[i]] = str(j)
if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
old_nested_dict[line[0]] = {}
for i, j in enumerate(line):
old_nested_dict[line[0]][old_headers[i]] = str(j)

self.logger.info('{} parsed'.format(old_meta_file))

new_delimiter = None
# Parse TSV or CSV file
for line in open(old_meta_file):
if len(line.split('\t')) >= len(line.split(',')):
new_delimiter = '\t'
break
else:
new_delimiter = ','
break
new_delimiter = self.select_delimiter(new_meta_file)

header_summary = {}
new_nested_dict = {}
# in the new metadata file
# we check if the genome id exists, and the columns names exist
# for each common column name we compare the value for each common
# genomes and add 1 if they are different
number_of_genomes = 0
with open(new_meta_file, 'r') as nmf:
new_headers = nmf.readline().split(new_delimiter)
if new_delimiter == ',':
for line in csv.reader(nmf):

if line[0] in old_nested_dict:
number_of_genomes += 1
for i, j in enumerate(line):
if new_headers[i] in old_headers:
if str(j) != old_nested_dict.get(line[0]).get(new_headers[i]):
header_summary.setdefault(
new_headers[i], []).append(1)
else:
header_summary.setdefault(
new_headers[i], []).append(0)
for raw_line in nmf:
line = raw_line.strip('\n').split('\t')
if line[0] in old_nested_dict:
number_of_genomes += 1
for i, j in enumerate(line):
if new_headers[i] in old_headers:
if str(j) != old_nested_dict.get(line[0]).get(new_headers[i]):
Expand All @@ -124,10 +133,63 @@ def compare_metadata(self, old_meta_file, new_meta_file):
set(new_headers)
new_columns = set(new_headers) - set(old_headers)

print("Based on {} common genomes.".format(number_of_genomes))

print("Deprecated columns:")
for removed_column in removed_columns:
print("\t- {}".format(removed_column))

print("New columns:")
for new_column in new_columns:
print("\t- {}".format(new_column))

def compare_selected_data(self, old_meta_file, new_meta_file, metafield, output_file, only_ncbi=False):
old_delimiter = self.select_delimiter(old_meta_file)
old_nested_dict = {}
with open(old_meta_file, 'r') as omf:
old_headers = omf.readline().split(old_delimiter)
if metafield not in old_headers:
self.logger.error(f'{metafield} is not in {old_meta_file}')
sys.exit()

if old_delimiter == ',':
for line in csv.reader(omf):
if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
old_nested_dict[line[0]] = str(
line[old_headers.index(metafield)])
else:
for raw_line in omf:
line = raw_line.strip('\n').split('\t')
if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
old_nested_dict[line[0]] = str(
line[old_headers.index(metafield)])

new_delimiter = self.select_delimiter(new_meta_file)
new_nested_dict = {}
with open(new_meta_file, 'r') as nmf:
new_headers = nmf.readline().split(new_delimiter)
if metafield not in new_headers:
self.logger.error(f'{metafield} is not in {old_meta_file}')
sys.exit()
if new_delimiter == ',':
for line in csv.reader(nmf):
if line[0] in old_nested_dict:
new_nested_dict[line[0]] = str(
line[new_headers.index(metafield)])
else:
for raw_line in omf:
line = raw_line.strip('\n').split('\t')
new_nested_dict[line[0]] = str(
line[new_headers.index(metafield)])

results = []
outf = open(output_file, 'w')
outf.write('genome_id\told_value\tnew_value\tsimilarity\n')
for k, v in old_nested_dict.items():
similarity = 'Identical'
if v != new_nested_dict.get(k):
similarity = "Different"
outf.write('{}\n'.format(
'\t'.join([k, str(v), str(new_nested_dict.get(k)), similarity])))

self.logger.info('{} parsed'.format(old_meta_file))

0 comments on commit cdb325a

Please sign in to comment.