diff --git a/bin/gtdb_migration_tk b/bin/gtdb_migration_tk index 84378bd..f745064 100755 --- a/bin/gtdb_migration_tk +++ b/bin/gtdb_migration_tk @@ -57,12 +57,13 @@ def print_help(): Information from Nomenclatural resources: - lpsn -> Process steps for LPSN. [In Dev] + lpsn -> Process steps for LPSN. bacdive -> Process steps for BacDive. [In Dev] strains -> Set of tools to combined information from LPSN,DSMZ and Straininfo. Test suite for data validation: - overview -> Compare the Metadata file from the previous version with the new one. + overview -> Compare the Metadata file from the previous version with the new one. + compare_field -> Compare a specific metadata field between to metadata files. Use: gtdb_migration_tk -h for command specific help. @@ -89,7 +90,7 @@ if __name__ == '__main__': lpsn_parser_lpsn_wf = lpsn_subparser.add_parser('lpsn_wf', add_help=False, formatter_class=CustomHelpFormatter, - help='Full Pipeline Pull HTML -> Parse HTML') + help='Full Pipeline Pull HTML and Parse HTML') lpsn_parser_lpsn_wf.add_argument( 'output_dir', help='Output directory.') lpsn_parser_lpsn_wf.add_argument( @@ -110,7 +111,7 @@ if __name__ == '__main__': help='Parse HTML files.') lpsn_parser_parse_html.add_argument( - 'input directory', help='Directory containing all genus HTML files.') + 'input_dir', help='Directory containing all genus HTML files.') lpsn_parser_parse_html.add_argument( 'output_dir', help='Output directory.') lpsn_parser_parse_html.add_argument( @@ -189,12 +190,30 @@ if __name__ == '__main__': overview_parser = subparsers.add_parser('overview', formatter_class=CustomHelpFormatter, - description='Compare the Metadata file from the previous version with the new one.') + help='Compare the Metadata file from the previous version with the new one.') overview_parser.add_argument( - '--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.') + '--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True) overview_parser.add_argument( - '--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.') + '--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True) overview_parser.add_argument( + '--only_ncbi', help='Output file.', action='store_true') + overview_parser.add_argument( + '--silent', help="suppress output", action='store_true') + + metafield_parser = subparsers.add_parser('compare_field', + formatter_class=CustomHelpFormatter, + help='Compare a specific metadata field between to metadata files.') + metafield_parser.add_argument( + '--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True) + metafield_parser.add_argument( + '--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True) + metafield_parser.add_argument( + '--field_of_interest', help='common field to compare between files.',required=True) + metafield_parser.add_argument( + '--output_file', help='Output file.',required=True) + metafield_parser.add_argument( + '--only_ncbi', help='Output file.', action='store_true') + metafield_parser.add_argument( '--silent', help="suppress output", action='store_true') # get and check options diff --git a/gtdb_migration_tk/bacdive.py b/gtdb_migration_tk/bacdive.py index 3573dea..094dfb9 100644 --- a/gtdb_migration_tk/bacdive.py +++ b/gtdb_migration_tk/bacdive.py @@ -22,6 +22,7 @@ import io import re import logging +import time from requests.auth import HTTPBasicAuth from unidecode import unidecode @@ -52,8 +53,21 @@ def getGenera(self, outfile, urlreq=None): response = requests.get( 'https://bacdive.dsmz.de/api/pnu/genus/', headers=self.headers, auth=self.credentials) else: - response = requests.get( - urlreq, headers=self.headers, auth=self.credentials) + while True: + try: + print(urlreq) + response = requests.get( + urlreq, headers=self.headers, auth=self.credentials) + except requests.exceptions.ConnectionError: + print('Max retries for {}'.format(urlreq)) + time.sleep(10) + continue + except Exception: + print(e) + print('Max retries for {}'.format(urlreq)) + time.sleep(10) + continue + break if response.status_code == 200: results = response.json() @@ -87,8 +101,21 @@ def getSpecies(self, outfile_species, outfile_strains, dictgenus, urlreq=None): response = requests.get( 'https://bacdive.dsmz.de/api/pnu/species/', headers=self.headers, auth=self.credentials) else: - response = requests.get( - urlreq, headers=self.headers, auth=self.credentials) + while True: + try: + print(urlreq) + response = requests.get( + urlreq, headers=self.headers, auth=self.credentials) + except requests.exceptions.ConnectionError: + print('Max retries for {}'.format(urlreq)) + time.sleep(10) + continue + except Exception: + print(e) + print('Max retries for {}'.format(urlreq)) + time.sleep(10) + continue + break if response.status_code == 200: results = response.json() diff --git a/gtdb_migration_tk/lpsn.py b/gtdb_migration_tk/lpsn.py index ffdd5a1..d38b0c5 100644 --- a/gtdb_migration_tk/lpsn.py +++ b/gtdb_migration_tk/lpsn.py @@ -148,13 +148,13 @@ def download_lpsn_html(self): # Download pages listing all genus in LPSN print('Beginning file download lpsn ...') - url = 'http://www.bacterio.net/-ac.html' + url = 'http://www.bacterio.net/archive/-ac.html' urllib.request.urlretrieve(url, os.path.join(self.outdir, 'ac.html')) - url = 'http://www.bacterio.net/-dl.html' + url = 'http://www.bacterio.net/archive/-dl.html' urllib.request.urlretrieve(url, os.path.join(self.outdir, 'dl.html')) - url = 'http://www.bacterio.net/-mr.html' + url = 'http://www.bacterio.net/archive/-mr.html' urllib.request.urlretrieve(url, os.path.join(self.outdir, 'mr.html')) - url = 'http://www.bacterio.net/-sz.html' + url = 'http://www.bacterio.net/archive/-sz.html' urllib.request.urlretrieve(url, os.path.join(self.outdir, 'sz.html')) # Parse html pages lising all genus @@ -187,9 +187,9 @@ def download_lpsn_html(self): for line in gsl: genus = line.strip() try: - print(os.path.join('http://www.bacterio.net', genus)) + print(os.path.join('http://www.bacterio.net/archive/', genus)) urllib.request.urlretrieve(os.path.join( - 'http://www.bacterio.net', genus), os.path.join(self.outdir, 'genus_html', genus)) + 'http://www.bacterio.net/archive/', genus), os.path.join(self.outdir, 'genus_html', genus)) except: failed_html_file.write('{}\n'.format(genus)) failed_html_file.close() diff --git a/gtdb_migration_tk/main.py b/gtdb_migration_tk/main.py index e629934..0a97ef8 100644 --- a/gtdb_migration_tk/main.py +++ b/gtdb_migration_tk/main.py @@ -46,7 +46,7 @@ def pull_html(self, options): """Pull all genus.html files.""" make_sure_path_exists(options.output_dir) p = LPSN(options.output_dir) - p.pull_html() + p.download_lpsn_html() def parse_html(self, options): """Parse all html files.""" @@ -80,7 +80,15 @@ def generate_type_table(self, options): def compare_metadata(self, options): p = Tools() p.compare_metadata(options.previous_metadata_file, - options.new_metadata_file) + options.new_metadata_file, + options.only_ncbi) + + def compare_selected_data(self, options): + p = Tools() + p.compare_selected_data(options.previous_metadata_file, + options.new_metadata_file, + options.field_of_interest, + options.output_file, options.only_ncbi) def parse_options(self, options): """Parse user options and call the correct pipeline(s)""" @@ -104,6 +112,8 @@ def parse_options(self, options): self.generate_type_table(options) elif options.subparser_name == 'overview': self.compare_metadata(options) + elif options.subparser_name == 'compare_field': + self.compare_selected_data(options) else: self.logger.error('Unknown command: ' + options.subparser_name + '\n') diff --git a/gtdb_migration_tk/tools.py b/gtdb_migration_tk/tools.py index bd7a536..6c63d34 100644 --- a/gtdb_migration_tk/tools.py +++ b/gtdb_migration_tk/tools.py @@ -48,44 +48,38 @@ def __init__(self): """Initialization.""" self.logger = logging.getLogger() - def compare_metadata(self, old_meta_file, new_meta_file): - - old_delimiter = None + def select_delimiter(self, metafile): # Parse TSV or CSV file - for line in open(old_meta_file): + for line in open(metafile): if len(line.split('\t')) >= len(line.split(',')): - old_delimiter = '\t' - break + return '\t' else: - old_delimiter = ',' - break + return ',' + + def compare_metadata(self, old_meta_file, new_meta_file, only_ncbi=False): + + old_delimiter = self.select_delimiter(old_meta_file) old_nested_dict = {} with open(old_meta_file, 'r') as omf: old_headers = omf.readline().split(old_delimiter) if old_delimiter == ',': for line in csv.reader(omf): - old_nested_dict[line[0]] = {} - for i, j in enumerate(line): - old_nested_dict[line[0]][old_headers[i]] = str(j) + if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi: + old_nested_dict[line[0]] = {} + for i, j in enumerate(line): + old_nested_dict[line[0]][old_headers[i]] = str(j) else: for raw_line in omf: line = raw_line.strip('\n').split('\t') - old_nested_dict[line[0]] = {} - for i, j in enumerate(line): - old_nested_dict[line[0]][old_headers[i]] = str(j) + if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi: + old_nested_dict[line[0]] = {} + for i, j in enumerate(line): + old_nested_dict[line[0]][old_headers[i]] = str(j) self.logger.info('{} parsed'.format(old_meta_file)) - new_delimiter = None - # Parse TSV or CSV file - for line in open(old_meta_file): - if len(line.split('\t')) >= len(line.split(',')): - new_delimiter = '\t' - break - else: - new_delimiter = ',' - break + new_delimiter = self.select_delimiter(new_meta_file) header_summary = {} new_nested_dict = {} @@ -93,11 +87,26 @@ def compare_metadata(self, old_meta_file, new_meta_file): # we check if the genome id exists, and the columns names exist # for each common column name we compare the value for each common # genomes and add 1 if they are different + number_of_genomes = 0 with open(new_meta_file, 'r') as nmf: new_headers = nmf.readline().split(new_delimiter) if new_delimiter == ',': for line in csv.reader(nmf): + if line[0] in old_nested_dict: + number_of_genomes += 1 + for i, j in enumerate(line): + if new_headers[i] in old_headers: + if str(j) != old_nested_dict.get(line[0]).get(new_headers[i]): + header_summary.setdefault( + new_headers[i], []).append(1) + else: + header_summary.setdefault( + new_headers[i], []).append(0) + for raw_line in nmf: + line = raw_line.strip('\n').split('\t') + if line[0] in old_nested_dict: + number_of_genomes += 1 for i, j in enumerate(line): if new_headers[i] in old_headers: if str(j) != old_nested_dict.get(line[0]).get(new_headers[i]): @@ -124,6 +133,8 @@ def compare_metadata(self, old_meta_file, new_meta_file): set(new_headers) new_columns = set(new_headers) - set(old_headers) + print("Based on {} common genomes.".format(number_of_genomes)) + print("Deprecated columns:") for removed_column in removed_columns: print("\t- {}".format(removed_column)) @@ -131,3 +142,54 @@ def compare_metadata(self, old_meta_file, new_meta_file): print("New columns:") for new_column in new_columns: print("\t- {}".format(new_column)) + + def compare_selected_data(self, old_meta_file, new_meta_file, metafield, output_file, only_ncbi=False): + old_delimiter = self.select_delimiter(old_meta_file) + old_nested_dict = {} + with open(old_meta_file, 'r') as omf: + old_headers = omf.readline().split(old_delimiter) + if metafield not in old_headers: + self.logger.error(f'{metafield} is not in {old_meta_file}') + sys.exit() + + if old_delimiter == ',': + for line in csv.reader(omf): + if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi: + old_nested_dict[line[0]] = str( + line[old_headers.index(metafield)]) + else: + for raw_line in omf: + line = raw_line.strip('\n').split('\t') + if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi: + old_nested_dict[line[0]] = str( + line[old_headers.index(metafield)]) + + new_delimiter = self.select_delimiter(new_meta_file) + new_nested_dict = {} + with open(new_meta_file, 'r') as nmf: + new_headers = nmf.readline().split(new_delimiter) + if metafield not in new_headers: + self.logger.error(f'{metafield} is not in {old_meta_file}') + sys.exit() + if new_delimiter == ',': + for line in csv.reader(nmf): + if line[0] in old_nested_dict: + new_nested_dict[line[0]] = str( + line[new_headers.index(metafield)]) + else: + for raw_line in omf: + line = raw_line.strip('\n').split('\t') + new_nested_dict[line[0]] = str( + line[new_headers.index(metafield)]) + + results = [] + outf = open(output_file, 'w') + outf.write('genome_id\told_value\tnew_value\tsimilarity\n') + for k, v in old_nested_dict.items(): + similarity = 'Identical' + if v != new_nested_dict.get(k): + similarity = "Different" + outf.write('{}\n'.format( + '\t'.join([k, str(v), str(new_nested_dict.get(k)), similarity]))) + + self.logger.info('{} parsed'.format(old_meta_file))