improve comparison between metadata_files.

Ecogenomics · Feb 20, 2020 · cdb325a · cdb325a
1 parent e4cc4e7
commit cdb325a
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 42 deletions.
diff --git a/bin/gtdb_migration_tk b/bin/gtdb_migration_tk
@@ -57,12 +57,13 @@ def print_help():
 
     
     Information from Nomenclatural resources:
-      lpsn     -> Process steps for LPSN. [In Dev]
+      lpsn     -> Process steps for LPSN.
       bacdive  -> Process steps for BacDive. [In Dev]
       strains  -> Set of tools to combined information from LPSN,DSMZ and Straininfo.
       
     Test suite for data validation:
-      overview -> Compare the Metadata file from the previous version with the new one.
+      overview      -> Compare the Metadata file from the previous version with the new one.
+      compare_field -> Compare a specific metadata field between to metadata files. 
       
     
   Use: gtdb_migration_tk <command> -h for command specific help.
@@ -89,7 +90,7 @@ if __name__ == '__main__':
     lpsn_parser_lpsn_wf = lpsn_subparser.add_parser('lpsn_wf',
                                                     add_help=False,
                                                     formatter_class=CustomHelpFormatter,
-                                                    help='Full Pipeline Pull HTML -> Parse HTML')
+                                                    help='Full Pipeline Pull HTML and Parse HTML')
     lpsn_parser_lpsn_wf.add_argument(
         'output_dir', help='Output directory.')
     lpsn_parser_lpsn_wf.add_argument(
@@ -110,7 +111,7 @@ if __name__ == '__main__':
                                                        help='Parse HTML files.')
 
     lpsn_parser_parse_html.add_argument(
-        'input directory', help='Directory containing all genus HTML files.')
+        'input_dir', help='Directory containing all genus HTML files.')
     lpsn_parser_parse_html.add_argument(
         'output_dir', help='Output directory.')
     lpsn_parser_parse_html.add_argument(
@@ -189,12 +190,30 @@ if __name__ == '__main__':
 
     overview_parser = subparsers.add_parser('overview',
                                             formatter_class=CustomHelpFormatter,
-                                            description='Compare the Metadata file from the previous version with the new one.')
+                                            help='Compare the Metadata file from the previous version with the new one.')
     overview_parser.add_argument(
-        '--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.')
+        '--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True)
     overview_parser.add_argument(
-        '--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.')
+        '--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True)
     overview_parser.add_argument(
+        '--only_ncbi', help='Output file.', action='store_true')
+    overview_parser.add_argument(
+        '--silent', help="suppress output", action='store_true')
+
+    metafield_parser = subparsers.add_parser('compare_field',
+                                            formatter_class=CustomHelpFormatter,
+                                            help='Compare a specific metadata field between to metadata files.')
+    metafield_parser.add_argument(
+        '--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True)
+    metafield_parser.add_argument(
+        '--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True)
+    metafield_parser.add_argument(
+        '--field_of_interest', help='common field to compare between files.',required=True)
+    metafield_parser.add_argument(
+        '--output_file', help='Output file.',required=True)
+    metafield_parser.add_argument(
+        '--only_ncbi', help='Output file.', action='store_true')
+    metafield_parser.add_argument(
         '--silent', help="suppress output", action='store_true')
 
     # get and check options

diff --git a/gtdb_migration_tk/bacdive.py b/gtdb_migration_tk/bacdive.py
@@ -22,6 +22,7 @@
 import io
 import re
 import logging
+import time
 
 from requests.auth import HTTPBasicAuth
 from unidecode import unidecode
@@ -52,8 +53,21 @@ def getGenera(self, outfile, urlreq=None):
             response = requests.get(
                 'https://bacdive.dsmz.de/api/pnu/genus/', headers=self.headers, auth=self.credentials)
         else:
-            response = requests.get(
-                urlreq, headers=self.headers, auth=self.credentials)
+            while True:
+                try:
+                    print(urlreq)
+                    response = requests.get(
+                        urlreq, headers=self.headers, auth=self.credentials)
+                except requests.exceptions.ConnectionError:
+                    print('Max retries for {}'.format(urlreq))
+                    time.sleep(10)
+                    continue
+                except Exception:
+                    print(e)
+                    print('Max retries for {}'.format(urlreq))
+                    time.sleep(10)
+                    continue
+                break
 
         if response.status_code == 200:
             results = response.json()
@@ -87,8 +101,21 @@ def getSpecies(self, outfile_species, outfile_strains, dictgenus, urlreq=None):
             response = requests.get(
                 'https://bacdive.dsmz.de/api/pnu/species/', headers=self.headers, auth=self.credentials)
         else:
-            response = requests.get(
-                urlreq, headers=self.headers, auth=self.credentials)
+            while True:
+                try:
+                    print(urlreq)
+                    response = requests.get(
+                        urlreq, headers=self.headers, auth=self.credentials)
+                except requests.exceptions.ConnectionError:
+                    print('Max retries for {}'.format(urlreq))
+                    time.sleep(10)
+                    continue
+                except Exception:
+                    print(e)
+                    print('Max retries for {}'.format(urlreq))
+                    time.sleep(10)
+                    continue
+                break
 
         if response.status_code == 200:
             results = response.json()

diff --git a/gtdb_migration_tk/lpsn.py b/gtdb_migration_tk/lpsn.py
@@ -148,13 +148,13 @@ def download_lpsn_html(self):
 
         # Download pages listing all genus in LPSN
         print('Beginning file download lpsn ...')
-        url = 'http://www.bacterio.net/-ac.html'
+        url = 'http://www.bacterio.net/archive/-ac.html'
         urllib.request.urlretrieve(url, os.path.join(self.outdir, 'ac.html'))
-        url = 'http://www.bacterio.net/-dl.html'
+        url = 'http://www.bacterio.net/archive/-dl.html'
         urllib.request.urlretrieve(url, os.path.join(self.outdir, 'dl.html'))
-        url = 'http://www.bacterio.net/-mr.html'
+        url = 'http://www.bacterio.net/archive/-mr.html'
         urllib.request.urlretrieve(url, os.path.join(self.outdir, 'mr.html'))
-        url = 'http://www.bacterio.net/-sz.html'
+        url = 'http://www.bacterio.net/archive/-sz.html'
         urllib.request.urlretrieve(url, os.path.join(self.outdir, 'sz.html'))
 
         # Parse html pages lising all genus
@@ -187,9 +187,9 @@ def download_lpsn_html(self):
             for line in gsl:
                 genus = line.strip()
                 try:
-                    print(os.path.join('http://www.bacterio.net', genus))
+                    print(os.path.join('http://www.bacterio.net/archive/', genus))
                     urllib.request.urlretrieve(os.path.join(
-                        'http://www.bacterio.net', genus), os.path.join(self.outdir, 'genus_html', genus))
+                        'http://www.bacterio.net/archive/', genus), os.path.join(self.outdir, 'genus_html', genus))
                 except:
                     failed_html_file.write('{}\n'.format(genus))
         failed_html_file.close()

diff --git a/gtdb_migration_tk/main.py b/gtdb_migration_tk/main.py
@@ -46,7 +46,7 @@ def pull_html(self, options):
         """Pull all genus.html files."""
         make_sure_path_exists(options.output_dir)
         p = LPSN(options.output_dir)
-        p.pull_html()
+        p.download_lpsn_html()
 
     def parse_html(self, options):
         """Parse all html files."""
@@ -80,7 +80,15 @@ def generate_type_table(self, options):
     def compare_metadata(self, options):
         p = Tools()
         p.compare_metadata(options.previous_metadata_file,
-                           options.new_metadata_file)
+                           options.new_metadata_file,
+                           options.only_ncbi)
+
+    def compare_selected_data(self, options):
+        p = Tools()
+        p.compare_selected_data(options.previous_metadata_file,
+                                options.new_metadata_file,
+                                options.field_of_interest,
+                                options.output_file, options.only_ncbi)
 
     def parse_options(self, options):
         """Parse user options and call the correct pipeline(s)"""
@@ -104,6 +112,8 @@ def parse_options(self, options):
                 self.generate_type_table(options)
         elif options.subparser_name == 'overview':
             self.compare_metadata(options)
+        elif options.subparser_name == 'compare_field':
+            self.compare_selected_data(options)
         else:
             self.logger.error('Unknown command: ' +
                               options.subparser_name + '\n')

diff --git a/gtdb_migration_tk/tools.py b/gtdb_migration_tk/tools.py
@@ -48,56 +48,65 @@ def __init__(self):
         """Initialization."""
         self.logger = logging.getLogger()
 
-    def compare_metadata(self, old_meta_file, new_meta_file):
-
-        old_delimiter = None
+    def select_delimiter(self, metafile):
         # Parse TSV or CSV file
-        for line in open(old_meta_file):
+        for line in open(metafile):
             if len(line.split('\t')) >= len(line.split(',')):
-                old_delimiter = '\t'
-                break
+                return '\t'
             else:
-                old_delimiter = ','
-                break
+                return ','
+
+    def compare_metadata(self, old_meta_file, new_meta_file, only_ncbi=False):
+
+        old_delimiter = self.select_delimiter(old_meta_file)
 
         old_nested_dict = {}
         with open(old_meta_file, 'r') as omf:
             old_headers = omf.readline().split(old_delimiter)
             if old_delimiter == ',':
                 for line in csv.reader(omf):
-                    old_nested_dict[line[0]] = {}
-                    for i, j in enumerate(line):
-                        old_nested_dict[line[0]][old_headers[i]] = str(j)
+                    if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
+                        old_nested_dict[line[0]] = {}
+                        for i, j in enumerate(line):
+                            old_nested_dict[line[0]][old_headers[i]] = str(j)
             else:
                 for raw_line in omf:
                     line = raw_line.strip('\n').split('\t')
-                    old_nested_dict[line[0]] = {}
-                    for i, j in enumerate(line):
-                        old_nested_dict[line[0]][old_headers[i]] = str(j)
+                    if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
+                        old_nested_dict[line[0]] = {}
+                        for i, j in enumerate(line):
+                            old_nested_dict[line[0]][old_headers[i]] = str(j)
 
         self.logger.info('{} parsed'.format(old_meta_file))
 
-        new_delimiter = None
-        # Parse TSV or CSV file
-        for line in open(old_meta_file):
-            if len(line.split('\t')) >= len(line.split(',')):
-                new_delimiter = '\t'
-                break
-            else:
-                new_delimiter = ','
-                break
+        new_delimiter = self.select_delimiter(new_meta_file)
 
         header_summary = {}
         new_nested_dict = {}
         # in the new metadata file
         # we check if the genome id exists, and the columns names exist
         # for each common column name we compare the value for each common
         # genomes and add 1 if they are different
+        number_of_genomes = 0
         with open(new_meta_file, 'r') as nmf:
             new_headers = nmf.readline().split(new_delimiter)
             if new_delimiter == ',':
                 for line in csv.reader(nmf):
+
                     if line[0] in old_nested_dict:
+                        number_of_genomes += 1
+                        for i, j in enumerate(line):
+                            if new_headers[i] in old_headers:
+                                if str(j) != old_nested_dict.get(line[0]).get(new_headers[i]):
+                                    header_summary.setdefault(
+                                        new_headers[i], []).append(1)
+                                else:
+                                    header_summary.setdefault(
+                                        new_headers[i], []).append(0)
+                for raw_line in nmf:
+                    line = raw_line.strip('\n').split('\t')
+                    if line[0] in old_nested_dict:
+                        number_of_genomes += 1
                         for i, j in enumerate(line):
                             if new_headers[i] in old_headers:
                                 if str(j) != old_nested_dict.get(line[0]).get(new_headers[i]):
@@ -124,10 +133,63 @@ def compare_metadata(self, old_meta_file, new_meta_file):
             set(new_headers)
         new_columns = set(new_headers) - set(old_headers)
 
+        print("Based on {} common genomes.".format(number_of_genomes))
+
         print("Deprecated columns:")
         for removed_column in removed_columns:
             print("\t- {}".format(removed_column))
 
         print("New columns:")
         for new_column in new_columns:
             print("\t- {}".format(new_column))
+
+    def compare_selected_data(self, old_meta_file, new_meta_file, metafield, output_file, only_ncbi=False):
+        old_delimiter = self.select_delimiter(old_meta_file)
+        old_nested_dict = {}
+        with open(old_meta_file, 'r') as omf:
+            old_headers = omf.readline().split(old_delimiter)
+            if metafield not in old_headers:
+                self.logger.error(f'{metafield} is not in {old_meta_file}')
+                sys.exit()
+
+            if old_delimiter == ',':
+                for line in csv.reader(omf):
+                    if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
+                        old_nested_dict[line[0]] = str(
+                            line[old_headers.index(metafield)])
+            else:
+                for raw_line in omf:
+                    line = raw_line.strip('\n').split('\t')
+                    if (only_ncbi and not line[0].startswith('U_')) or not only_ncbi:
+                        old_nested_dict[line[0]] = str(
+                            line[old_headers.index(metafield)])
+
+        new_delimiter = self.select_delimiter(new_meta_file)
+        new_nested_dict = {}
+        with open(new_meta_file, 'r') as nmf:
+            new_headers = nmf.readline().split(new_delimiter)
+            if metafield not in new_headers:
+                self.logger.error(f'{metafield} is not in {old_meta_file}')
+                sys.exit()
+            if new_delimiter == ',':
+                for line in csv.reader(nmf):
+                    if line[0] in old_nested_dict:
+                        new_nested_dict[line[0]] = str(
+                            line[new_headers.index(metafield)])
+            else:
+                for raw_line in omf:
+                    line = raw_line.strip('\n').split('\t')
+                    new_nested_dict[line[0]] = str(
+                        line[new_headers.index(metafield)])
+
+        results = []
+        outf = open(output_file, 'w')
+        outf.write('genome_id\told_value\tnew_value\tsimilarity\n')
+        for k, v in old_nested_dict.items():
+            similarity = 'Identical'
+            if v != new_nested_dict.get(k):
+                similarity = "Different"
+            outf.write('{}\n'.format(
+                '\t'.join([k, str(v), str(new_nested_dict.get(k)), similarity])))
+
+        self.logger.info('{} parsed'.format(old_meta_file))