From 5bd8a2b7bce9d651e4233650cced8ce577a06539 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 6 Jan 2023 17:54:47 +0000
Subject: [PATCH] Revert "Allow matches to be filtered to a set of variant IDs
 (e.g. ones that are shared across datasets)"

This reverts commit 8f4f1b9b2c7d60d20dfb58351daba29caafc930e.
---
 pgscatalog_utils/match/combine_matches.py |  3 ---
 pgscatalog_utils/match/label.py           | 28 ++---------------------
 pgscatalog_utils/match/log.py             |  8 +++----
 pgscatalog_utils/match/match_variants.py  |  3 ---
 4 files changed, 6 insertions(+), 36 deletions(-)
diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py
index b533936..0e4ea17 100644
--- a/pgscatalog_utils/match/combine_matches.py
+++ b/pgscatalog_utils/match/combine_matches.py
@@ -58,9 +58,6 @@ def _parse_args(args=None):
                         help='<Required> List of match files')
     parser.add_argument('--min_overlap', dest='min_overlap', required=True,
                         type=float, help='<Required> Minimum proportion of variants to match before error')
-    parser.add_argument('-IDs', '--filter_IDs', dest='filter',
-                        help='<Optional> Path to file containing list of variant IDs that can be included in the final scorefile.'
-                             '[useful for limiting scoring files to variants present in multiple datasets]')
     parser = add_match_args(parser) # params for labelling matches
     parser.add_argument('--outdir', dest='outdir', required=True,
                         help='<Required> Output directory')
diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 6183373..d1c1417 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -9,17 +9,10 @@
 
 def make_params_dict(args) -> dict[str, bool]:
     """ Make a dictionary with parameters that control labelling match candidates """
-    filter_IDs = []
-    if args.filter:
-        logger.debug("Reading filter file (variant IDs)")
-        with open(args.filter, 'r') as f:
-            filter_IDs = [line.strip() for line in f]
-
     return {'keep_first_match': args.keep_first_match,
             'remove_ambiguous': args.remove_ambiguous,
             'skip_flip': args.skip_flip,
-            'remove_multiallelic': args.remove_multiallelic,
-            'filter_IDs': filter_IDs}
+            'remove_multiallelic': args.remove_multiallelic}
 
 
 def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
@@ -30,7 +23,7 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
     - duplicate: True if more than one best match exists for the same accession and ID
     - ambiguous: True if ambiguous
     """
-    assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip', 'filter_IDs'}
+    assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip'}
     labelled = (df.with_column(pl.lit(False).alias('exclude'))  # set up dummy exclude column for _label_*
                 .pipe(_label_best_match)
                 .pipe(_label_duplicate_best_match)
@@ -38,7 +31,6 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
                 .pipe(_label_biallelic_ambiguous, params['remove_ambiguous'])
                 .pipe(_label_multiallelic, params['remove_multiallelic'])
                 .pipe(_label_flips, params['skip_flip'])
-                .pipe(_label_filter, params['filter_IDs'])
                 .with_column(pl.lit(True).alias('match_candidate')))
 
     return _encode_match_priority(labelled)
@@ -223,19 +215,3 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
     else:
         logger.debug("Not excluding flipped matches")
         return df
-
-
-def _label_filter(df: pl.LazyFrame, filter_IDs: list) -> pl.LazyFrame:
-    nIDs = len(filter_IDs)
-    if nIDs > 0:
-        logger.debug("Excluding variants that are not in ID list (read {} IDs)".format(nIDs))
-        df = df.with_column(pl.when(pl.col('ID').is_in(filter_IDs))
-                            .then(pl.lit(True))
-                            .otherwise(pl.lit(False))
-                            .alias('match_IDs'))
-        return df.with_column(pl.when(pl.col('match_IDs') == False)
-                              .then(True)
-                              .otherwise(pl.col('exclude'))
-                              .alias('exclude'))
-    else:
-        return df.with_column((pl.lit('NA')).alias('match_IDs'))
\ No newline at end of file
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 1ab6764..a456d07 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -22,8 +22,8 @@ def make_summary_log(match_candidates: pl.LazyFrame, scorefile: pl.LazyFrame, fi
             .select(pl.exclude("^.*_right$"))
             .with_columns([pl.col('match_status').fill_null(value='unmatched'),
                            pl.lit(dataset).alias('dataset')])  # fill in unmatched variants
-            .groupby(['dataset', 'accession', 'match_IDs', 'ambiguous', 'is_multiallelic', 'match_flipped',
-                      'duplicate_best_match', 'duplicate_ID', 'match_status'])
+            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'match_flipped',
+                      'duplicate_best_match', 'duplicate_ID'])
             .agg(pl.count())
             .join(filter_summary, how='left', on='accession')
             .pipe(_prettify_summary))
@@ -45,7 +45,7 @@ def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame) -> None:
 
 def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
     keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic",
-                 "duplicate_best_match", "duplicate_ID", "match_IDs", "count", "percent"]
+                 "duplicate_best_match", "duplicate_ID", "count", "percent"]
     return (df.with_column((pl.col("count") / pl.sum("count") * 100)
                            .over(["dataset", "accession"])
                            .alias("percent"))
@@ -55,7 +55,7 @@ def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
 def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
     keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
                  "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
-                 "ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID", "match_IDs",
+                 "ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID",
                  "match_status", "dataset"]
     pretty_df = (df.select(keep_cols)
                  .select(pl.exclude("^.*_right"))
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index e5b43ee..8e0b446 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -210,9 +210,6 @@ def _parse_args(args=None):
                         help="<Optional> Only match, then write intermediate files, don't make scoring files")
     parser.add_argument('--min_overlap', dest='min_overlap', required=False,
                         type=float, help='<Optional> Minimum proportion of variants to match before error')
-    parser.add_argument('-IDs', '--filter_IDs', dest='filter',
-                        help='<Optional> Path to file containing list of variant IDs that can be included in the final scorefile.'
-                             '[useful for limiting scoring files to variants present in multiple datasets]')
     parser = add_match_args(parser) # params for labelling matches
     parser.add_argument('--outdir', dest='outdir', required=True,
                         help='<Required> Output directory')