From 5bd8a2b7bce9d651e4233650cced8ce577a06539 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 6 Jan 2023 17:54:47 +0000 Subject: [PATCH] Revert "Allow matches to be filtered to a set of variant IDs (e.g. ones that are shared across datasets)" This reverts commit 8f4f1b9b2c7d60d20dfb58351daba29caafc930e. --- pgscatalog_utils/match/combine_matches.py | 3 --- pgscatalog_utils/match/label.py | 28 ++--------------------- pgscatalog_utils/match/log.py | 8 +++---- pgscatalog_utils/match/match_variants.py | 3 --- 4 files changed, 6 insertions(+), 36 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index b533936..0e4ea17 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -58,9 +58,6 @@ def _parse_args(args=None): help=' List of match files') parser.add_argument('--min_overlap', dest='min_overlap', required=True, type=float, help=' Minimum proportion of variants to match before error') - parser.add_argument('-IDs', '--filter_IDs', dest='filter', - help=' Path to file containing list of variant IDs that can be included in the final scorefile.' - '[useful for limiting scoring files to variants present in multiple datasets]') parser = add_match_args(parser) # params for labelling matches parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 6183373..d1c1417 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -9,17 +9,10 @@ def make_params_dict(args) -> dict[str, bool]: """ Make a dictionary with parameters that control labelling match candidates """ - filter_IDs = [] - if args.filter: - logger.debug("Reading filter file (variant IDs)") - with open(args.filter, 'r') as f: - filter_IDs = [line.strip() for line in f] - return {'keep_first_match': args.keep_first_match, 'remove_ambiguous': args.remove_ambiguous, 'skip_flip': args.skip_flip, - 'remove_multiallelic': args.remove_multiallelic, - 'filter_IDs': filter_IDs} + 'remove_multiallelic': args.remove_multiallelic} def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame: @@ -30,7 +23,7 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame: - duplicate: True if more than one best match exists for the same accession and ID - ambiguous: True if ambiguous """ - assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip', 'filter_IDs'} + assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip'} labelled = (df.with_column(pl.lit(False).alias('exclude')) # set up dummy exclude column for _label_* .pipe(_label_best_match) .pipe(_label_duplicate_best_match) @@ -38,7 +31,6 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame: .pipe(_label_biallelic_ambiguous, params['remove_ambiguous']) .pipe(_label_multiallelic, params['remove_multiallelic']) .pipe(_label_flips, params['skip_flip']) - .pipe(_label_filter, params['filter_IDs']) .with_column(pl.lit(True).alias('match_candidate'))) return _encode_match_priority(labelled) @@ -223,19 +215,3 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame: else: logger.debug("Not excluding flipped matches") return df - - -def _label_filter(df: pl.LazyFrame, filter_IDs: list) -> pl.LazyFrame: - nIDs = len(filter_IDs) - if nIDs > 0: - logger.debug("Excluding variants that are not in ID list (read {} IDs)".format(nIDs)) - df = df.with_column(pl.when(pl.col('ID').is_in(filter_IDs)) - .then(pl.lit(True)) - .otherwise(pl.lit(False)) - .alias('match_IDs')) - return df.with_column(pl.when(pl.col('match_IDs') == False) - .then(True) - .otherwise(pl.col('exclude')) - .alias('exclude')) - else: - return df.with_column((pl.lit('NA')).alias('match_IDs')) \ No newline at end of file diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 1ab6764..a456d07 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -22,8 +22,8 @@ def make_summary_log(match_candidates: pl.LazyFrame, scorefile: pl.LazyFrame, fi .select(pl.exclude("^.*_right$")) .with_columns([pl.col('match_status').fill_null(value='unmatched'), pl.lit(dataset).alias('dataset')]) # fill in unmatched variants - .groupby(['dataset', 'accession', 'match_IDs', 'ambiguous', 'is_multiallelic', 'match_flipped', - 'duplicate_best_match', 'duplicate_ID', 'match_status']) + .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'match_flipped', + 'duplicate_best_match', 'duplicate_ID']) .agg(pl.count()) .join(filter_summary, how='left', on='accession') .pipe(_prettify_summary)) @@ -45,7 +45,7 @@ def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame) -> None: def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame: keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", - "duplicate_best_match", "duplicate_ID", "match_IDs", "count", "percent"] + "duplicate_best_match", "duplicate_ID", "count", "percent"] return (df.with_column((pl.col("count") / pl.sum("count") * 100) .over(["dataset", "accession"]) .alias("percent")) @@ -55,7 +55,7 @@ def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame: def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame: keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", - "ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID", "match_IDs", + "ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] pretty_df = (df.select(keep_cols) .select(pl.exclude("^.*_right")) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index e5b43ee..8e0b446 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -210,9 +210,6 @@ def _parse_args(args=None): help=" Only match, then write intermediate files, don't make scoring files") parser.add_argument('--min_overlap', dest='min_overlap', required=False, type=float, help=' Minimum proportion of variants to match before error') - parser.add_argument('-IDs', '--filter_IDs', dest='filter', - help=' Path to file containing list of variant IDs that can be included in the final scorefile.' - '[useful for limiting scoring files to variants present in multiple datasets]') parser = add_match_args(parser) # params for labelling matches parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory')