From dfcd6712069baa09038122ab77c5768d24273aea Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 19 Nov 2024 17:05:49 -0800 Subject: [PATCH] Add rule to annotate GIHSN samples Adds a new `gihsn_sample` column to the metadata to indicate whether "GIHSN" was found in the `strain` as proxy for whether the sample came from the Global Influenza Hospital Surveillance Network (GIHSN). Follows the existing pattern of using `True` and `False` boolean values. Resolves --- workflow/snakemake_rules/select_strains.smk | 23 ++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/select_strains.smk b/workflow/snakemake_rules/select_strains.smk index 2a7cb3c..6ef89e7 100644 --- a/workflow/snakemake_rules/select_strains.smk +++ b/workflow/snakemake_rules/select_strains.smk @@ -70,6 +70,27 @@ rule join_metadata: --output {output.metadata:q} 2>&1 | tee {log} """ +# Annotate strains in the metadata that have "GIHSN" in the strain name to +# indicate whether it was collected as part of the Global Influenza Hospital +# Surveillance Network (GIHSN) +rule annotate_metadata_with_gihsn: + input: + metadata="data/{lineage}/metadata_joined.tsv", + output: + metadata="data/{lineage}/metadata_with_gihsn.tsv", + conda: "../envs/nextstrain.yaml" + benchmark: + "benchmarks/annotate_metadata_with_gihsn_{lineage}.txt" + log: + "logs/annotate_metadata_with_gihsn_{lineage}.txt" + shell: + """ + csvtk --tabs mutate2 \ + --expression '${{strain}}=~"(GIHSN)" ? "True" : "False"' \ + --name gihsn_sample \ + {input.metadata} > {output.metadata} + """ + rule build_reference_strains_table: input: references="config/{lineage}/reference_strains.txt", @@ -96,7 +117,7 @@ rule build_reference_strains_table: # later. rule annotate_metadata_with_reference_strains: input: - metadata="data/{lineage}/metadata_joined.tsv", + metadata="data/{lineage}/metadata_with_gihsn.tsv", references="data/{lineage}/reference_strains.tsv", output: metadata="data/{lineage}/metadata.tsv",