diff --git a/.gitignore b/.gitignore index ad73646b..1fd136a5 100644 --- a/.gitignore +++ b/.gitignore @@ -98,6 +98,8 @@ data_flu/** data_gisaid_flu/** data_gisaid_rsv/** data_genbank_rsv/** +data_6month +data_6month/** # Ignore server passwords server/htpasswd @@ -108,7 +110,8 @@ daily_update.sh daily_update_new.sh daily_update_rsv.sh daily_update_sars2_genbank.sh -daily_update_sars2_gisaid.sh +daily_update_sars2_gisaid_full.sh +daily_update_sars2_gisaid_6month.sh update_gisaid.sh update_genbank.sh filter_list.txt @@ -143,4 +146,4 @@ data_flu_small/** workflow_main/notebooks/** # SnapGene - temp files -static_data/flu/alignments/.sglock/** \ No newline at end of file +static_data/flu/alignments/.sglock/** diff --git a/config/config_flu_genbank.yaml b/config/config_flu_genbank.yaml index 2f348759..f60d061c 100644 --- a/config/config_flu_genbank.yaml +++ b/config/config_flu_genbank.yaml @@ -36,6 +36,20 @@ chunk_size: 10000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1", "2", "3", "4", "5", "6", "7", "8"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_flu_gisaid.yaml b/config/config_flu_gisaid.yaml index d7308579..76b9626c 100644 --- a/config/config_flu_gisaid.yaml +++ b/config/config_flu_gisaid.yaml @@ -32,6 +32,20 @@ chunk_size: 10000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1", "2", "3", "4", "5", "6", "7", "8"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_flu_gisaid_dev.yaml b/config/config_flu_gisaid_dev.yaml index c0ec3ba8..721bb1a9 100644 --- a/config/config_flu_gisaid_dev.yaml +++ b/config/config_flu_gisaid_dev.yaml @@ -32,6 +32,20 @@ chunk_size: 10000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1", "2", "3", "4", "5", "6", "7", "8"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_rsv_custom.yaml b/config/config_rsv_custom.yaml index 98dc19ff..bcb7b458 100644 --- a/config/config_rsv_custom.yaml +++ b/config/config_rsv_custom.yaml @@ -31,6 +31,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_rsv_genbank.yaml b/config/config_rsv_genbank.yaml index d09163f4..1f1b7072 100644 --- a/config/config_rsv_genbank.yaml +++ b/config/config_rsv_genbank.yaml @@ -36,6 +36,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_sars2_alpha.yaml b/config/config_sars2_alpha.yaml index 08e96ea7..75aabd27 100644 --- a/config/config_sars2_alpha.yaml +++ b/config/config_sars2_alpha.yaml @@ -31,6 +31,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_sars2_custom.yaml b/config/config_sars2_custom.yaml index 35c16ade..a095f6f5 100644 --- a/config/config_sars2_custom.yaml +++ b/config/config_sars2_custom.yaml @@ -31,6 +31,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_sars2_genbank.yaml b/config/config_sars2_genbank.yaml index d693194c..1a5ac359 100644 --- a/config/config_sars2_genbank.yaml +++ b/config/config_sars2_genbank.yaml @@ -35,6 +35,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_sars2_genbank_dev.yaml b/config/config_sars2_genbank_dev.yaml index fb1a02eb..1cc0119c 100644 --- a/config/config_sars2_genbank_dev.yaml +++ b/config/config_sars2_genbank_dev.yaml @@ -36,6 +36,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_sars2_gisaid.yaml b/config/config_sars2_gisaid.yaml index 345c2efa..0dff084c 100644 --- a/config/config_sars2_gisaid.yaml +++ b/config/config_sars2_gisaid.yaml @@ -31,6 +31,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/config/config_sars2_gisaid_6month.yaml b/config/config_sars2_gisaid_6month.yaml new file mode 100644 index 00000000..1f08b1cb --- /dev/null +++ b/config/config_sars2_gisaid_6month.yaml @@ -0,0 +1,210 @@ +# ------------------ +# GLOBAL +# ------------------ + +# Virus this config is written for +virus: "sars2" + +# Path to folder with downloaded and processed data +# This path is relative to the project root +data_folder: "data_6month" + +# Path to folder with genome information (reference.fasta, genes.json, proteins.json) +# This path is relative to the project root +static_data_folder: "static_data/sars2" + +# Path to folder with data to use in development +# This path is relative to the project root +example_data_folder: "data" + +# Database for this virus +postgres_db: "cg_gisaid_6month" + +# ------------------ +# INGEST +# ------------------ + +# Number of genomes to load into memory before flushing to disk +chunk_size: 100000 + +# -------------------- +# ANALYSIS +# -------------------- + +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: 180 +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + +segments: ["1"] + +# Insertions or deletions with more than this difference in bases between the +# ref and the alt will be discarded (NT level only) +max_indel_length: 100 + +# Mutations with less than this number of global occurrences will be ignored +mutation_count_threshold: 3 + +# Threshold of prevalence to report a mutation as being a consensus +# mutation for a group (e.g., clade, lineage) +consensus_fraction: 0.9 + +# Threshold of prevalence to report a mutation as being associated +# with a group (e.g., clade, lineage) +min_reporting_fraction: 0.05 + +metadata_cols: + host: + title: "Host" + gender: + title: "Gender" + patient_status: + title: "Patient Status" + passage: + title: "Passage" + disabled: true + specimen: + title: "Specimen" + sequencing_tech: + title: "Sequencing" + assembly_method: + title: "Assembly" + comment_type: + title: "Flag" + authors: + title: "Authors" + originating_lab: + title: "Originating lab" + submitting_lab: + title: "Submitting lab" + # PANGO metadata + conflict: + title: "PANGO conflict" + ambiguity_score: + title: "PANGO ambiguity score" + scorpio_call: + title: "scorpio call" + scorpio_support: + title: "scorpio support" + scorpio_conflict: + title: "scorpio conflict" + scorpio_notes: + title: "scorpio notes" + pangolin_is_designated: + title: "PANGO is_designated" + pangolin_qc_status: + title: "PANGO QC status" + pangolin_qc_notes: + title: "PANGO QC notes" + pangolin_note: + title: "pangolin note" + +group_cols: + lineage: + name: "lineage" + title: "PANGO Lineage" + description: "" + link: + title: "(Lineage Descriptions)" + href: "https://cov-lineages.org/descriptions.html" + show_collapse_options: true + gisaid_lineage: + name: "gisaid_lineage" + title: "PANGO Lineage (GISAID)" + description: "PANGO assignments from GISAID" + link: + title: "(Lineage Descriptions)" + href: "https://cov-lineages.org/descriptions.html" + clade: + name: "clade" + title: "Clade" + description: "For more information about clade and lineage nomenclature, visit this:" + link: + title: "[GISAID note]" + href: "https://www.gisaid.org/references/statements-clarifications/clade-and-lineage-nomenclature-aids-in-genomic-epidemiology-of-active-hcov-19-viruses/" + show_collapse_options: false + +# AZ report options +report_gene: "S" +report_group_col: "lineage" + +# Surveillance plot options +# see: workflow_main/scripts/surveillance.py +surv_group_col: "lineage" +surv_period: "W" +surv_min_combo_count: 50 +surv_min_single_count: 50 +surv_start_date_days_ago: 90 +surv_end_date_days_ago: 30 + +# --------------- +# DATABASE +# --------------- + +# Split mutation table partitions into periods of this length +# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases +# Common options: +# 'D' calendar day frequency +# 'W' weekly frequency +# 'M' month end frequency +mutation_partition_break: "M" + +# --------------- +# SERVER +# --------------- + +# Require a login for accessing the website +# Users are provided to the app via. the "LOGINS" environment variable, +# which is structured as "user1:pass1,user2:pass2,..." +login_required: false + +dev_hostname: "http://localhost:5001" +prod_hostname: + - "https://covidcg.org" + - "https://sars2.pathmut.org" + - "https://sars2.gisaid.pathmut.org" + +# ---------------------- +# VISUALIZATION +# ---------------------- + +site_title: "COVID CG" +data_provider: "GISAID" + +# Default references for each subtype +default_references: + SARS-CoV-2: WIV04 + +# Home page +show_home_banner: true +show_walkthroughs: true +show_surveillance: true +show_global_seq_plot: true + +show_reports_tab: true +show_global_sequencing_tab: true +show_methods_tab: true +show_related_projects_tab: true + +default_gene: S +default_protein: nsp12 - RdRp + +min_date: "2019-12-01" + +show_logos: + GISAID: true + GenBank: false + +# Allow downloads of sequence metadata (before aggregation) +allow_metadata_download: false +# Allow downloads of raw genomes +allow_genome_download: false diff --git a/config/config_sars2_gisaid_private.yaml b/config/config_sars2_gisaid_private.yaml index 0f7b2c8c..175f2ee5 100644 --- a/config/config_sars2_gisaid_private.yaml +++ b/config/config_sars2_gisaid_private.yaml @@ -31,6 +31,20 @@ chunk_size: 100000 # ANALYSIS # -------------------- +# Don't process sequences prior to this date +# Leave empty to ignore +start_date_cutoff: +# Don't process sequences after this date +# Leave empty to ignore +end_date_cutoff: + +# Don't process sequences after X days ago +# Leave empty to ignore +start_date_cutoff_days_ago: +# Don't process sequences prior to X days ago +# Leave empty to ignore +end_date_cutoff_days_ago: + segments: ["1"] # Insertions or deletions with more than this difference in bases between the diff --git a/services/server/cg_server/db_seed/seed.py b/services/server/cg_server/db_seed/seed.py index c0036769..281ff6e0 100644 --- a/services/server/cg_server/db_seed/seed.py +++ b/services/server/cg_server/db_seed/seed.py @@ -298,6 +298,12 @@ def seed_database(conn, schema="public"): isolate_df["collection_date"] = pd.to_datetime(isolate_df["collection_date"]) isolate_df["submission_date"] = pd.to_datetime(isolate_df["submission_date"]) # print(isolate_df.columns) + if 'gisaid_lineage' in isolate_df.columns: + isolate_df['gisaid_lineage'] = isolate_df['gisaid_lineage'].fillna('Unassigned') + if 'lineage' in isolate_df.columns: + isolate_df['lineage'] = isolate_df['lineage'].fillna('Unassigned') + if 'clade' in isolate_df.columns: + isolate_df['clade'] = isolate_df['clade'].fillna('Unassigned') # Make a column for each metadata field metadata_cols = [] diff --git a/workflow_main/Snakefile b/workflow_main/Snakefile index 52b41ddc..c7a7a6d3 100644 --- a/workflow_main/Snakefile +++ b/workflow_main/Snakefile @@ -3,6 +3,7 @@ """Main data processing workflow from ingested data $ snakemake --configfile ../config/config_sars2_gisaid.yaml -j6 +$ snakemake --configfile ../config/config_sars2_gisaid_6month.yaml -j6 -R sequence_manifest $ snakemake --configfile ../config/config_sars2_genbank_dev.yaml -j6 $ snakemake --configfile ../config/config_rsv_genbank.yaml -j6 $ snakemake --configfile ../config/config_flu_genbank.yaml -j6 @@ -35,6 +36,8 @@ import datetime import os from pathlib import Path +import pandas as pd + data_folder = os.path.join("..", config["data_folder"]) static_data_folder = os.path.join("..", config["static_data_folder"]) @@ -46,6 +49,34 @@ CHUNKS, = glob_wildcards(os.path.join( data_folder, "fasta_raw", "{chunk}.fa.gz" )) +start_date_cutoff = None +end_date_cutoff = None + +if config["start_date_cutoff"] is not None: + start_date_cutoff = pd.to_datetime(config["start_date_cutoff"]) +elif config["start_date_cutoff_days_ago"] is not None: + start_date_cutoff = ( + pd.to_datetime(today_str) - + pd.Timedelta(days=config["start_date_cutoff_days_ago"]) + ) + +if config["end_date_cutoff"] is not None: + end_date_cutoff = pd.to_datetime(config["end_date_cutoff"]) +elif config["end_date_cutoff_days_ago"] is not None: + end_date_cutoff = ( + pd.to_datetime(today_str) - + pd.Timedelta(days=config["end_date_cutoff_days_ago"]) + ) + +# Filter chunks by date +if start_date_cutoff is not None: + print(f"Filtering out sequences from before {start_date_cutoff.isoformat()}") + CHUNKS = [c for c in CHUNKS if pd.to_datetime(c.split('_')[2]) > start_date_cutoff] + +if end_date_cutoff is not None: + print(f"Filtering out sequences from after {end_date_cutoff.isoformat()}") + CHUNKS = [c for c in CHUNKS if pd.to_datetime(c.split('_')[2]) < end_date_cutoff] + SEGMENTS = config["segments"] SUBTYPES = [ d.name for d in sorted( @@ -211,13 +242,17 @@ rule sequence_manifest: output: manifest = os.path.join(data_folder, "sequence_manifest.csv") params: - processed_fasta_files = os.path.join(data_folder, "fasta_processed") + processed_fasta_files = os.path.join(data_folder, "fasta_processed"), + start_date_cutoff = f"--start-date-cutoff {start_date_cutoff.date().isoformat()}" if start_date_cutoff is not None else "", + end_date_cutoff = f"--end-date-cutoff {end_date_cutoff.date().isoformat()}" if end_date_cutoff is not None else "" shell: """ python3 scripts/sequence_manifest.py \ --reference {input.reference} \ --fasta {params.processed_fasta_files} \ - --out {output.manifest} + --out {output.manifest} \ + {params.start_date_cutoff} + {params.end_date_cutoff} """ diff --git a/workflow_main/scripts/sequence_manifest.py b/workflow_main/scripts/sequence_manifest.py index 50a87268..5eb35a39 100644 --- a/workflow_main/scripts/sequence_manifest.py +++ b/workflow_main/scripts/sequence_manifest.py @@ -97,6 +97,20 @@ def main(): required=True, help="Path to reference JSON file", ) + parser.add_argument( + "--start-date-cutoff", + type=str, + required=False, + default=None, + help="Filter out sequences prior to this date" + ) + parser.add_argument( + "--end-date-cutoff", + type=str, + required=False, + default=None, + help="Filter out sequences after this date" + ) parser.add_argument( "--out", type=str, required=True, help="Output manifest CSV file" ) @@ -141,6 +155,20 @@ def main(): reference=lambda x: x["subtype"].map(subtype_refs), ) + # Filter by date + pruned_manifest["date_obj"] = pd.to_datetime(pruned_manifest["date"]) + if args.start_date_cutoff is not None: + print(f"Filtering out sequences from before {args.start_date_cutoff}") + pruned_manifest = pruned_manifest.loc[ + pruned_manifest["date_obj"] > pd.to_datetime(args.start_date_cutoff) + ] + if args.end_date_cutoff is not None: + print(f"Filtering out sequences after {args.end_date_cutoff}") + pruned_manifest = pruned_manifest.loc[ + pruned_manifest["date_obj"] < pd.to_datetime(args.end_date_cutoff) + ] + pruned_manifest.drop(columns=['date_obj'], inplace=True) + pruned_manifest = pruned_manifest.explode("reference") pruned_manifest.to_csv(args.out, index=False)