Skip to content

Commit

Permalink
Add sequence quality as metadata (#636)
Browse files Browse the repository at this point in the history
* Catch more valid passage values

* sequence quality tracking

* Adjust pangolin cores, add update tasks and explicit tempdir

* fix typo

* fix typo - write quality in plaintext, no gzip

* Add additional PANGO columns

* Disable SARS2 preprocessing at the sequence copying step

* Calculate ambiguity percentage, add quality metadata columns

* Disable filtering out sequences without a lineage

* Fix typos, package versions, bug in percent_ambiguous

* Add new metadata cols to alpha config

* Fix edge-case where fasta file ends on an empty entry that's still named

* Filter out empty sequences

* Fix typos in additional pango columns

* Fix edge case where sequences don't have any feature coverage

* Dont push empty sequences to DB

* Sequence quality for SARS2 genbank workflow

* Fix bug - no more encoding of quality stats

* Fix bug - no more encoding of quality stats

* UI and server accomodation for new quality filtering

* Ignore flu genbank data

* Bump version to v2.7.6-qual-rc1

* Set default GISAID lineage to unassigned -- avoid triggering non-null constraints on the DB

* Fix missing fields in alpha config

* Fill in missing clade values
  • Loading branch information
atc3 authored Jan 15, 2024
1 parent 6c13d01 commit c389d1e
Show file tree
Hide file tree
Showing 40 changed files with 1,041 additions and 220 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ data
data_genbank
data_flu
data_flu_small
data_flu_genbank
data_gisaid_flu
data_gisaid_rsv
data_genbank_rsv
Expand Down
1 change: 1 addition & 0 deletions .gcloudignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ data_az
data_ma
data_flu
data_flu_small
data_flu_genbank
data_gisaid_flu
data_gisaid_rsv
dist
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ example_data_genbank/*/lineage_treetime/*.pdf

data
data_genbank
data_flu_genbank
example_data_genbank/rsv/**
example_data_genbank/flu/**
example_data_genbank/sars2/**
Expand Down
38 changes: 35 additions & 3 deletions config/config_sars2_alpha.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ static_data_folder: "static_data/sars2"

# Path to folder with data to use in development
# This path is relative to the project root
example_data_folder: "example_data_genbank"
example_data_folder: "data"

# Database for this virus
postgres_db: "cg_alpha"
postgres_db: "cg_gisaid"

# ------------------
# INGEST
Expand Down Expand Up @@ -72,16 +72,44 @@ metadata_cols:
title: "Originating lab"
submitting_lab:
title: "Submitting lab"
# PANGO metadata
conflict:
title: "PANGO conflict"
ambiguity_score:
title: "PANGO ambiguity score"
scorpio_call:
title: "scorpio call"
scorpio_support:
title: "scorpio support"
scorpio_conflict:
title: "scorpio conflict"
scorpio_notes:
title: "scorpio notes"
pangolin_is_designated:
title: "PANGO is_designated"
pangolin_qc_status:
title: "PANGO QC status"
pangolin_qc_notes:
title: "PANGO QC notes"
pangolin_note:
title: "pangolin note"

group_cols:
lineage:
name: "lineage"
title: "Lineage"
title: "PANGO Lineage"
description: ""
link:
title: "(Lineage Descriptions)"
href: "https://cov-lineages.org/descriptions.html"
show_collapse_options: true
gisaid_lineage:
name: "gisaid_lineage"
title: "PANGO Lineage (GISAID)"
description: "PANGO assignments from GISAID"
link:
title: "(Lineage Descriptions)"
href: "https://cov-lineages.org/descriptions.html"
clade:
name: "clade"
title: "Clade"
Expand All @@ -91,6 +119,10 @@ group_cols:
href: "https://www.gisaid.org/references/statements-clarifications/clade-and-lineage-nomenclature-aids-in-genomic-epidemiology-of-active-hcov-19-viruses/"
show_collapse_options: false

# AZ report options
report_gene: "S"
report_group_col: "lineage"

# Surveillance plot options
# see: workflow_main/scripts/surveillance.py
surv_group_col: "lineage"
Expand Down
21 changes: 21 additions & 0 deletions config/config_sars2_gisaid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ metadata_cols:
title: "Originating lab"
submitting_lab:
title: "Submitting lab"
# PANGO metadata
conflict:
title: "PANGO conflict"
ambiguity_score:
title: "PANGO ambiguity score"
scorpio_call:
title: "scorpio call"
scorpio_support:
title: "scorpio support"
scorpio_conflict:
title: "scorpio conflict"
scorpio_notes:
title: "scorpio notes"
pangolin_is_designated:
title: "PANGO is_designated"
pangolin_qc_status:
title: "PANGO QC status"
pangolin_qc_notes:
title: "PANGO QC notes"
pangolin_note:
title: "pangolin note"

group_cols:
lineage:
Expand Down
21 changes: 21 additions & 0 deletions config/config_sars2_gisaid_private.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ metadata_cols:
title: "Originating lab"
submitting_lab:
title: "Submitting lab"
# PANGO metadata
conflict:
title: "PANGO conflict"
ambiguity_score:
title: "PANGO ambiguity score"
scorpio_call:
title: "scorpio call"
scorpio_support:
title: "scorpio support"
scorpio_conflict:
title: "scorpio conflict"
scorpio_notes:
title: "scorpio notes"
pangolin_is_designated:
title: "PANGO is_designated"
pangolin_qc_status:
title: "PANGO QC status"
pangolin_qc_notes:
title: "PANGO QC notes"
pangolin_note:
title: "pangolin note"

group_cols:
lineage:
Expand Down
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "covidcg",
"version": "2.7.5-pango",
"version": "2.7.6-qual-rc1",
"description": "",
"engines": {
"node": ">=8",
Expand Down
8 changes: 5 additions & 3 deletions services/server/cg_server/db_seed/insert_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def insert_sequences(conn, data_path, schema="public"):
fasta_path = Path(data_path) / "fasta_processed"

with conn.cursor() as cur:

cur.execute(sql.SQL("SET search_path TO {};").format(sql.Identifier(schema)))

# The SARS-CoV-2 genomes are ~30kb, which seems like it
Expand Down Expand Up @@ -112,7 +111,6 @@ def insert_sequences(conn, data_path, schema="public"):
chunk_size = 10_000

for i, fasta_file in enumerate(sorted(fasta_path.iterdir())):

# if fasta_file.name in loaded_files:
# print("Skipping: ", fasta_file.name)
# continue
Expand All @@ -136,6 +134,10 @@ def insert_sequences(conn, data_path, schema="public"):
if not line and i < (len(lines) - 1):
continue

# If we're on the last line, but the entry is empty, then skip
if not line and i == (len(lines) - 1) and not cur_seq:
continue

# If not the name of an entry, add this line to the current sequence
# (some FASTA files will have multiple lines per sequence)
if line[0] != ">":
Expand All @@ -144,7 +146,7 @@ def insert_sequences(conn, data_path, schema="public"):
# Start of another entry = end of the previous entry
if line[0] == ">" or i == (len(lines) - 1):
# Avoid capturing the first one and pushing an empty sequence
if cur_entry:
if cur_entry and cur_seq:
buffer.write(
cur_entry + "," + cur_seq + "," + fasta_file.name + "\n"
)
Expand Down
38 changes: 21 additions & 17 deletions services/server/cg_server/db_seed/seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def df_to_sql(cur, df, table, index_label=None):

def seed_database(conn, schema="public"):
with conn.cursor() as cur:

cur.execute(sql.SQL("SET search_path TO {};").format(sql.Identifier(schema)))

cur.execute("DROP EXTENSION IF EXISTS intarray;")
Expand Down Expand Up @@ -195,7 +194,6 @@ def seed_database(conn, schema="public"):

mutation_fields = ["dna", "gene_aa", "protein_aa"]
for grouping in group_mutation_frequencies.keys():

# Get references
reference_names = sorted(group_mutation_frequencies[grouping].keys())

Expand Down Expand Up @@ -256,7 +254,6 @@ def seed_database(conn, schema="public"):

# Build colormaps
for grouping in config["group_cols"].keys():

# Collect unique group names
group_names = []
for reference in global_group_counts.keys():
Expand Down Expand Up @@ -296,12 +293,27 @@ def seed_database(conn, schema="public"):
print("done")

print("Writing sequence metadata...", end="", flush=True)

isolate_df = pd.read_json(data_path / "isolate_data.json")
isolate_df["collection_date"] = pd.to_datetime(isolate_df["collection_date"])
isolate_df["submission_date"] = pd.to_datetime(isolate_df["submission_date"])
# print(isolate_df.columns)

# Make a column for each metadata field
metadata_cols = []
metadata_col_defs = []
for field in list(config["metadata_cols"].keys()) + loc_levels:
metadata_col_defs.append(sql.SQL(f"{field} INTEGER NOT NULL"))
metadata_cols.append(field)

# Make columns for sequence metadata, if they exist
if "length" in isolate_df.columns:
metadata_cols.append("length")
metadata_col_defs.append(sql.SQL("length INTEGER NOT NULL"))
if "percent_ambiguous" in isolate_df.columns:
metadata_cols.append("percent_ambiguous")
metadata_col_defs.append(sql.SQL("percent_ambiguous REAL NOT NULL"))

metadata_col_defs = sql.SQL(",\n").join(metadata_col_defs)

# Make a column for each grouping
Expand Down Expand Up @@ -338,11 +350,6 @@ def seed_database(conn, schema="public"):
)
)

isolate_df = pd.read_json(data_path / "isolate_data.json")
isolate_df["collection_date"] = pd.to_datetime(isolate_df["collection_date"])
isolate_df["submission_date"] = pd.to_datetime(isolate_df["submission_date"])
# print(isolate_df.columns)

# Partition settings
min_date = isolate_df["collection_date"].min()
# Round latest sequence to the nearest partition break
Expand Down Expand Up @@ -423,8 +430,7 @@ def seed_database(conn, schema="public"):
"segments",
"accession_ids",
]
+ list(config["metadata_cols"].keys())
+ loc_levels
+ metadata_cols
+ list(
filter(lambda x: x != "subtype", config["group_cols"].keys())
) # Avoid duplicate subtype index
Expand Down Expand Up @@ -480,7 +486,9 @@ def seed_database(conn, schema="public"):
# Clean up the reference name as a SQL ident - no dots
reference_name_sql = reference_name.replace(".", "_")

reference_partition_name = f"seqmut_{mutation_field}_{reference_name_sql}"
reference_partition_name = (
f"seqmut_{mutation_field}_{reference_name_sql}"
)

# Create reference partition
cur.execute(
Expand Down Expand Up @@ -555,13 +563,11 @@ def seed_database(conn, schema="public"):
"subtype",
"reference",
]
+ list(config["metadata_cols"].keys())
+ loc_levels
+ metadata_cols
+ list(
filter(lambda x: x != "subtype", config["group_cols"].keys())
) # Avoid duplicate subtype index
):

cur.execute(
sql.SQL(
"CREATE INDEX {index_name} ON {table_name}({field});"
Expand Down Expand Up @@ -756,13 +762,11 @@ def seed_database(conn, schema="public"):
"subtype",
"reference",
]
+ list(config["metadata_cols"].keys())
+ loc_levels
+ metadata_cols
+ list(
filter(lambda x: x != "subtype", config["group_cols"].keys())
) # Avoid duplicate subtype index
):

cur.execute(
sql.SQL(
"CREATE INDEX {index_name} ON {table_name}({field});"
Expand Down
2 changes: 2 additions & 0 deletions services/server/cg_server/download/genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def download_genomes(conn, req):
req.get("selected_metadata_fields", None),
req.get("selected_group_fields", None),
req.get("selected_reference", None),
req.get("sequence_length", None),
req.get("percent_ambiguous", None),
)

cur.execute(
Expand Down
8 changes: 6 additions & 2 deletions services/server/cg_server/download/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@


def download_metadata(conn, req):

selected_reference = req.get("selected_reference", None)
if not selected_reference:
raise Exception("No reference specified")
Expand All @@ -34,6 +33,8 @@ def download_metadata(conn, req):
req.get("subm_end_date", None),
req.get("selected_metadata_fields", None),
req.get("selected_group_fields", None),
req.get("sequence_length", None),
req.get("percent_ambiguous", None),
selected_reference,
)

Expand Down Expand Up @@ -179,7 +180,10 @@ def download_metadata(conn, req):

cur.execute(query)

res_df = pd.DataFrame.from_records(cur.fetchall(), columns=sequence_cols,)
res_df = pd.DataFrame.from_records(
cur.fetchall(),
columns=sequence_cols,
)

# Replace mutation IDs with names
for mutation_field in ["dna", "gene_aa", "protein_aa"]:
Expand Down
15 changes: 11 additions & 4 deletions services/server/cg_server/query/group_mutation_frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def query_group_mutation_frequencies(conn, req):
"mutation_str",
]
if mutation_type == "gene_aa" or mutation_type == "protein_aa":
mutation_cols = ["feature",] + mutation_cols
mutation_cols = [
"feature",
] + mutation_cols

mutation_cols_expr = sql.SQL(",\n").join(
[
Expand Down Expand Up @@ -94,10 +96,14 @@ def query_group_mutation_frequencies_dynamic(conn, req):
mutation_table = "dna_mutation"
elif mutation_type == "gene_aa":
mutation_table = "gene_aa_mutation"
mutation_cols = ["feature",] + mutation_cols
mutation_cols = [
"feature",
] + mutation_cols
elif mutation_type == "protein_aa":
mutation_table = "protein_aa_mutation"
mutation_cols = ["feature",] + mutation_cols
mutation_cols = [
"feature",
] + mutation_cols

sequence_where_filter = build_sequence_location_where_filter(
group_key,
Expand All @@ -108,6 +114,8 @@ def query_group_mutation_frequencies_dynamic(conn, req):
req.get("subm_end_date", None),
req.get("selected_metadata_fields", None),
req.get("selected_group_fields", None),
req.get("sequence_length", None),
req.get("percent_ambiguous", None),
selected_reference,
)
sequence_mutation_table = "sequence_" + mutation_table
Expand Down Expand Up @@ -167,4 +175,3 @@ def query_group_mutation_frequencies_dynamic(conn, req):
)

return res.to_json(orient="records")

Loading

0 comments on commit c389d1e

Please sign in to comment.