Skip to content

Commit

Permalink
Flu Genbank release (#644)
Browse files Browse the repository at this point in the history
* Fix various metadata bugs

* Fix issues with fasta file parsing

* Remove concurrent index building - can't do it in the middle of a transaction

* More verbose sequence processing to identify filtering bottlenecks

* year partition breaks for all flu sites

* Add build scripts for flu genbank site

* Add dev environment for flu genbank

* formatting

* Fix gzip bug
  • Loading branch information
atc3 authored Apr 4, 2024
1 parent 67e1c70 commit 8ae0710
Show file tree
Hide file tree
Showing 23 changed files with 269 additions and 73 deletions.
3 changes: 2 additions & 1 deletion build/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ gcloud builds submit --config build/cloudbuild.yaml --substitutions=_TARGET="rsv

# FLU

gcloud builds submit --config build/cloudbuild.yaml --substitutions=_TARGET="flu-gisaid",_CONFIGFILE="config/config_flu_gisaid.yaml",_TAG_NAME="${CG_VERSION}" .
gcloud builds submit --config build/cloudbuild.yaml --substitutions=_TARGET="flu-gisaid",_CONFIGFILE="config/config_flu_gisaid.yaml",_TAG_NAME="${CG_VERSION}" .
gcloud builds submit --config build/cloudbuild.yaml --substitutions=_TARGET="flu-genbank",_CONFIGFILE="config/config_flu_genbank.yaml",_TAG_NAME="${CG_VERSION}" .
1 change: 1 addition & 0 deletions build/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ gcloud run deploy rsv-genbank --image "gcr.io/${PROJECT_ID}/rsv:${CG_VERSION}"
# FLU

gcloud run deploy flu-gisaid --image "gcr.io/${PROJECT_ID}/flu-gisaid:${CG_VERSION}"
gcloud run deploy flu-genbank --image "gcr.io/${PROJECT_ID}/flu-genbank:${CG_VERSION}"
6 changes: 2 additions & 4 deletions config/config_flu_genbank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,9 @@ surv_group_references:

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
mutation_partition_break: "M"
# 'Y' year end frequency
mutation_partition_break: "Y"

# ---------------
# SERVER
Expand Down
181 changes: 181 additions & 0 deletions config/config_flu_genbank_dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# ------------------
# GLOBAL
# ------------------

# Virus this config is written for
virus: "flu"

# Path to folder with downloaded and processed data
# This path is relative to the project root
data_folder: "data_flu_small"

# Path to folder with genome information (reference.fasta, genes.json, proteins.json)
# This path is relative to the project root
static_data_folder: "static_data/flu"

# Database for this virus
postgres_db: "flu_genbank_dev"

# ------------------
# INGEST
# ------------------

# Download sequences from GenBank (NCBI Virus vvsearch) in chunks of this size
# D = day, W = week, M = month, Y = year
dl_chunk_period: "Y"

# Number of genomes to load into memory before flushing to disk
chunk_size: 10000

# --------------------
# ANALYSIS
# --------------------

# Don't process sequences prior to this date
# Leave empty to ignore
start_date_cutoff:
# Don't process sequences after this date
# Leave empty to ignore
end_date_cutoff:

# Don't process sequences after X days ago
# Leave empty to ignore
start_date_cutoff_days_ago:
# Don't process sequences prior to X days ago
# Leave empty to ignore
end_date_cutoff_days_ago:

segments: ["1", "2", "3", "4", "5", "6", "7", "8"]

# Insertions or deletions with more than this difference in bases between the
# ref and the alt will be discarded (NT level only)
max_indel_length: 100

# Mutations with less than this number of global occurrences will be ignored
mutation_count_threshold: 3

# Threshold of prevalence to report a mutation as being a consensus
# mutation for a group (e.g., clade, lineage)
consensus_fraction: 0.9

# Threshold of prevalence to report a mutation as being associated
# with a group (e.g., clade, lineage)
min_reporting_fraction: 0.05

metadata_cols:
database:
title: "Database"
host:
title: "Host"
isolation_source:
title: "Isolation source"
authors:
title: "Authors"
publications:
title: "Publications"

group_cols:
serotype:
name: "serotype"
title: "Serotype"
description: ""
show_collapse_options: false

# AZ report options
report_gene: HA
report_group_col: serotype
report_group_references:
B-vic: B-Austria-1359417-2021
B-yam: B-Phuket-3073-2013
H1N1: A-Wisconsin-67-2022
H3N2: A-Darwin-6-2021
H5NX: A-Goose-Guangdong-1-96
H7NX: A-Shanghai-02-2013
H9NX: A-Hong-Kong-1073-99
H10NX: A-Jiangsu-428-2021

# Surveillance plot options
# see: workflow_main/scripts/surveillance.py
surv_group_col: "serotype"
surv_start_date: "1956-01-01"
surv_period: "Y"
surv_min_combo_count: 50
surv_min_single_count: 50
surv_start_date_days_ago: 90
surv_end_date_days_ago: 30
surv_group_references:
B-vic: B-Austria-1359417-2021
B-yam: B-Phuket-3073-2013
H1N1: A-Wisconsin-67-2022
H3N2: A-Darwin-6-2021
H5NX: A-Goose-Guangdong-1-96
H7NX: A-Shanghai-02-2013
H9NX: A-Hong-Kong-1073-99
H10NX: A-Jiangsu-428-2021

# ---------------
# DATABASE
# ---------------

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "Y"

# ---------------
# SERVER
# ---------------

# Require a login for accessing the website
# Users are provided to the app via. the "LOGINS" environment variable,
# which is structured as "user1:pass1,user2:pass2,..."
login_required: false

dev_hostname: "http://localhost:5003"
prod_hostname:
- "https://flu.genbank.pathmut.org"

# ----------------------
# VISUALIZATION
# ----------------------

site_title: "Flu PathMut"
data_provider: "NCBI GenBank"
motd_url: "https://storage.googleapis.com/ve-public/MOTD.html"

# Default references for each subtype
default_references:
B-vic: B-Austria-1359417-2021
B-yam: B-Phuket-3073-2013
H1N1: A-Wisconsin-67-2022
H3N2: A-Darwin-6-2021
H5NX: A-Goose-Guangdong-1-96
H7NX: A-Shanghai-02-2013
H9NX: A-Hong-Kong-1073-99
H10NX: A-Jiangsu-428-2021

# Home page
show_home_banner: false
show_walkthroughs: false
show_surveillance: true
show_global_seq_plot: false

show_reports_tab: false
show_global_sequencing_tab: false
show_methods_tab: false
show_related_projects_tab: false

default_gene: HA
default_protein: HA

min_date: "1956-01-01"

show_logos:
GISAID: false
GenBank: true

# Allow downloads of sequence metadata (before aggregation)
allow_metadata_download: true
# Allow downloads of raw genomes
allow_genome_download: true
4 changes: 1 addition & 3 deletions config/config_flu_gisaid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,8 @@ surv_group_references:

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "Y"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_flu_gisaid_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,8 @@ surv_group_references:

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "Y"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_rsv_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,8 @@ surv_group_references:

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "Y"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_rsv_genbank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,8 @@ surv_group_references:

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "Y"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_sars2_alpha.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,8 @@ surv_end_date_days_ago: 30

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "M"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_sars2_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,8 @@ surv_end_date_days_ago: 30

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "M"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_sars2_genbank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,8 @@ surv_end_date: "2020-03-01" # Overrides days ago

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "M"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_sars2_genbank_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,8 @@ surv_end_date: "2020-03-01" # Overrides days ago

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "M"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_sars2_gisaid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,8 @@ surv_end_date_days_ago: 30

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "M"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_sars2_gisaid_6month.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,8 @@ surv_end_date_days_ago: 30

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "M"

# ---------------
Expand Down
4 changes: 1 addition & 3 deletions config/config_sars2_gisaid_private.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,8 @@ surv_end_date_days_ago: 30

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
# 'Y' year end frequency
mutation_partition_break: "M"

# ---------------
Expand Down
8 changes: 5 additions & 3 deletions docker-compose.flu.genbank.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
- LOGINS=user1:pass1,user2:pass2
- FLASK_APP=cg_server/main.py
- FLASK_ENV=development
- CONFIGFILE=/opt/config/config_flu_genbank.yaml
- CONFIGFILE=/opt/config/config_flu_genbank_dev.yaml
- CONSTANTSFILE=/opt/constants/defs.json
- DATA_PATH=/data
- STATIC_DATA_PATH=/opt/static_data
Expand All @@ -30,12 +30,13 @@ services:
working_dir: /app
volumes:
- ./services/server:/app:cached # Mount the server python code at run-time, so that the flask development server can refresh on changes
- ./example_data_genbank/flu:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
- ./data_flu_genbank:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
- ./src/constants:/opt/constants:cached
- ./config:/opt/config:cached
- ./static_data/flu:/opt/static_data:cached
depends_on:
- db

db:
image: postgres:12-alpine
environment:
Expand All @@ -51,13 +52,14 @@ services:
- ./services/postgres/postgres.conf:/etc/postgresql/postgresql.conf # Custom config
ports: # Expose 5432 so we can use DB administration tools
- 5432:5432

frontend:
restart: always
build:
context: ./
dockerfile: ./services/frontend/Dockerfile
environment:
CONFIGFILE: /app/config/config_flu_genbank.yaml
CONFIGFILE: /app/config/config_flu_genbank_dev.yaml
working_dir: /app
volumes:
- ./src:/app/src:cached # Mount the JS code at run-time, so the babel server can recompile the app on file changes
Expand Down
Loading

0 comments on commit 8ae0710

Please sign in to comment.