Skip to content

Commit

Permalink
Build fixes for sparse data (#245)
Browse files Browse the repository at this point in the history
* Add link to preprint, bump version and data date

* Allow builds without acknowledgements files

* Add documentation for the nextmeta file

* Add more descriptive warning when nextmeta file is not found

* Fixing pipeline crashes on sparse metadata

* Remove underscore calls, return no data when no locations are selected

* Remove initially selected location nodes if they don't exist in the data
  • Loading branch information
atc3 authored Oct 5, 2020
1 parent 591eed4 commit ee23da3
Show file tree
Hide file tree
Showing 17 changed files with 249 additions and 144 deletions.
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,23 @@ The `data` folder requires three folders to be populated with raw data from GISA

3. `seq_meta`: Sequencing technology metadata. These files can be downloaded by selecting "Sequencing technology metadata" from the download dialog when browsing sequences in the EpiCov™ Browse Tab.

Note that as of 2020-06-05 only 10,000 sequences can be downloaded from the EpiCov™ Browse Tab at one time. Please filter your searches in a way that you select and download no more than 10,000 sequences at one time. We select data daily by filtering by "Submission date".
In addition to the files above, the pipeline currently requires a "nextmeta" file, which can be downloaded from the "Downloads" dialog box on the EpiCov website:

![](https://github.com/vector-engineering/covidcg/raw/master/src/assets/images/download_nextmeta.png)

Unzip the downloaded file, rename the file to `nextmeta_[date]` (replacing `[date]` with the current date), and then move it into the `data` folder.

The data folder should then look like this (prior to running the snakemake pipeline):

```
data/
| fasta_raw/
| patient_meta/
| seq_meta/
| nextmeta_20201005.tsv
```

_Note that as of 2020-06-05 only 10,000 sequences can be downloaded from the EpiCov™ Browse Tab at one time. Please filter your searches in a way that you select and download no more than 10,000 sequences at one time. We select data daily by filtering by "Submission date"._

### Javascript

Expand Down
98 changes: 78 additions & 20 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import numpy as np
from cg_scripts.fasta import read_fasta_file
from cg_scripts.get_aa_snps import get_aa_snps
from cg_scripts.get_dna_snps import get_dna_snps
from cg_scripts.process_ack import process_ack
from cg_scripts.process_artic_primers import process_artic_primers
from cg_scripts.process_lineages import get_consensus_snps
from cg_scripts.process_locations import process_location_metadata, build_select_tree
Expand Down Expand Up @@ -234,29 +233,28 @@ rule process_seq_metadata:
seq_meta_df.to_csv(output.seq_meta)


ACK_FILES, = glob_wildcards("data/acknowledgements/{ack_file}.xls")

rule process_acknowledgements:
input:
ack = expand(
"data/acknowledgements/{ack_file}.xls",
ack_file=ACK_FILES
)
output:
ack_meta = data_folder + "/ack_meta.csv",
ack_map = data_folder + "/ack_map.json"
run:
ack_df, ack_map = process_ack(input.ack)
ack_df.to_csv(output.ack_meta)
ack_map.to_json(output.ack_map, orient="index")
# ACK_FILES, = glob_wildcards("data/acknowledgements/{ack_file}.xls")
# rule process_acknowledgements:
# input:
# ack = expand(
# "data/acknowledgements/{ack_file}.xls",
# ack_file=ACK_FILES
# )
# output:
# ack_meta = data_folder + "/ack_meta.csv",
# ack_map = data_folder + "/ack_map.json"
# run:
# ack_df, ack_map = process_ack(input.ack)
# ack_df.to_csv(output.ack_meta)
# ack_map.to_json(output.ack_map, orient="index")

# Main rule for generating the data files for the browser
# Mostly just a bunch of joins
rule generate_ui_data:
input:
patient_meta = data_folder + "/patient_meta.csv",
seq_meta = data_folder + "/seq_meta.csv",
ack_meta = data_folder + "/ack_meta.csv",
# ack_meta = data_folder + "/ack_meta.csv",
dna_snp_group = data_folder + "/dna_snp_group.csv",
gene_aa_snp_group = data_folder + "/gene_aa_snp_group.csv",
protein_aa_snp_group = data_folder + "/protein_aa_snp_group.csv",
Expand All @@ -272,7 +270,7 @@ rule generate_ui_data:
run:
patient_meta_df = pd.read_csv(input.patient_meta, index_col="Accession ID")
seq_meta_df = pd.read_csv(input.seq_meta, index_col="Accession ID")
ack_meta_df = pd.read_csv(input.ack_meta, index_col="Accession ID")
# ack_meta_df = pd.read_csv(input.ack_meta, index_col="Accession ID")

dna_snp_group_df = pd.read_csv(input.dna_snp_group, index_col="Accession ID")
gene_aa_snp_group_df = pd.read_csv(input.gene_aa_snp_group, index_col="Accession ID")
Expand All @@ -289,9 +287,9 @@ rule generate_ui_data:
df = df.loc[~pd.isnull(df['clade']), :]

# Join acknowledgement IDs onto main metadata dataframe
df = df.join(ack_meta_df, on="Accession ID", how="left", sort=False)
# df = df.join(ack_meta_df, on="Accession ID", how="left", sort=False)
# Replace missing acknowledgement IDs with -1, then cast to integer
df["ack_id"] = df["ack_id"].fillna(-1).astype(int)
# df["ack_id"] = df["ack_id"].fillna(-1).astype(int)

# Join SNPs to main dataframe
# inner join to exclude filtered out sequences
Expand Down Expand Up @@ -519,6 +517,10 @@ rule process_artic_primers:


NEXTMETA, = glob_wildcards(data_folder + "/nextmeta_{nextmeta}.tsv")

if len(NEXTMETA) == 0:
raise Exception('nextmeta file not found. Please see https://github.com/vector-engineering/covidcg#data-requirements for how to obtain the nextmeta file')

latest_nextmeta_file = data_folder + '/nextmeta_' + sorted(NEXTMETA)[-1] + '.tsv'

rule calc_global_sequencing_efforts:
Expand Down Expand Up @@ -701,6 +703,62 @@ rule calc_global_sequencing_efforts:
with open(output.country_seq_stats, 'w') as fp:
fp.write(country_df_str)

rule assemble_data_package:
input:
case_data = data_folder + '/case_data.json',
# ack_map = data_folder + '/ack_map.json',
clade_snp = data_folder + '/clade_snp.json',
country_score = data_folder + '/country_score.json',
dna_snp_map = data_folder + '/dna_snp_map.json',
gene_aa_snp_map = data_folder + '/gene_aa_snp_map.json',
geo_select_tree = data_folder + '/geo_select_tree.json',
global_group_counts = data_folder + '/global_group_counts.json',
lineage_snp = data_folder + '/lineage_snp.json',
location_map = data_folder + '/location_map.json',
metadata_map = data_folder + '/metadata_map.json',
protein_aa_snp_map = data_folder + '/protein_aa_snp_map.json'
output:
data_package = data_folder + '/data_package.json'
run:
data_package = {}
with open(input.case_data, 'r') as fp:
data_package['case_data'] = json.loads(fp.read())
# with open(input.ack_map, 'r') as fp:
# data_package['ack_map'] = json.loads(fp.read())
with open(input.clade_snp, 'r') as fp:
data_package['clade_snp'] = json.loads(fp.read())
with open(input.country_score, 'r') as fp:
data_package['country_score'] = json.loads(fp.read())
with open(input.dna_snp_map, 'r') as fp:
data_package['dna_snp_map'] = json.loads(fp.read())
with open(input.gene_aa_snp_map, 'r') as fp:
data_package['gene_aa_snp_map'] = json.loads(fp.read())
with open(input.geo_select_tree, 'r') as fp:
data_package['geo_select_tree'] = json.loads(fp.read())
with open(input.global_group_counts, 'r') as fp:
data_package['global_group_counts'] = json.loads(fp.read())
with open(input.lineage_snp, 'r') as fp:
data_package['lineage_snp'] = json.loads(fp.read())
with open(input.location_map, 'r') as fp:
data_package['location_map'] = json.loads(fp.read())
with open(input.metadata_map, 'r') as fp:
data_package['metadata_map'] = json.loads(fp.read())
with open(input.protein_aa_snp_map, 'r') as fp:
data_package['protein_aa_snp_map'] = json.loads(fp.read())

with open(output.data_package, 'w') as fp:
fp.write(json.dumps(data_package))

rule compress_data_package:
input:
data_package = data_folder + '/data_package.json'
output:
data_package = data_folder + '/data_package.json.gz'
shell:
'''
gzip -9 -k {input.data_package} -c > {output.data_package}
'''


# # This is only for site maintainers
# if "upload_hashmap" in config and config["upload_hashmap"]:
Expand Down
2 changes: 2 additions & 0 deletions cg_scripts/process_ack.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

"""Process acknowledgement files
2020-10-05: DEPRECATED. No longer bundling in acknowledgements
Author: Albert Chen (Deverman Lab - Broad Institute)
"""

Expand Down
8 changes: 8 additions & 0 deletions cg_scripts/process_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ def process_location_metadata(case_df):
.fillna(-1)
)

# With sparse data, sometimes none of the sequences fed in will have a
# "division" or "location" entry.
# Make them manually now, if they don't already exist
if "division" not in location_df.columns:
location_df["division"] = -1
if "location" not in location_df.columns:
location_df["location"] = -1

# Clean location data
location_df = clean_location_data(location_df)

Expand Down
23 changes: 15 additions & 8 deletions cg_scripts/process_patient_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def clean_gender_metadata(patient_meta_df):
print("Cleaning patient gender metadata...", end="", flush=True)

# Make a copy, strip whitespace
patient_meta_df["gender"] = patient_meta_df["Gender"].str.strip()
patient_meta_df["gender"] = patient_meta_df["Gender"].astype(str).str.strip()

replace_map = [
(r"^female", "Female", False),
Expand Down Expand Up @@ -78,7 +78,7 @@ def clean_age_metadata(patient_meta_df):
print("Cleaning patient age metadata...", end="", flush=True)

# Do some basic cleanup before we start
patient_meta_df["age_clean"] = patient_meta_df["Patient age"]
patient_meta_df["age_clean"] = patient_meta_df["Patient age"].astype(str)
patient_meta_df["age_clean"] = patient_meta_df["age_clean"].fillna("Unknown")
patient_meta_df["age_clean"] = patient_meta_df["age_clean"].str.strip()

Expand Down Expand Up @@ -206,6 +206,7 @@ def clean_age_metadata(patient_meta_df):
for x in patient_meta_df["age_clean"][
pd.isnull(patient_meta_df["age_start"])
]
.astype(str)
.str.strip()
.unique()
.astype(str)
Expand All @@ -222,7 +223,9 @@ def clean_patient_status_metadata(patient_meta_df):
print("Cleaning patient status metadata...", end="", flush=True)

# Strip whitespace
patient_meta_df["patient_status"] = patient_meta_df["Patient status"].str.strip()
patient_meta_df["patient_status"] = (
patient_meta_df["Patient status"].astype(str).str.strip()
)

replace_map = [
(r"hospitalized", "Hospitalized", False),
Expand Down Expand Up @@ -348,7 +351,7 @@ def clean_passage_metadata(patient_meta_df):
print("Cleaning cell passage metadata...", end="", flush=True)

# Basic cleaning
patient_meta_df["passage"] = patient_meta_df["Passage"].str.strip()
patient_meta_df["passage"] = patient_meta_df["Passage"].astype(str).str.strip()

passage_key_map = {
"Original": [
Expand Down Expand Up @@ -435,6 +438,7 @@ def clean_passage_metadata(patient_meta_df):
for x in patient_meta_df["Passage"][
pd.isnull(patient_meta_df["passage"])
]
.astype(str)
.str.strip()
.unique()
.astype(str)
Expand All @@ -456,7 +460,7 @@ def clean_specimen_metadata(patient_meta_df):
print("Cleaning specimen metadata...", end="", flush=True)

# Basic cleanup
patient_meta_df["specimen"] = patient_meta_df["Specimen"].str.strip()
patient_meta_df["specimen"] = patient_meta_df["Specimen"].astype(str).str.strip()

specimen_key_map = {
"Alveolar lavage fluid": [],
Expand Down Expand Up @@ -729,6 +733,7 @@ def clean_specimen_metadata(patient_meta_df):
for x in patient_meta_df["Specimen"][
pd.isnull(patient_meta_df["specimen"])
]
.astype(str)
.str.strip()
.unique()
.astype(str)
Expand All @@ -744,7 +749,9 @@ def clean_specimen_metadata(patient_meta_df):

def clean_collection_date_metadata(patient_meta_df):

patient_meta_df["collection_date"] = patient_meta_df["Collection date"].str.strip()
patient_meta_df["collection_date"] = (
patient_meta_df["Collection date"].astype(str).str.strip()
)

# Filter out really unspecific collection dates
# If the date is 4 characters or less (a year, like "2019", or "2020"), then remove it
Expand All @@ -765,12 +772,12 @@ def clean_collection_date_metadata(patient_meta_df):


def clean_lineage_metadata(patient_meta_df):
patient_meta_df["lineage"] = patient_meta_df["Lineage"].str.strip()
patient_meta_df["lineage"] = patient_meta_df["Lineage"].astype(str).str.strip()
return patient_meta_df


def clean_clade_metadata(patient_meta_df):
patient_meta_df["clade"] = patient_meta_df["Clade"].str.strip()
patient_meta_df["clade"] = patient_meta_df["Clade"].astype(str).str.strip()
return patient_meta_df


Expand Down
12 changes: 7 additions & 5 deletions cg_scripts/process_seq_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ def clean_seq_tech_metadata(seq_meta_df):
print("Cleaning sequencing technology metadata...", end="", flush=True)

# Basic cleaning
seq_meta_df["sequencing_tech"] = seq_meta_df["Sequencing technology"].str.strip()
seq_meta_df["sequencing_tech"] = (
seq_meta_df["Sequencing technology"].astype(str).str.strip()
)

replace_map = [
(r"illumina", "Illumina", False),
Expand Down Expand Up @@ -123,7 +125,9 @@ def clean_assembly_metadata(seq_meta_df):

print("Cleaning assembly method metadata...", end="", flush=True)

seq_meta_df["assembly_method"] = seq_meta_df["Assembly method"].str.strip()
seq_meta_df["assembly_method"] = (
seq_meta_df["Assembly method"].astype(str).str.strip()
)

replace_map = [
# Aliases
Expand Down Expand Up @@ -203,10 +207,8 @@ def clean_assembly_metadata(seq_meta_df):


def clean_comment_type_metadata(seq_meta_df):
seq_meta_df["comment_type"] = seq_meta_df["Comment type"].str.strip()

seq_meta_df["comment_type"] = seq_meta_df["Comment type"].astype(str).str.strip()
seq_meta_df["comment_type"] = seq_meta_df["comment_type"].fillna("None")

return seq_meta_df


Expand Down
Binary file added src/assets/images/download_nextmeta.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 7 additions & 1 deletion src/components/Pages/ExampleTab.js
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ const ExampleTab = observer(() => {
</ul>
</TOC>

<ExampleHeader>
<ExampleHeader style={{ marginBottom: '10px' }}>
<a id="introduction" />
<ExampleTitle>Introduction</ExampleTitle>
<a href="#" onClick={scrollToRef.bind(this, 'getting-started-top')}>
Expand All @@ -526,6 +526,12 @@ const ExampleTab = observer(() => {
</ExampleHeader>

<ExampleTutorial>
<b style={{ fontSize: '1.4em', marginTop: '10px' }}>
Preprint:{' '}
<ExternalLink href="https://www.biorxiv.org/content/10.1101/2020.09.23.310565v2">
https://www.biorxiv.org/content/10.1101/2020.09.23.310565v2
</ExternalLink>
</b>
<p>
<b>
The COVID-19 CoV Genetics browser was designed to empower diverse
Expand Down
5 changes: 3 additions & 2 deletions src/components/Table/AcknowledgementsTable.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import React, { useState, useEffect } from 'react';
import styled from 'styled-components';
import { observer } from 'mobx-react';
import { useStores } from '../../stores/connect';
import { getAckTextsFromAckIds } from '../../utils/acknowledgements';
// import { getAckTextsFromAckIds } from '../../utils/acknowledgements';
import _ from 'underscore';

import { ASYNC_STATES } from '../../constants/UI';
Expand Down Expand Up @@ -55,7 +55,8 @@ const AcknowledgementsTable = observer(() => {

// Get the list of selected Accession IDs, and map to
// acknowledgement texts
let ackTexts = getAckTextsFromAckIds(ackIds);
// let ackTexts = getAckTextsFromAckIds(ackIds);
let ackTexts = [];
// Set the acknowledgement ID for each acknowledgement object
for (let i = 0; i < ackTexts.length; i++) {
ackTexts[i]['ack_id'] = ackIds[i];
Expand Down
5 changes: 4 additions & 1 deletion src/stores/configStore.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,12 @@ export const initialConfigValues = {

selectTree: initialSelectTree,
selectedLocationNodes: [
// NOTE: comment out these lines, if you are working with a custom dataset that
// doesn't have any sequences from these location
// Maybe should move these settings into a more easily-editable YAML file...
getLocationByNameAndLevel(initialSelectTree, 'USA', 'country', true)[0],
getLocationByNameAndLevel(initialSelectTree, 'Canada', 'country', true)[0],
],
].filter((node) => node !== undefined),

hoverGroup: null,
selectedGroups: [],
Expand Down
Loading

0 comments on commit ee23da3

Please sign in to comment.