Skip to content

Commit

Permalink
Mutation Frequency Report (#615)
Browse files Browse the repository at this point in the history
* Add mutation frequency report script

* fix typo
  • Loading branch information
atc3 authored Apr 19, 2023
1 parent 9925dcb commit 81e48f1
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 5 deletions.
34 changes: 30 additions & 4 deletions workflow_main/analyses/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,27 @@ rule consensus_mutations:
"""


rule mutation_frequency_report:
input:
metadata_map = os.path.join(data_folder, "metadata_map.json"),
group_mutation_frequencies = rules.consensus_mutations.output.group_mutation_frequencies
output:
mutation_frequency_json = os.path.join(data_folder, "group_mutation_frequencies_annotated.json"),
mutation_frequency_dna_csv = os.path.join(data_folder, "group_dna_mutation_frequencies.csv"),
mutation_frequency_gene_aa_csv = os.path.join(data_folder, "group_gene_aa_mutation_frequencies.csv"),
mutation_frequency_protein_aa_csv = os.path.join(data_folder, "group_protein_aa_mutation_frequencies.csv")
shell:
"""
python3 analyses/scripts/mutation_frequency_report.py \
--group-mutation-frequencies {input.group_mutation_frequencies} \
--metadata-map {input.metadata_map} \
--output-json {output.mutation_frequency_json} \
--output-dna-csv {output.mutation_frequency_dna_csv} \
--output-gene-aa-csv {output.mutation_frequency_gene_aa_csv} \
--output-protein-aa-csv {output.mutation_frequency_protein_aa_csv}
"""


rule additional_analyses:
input:
# AZ REPORTS
Expand All @@ -199,16 +220,21 @@ rule additional_analyses:
# Surveillance data
rules.surveillance_data.output.group_counts,
rules.surveillance_data.output.group_regression,
# # Old global sequencing plot
# Old global sequencing plot
rules.create_standalone_map_spec.output.standalone_spec,
# # New global sequencing data
# New global sequencing data
rules.global_seq_data.output.case_count,
rules.global_seq_data.output.sequences_per_month,
rules.global_seq_data.output.turnaround_per_month,
rules.global_seq_data.output.iso_lookup,
# # Consensus Mutations
# Consensus Mutations
rules.consensus_mutations.output.group_consensus_mutations,
rules.consensus_mutations.output.group_mutation_frequencies
rules.consensus_mutations.output.group_mutation_frequencies,
# Annotated mutation frequencies
rules.mutation_frequency_report.output.mutation_frequency_json,
rules.mutation_frequency_report.output.mutation_frequency_dna_csv,
rules.mutation_frequency_report.output.mutation_frequency_gene_aa_csv,
rules.mutation_frequency_report.output.mutation_frequency_protein_aa_csv

output:
done = touch(os.path.join(
Expand Down
2 changes: 1 addition & 1 deletion workflow_main/analyses/scripts/az_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def main():

isolate_df = isolate_df.loc[valid_group_reference_pair, :]

# Load DNA mutation ID map
# Load mutation ID maps
with open(args.metadata_map, "r") as fp:
metadata_map = json.loads(fp.read())

Expand Down
203 changes: 203 additions & 0 deletions workflow_main/analyses/scripts/mutation_frequency_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# coding: utf-8

"""Add mutation metadata to group mutation frequencies file
Also spit out more readable CSVs for easier ingestion
Author: Albert Chen ([email protected])
"""

import argparse
import json

import pandas as pd


def dna_mutation_to_name(mut_str):
split = mut_str.split("|")
# REF POS ALT
return split[2] + split[1] + split[3]


def aa_mutation_to_name(mut_str):
split = mut_str.split("|")
# REF POS ALT
return split[2] + split[1] + split[3]


def main():
"""Command-line entry point"""

parser = argparse.ArgumentParser()
parser.add_argument(
"--group-mutation-frequencies",
type=str,
required=True,
help="Path to group mutation frequencies JSON file",
)
parser.add_argument(
"--metadata-map", type=str, required=True, help="Metadata map JSON file"
)
parser.add_argument(
"--output-json", type=str, required=True, help="Path to output JSON file"
)
parser.add_argument(
"--output-dna-csv", type=str, required=True, help="Path to output DNA CSV file"
)
parser.add_argument(
"--output-gene-aa-csv",
type=str,
required=True,
help="Path to output gene AA CSV file",
)
parser.add_argument(
"--output-protein-aa-csv",
type=str,
required=True,
help="Path to output protein AA CSV file",
)
args = parser.parse_args()

# Open mutation frequency data
with open(args.group_mutation_frequencies, "r") as fp:
group_mutation_frequencies = json.loads(fp.read())

# Load mutation ID maps
with open(args.metadata_map, "r") as fp:
metadata_map = json.loads(fp.read())

id_to_dna_mutation = {v: k for k, v in metadata_map["dna_mutation"].items()}
id_to_gene_aa_mutation = {v: k for k, v in metadata_map["gene_aa_mutation"].items()}
id_to_protein_aa_mutation = {
v: k for k, v in metadata_map["protein_aa_mutation"].items()
}

# -----------------------------
# MUTATION DEFINITIONS
# -----------------------------

dna_mutation_def = (
pd.Series(id_to_dna_mutation)
.rename("mutation_str")
.rename_axis("mutation_id")
.to_frame()
)
dna_mutation_def.insert(
0, "segment", dna_mutation_def["mutation_str"].apply(lambda x: x.split("|")[0])
)
dna_mutation_def.insert(
1, "pos", dna_mutation_def["mutation_str"].apply(lambda x: int(x.split("|")[1]))
)
dna_mutation_def.insert(
2, "ref", dna_mutation_def["mutation_str"].apply(lambda x: x.split("|")[2])
)
dna_mutation_def.insert(
3, "alt", dna_mutation_def["mutation_str"].apply(lambda x: x.split("|")[3])
)
dna_mutation_def.insert(
0,
"mutation_name",
dna_mutation_def["mutation_str"].apply(dna_mutation_to_name),
)

gene_aa_mutation_def = (
pd.Series(id_to_gene_aa_mutation)
.rename("mutation_str")
.rename_axis("mutation_id")
.to_frame()
)
gene_aa_mutation_def.insert(
1, "gene", gene_aa_mutation_def["mutation_str"].apply(lambda x: x.split("|")[0])
)
gene_aa_mutation_def.insert(
2,
"pos",
gene_aa_mutation_def["mutation_str"].apply(lambda x: int(x.split("|")[1])),
)
gene_aa_mutation_def.insert(
3, "ref", gene_aa_mutation_def["mutation_str"].apply(lambda x: x.split("|")[2])
)
gene_aa_mutation_def.insert(
4, "alt", gene_aa_mutation_def["mutation_str"].apply(lambda x: x.split("|")[3])
)
gene_aa_mutation_def.insert(
0,
"mutation_name",
gene_aa_mutation_def["mutation_str"].apply(aa_mutation_to_name),
)

protein_aa_mutation_def = (
pd.Series(id_to_protein_aa_mutation)
.rename("mutation_str")
.rename_axis("mutation_id")
.to_frame()
)
protein_aa_mutation_def.insert(
1,
"protein",
protein_aa_mutation_def["mutation_str"].apply(lambda x: x.split("|")[0]),
)
protein_aa_mutation_def.insert(
2,
"pos",
protein_aa_mutation_def["mutation_str"].apply(lambda x: int(x.split("|")[1])),
)
protein_aa_mutation_def.insert(
3,
"ref",
protein_aa_mutation_def["mutation_str"].apply(lambda x: x.split("|")[2]),
)
protein_aa_mutation_def.insert(
4,
"alt",
protein_aa_mutation_def["mutation_str"].apply(lambda x: x.split("|")[3]),
)
protein_aa_mutation_def.insert(
0,
"mutation_name",
protein_aa_mutation_def["mutation_str"].apply(aa_mutation_to_name),
)

mutation_defs = {
"dna": dna_mutation_def.to_dict(orient="index"),
"gene_aa": gene_aa_mutation_def.to_dict(orient="index"),
"protein_aa": protein_aa_mutation_def.to_dict(orient="index"),
}

out_dfs = {"dna": [], "gene_aa": [], "protein_aa": []}

for group in group_mutation_frequencies.keys():
for reference_name in group_mutation_frequencies[group].keys():
for mutation_type in group_mutation_frequencies[group][
reference_name
].keys():
mutations = group_mutation_frequencies[group][reference_name][
mutation_type
]
defs = [
mutation_defs[mutation_type][mutation["mutation_id"]]
for mutation in mutations
]
group_mutation_frequencies[group][reference_name][mutation_type] = [
{**a, **b} for a, b in zip(mutations, defs)
]
mutation_df = pd.DataFrame.from_dict(
group_mutation_frequencies[group][reference_name][mutation_type]
)
mutation_df.insert(1, "reference", reference_name)
out_dfs[mutation_type].append(mutation_df)

for mutation_type in ["dna", "gene_aa", "protein_aa"]:
out_dfs[mutation_type] = pd.concat(
out_dfs[mutation_type], axis=0, ignore_index=True
)

with open(args.output_json, "w") as fp:
fp.write(json.dumps(group_mutation_frequencies))

out_dfs["dna"].to_csv(args.output_dna_csv, index=False)
out_dfs["gene_aa"].to_csv(args.output_gene_aa_csv, index=False)
out_dfs["protein_aa"].to_csv(args.output_protein_aa_csv, index=False)


if __name__ == "__main__":
main()

0 comments on commit 81e48f1

Please sign in to comment.