Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

snp test #1

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/data/snp/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*/
55 changes: 55 additions & 0 deletions tests/data/snp/kidney_eqtl.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
##fileformat=VCFv4.2
chr1 16693559 chr1_16693559_C_T_b38 C T . .
chr1 25235176 chr1_25235176_G_A_b38 G A . .
chr1 121041638 chr1_121041638_C_T_b38 C T . .
chr2 48595242 chr2_48595242_G_A_b38 G A . .
chr2 62506118 chr2_62506118_C_T_b38 C T . .
chr2 74958729 chr2_74958729_G_T_b38 G T . .
chr2 108288574 chr2_108288574_C_T_b38 C T . .
chr2 110607190 chr2_110607190_G_A_b38 G A . .
chr2 127648067 chr2_127648067_C_T_b38 C T . .
chr2 131682500 chr2_131682500_G_C_b38 C G . .
chr2 161244750 chr2_161244750_G_A_b38 G A . .
chr3 69105192 chr3_69105192_C_G_b38 G C . .
chr5 5078200 chr5_5078200_C_T_b38 C T . .
chr5 71044045 chr5_71044045_G_T_b38 G T . .
chr5 181003797 chr5_181003797_T_C_b38 T C . .
chr6 85678170 chr6_85678170_G_T_b38 T G . .
chr6 106629627 chr6_106629627_G_A_b38 G A . .
chr6 111592059 chr6_111592059_C_T_b38 C T . .
chr6 130053316 chr6_130053316_T_A_b38 A T . .
chr6 135302695 chr6_135302695_C_T_b38 C T . .
chr7 30612322 chr7_30612322_G_A_b38 A G . .
chr7 95596945 chr7_95596945_T_C_b38 T C . .
chr9 62885682 chr9_62885682_A_G_b38 A G . .
chr10 45832504 chr10_45832504_G_C_b38 G C . .
chr10 48022458 chr10_48022458_G_A_b38 G A . .
chr10 45947840 chr10_45947840_C_T_b38 C T . .
chr11 1941470 chr11_1941470_C_T_b38 C T . .
chr14 73787509 chr14_73787509_G_A_b38 G A . .
chr14 94129652 chr14_94129652_T_C_b38 T C . .
chr15 95130166 chr15_95130166_G_A_b38 A G . .
chr15 99733402 chr15_99733402_G_C_b38 C G . .
chr15 100574435 chr15_100574435_T_C_b38 T C . .
chr16 1215891 chr16_1215891_G_A_b38 G A . .
chr16 2936322 chr16_2936322_T_C_b38 C T . .
chr16 11976565 chr16_11976565_T_C_b38 C T . .
chr16 14912068 chr16_14912068_C_T_b38 C T . .
chr16 18339330 chr16_18339330_G_C_b38 G C . .
chr16 70251998 chr16_70251998_A_G_b38 A G . .
chr16 84180359 chr16_84180359_G_A_b38 G A . .
chr17 4443684 chr17_4443684_T_C_b38 C T . .
chr17 6700205 chr17_6700205_T_C_b38 T C . .
chr17 15976596 chr17_15976596_C_T_b38 T C . .
chr17 40720995 chr17_40720995_T_C_b38 T C . .
chr17 50547162 chr17_50547162_A_C_b38 A C . .
chr19 44662645 chr19_44662645_A_C_b38 C A . .
chr19 44718265 chr19_44718265_T_C_b38 C T . .
chr19 57438168 chr19_57438168_T_C_b38 C T . .
chr22 17199629 chr22_17199629_A_C_b38 A C . .
chr22 18531430 chr22_18531430_G_A_b38 A G . .
chr22 21468436 chr22_21468436_T_C_b38 C T . .
chr22 21980728 chr22_21980728_T_C_b38 T C . .
chr22 24002141 chr22_24002141_T_G_b38 G T . .
chr22 50562842 chr22_50562842_G_A_b38 G A . .
chrX 153060001 chrX_153060001_G_A_b38 G A . .
92 changes: 92 additions & 0 deletions tests/test_snp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import subprocess

import h5py
import numpy as np
import pandas as pd

from baskerville.dataset import targets_prep_strand

params_file = "tests/data/eval/params.json"
model_file = "tests/data/eval/model.h5"
targets_file = "tests/data/tiny/hg38/targets.txt"
snp_dir = "tests/data/snp"
vcf_file = f"{snp_dir}/kidney_eqtl.vcf"
fasta_file = "%s/assembly/ucsc/hg38.fa" % os.environ["HG38"]
stat_keys = ["logSAD", "logD2"]

def test_snp():
test_out_dir = f"{snp_dir}/full"
scores_file = f"{test_out_dir}/scores.h5"
if os.path.isfile(scores_file):
os.remove(scores_file)

cmd = [
"src/baskerville/scripts/hound_snp.py",
"-f",
fasta_file,
"-o",
test_out_dir,
"--stats",
",".join(stat_keys),
"--rc",
"-t",
targets_file,
params_file,
model_file,
vcf_file
]
print(" ".join(cmd))
subprocess.run(cmd, check=True)

with h5py.File(scores_file, "r") as scores_h5:
for sk in stat_keys:
score = scores_h5[sk][:]
score_var = score.var(axis=0, dtype='float32')
assert (score_var> 0).all()


def test_slice():
test_full_dir = f"{snp_dir}/full"
test_slice_dir = f"{snp_dir}/sub"
os.makedirs(test_slice_dir, exist_ok=True)
scores_file = f"{test_slice_dir}/scores.h5"
if os.path.isfile(scores_file):
os.remove(scores_file)

# slice targets
targets_df = pd.read_csv(targets_file, sep="\t", index_col=0)
rna_mask = np.array([desc.startswith("RNA") for desc in targets_df.description])
targets_rna_df = targets_df[rna_mask]
targets_rna_file = f"{test_slice_dir}/targets_rna.txt"
targets_rna_df.to_csv(targets_rna_file, sep="\t")

cmd = [
"src/baskerville/scripts/hound_snp.py",
"-f",
fasta_file,
"-o",
test_slice_dir,
"--rc",
"--stats",
",".join(stat_keys),
"-t",
targets_rna_file,
params_file,
model_file,
vcf_file
]
print(" ".join(cmd))
subprocess.run(cmd, check=True)

# stranded mask
targets_strand_df = targets_prep_strand(targets_df)
rna_strand_mask = np.array([desc.startswith("RNA") for desc in targets_strand_df.description])

for sk in stat_keys:
with h5py.File(f"{test_full_dir}/scores.h5", "r") as scores_h5:
score_full = scores_h5[sk][:].astype('float32')
score_full = score_full[...,rna_strand_mask]
with h5py.File(f"{test_slice_dir}/scores.h5", "r") as scores_h5:
score_slice = scores_h5[sk][:].astype('float32')
assert np.allclose(score_full, score_slice)
Loading