Skip to content

Commit

Permalink
Phasing and variant calling improvements (#4)
Browse files Browse the repository at this point in the history
* improve phasing and handle homozygous case
* better handle homozygous case
* update rccx
  • Loading branch information
xiao-chen-xc authored Apr 7, 2023
1 parent 075730f commit 775c1d1
Show file tree
Hide file tree
Showing 23 changed files with 2,413 additions and 1,022 deletions.
2 changes: 1 addition & 1 deletion paraphase/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
name = "paraphase"
__version__ = "2.1.0"
7 changes: 2 additions & 5 deletions paraphase/data/ncf1/ncf1_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@ data:
coordinates:
hg38:
nchr: "chr7"
nchr_old: "chr7_74768800_74792800"
nchr_old: "chr7_74760000_74820000"
nchr_length: 159345973
extract_region1: "chr7:74769011-74792315"
extract_region2: "chr7:73215625-73238945 chr7:75153639-75176964"

pivot_site: 74777266
pivot_site: 74777265
left_boundary: 74769011
right_boundary: 74792315

#mutli-alleleic site
noisy_region: [[74783765, 74783765], [74781743, 74781743], [74776715, 74776718], [74780432, 74780432], [74778377, 74778377]]
1,404 changes: 1,002 additions & 402 deletions paraphase/data/ncf1/ref.fa

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion paraphase/data/ncf1/ref.fa.fai
Original file line number Diff line number Diff line change
@@ -1 +1 @@
chr7_74768800_74792800 24001 24 60 61
chr7_74760000_74820000 60001 24 60 61
2 changes: 1 addition & 1 deletion paraphase/data/neb/neb_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ coordinates:
left_boundary: 151578800
right_boundary: 151588500

noisy_region: [[151584195, 151584196], [151578903, 151578903], [151579992, 151580022]]
noisy_region: [[151579992, 151580022]]
2 changes: 1 addition & 1 deletion paraphase/data/pms2/pms2_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ data:
coordinates:
hg38:
nchr: "chr7"
nchr_old: "chr7_5967000_5992500"
nchr_old: "chr7_5957000_6010000"
nchr_length: 159345973
extract_region1: "chr7:5970000-5989062"
extract_region2: "chr7:6735630-6754792"
Expand Down
1,312 changes: 885 additions & 427 deletions paraphase/data/pms2/pms2_ref.fa

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion paraphase/data/pms2/pms2_ref.fa.fai
Original file line number Diff line number Diff line change
@@ -1 +1 @@
chr7_5967000_5992500 25501 22 60 61
chr7_5957000_6010000 53001 22 60 61
2 changes: 0 additions & 2 deletions paraphase/data/rccx/rccx_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ coordinates:
deletion1_size: 6367
deletion2_size: 120

noisy_region: [[32029159, 32029159], [32022483, 32022483]]

left_boundary: 32013300
right_boundary: 32046000

Expand Down
2 changes: 2 additions & 0 deletions paraphase/data/strc/strc_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ coordinates:

depth_region: [[43610000, 43660000]]

pivot_site: 43602487

left_boundary: 43599500
right_boundary: 43619600

Expand Down
5 changes: 1 addition & 4 deletions paraphase/genes/cfc1_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,10 @@ def call(self):
return None
self.get_homopolymer()
self.get_candidate_pos()
# add pivot site
if "130593061_A_G" not in self.candidate_pos:
self.candidate_pos.add("130593061_A_G")
self.het_sites = sorted(list(self.candidate_pos))
self.remove_noisy_sites()

raw_read_haps = self.get_haplotypes_from_reads(self.het_sites)
raw_read_haps = self.get_haplotypes_from_reads(add_sites=["130593061_A_G"])

(
ass_haps,
Expand Down
4 changes: 3 additions & 1 deletion paraphase/genes/f8_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def call(self):
self.het_sites = sorted(list(self.candidate_pos))
self.remove_noisy_sites()

raw_read_haps = self.get_haplotypes_from_reads(self.het_sites, check_clip=True)
raw_read_haps = self.get_haplotypes_from_reads(
check_clip=True, kept_sites=["155386300_A_C", "155386860_C_G"]
)

(
ass_haps,
Expand Down
49 changes: 30 additions & 19 deletions paraphase/genes/ikbkg_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def call(self):

self.get_candidate_pos(min_vaf=0.095)

# add these sites for duplication/deletion calling
var_found = False
for var in self.candidate_pos:
pos = int(var.split("_")[0])
Expand All @@ -68,9 +69,10 @@ def call(self):
self.remove_noisy_sites()

raw_read_haps = self.get_haplotypes_from_reads(
self.het_sites,
check_clip=True,
partial_deletion_reads=self.del1_reads_partial,
kept_sites=["154569800_T_G"],
add_sites=["154555882_C_G"],
)
het_sites = self.het_sites
if self.del1_reads_partial != set():
Expand Down Expand Up @@ -101,25 +103,26 @@ def call(self):
pseudo_counter = 0
dup_counter = 0
deletion_haplotypes = []
for i, hap in enumerate(ass_haps):
nsite = min(len(hap), 10)
start_seq = hap[:nsite]
if start_seq.startswith("0") is False:
if start_seq.count("1") >= start_seq.count("2"):
gene_counter += 1
hap_name = f"ikbkg_hap{gene_counter}"
tmp.setdefault(hap, hap_name)
if "3" in hap:
deletion_haplotypes.append(hap_name)
pivot_index, index_found = self.get_pivot_site_index()
if index_found is True:
for i, hap in enumerate(ass_haps):
start_seq = hap[: pivot_index + 1]
if start_seq.startswith("0") is False:
if start_seq.count("1") >= start_seq.count("2"):
gene_counter += 1
hap_name = f"ikbkg_hap{gene_counter}"
tmp.setdefault(hap, hap_name)
if "3" in hap:
deletion_haplotypes.append(hap_name)
else:
pseudo_counter += 1
hap_name = f"pseudo_hap{pseudo_counter}"
tmp.setdefault(hap, hap_name)
if "3" in hap:
deletion_haplotypes.append(hap_name)
else:
pseudo_counter += 1
hap_name = f"pseudo_hap{pseudo_counter}"
tmp.setdefault(hap, hap_name)
if "3" in hap:
deletion_haplotypes.append(hap_name)
else:
dup_counter += 1
tmp.setdefault(hap, f"dup_hap{dup_counter}")
dup_counter += 1
tmp.setdefault(hap, f"dup_hap{dup_counter}")
ass_haps = tmp

haplotypes = None
Expand Down Expand Up @@ -161,6 +164,14 @@ def call(self):
new_allele.append(ass_haps[hap])
new_alleles.append(new_allele)

if gene_counter == 0 or pseudo_counter == 0:
total_cn = None
gene_counter = None
# homozygous case
if total_cn == 0:
total_cn = None
gene_counter = None

self.close_handle()

return self.GeneCall(
Expand Down
11 changes: 6 additions & 5 deletions paraphase/genes/ncf1_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def call(self):
return None
pivot_site = self.pivot_site
for pileupcolumn in self._bamh.pileup(
self.nchr, pivot_site - 1, pivot_site, truncate=True
self.nchr, pivot_site, pivot_site + 1, truncate=True
):
bases = [
a.upper() for a in pileupcolumn.get_query_sequences(add_indels=True)
Expand All @@ -37,13 +37,10 @@ def call(self):

self.get_homopolymer()
self.get_candidate_pos()
# add pivot site
if "74777266_G_A" not in self.candidate_pos:
self.candidate_pos.add("74777266_G_A")
self.het_sites = sorted(list(self.candidate_pos))
self.remove_noisy_sites()

raw_read_haps = self.get_haplotypes_from_reads(self.het_sites)
raw_read_haps = self.get_haplotypes_from_reads(add_sites=["74777265_A_T"])

(
ass_haps,
Expand Down Expand Up @@ -115,6 +112,10 @@ def call(self):
counter_gene = None
total_cn = None

# homozygous case
if total_cn == 0:
total_cn = None

self.close_handle()

return self.GeneCall(
Expand Down
6 changes: 5 additions & 1 deletion paraphase/genes/neb_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def call(self):
self.het_sites = sorted(list(self.candidate_pos))
self.remove_noisy_sites()

raw_read_haps = self.get_haplotypes_from_reads(self.het_sites)
raw_read_haps = self.get_haplotypes_from_reads()

(
ass_haps,
Expand Down Expand Up @@ -127,6 +127,10 @@ def call(self):
total_cn = None
new_alleles = []

# homozygous case
if total_cn == 0:
total_cn = None

self.close_handle()

return self.GeneCall(
Expand Down
11 changes: 8 additions & 3 deletions paraphase/genes/pms2_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def call(self):
self.het_sites = sorted(list(self.candidate_pos))
self.remove_noisy_sites()
# for distinguishing pms2 from pms2cl
self.het_sites.append("5989137_G_A")

raw_read_haps = self.get_haplotypes_from_reads(self.het_sites, check_clip=True)
raw_read_haps = self.get_haplotypes_from_reads(
check_clip=True, add_sites=["5989137_G_A"]
)

(
ass_haps,
Expand Down Expand Up @@ -77,6 +77,11 @@ def call(self):
# bigger cnvs are not handled here yet
if pms2_cn != 2:
pms2_cn = None

# homozygous case
if total_cn == 0:
total_cn = None

self.close_handle()

return self.GeneCall(
Expand Down
13 changes: 8 additions & 5 deletions paraphase/genes/rccx_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def annotate_var(self, allele_var):
elif abs(len(tmp[1]) - len(tmp[2])) <= 1 and len(tmp[2]) >= 6:
annotated_allele = "pseudogene_duplication"
else:
annotated_allele = "duplicaton_WT_plus_" + ",".join(tmp[1])
annotated_allele = "duplication_WT_plus_" + ",".join(tmp[1])
else:
if abs(len(tmp[1]) - len(tmp[2])) <= 1 and len(tmp[2]) >= 6:
annotated_allele = ",".join(tmp[0]) + "_pseudogene_duplication"
Expand Down Expand Up @@ -367,7 +367,7 @@ def update_alleles(
successful_phasing = True

# add missing links when there is no two-cp haplotypes
if two_cp_haplotypes == []:
if two_cp_haplotypes == [] and len(ending_copies) <= 2:
# add the missing link in cn=4
if (
len(final_haps) in [3, 4]
Expand Down Expand Up @@ -509,11 +509,14 @@ def call(self):

self.het_sites = sorted(list(self.candidate_pos))
self.remove_noisy_sites()
het_sites = self.het_sites

raw_read_haps = self.get_haplotypes_from_reads(
het_sites, check_clip=True, partial_deletion_reads=self.del1_reads_partial
check_clip=True,
partial_deletion_reads=self.del1_reads_partial,
kept_sites=["32046300_G_A", "32013265_A_T"],
)

het_sites = self.het_sites
if self.del2_reads_partial != set():
raw_read_haps, het_sites = self.update_reads_for_deletions(
raw_read_haps,
Expand Down Expand Up @@ -614,7 +617,7 @@ def call(self):
if ass_haps == [] and self.het_sites == []:
# homozygous, feed all reads to call variants
total_cn = 2
if total_cn < 2:
if total_cn < 2 or len(ending_copies) > 2:
total_cn = None

annotated_alleles = self.annotate_alleles(
Expand Down
8 changes: 3 additions & 5 deletions paraphase/genes/smn1_phaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,14 +511,12 @@ def call(self):
[self.del1_5p_pos1, self.del1_5p_pos2],
]
self.get_candidate_pos(regions_to_check=regions_to_check)
# always add splice site
if self.candidate_pos != set():
self.candidate_pos.add("70951946_C_T")

het_sites = sorted(list(self.candidate_pos))
raw_read_haps = self.get_haplotypes_from_reads(het_sites)
self.het_sites = sorted(list(self.candidate_pos))
raw_read_haps = self.get_haplotypes_from_reads(add_sites=["70951946_C_T"])

# update reads for those overlapping known deletions
het_sites = self.het_sites
if self.smn2_del_reads_partial != set():
raw_read_haps, het_sites = self.update_reads_for_deletions(
raw_read_haps,
Expand Down
Loading

0 comments on commit 775c1d1

Please sign in to comment.