Skip to content

Commit

Permalink
Merge pull request #22 from JianjunJin/new_search
Browse files Browse the repository at this point in the history
New search
  • Loading branch information
JianjunJin authored Nov 6, 2023
2 parents 4c8c76b + 3d04476 commit 803ae6e
Show file tree
Hide file tree
Showing 11 changed files with 732 additions and 526 deletions.
42 changes: 28 additions & 14 deletions traversome/Assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from collections import OrderedDict
from loguru import logger
from typing import Union
from traversome.AssemblySimple import AssemblySimple #, VertexMergingHistory, VertexEditHistory
from traversome.AssemblySimple import AssemblySimple #, VertexMergingHistory, VertexEditHistory
# from traversome.PathGeneratorGraphOnly import PathGeneratorGraphOnly
from traversome.PathGenerator import PathGenerator
# from traversome.VariantGenerator import VariantGenerator
# from traversome.EstMultiplicityFromCov import EstMultiplicityFromCov
# from traversome.EstMultiplicityPrecise import EstMultiplicityPrecise
from traversome.utils import (
Expand Down Expand Up @@ -955,7 +955,7 @@ def reduce_to_subgraph(self,
# :param force_circular
# :param hetero_chromosome
# """
# generator = PathGenerator(
# generator = VariantGenerator(
# assembly_graph_obj=self,
# graph_alignment=graph_alignment,
# min_valid_search=min_valid_search,
Expand Down Expand Up @@ -999,6 +999,7 @@ def is_fully_covered_by(self, input_path):
len(graph_set), len(path_set), sorted(set(self.vertex_info) - set([_n_ for _n_, _e_ in input_path]))))
return graph_set == path_set

# TODO can be cached
def get_path_length(self, input_path, check_valid=True, adjust_for_cyclic=False):
# uni_overlap = self.__uni_overlap if self.__uni_overlap else 0
# circular_len = sum([self.vertex_info[name].len - uni_overlap for name, strand in input_path])
Expand All @@ -1018,18 +1019,31 @@ def get_path_length(self, input_path, check_valid=True, adjust_for_cyclic=False)
else:
return sum(accumulated_lens) + last_v_info.len

def get_path_internal_length(self, input_path, keep_terminal_overlaps=True):
assert len(input_path) > 1, f"input path len cannot be <= 1, this path is {input_path}"
# internal_len is allowed to be negative when this_overlap > 0 and len(the_repeat_path) == 2
if keep_terminal_overlaps:
ovl_list = [self.vertex_info[_n1].connections[_e1][(_n2, not _e2)]
for (_n1, _e1), (_n2, _e2) in zip(input_path[1:-2], input_path[2:-1])]
# TODO can be cached
def get_path_internal_length(self, input_path, keep_terminal_overlaps=True) -> int:
"""
For instance,
given path [a, b, c, d] below, It and I are the internal length with and without terminal overlaps.
a -----------
b -----------
c -----------
d ----------
It --------------------
I --------------
"""
if len(input_path) < 3:
return 0
else:
ovl_list = [self.vertex_info[_n1].connections[_e1][(_n2, not _e2)]
for (_n1, _e1), (_n2, _e2) in zip(input_path[:-1], input_path[1:])]
inter_ls = [self.vertex_info[_n1].len
for (_n1, _e1) in input_path[1:-1]]
return sum(inter_ls) - sum(ovl_list)
if keep_terminal_overlaps:
ovl_list = [self.vertex_info[_n1].connections[_e1][(_n2, not _e2)]
for (_n1, _e1), (_n2, _e2) in zip(input_path[1:-2], input_path[2:-1])]
else:
ovl_list = [self.vertex_info[_n1].connections[_e1][(_n2, not _e2)]
for (_n1, _e1), (_n2, _e2) in zip(input_path[:-1], input_path[1:])]
inter_ls = [self.vertex_info[_n1].len
for (_n1, _e1) in input_path[1:-1]]
return max(sum(inter_ls) - sum(ovl_list), 0)
# uni_overlap = self.__uni_overlap if self.__uni_overlap else 0
# internal_len = -uni_overlap
# for seg_name, seg_strand in input_path[1:-1]:
Expand Down
34 changes: 22 additions & 12 deletions traversome/GraphAlignRecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sympy
from itertools import product
from multiprocessing import Manager, Pool
import gzip
# import dill


Expand Down Expand Up @@ -403,7 +404,10 @@ def __init__(
self.read_records = OrderedDict()

# run the parsing function
self.parse_alignment_file(num_proc=num_proc, _num_block_lines=kwargs.get("_num_block_lines", 10000))
if self.alignment_file.endswith("gz"): # TODO multiprocess with gzip not implemented yet
self.parse_alignment_file(num_proc=1)
else:
self.parse_alignment_file(num_proc=num_proc, _num_block_lines=kwargs.get("_num_block_lines", 10000))
self.build_read_records()

# generate the probabilities of multiple hits for each query
Expand Down Expand Up @@ -744,7 +748,7 @@ def __cal_unaligned_likelihood(self, query_seq, query_base_frequencies):
def __cal_alignment_likelihood(self, record, qry_seq, half_kmer, transition_matrix):
this_likelihood = 1.
ref_seq = self.__format_modify_ref_seq(
path=record.path, p_start=record.p_start, p_end=record.p_end + 1, half_kmer=half_kmer)
path=record.path, p_start=record.p_start, p_end=record.p_end, half_kmer=half_kmer) # p_end is open
qry_ali, ref_ali = _insert_gaps_to_alignment(
_q_seq=qry_seq, _q_start=record.q_start, _r_seq=ref_seq, _r_start=half_kmer, _cigar=record.cigar)
kmer = 2 * half_kmer + 1
Expand Down Expand Up @@ -974,23 +978,29 @@ def parse_alignment_file_mul(self, num_proc, num_block_lines=10000):
def parse_alignment_file_single(self):
"""
"""
if self.alignment_file.endswith("gz"):
input_f = gzip.open(self.alignment_file, "rt")
else:
input_f = open(self.alignment_file)
if self.alignment_format == "GAF":
# store a list of GAFRecord objects made for each line in GAF file.
with open(self.alignment_file) as input_f:
self.raw_records = _gaf_parse_worker(
# csv_lines_gen=csv.reader(input_f, delimiter="\t"),
csv_lines_gen=input_f,
_min_align_len=self.min_align_len,
_min_identity=self.min_identity,
_parse_cigar=self.parse_cigar)
self.raw_records = _gaf_parse_worker(
# csv_lines_gen=csv.reader(input_f, delimiter="\t"),
csv_lines_gen=input_f,
_min_align_len=self.min_align_len,
_min_identity=self.min_identity,
_parse_cigar=self.parse_cigar)
input_f.close()
elif self.alignment_format == "SPA-TSV":
# store a list of SPAligner SPATSVRecord objects made for each line in TSV file.
with open(self.alignment_file) as input_f:
self.raw_records = _tsv_parse_worker(
# csv_lines_gen=csv.reader(input_f, delimiter="\t"),
csv_lines_gen=input_f,
_min_align_len=self.min_align_len)
input_f.close()
else:
input_f.close()
raise Exception("unsupported format!")

# # filtering raw_records based on min length
Expand Down Expand Up @@ -1113,12 +1123,12 @@ def output(self, output_file, output_fmt="gaf"):
pass

def parse_alignment_format_from_postfix(self):
if self.alignment_file.lower().endswith(".gaf"):
if self.alignment_file.lower().endswith(".gaf") or self.alignment_file.lower().endswith(".gaf.gz"):
alignment_format = "GAF"
elif self.alignment_file.lower().endswith(".tsv"):
elif self.alignment_file.lower().endswith(".tsv") or self.alignment_file.lower().endswith(".tsv.gz"):
alignment_format = "SPA-TSV"
else:
raise Exception("Please denote the alignment format using adequate postfix (.gaf/.tsv)")
raise Exception("Please denote the alignment format using adequate postfix (.gaf/.gaf.gz/.tsv/.tsv.gz)")
return alignment_format

def __iter__(self):
Expand Down
2 changes: 1 addition & 1 deletion traversome/ModelFitBayesian.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def run_mcmc(self, n_generations, n_burn, chosen_ids: typingODict[int, bool] = N
# Because many traversome attributes including subpath information were created using the original variant
# ids, so here we prefer not making traversome.get_multinomial_like_formula complicated. Instead, we create
# variant_percents with foo values inserted when that variant id is not in chosen_ids_set.
real_percents = pm.Dirichlet(name="comp", a=np.ones(chosen_num), shape=(chosen_num,))
real_percents = pm.Dirichlet(name="vid", a=np.ones(chosen_num), shape=(chosen_num,))
variant_percents = [False] * self.num_of_variants
for go_id_id, chosen_id in enumerate(chosen_ids):
variant_percents[chosen_id] = real_percents[go_id_id]
Expand Down
59 changes: 27 additions & 32 deletions traversome/ModelFitMaxLike.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ class ModelFitMaxLike(object):
def __init__(self,
model,
variant_paths,
all_sub_paths,
variant_subpath_counters,
sbp_to_sbp_id,
repr_to_merged_variants,
Expand All @@ -75,7 +74,7 @@ def __init__(self,
self.model = model
self.variant_paths = variant_paths
self.num_put_variants = len(variant_paths)
self.all_sub_paths = all_sub_paths
self.all_sub_paths = model.all_sub_paths
self.variant_subpath_counters = variant_subpath_counters
self.sbp_to_sbp_id = sbp_to_sbp_id
self.repr_to_merged_variants = repr_to_merged_variants
Expand Down Expand Up @@ -171,31 +170,21 @@ def reverse_model_selection(self,
for variant_id in range(self.num_put_variants)]
logger.debug("Test variants {}".format(list(chosen_ids)))

# # it was originally combined into one function,
# # but now in order to parallelize minimize_neg_likelihood(), keep them separated
# logger.debug("Generating the likelihood function .. ")
# set_chosen_ids = set(chosen_ids_set)
# neg_loglike_func_obj = self.get_neg_likelihood_of_var_freq(within_variant_ids=set_chosen_ids)
# logger.info("Maximizing the likelihood function for {} variants".format(len(chosen_ids_set)))
# success_run = minimize_neg_likelihood(
# neg_loglike_func=neg_loglike_func_obj.loglike_func,
# num_variables=len(chosen_ids_set),
# verbose=self.traversome.loglevel in ("TRACE", "ALL"))
# previous_prop, previous_echo, previous_like, previous_criteria = \
# self.__summarize_like_and_criteria(success_run, set_chosen_ids, criterion, neg_loglike_func_obj)
previous_prop, previous_echo, previous_like, previous_criteria = \
self.__compute_like_and_criteria(chosen_id_set=chosen_ids, criteria=criterion)

# logger.info("Proportions: %s " % {_iid: previous_prop[_gid] for _gid, _iid in enumerate(chosen_ids_set)})
# logger.info("Log-likelihood: %s" % previous_like)
# drop zero prob variant
self.__drop_zero_variants(chosen_ids, previous_prop, previous_echo, diff_tolerance)
# if one component is identified as indispensable in the n-dimension model,
# it will be indispensable for subsequent (n-m)-dimension models
if user_fixed_ids: # in the case of user assigned fixed 'indispensable' variant id(s)
indispensable_ids = {u_id: True for u_id in user_fixed_ids}
else:
indispensable_ids = {}

previous_prop, previous_echo, previous_like, previous_criteria = \
self.__compute_like_and_criteria(chosen_id_set=chosen_ids, criteria=criterion)

# logger.info("Proportions: %s " % {_iid: previous_prop[_gid] for _gid, _iid in enumerate(chosen_ids_set)})
# logger.info("Log-likelihood: %s" % previous_like)
# drop zero prob variant
self.__drop_zero_variants(chosen_ids, previous_prop, previous_echo, diff_tolerance, indispensable_ids)

# stepwise
while len(chosen_ids) > 1:
logger.debug("Trying dropping {} variant(s) ..".format(self.num_put_variants - len(chosen_ids) + 1))
Expand Down Expand Up @@ -268,18 +257,20 @@ def reverse_model_selection(self,
previous_prop = test_id_res[best_drop_id]["prop"]
previous_echo = test_id_res[best_drop_id]["echo"]
chosen_ids.remove(best_drop_id)
logger.info("Drop id_{}".format(self.__str_rep_id(best_drop_id)))
# drop candidate id that minimize criteria
logger.info("Drop {}".format(self.__str_rep_id(best_drop_id)))
logger.info("Proportions: " +
", ".join(["%s:%.4f" % (_id, _p) for _id, _p, in previous_echo.items()]))
logger.info("Log-likelihood: %s" % previous_like)
self.__drop_zero_variants(chosen_ids, previous_prop, previous_echo, diff_tolerance)
self.__drop_zero_variants(
chosen_ids, previous_prop, previous_echo, diff_tolerance, indispensable_ids)
changed = True
break
# for var_id in list(chosen_ids_set):
# if abs(previous_prop[var_id] - 0.) < diff_tolerance:
# del chosen_ids_set[var_id]
# for uid_var_id in self.traversome.repr_to_merged_variants[var_id]:
# del previous_prop[uid_var_id]
# for cid_var_id in self.traversome.repr_to_merged_variants[var_id]:
# del previous_prop[cid_var_id]
# del previous_echo[self.__str_rep_id(var_id)]
# logger.info("Drop {}".format(self.__str_rep_id(var_id)))
# else:
Expand All @@ -297,14 +288,18 @@ def reverse_model_selection(self,
logger.info("Log-likelihood: %s" % previous_like)
return previous_prop

def __drop_zero_variants(self, chosen_ids, representative_props, echo_props, diff_tolerance):
def __drop_zero_variants(self, chosen_ids, representative_props, echo_props, diff_tolerance, indispensable_ids):
for var_id in list(chosen_ids):
# do not drop a candidate variant if it is fixed either by the user or by a read path
if var_id in indispensable_ids:
continue
if abs(representative_props[var_id] - 0.) < diff_tolerance:
chosen_ids.remove(var_id)
for uid_var_id in self.repr_to_merged_variants[var_id]:
del representative_props[uid_var_id]
for cid_var_id in self.repr_to_merged_variants[var_id]:
del representative_props[cid_var_id]
del echo_props[self.__str_rep_id(var_id)]
logger.info("Drop id_{}".format(self.__str_rep_id(var_id)))
# drop candidate id that has estimated proportion of zero
logger.info("Drop {}".format(self.__str_rep_id(var_id)))

def __test_one_drop(self, var_id, chosen_ids: set, sorted_chosen_ids, criterion, test_id_res, indispensable_ids):
if var_id not in indispensable_ids:
Expand Down Expand Up @@ -461,13 +456,13 @@ def __summarize_run_prop(self, success_run, within_var_ids):
echo_prop[self.__str_rep_id(representatives[go])] = this_prop
unidentifiable_var_ids = self.repr_to_merged_variants[representatives[go]]
this_prop /= len(unidentifiable_var_ids)
for uid_var_id in unidentifiable_var_ids:
prop_dict[uid_var_id] = this_prop
for cid_var_id in unidentifiable_var_ids:
prop_dict[cid_var_id] = this_prop
use_prop = OrderedDict([(_id, prop_dict[_id]) for _id in sorted(prop_dict)])
return use_prop, echo_prop

def __str_rep_id(self, rep_id):
return "+".join([str(_uid_var_id) for _uid_var_id in self.repr_to_merged_variants[rep_id]])
return "+".join([f"cid_{(_cid_var_id + 1)}" for _cid_var_id in self.repr_to_merged_variants[rep_id]])

def get_neg_likelihood_of_var_freq(self, within_variant_ids: set = None, scipy_style=True):
# log_like_formula = self.traversome.get_likelihood_binomial_formula(
Expand Down
6 changes: 3 additions & 3 deletions traversome/ModelGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_like_formula(self, variant_percents, log_func, within_variant_ids: Set =
input symengine.Symbols for maximum likelihood analysis (scipy),
e.g. [Symbol("P" + str(variant_id)) for variant_id in range(self.num_put_variants)].
input pm.Dirichlet for bayesian analysis (pymc3),
e.g. pm.Dirichlet(name="comp", a=np.ones(variant_num), shape=(variant_num,)).
e.g. pm.Dirichlet(name="vid", a=np.ones(variant_num), shape=(variant_num,)).
:param log_func:
input symengine.log for maximum likelihood analysis using scipy,
input tt.log for bayesian analysis using pymc3
Expand Down Expand Up @@ -57,8 +57,8 @@ def get_like_formula(self, variant_percents, log_func, within_variant_ids: Set =
if within_variant_ids and not (set(these_sp_info[check_sp].from_variants) & within_variant_ids):
del these_sp_info[check_sp]

# calculate the observations
observations = [len(this_sp_info.mapped_records) for this_sp_info in these_sp_info.values()]
# get the observations
observations = [this_sp_info.num_matched for this_sp_info in these_sp_info.values()]

# sub path possible matches
logger.debug(" Formulating the subpath probabilities ..")
Expand Down
Loading

0 comments on commit 803ae6e

Please sign in to comment.