Skip to content

Commit

Permalink
Merge pull request #23 from JianjunJin/bootstrap_pangenome
Browse files Browse the repository at this point in the history
Bootstrap and PanGenome
  • Loading branch information
JianjunJin authored Dec 5, 2023
2 parents 803ae6e + 3a01ed8 commit 2f1595f
Show file tree
Hide file tree
Showing 10 changed files with 1,109 additions and 239 deletions.
53 changes: 26 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,45 +1,44 @@


# Traversome (Under active development)
# Traversome
Genomic structure frequency estimation from genome assembly graphs and long reads.


### Install dependencies
### Installation

Install dependencies using conda. I recommend using the mamba version of conda.
```bash
mamba create -n traversome_env
mamba activate traversome_env
mamba install python sympy python-symengine pytensor dill typer loguru
```
<details><summary>[Optional] Install dependencies for running Bayesian MCMC.</summary>
If you want to run Bayesian mcmc with Traversome, you have to install pymc and pytensor.
Due to the fast evolving of pymc, sometimes its installation may be unsuccessful and not seen during the installation.

```bash
conda install dill typer numpy scipy pymc3 sympy loguru -c conda-forge
conda install python-symengine -c symengine -c conda-forge
mamba install pytensor pymc
```
</details>

### Install devel version of traversome
Install Traversome using pip.
```bash
git clone --depth=1 https://github.com/Kinggerm/Traversome
pip install -e ./Traversome --no-deps
git clone --depth=1 https://github.com/JianjunJin/Traversome
pip install ./Traversome --no-deps
```

### Command line interface (CLI)

```bash
traversome thorough -g graph.gfa -a align.gaf -o outdir
traversome thorough -g graph.gfa -a align.gaf -o outdir --topo circular --chr all
```

### Interpreting results
...

## Development

```
# workflow
|-- __main__.py
|-- traversome.py
|-- __init__.py
|-- SimpleAssembly.py
|-- Assembly.py
|-- GraphAlignRecords.py
|-- CleanGraph.py (still working)
|-- EstCopyDepthFromCov.py
|-- EstCopyDepthPrecise.py (still working)
|-- GraphAlignmentPathGenerator.py
|-- GraphOnlyPathGenerator.py
|-- ModelFitBayesian.py
|-- ModelFitMaxLike.py
|-- output_dir
|-- traversome.log.txt running log
|-- variants.info.tab information of survival variants after model selection and bootstrap
|-- bootstrap.replicates.tab bootstrap results
|-- final.result.tab summary of pangenome solutions
|-- pangenome.gfa pangenome graph of the best supported result
```
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@
"dill",
"numpy",
"scipy",
"pymc>=4",
"symengine",
"sympy",
"matplotlib",
# "pymc>=4", # make mcmc optional
# "matplotlib",
"typer",
"loguru",
]
Expand Down
69 changes: 68 additions & 1 deletion traversome/AssemblySimple.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,9 @@ def __init__(self, graph_file=None, min_cov=0., max_cov=INF, uni_overlap=None):
else:
self.parse_fastg()

# paths, 2023-11-16 added as temporary solution
self.paths = OrderedDict() # label -> {"path" -> [], "prop" -> float, "circular" -> bool}

def __repr__(self):
"""
Human readable desciptive output of the Assembly object
Expand Down Expand Up @@ -511,6 +514,7 @@ def parse_gfa_v1(self, gfa_open):
self.__uni_overlap = overlap_values.pop()

def parse_gfa_v2(self, gfa_open):
# TODO: to be updated
"GFA VERSION 2 PARSING"

# set for storing kmer results
Expand Down Expand Up @@ -728,6 +732,7 @@ def write_to_fasta(self, out_file, interleaved=None, check_postfix=True):

def write_to_gfa(self, out_file, check_postfix=True, other_attr=None):
"""
GFA1
:param out_file: str
:param check_postfix: bool
:param other_attr: dict, e.g. {"CL":"z", "C2":"z"}
Expand All @@ -738,6 +743,7 @@ def write_to_gfa(self, out_file, check_postfix=True, other_attr=None):
other_attr = {}
logger.debug(" writing graph to " + str(out_file))
out_file_handler = open(out_file, "w")
out_file_handler.write("H\tVN:Z:1.0\n")
for vertex_name in self.vertex_info:
out_file_handler.write("\t".join(
[
Expand All @@ -760,4 +766,65 @@ def write_to_gfa(self, out_file, check_postfix=True, other_attr=None):
"L", vertex_name, ("-", "+")[this_end], next_v, ("-", "+")[not next_e],
str(this_overlap) + "M"
]) + "\n")

# temporary solution
for p_id, path_dict in self.paths.items():
path_str_list = []
overlap_list = []
this_path = path_dict["path"]
len_p = len(this_path)
for go_v, (this_vertex, this_end) in enumerate(this_path):
path_str_list.append(this_vertex + ("-", "+")[this_end])
if go_v == len_p - 1:
if path_dict["circular"]:
next_id = 0
else:
break
else:
next_id = go_v + 1
next_v, next_e = this_path[next_id]
overlap_list.append(f"{self.vertex_info[this_vertex].connections[this_end][(next_v, not next_e)]}M")
ot_attr = []
for att, at_val in path_dict.items():
if att not in {"path", "circular"}:
ot_attr.append(f"\t{att}:{type(at_val).__name__}:{at_val}")
out_file_handler.write(f"P\tv{p_id}\t{','.join(path_str_list)}\t{','.join(overlap_list)}"
f"{''.join(ot_attr)}\n")
out_file_handler.close()

# def write_to_gfa2(self, out_file, check_postfix=True, other_attr=None):
# """
# TODO GFA2
# :param out_file: str
# :param check_postfix: bool
# :param other_attr: dict, e.g. {"CL":"z", "C2":"z"}
# """
# if check_postfix and not out_file.endswith(".gfa"):
# out_file += ".gfa"
# if not other_attr:
# other_attr = {}
# logger.debug(" writing graph to " + str(out_file))
# out_file_handler = open(out_file, "w")
# out_file_handler.write("H\tVN:Z:2.0")
# for vertex_name in self.vertex_info:
# out_file_handler.write("\t".join(
# [
# "S", vertex_name, self.vertex_info[vertex_name].seq[True],
# "LN:i:" + str(self.vertex_info[vertex_name].len),
# "RC:i:" + str(int(self.vertex_info[vertex_name].len * self.vertex_info[vertex_name].cov))] +
# [
# "%s:%s:%s" % (attr_name, attr_type, self.vertex_info[vertex_name].other_attr.get(attr_name, ""))
# for attr_name, attr_type in other_attr.items()
# if self.vertex_info[vertex_name].other_attr.get(attr_name, False)
# ]) + "\n")
# recorded_connections = set()
# for vertex_name in self.vertex_info:
# for this_end in (False, True):
# for (next_v, next_e), this_overlap in self.vertex_info[vertex_name].connections[this_end].items():
# this_con = tuple(sorted([(vertex_name, this_end), (next_v, next_e)]))
# if this_con not in recorded_connections:
# recorded_connections.add(this_con)
# out_file_handler.write("\t".join([
# "E", vertex_name, ("-", "+")[this_end], next_v, ("-", "+")[not next_e],
# str(this_overlap) + "M"
# ]) + "\n")
# out_file_handler.close()
Loading

0 comments on commit 2f1595f

Please sign in to comment.