-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from pranjan77/main
Genome loader script
- Loading branch information
Showing
13 changed files
with
36,876 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,23 @@ | ||
# cdm-data-loader-utils | ||
Repo for CDM input data loading and wrangling | ||
|
||
|
||
|
||
**Set up conda environment** | ||
```bash | ||
conda env create -f env.yml | ||
conda activate genome_loader_env | ||
``` | ||
|
||
**Run tests** | ||
```bash | ||
python -m unittest discover -s tests -p test_genome_loader.py | ||
``` | ||
|
||
|
||
**For running tests with checkm2 (Time taking so disabled by default)** | ||
```bash | ||
conda activate genome_loader_env | ||
checkm2 database --download | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
name: genome_loader_env | ||
channels: | ||
- bioconda | ||
- conda-forge | ||
- defaults | ||
dependencies: | ||
- python=3.7 | ||
- bbmap | ||
- biopython | ||
- pandas | ||
- prodigal | ||
- checkm2 | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from collections.abc import Iterable | ||
import hashlib | ||
|
||
DEFAULT_SPLIT = " " | ||
|
||
def _hash_string(s): | ||
return hashlib.sha256(s.encode("utf-8")).hexdigest() | ||
|
||
class HashSeq(str): | ||
def __new__(cls, v): | ||
instance = super().__new__(cls, v.upper()) | ||
return instance | ||
|
||
@property | ||
def hash_value(self): | ||
return _hash_string(self) | ||
|
||
class HashSeqList(list): | ||
def append(self, o): | ||
if isinstance(o, str): | ||
super().append(HashSeq(o)) | ||
elif isinstance(o, HashSeq): | ||
super().append(o) | ||
else: | ||
raise ValueError("bad type") | ||
|
||
@property | ||
def hash_value(self): | ||
h_list = [x.hash_value for x in self] | ||
hash_seq = "_".join(sorted(h_list)) | ||
return _hash_string(hash_seq) | ||
|
||
def extract_features(faa_str, split=DEFAULT_SPLIT, h_func=None): | ||
features = [] | ||
active_seq = None | ||
seq_lines = [] | ||
for line in faa_str.split("\n"): | ||
if line.startswith(">"): | ||
if active_seq is not None: | ||
active_seq.seq = "".join(seq_lines) | ||
features.append(active_seq) | ||
seq_lines = [] | ||
seq_id = line[1:] | ||
desc = None | ||
if h_func: | ||
seq_id, desc = h_func(seq_id) | ||
elif split: | ||
header_data = line[1:].split(split, 1) | ||
seq_id = header_data[0] | ||
if len(header_data) > 1: | ||
desc = header_data[1] | ||
active_seq = Feature(seq_id, "", desc) | ||
else: | ||
seq_lines.append(line.strip()) | ||
if len(seq_lines) > 0: | ||
active_seq.seq = "".join(seq_lines) | ||
features.append(active_seq) | ||
return features | ||
|
||
def read_fasta2(f, split=DEFAULT_SPLIT, h_func=None): | ||
if f.endswith(".gz"): | ||
import gzip | ||
with gzip.open(f, "rb") as fh: | ||
return extract_features(fh.read().decode("utf-8"), split, h_func) | ||
else: | ||
with open(f, "r") as fh: | ||
return extract_features(fh.read(), split, h_func) | ||
|
||
class Feature: | ||
def __init__(self, feature_id, sequence, description=None, aliases=None): | ||
self.id = feature_id | ||
self.seq = sequence | ||
self.description = description | ||
self.ontology_terms = {} | ||
self.aliases = aliases | ||
|
||
def add_ontology_term(self, ontology_term, value): | ||
if ontology_term not in self.ontology_terms: | ||
self.ontology_terms[ontology_term] = [] | ||
if value not in self.ontology_terms[ontology_term]: | ||
self.ontology_terms[ontology_term].append(value) | ||
|
||
def contig_set_hash(features): | ||
hl = HashSeqList() | ||
for contig in features: | ||
seq = HashSeq(contig.seq) | ||
hl.append(seq) | ||
return hl.hash_value | ||
|
Oops, something went wrong.