Skip to content

Commit

Permalink
Merge pull request #1 from pranjan77/main
Browse files Browse the repository at this point in the history
Genome loader script
  • Loading branch information
Fxe authored Nov 18, 2024
2 parents 4ec51f6 + af26e9c commit 04e1e44
Show file tree
Hide file tree
Showing 13 changed files with 36,876 additions and 0 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,23 @@
# cdm-data-loader-utils
Repo for CDM input data loading and wrangling



**Set up conda environment**
```bash
conda env create -f env.yml
conda activate genome_loader_env
```

**Run tests**
```bash
python -m unittest discover -s tests -p test_genome_loader.py
```


**For running tests with checkm2 (Time taking so disabled by default)**
```bash
conda activate genome_loader_env
checkm2 database --download
```

13 changes: 13 additions & 0 deletions env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: genome_loader_env
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- python=3.7
- bbmap
- biopython
- pandas
- prodigal
- checkm2

Empty file.
89 changes: 89 additions & 0 deletions genome_loader_scripts/calculate_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from collections.abc import Iterable
import hashlib

DEFAULT_SPLIT = " "

def _hash_string(s):
return hashlib.sha256(s.encode("utf-8")).hexdigest()

class HashSeq(str):
def __new__(cls, v):
instance = super().__new__(cls, v.upper())
return instance

@property
def hash_value(self):
return _hash_string(self)

class HashSeqList(list):
def append(self, o):
if isinstance(o, str):
super().append(HashSeq(o))
elif isinstance(o, HashSeq):
super().append(o)
else:
raise ValueError("bad type")

@property
def hash_value(self):
h_list = [x.hash_value for x in self]
hash_seq = "_".join(sorted(h_list))
return _hash_string(hash_seq)

def extract_features(faa_str, split=DEFAULT_SPLIT, h_func=None):
features = []
active_seq = None
seq_lines = []
for line in faa_str.split("\n"):
if line.startswith(">"):
if active_seq is not None:
active_seq.seq = "".join(seq_lines)
features.append(active_seq)
seq_lines = []
seq_id = line[1:]
desc = None
if h_func:
seq_id, desc = h_func(seq_id)
elif split:
header_data = line[1:].split(split, 1)
seq_id = header_data[0]
if len(header_data) > 1:
desc = header_data[1]
active_seq = Feature(seq_id, "", desc)
else:
seq_lines.append(line.strip())
if len(seq_lines) > 0:
active_seq.seq = "".join(seq_lines)
features.append(active_seq)
return features

def read_fasta2(f, split=DEFAULT_SPLIT, h_func=None):
if f.endswith(".gz"):
import gzip
with gzip.open(f, "rb") as fh:
return extract_features(fh.read().decode("utf-8"), split, h_func)
else:
with open(f, "r") as fh:
return extract_features(fh.read(), split, h_func)

class Feature:
def __init__(self, feature_id, sequence, description=None, aliases=None):
self.id = feature_id
self.seq = sequence
self.description = description
self.ontology_terms = {}
self.aliases = aliases

def add_ontology_term(self, ontology_term, value):
if ontology_term not in self.ontology_terms:
self.ontology_terms[ontology_term] = []
if value not in self.ontology_terms[ontology_term]:
self.ontology_terms[ontology_term].append(value)

def contig_set_hash(features):
hl = HashSeqList()
for contig in features:
seq = HashSeq(contig.seq)
hl.append(seq)
return hl.hash_value

Loading

0 comments on commit 04e1e44

Please sign in to comment.