Merge pull request #1 from pranjan77/main

Genome loader script
kbase · Nov 18, 2024 · 04e1e44 · 04e1e44
2 parents 4ec51f6 + af26e9c
commit 04e1e44
Show file tree

Hide file tree

Showing 13 changed files with 36,876 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,23 @@
 # cdm-data-loader-utils
 Repo for CDM input data loading and wrangling
+
+
+
+**Set up conda environment**  
+```bash
+conda env create -f env.yml
+conda activate genome_loader_env
+```
+
+**Run tests**
+```bash
+ python -m unittest discover -s tests -p test_genome_loader.py
+```
+
+
+**For running tests with checkm2 (Time taking so disabled by default)**
+```bash
+   conda activate genome_loader_env
+   checkm2 database --download
+```
+
diff --git a/env.yml b/env.yml
@@ -0,0 +1,13 @@
+name: genome_loader_env
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.7
+  - bbmap
+  - biopython
+  - pandas
+  - prodigal
+  - checkm2
+
diff --git a/genome_loader_scripts/__init__.py b/genome_loader_scripts/__init__.py
diff --git a/genome_loader_scripts/calculate_hash.py b/genome_loader_scripts/calculate_hash.py
@@ -0,0 +1,89 @@
+from collections.abc import Iterable
+import hashlib
+
+DEFAULT_SPLIT = " "
+
+def _hash_string(s):
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+
+class HashSeq(str):
+    def __new__(cls, v):
+        instance = super().__new__(cls, v.upper())
+        return instance
+
+    @property
+    def hash_value(self):
+        return _hash_string(self)
+
+class HashSeqList(list):
+    def append(self, o):
+        if isinstance(o, str):
+            super().append(HashSeq(o))
+        elif isinstance(o, HashSeq):
+            super().append(o)
+        else:
+            raise ValueError("bad type")
+
+    @property
+    def hash_value(self):
+        h_list = [x.hash_value for x in self]
+        hash_seq = "_".join(sorted(h_list))
+        return _hash_string(hash_seq)
+
+def extract_features(faa_str, split=DEFAULT_SPLIT, h_func=None):
+    features = []
+    active_seq = None
+    seq_lines = []
+    for line in faa_str.split("\n"):
+        if line.startswith(">"):
+            if active_seq is not None:
+                active_seq.seq = "".join(seq_lines)
+                features.append(active_seq)
+                seq_lines = []
+            seq_id = line[1:]
+            desc = None
+            if h_func:
+                seq_id, desc = h_func(seq_id)
+            elif split:
+                header_data = line[1:].split(split, 1)
+                seq_id = header_data[0]
+                if len(header_data) > 1:
+                    desc = header_data[1]
+            active_seq = Feature(seq_id, "", desc)
+        else:
+            seq_lines.append(line.strip())
+    if len(seq_lines) > 0:
+        active_seq.seq = "".join(seq_lines)
+        features.append(active_seq)
+    return features
+
+def read_fasta2(f, split=DEFAULT_SPLIT, h_func=None):
+    if f.endswith(".gz"):
+        import gzip
+        with gzip.open(f, "rb") as fh:
+            return extract_features(fh.read().decode("utf-8"), split, h_func)
+    else:
+        with open(f, "r") as fh:
+            return extract_features(fh.read(), split, h_func)
+
+class Feature:
+    def __init__(self, feature_id, sequence, description=None, aliases=None):
+        self.id = feature_id
+        self.seq = sequence
+        self.description = description
+        self.ontology_terms = {}
+        self.aliases = aliases
+
+    def add_ontology_term(self, ontology_term, value):
+        if ontology_term not in self.ontology_terms:
+            self.ontology_terms[ontology_term] = []
+        if value not in self.ontology_terms[ontology_term]:
+            self.ontology_terms[ontology_term].append(value)
+
+def contig_set_hash(features):
+    hl = HashSeqList()
+    for contig in features:
+        seq = HashSeq(contig.seq)
+        hl.append(seq)
+    return hl.hash_value
+