diff --git a/README.md b/README.md
index ddac927..3c94718 100644
--- a/README.md
+++ b/README.md
@@ -215,6 +215,7 @@ Most pools require amplification before cloning to convert the ssDNA to dsDNA an
* [Addgene: Guide to Using Pooled Libraries](https://www.addgene.org/guides/pooled-libraries/)
Pooled CRISPR Data Analysis
+
After the experiment, the cells are collected and DNA is isolated. The target sequence is then amplified and adaptors for high-throughput sequencing added. Several data analysis pipelines have been developed to identify target sequences over-represented or under-represented in the pool. The manuscript by Wang et al. (2019) provides a protocol for using a high-quality tool with these capabilities.
diff --git a/bioconda.recipe/guidemaker/meta.yaml b/bioconda.recipe/guidemaker/meta.yaml
index 2aa72f1..ba47e3c 100644
--- a/bioconda.recipe/guidemaker/meta.yaml
+++ b/bioconda.recipe/guidemaker/meta.yaml
@@ -66,4 +66,4 @@ extra:
- ravinpoudel
identifiers:
- biotools:GuideMaker
- - doi:10.5281/zenodo.4849258
+ - doi:10.5281/zenodo.4849258
\ No newline at end of file
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index dc77e42..7a83844 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -1,4 +1,3 @@
-
{% set data = load_setup_py_data() %}
package:
diff --git a/coverage.xml b/coverage.xml
index 89cb337..151d0d5 100644
--- a/coverage.xml
+++ b/coverage.xml
@@ -1,12 +1,12 @@
-
+
-
+
@@ -251,7 +251,7 @@
-
+
@@ -550,29 +550,28 @@
-
+
-
-
-
-
+
+
+
-
+
+
-
-
-
-
-
+
+
+
+
@@ -589,9 +588,9 @@
-
-
-
+
+
+
@@ -620,163 +619,177 @@
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
+
-
-
-
+
+
+
+
+
+
+
-
-
-
+
+
-
+
+
+
-
-
-
-
-
-
-
+
-
-
+
-
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/guidemaker/cli.py b/guidemaker/cli.py
index c0efb2e..9ca6e05 100644
--- a/guidemaker/cli.py
+++ b/guidemaker/cli.py
@@ -191,7 +191,7 @@ def main(arglist: list = None):
target_bed_df=tf_df)
logger.info("Identify genomic features")
anno.get_annotation_features()
- logger.info("Total number of CDS/locus in the input genome: %d" % anno.locuslen())
+ logger.info("Total number of %s in the input genome: %d" % anno.locuslen())
logger.info("Find genomic features closest the guides")
anno._get_nearby_features()
logger.info("Select guides that start between +%s and -%s of a feature start" %
diff --git a/guidemaker/core.py b/guidemaker/core.py
index ec80a7c..d68a94d 100644
--- a/guidemaker/core.py
+++ b/guidemaker/core.py
@@ -663,7 +663,7 @@ def __init__(self, annotation_list: List[str], annotation_type: str, target_bed_
self.qualifiers: object = None
def check_annotation_type(self):
- """determine if the file provided by the GFF argument is a GFF or GTF file
+ """open GTF/GFF and determine if the file provided by the GFF argument is a GFF or GTF file
Args: None
@@ -676,7 +676,10 @@ def search(f):
return "gff"
gtfmatch = re.search("gtf-version", line1)
if gtfmatch is not None:
- return "gtf"
+ return "gtf"
+ else:
+ logger.error("Could not verify the GFF/GTF file type. Please make sure your GFF/GTF file starts with '#gtf-version' or '##gff-version'")
+ raise ValueError
testfile = self.annotation_list[0]
if is_gzip(testfile):
with gzip.open(testfile, 'rt') as f:
@@ -742,20 +745,28 @@ def get_annotation_features(self, feature_types: List[str] = None) -> None:
pddict["name"].append(featid)
featlist = rec[8].split(';')
for feat in featlist:
- if feat.isspace():
+ try:
+ if feat.isspace(): # this handles whitespace strings
+ continue
+ if not feat: # this handles empty strings
+ continue
+ if anno_format == 'gtf':
+ fl = re.search('^[^"]*', feat)
+ fv = re.search('"([^"]*)"', feat)
+ feat_key = fl.group(0).strip()
+ feat_val = fv.group(0).strip('"')
+ elif anno_format =='gff':
+ fl = feat.split('=')
+ feat_key = fl[0]
+ feat_val = fl[1]
+ if not feat_key in feature_dict:
+ feature_dict[feat_key] = {}
+ feature_dict[feat_key][featid] = feat_val
+ except:
+ logger.warning("There appears to be an error in the formatting of an attribute in the "
+ "record below. Please check your input GFF or GTF file. The record is: {rec} "
+ "and the attribute is: {att}. Skipping this feature.".format(rec=featlist, att=feat))
continue
- if anno_format == 'gtf':
- fl = re.search('^[^"]*', feat)
- fv = re.search('"([^"]*)"', feat)
- feat_key = fl.group(0).strip()
- feat_val = fv.group(0).strip('"')
- elif anno_format =='gff':
- fl = feat.split('=')
- feat_key = fl[0]
- feat_val = fl[1]
- if not feat_key in feature_dict:
- feature_dict[feat_key] = {}
- feature_dict[feat_key][featid] = feat_val
genbankbed = pd.DataFrame.from_dict(pddict)
self.genbank_bed_df = genbankbed
self.feature_dict = feature_dict
@@ -963,9 +974,14 @@ def locuslen(self) -> int:
Returns:
(int): Number of locus tag
"""
-
- locus_count = len(self.feature_dict['locus_tag' or 'locus'].keys())
- return(locus_count)
+ da_keys = self.feature_dict.keys()
+ firsttag = (list(da_keys)[0])
+ if firsttag:
+ locus_count = len(self.feature_dict[firsttag].keys())
+ return firsttag, locus_count
+ else:
+ logger.warning("A locus key could not be found.")
+ return "notag", 0