diff --git a/README.md b/README.md index ddac927..3c94718 100644 --- a/README.md +++ b/README.md @@ -215,6 +215,7 @@ Most pools require amplification before cloning to convert the ssDNA to dsDNA an * [Addgene: Guide to Using Pooled Libraries](https://www.addgene.org/guides/pooled-libraries/)

Pooled CRISPR Data Analysis

+ After the experiment, the cells are collected and DNA is isolated. The target sequence is then amplified and adaptors for high-throughput sequencing added. Several data analysis pipelines have been developed to identify target sequences over-represented or under-represented in the pool. The manuscript by Wang et al. (2019) provides a protocol for using a high-quality tool with these capabilities. diff --git a/bioconda.recipe/guidemaker/meta.yaml b/bioconda.recipe/guidemaker/meta.yaml index 2aa72f1..ba47e3c 100644 --- a/bioconda.recipe/guidemaker/meta.yaml +++ b/bioconda.recipe/guidemaker/meta.yaml @@ -66,4 +66,4 @@ extra: - ravinpoudel identifiers: - biotools:GuideMaker - - doi:10.5281/zenodo.4849258 + - doi:10.5281/zenodo.4849258 \ No newline at end of file diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index dc77e42..7a83844 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,4 +1,3 @@ - {% set data = load_setup_py_data() %} package: diff --git a/coverage.xml b/coverage.xml index 89cb337..151d0d5 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,12 +1,12 @@ - + /Users/rivers/Documents/guidemaker/guidemaker - + @@ -251,7 +251,7 @@ - + @@ -550,29 +550,28 @@ - + - - - - + + + - + + - - - - - + + + + @@ -589,9 +588,9 @@ - - - + + + @@ -620,163 +619,177 @@ - - - - - - - - - - + + + + + + + + + + - - - + + + - - - - - - - + + + + + + + + + + - - + - - - + + + + + + + - - - + + - + + + - - - - - - - + - - + - + + + + + + - - - - + + + + + + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - - - - - - - - + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + - - - - + + + + - - - + + + + + + + + + + + + + + diff --git a/guidemaker/cli.py b/guidemaker/cli.py index c0efb2e..9ca6e05 100644 --- a/guidemaker/cli.py +++ b/guidemaker/cli.py @@ -191,7 +191,7 @@ def main(arglist: list = None): target_bed_df=tf_df) logger.info("Identify genomic features") anno.get_annotation_features() - logger.info("Total number of CDS/locus in the input genome: %d" % anno.locuslen()) + logger.info("Total number of %s in the input genome: %d" % anno.locuslen()) logger.info("Find genomic features closest the guides") anno._get_nearby_features() logger.info("Select guides that start between +%s and -%s of a feature start" % diff --git a/guidemaker/core.py b/guidemaker/core.py index ec80a7c..d68a94d 100644 --- a/guidemaker/core.py +++ b/guidemaker/core.py @@ -663,7 +663,7 @@ def __init__(self, annotation_list: List[str], annotation_type: str, target_bed_ self.qualifiers: object = None def check_annotation_type(self): - """determine if the file provided by the GFF argument is a GFF or GTF file + """open GTF/GFF and determine if the file provided by the GFF argument is a GFF or GTF file Args: None @@ -676,7 +676,10 @@ def search(f): return "gff" gtfmatch = re.search("gtf-version", line1) if gtfmatch is not None: - return "gtf" + return "gtf" + else: + logger.error("Could not verify the GFF/GTF file type. Please make sure your GFF/GTF file starts with '#gtf-version' or '##gff-version'") + raise ValueError testfile = self.annotation_list[0] if is_gzip(testfile): with gzip.open(testfile, 'rt') as f: @@ -742,20 +745,28 @@ def get_annotation_features(self, feature_types: List[str] = None) -> None: pddict["name"].append(featid) featlist = rec[8].split(';') for feat in featlist: - if feat.isspace(): + try: + if feat.isspace(): # this handles whitespace strings + continue + if not feat: # this handles empty strings + continue + if anno_format == 'gtf': + fl = re.search('^[^"]*', feat) + fv = re.search('"([^"]*)"', feat) + feat_key = fl.group(0).strip() + feat_val = fv.group(0).strip('"') + elif anno_format =='gff': + fl = feat.split('=') + feat_key = fl[0] + feat_val = fl[1] + if not feat_key in feature_dict: + feature_dict[feat_key] = {} + feature_dict[feat_key][featid] = feat_val + except: + logger.warning("There appears to be an error in the formatting of an attribute in the " + "record below. Please check your input GFF or GTF file. The record is: {rec} " + "and the attribute is: {att}. Skipping this feature.".format(rec=featlist, att=feat)) continue - if anno_format == 'gtf': - fl = re.search('^[^"]*', feat) - fv = re.search('"([^"]*)"', feat) - feat_key = fl.group(0).strip() - feat_val = fv.group(0).strip('"') - elif anno_format =='gff': - fl = feat.split('=') - feat_key = fl[0] - feat_val = fl[1] - if not feat_key in feature_dict: - feature_dict[feat_key] = {} - feature_dict[feat_key][featid] = feat_val genbankbed = pd.DataFrame.from_dict(pddict) self.genbank_bed_df = genbankbed self.feature_dict = feature_dict @@ -963,9 +974,14 @@ def locuslen(self) -> int: Returns: (int): Number of locus tag """ - - locus_count = len(self.feature_dict['locus_tag' or 'locus'].keys()) - return(locus_count) + da_keys = self.feature_dict.keys() + firsttag = (list(da_keys)[0]) + if firsttag: + locus_count = len(self.feature_dict[firsttag].keys()) + return firsttag, locus_count + else: + logger.warning("A locus key could not be found.") + return "notag", 0