diff --git a/coverage.xml b/coverage.xml index 6ccdbe3..f06a093 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,12 +1,12 @@ - + /Users/ravinpoudel/Documents/GuideMaker_ALL/GuideMaker/guidemaker - + @@ -64,7 +64,7 @@ - + @@ -103,45 +103,45 @@ - - - - - - - + + + + + + + - + - - - + + + - - + + - - + + - + - - + + - - + + @@ -150,57 +150,55 @@ - - + + - + - - + + - + - + - + - + - - + + - - @@ -223,9 +221,15 @@ + + + + + + - + @@ -606,97 +610,99 @@ - - - + - + - - + - - - + - - - + + + - - - - + - - - - + + - - - - - - + + + + + + + + - - - - - - - + + - - + + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - + - - - - + + + + + + + diff --git a/guidemaker/cli.py b/guidemaker/cli.py index 0bef605..ad94bd9 100644 --- a/guidemaker/cli.py +++ b/guidemaker/cli.py @@ -59,7 +59,8 @@ def myparser(): help='List of sequence representing restriction enzymes. Default: None.', default=[]) parser.add_argument('--filter_by_locus', nargs="*", help='List of locus tag. Default: None.', default=[]) - parser.add_argument('--doench_efficiency_score',help='Doench et al. 2016 - only for NGG PAM: None.', action='store_true') + parser.add_argument('--doench_efficiency_score',help='Doench et al. 2016 - only for NGG PAM: Default: None.', action='store_true') + parser.add_argument('--cfd_score',help='CFD score for assessing off-target activity of gRNAs: Default: None.', action='store_true') parser.add_argument('--keeptemp', help="Option to keep intermediate files be kept", action='store_true') parser.add_argument('--plot', help="Option to genereate guidemaker plots", action='store_true') parser.add_argument('--config', help="Path to YAML formatted configuration file, default is " + @@ -183,6 +184,11 @@ def main(arglist: list = None): if args.doench_efficiency_score: logging.info("Creating Efficiency Score based on Doench et al. 2016 - only for NGG PAM...") prettydf = guidemaker.core.get_doench_efficiency_score(df=prettydf, pam_orientation=args.pam_orientation) + + if args.cfd_score: + logging.info("Calculating CFD score for assessing off-target activity of gRNAs") + prettydf = guidemaker.core.cfd_score(df=prettydf) + fd_zero = prettydf['Feature distance'].isin([0]).sum() logging.info("Number of Guides within a gene coordinates i.e. zero Feature distance: %d", fd_zero) if not os.path.exists(args.outdir): diff --git a/guidemaker/core.py b/guidemaker/core.py index 905a2a2..0707607 100644 --- a/guidemaker/core.py +++ b/guidemaker/core.py @@ -845,21 +845,6 @@ def checklen30(seq): if len(seq) == 30: return True return False - - def cfd_calculator(knnstrlist, guide): - knnlist = knnstrlist.split(';') - cfd_list=[] - for item in knnlist: - result=cfd_score_calculator.calc_cfd(guide, item) - cfd_list.append(result) - s = [str(i) for i in cfd_list] - return s - - def get_max_cfd(cfdlist): - newlist = [float(x) for x in cfdlist] - newlist.sort() - maxcfd = newlist[-1] - return(maxcfd) def get_off_target_score(seq): dlist = targetprocessor_object.neighbors[seq]["neighbors"]["dist"] @@ -897,12 +882,8 @@ def get_off_target_seqs(seq): # to match with the numbering with other tools- offset pretty_df['Guide start'] = pretty_df['Guide start'] + 1 pretty_df['Feature start'] = pretty_df['Feature start'] + 1 - pretty_check30merlen=pretty_df.loc[pretty_df['target_seq30'].apply(checklen30)==True] - # add CDF scores - pretty_check30merlen['CFD Similar Guides'] = pretty_check30merlen.apply(lambda x: cfd_calculator(x['Similar guides'], x['Guide sequence']), axis=1) - # Add a column with max CFD score - pretty_check30merlen['Max CFD'] = pretty_check30merlen['CFD Similar Guides'].apply(get_max_cfd) - self.pretty_df = pretty_check30merlen + pretty_df=pretty_df.loc[pretty_df['target_seq30'].apply(checklen30)==True] + self.pretty_df = pretty_df def _filterlocus(self, filter_by_locus:list = []) -> PandasDataFrame: """ @@ -915,10 +896,10 @@ def _filterlocus(self, filter_by_locus:list = []) -> PandasDataFrame: (PandasDataFrame): A formated pandas dataframe """ - filter_pretty_df = deepcopy(self.pretty_df) # anno class object + df = deepcopy(self.pretty_df) # anno class object if len (filter_by_locus) > 0: - filter_pretty_df = filter_pretty_df[filter_pretty_df['locus_tag'].isin(filter_by_locus)] - return filter_pretty_df + df = df[df['locus_tag'].isin(filter_by_locus)] + return df def locuslen(self) -> int: """ @@ -1058,6 +1039,31 @@ def extend_ambiguous_dna(seq: str) -> List[str]: return extend_list +# add CDF scores + +def cfd_score(df): + def cfd_calculator(knnstrlist, guide): + knnlist = knnstrlist.split(';') + cfd_list=[] + for item in knnlist: + result=cfd_score_calculator.calc_cfd(guide, item) + cfd_list.append(result) + s = [str(i) for i in cfd_list] + return s + + def get_max_cfd(cfdlist): + newlist = [float(x) for x in cfdlist] + newlist.sort() + maxcfd = newlist[-1] + return(maxcfd) + + df['CFD Similar Guides'] = df.apply(lambda x: cfd_calculator(x['Similar guides'], x['Guide sequence']), axis=1) + # Add a column with max CFD score + df['Max CFD'] = df['CFD Similar Guides'].apply(get_max_cfd) + return df + + + def get_doench_efficiency_score(df, pam_orientation): checkset={'AGG','CGG','TGG','GGG'} if pam_orientation == "3prime" and set(df.PAM)==checkset: diff --git a/tests/test_core.py b/tests/test_core.py index 748db8f..0615016 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -23,8 +23,8 @@ TEST_DIR = os.path.dirname(os.path.abspath(__file__)) configpath = os.path.join(ROOT_DIR,"data","config_default.yaml") -# TEST_DIR = '/Users/ravinpoudel/Documents/GuideMaker_ALL/GuideMaker/tests' -# configpath="guidemaker/data/config_default.yaml" +TEST_DIR = '/Users/ravinpoudel/Documents/GuideMaker_ALL/GuideMaker/tests' +configpath="guidemaker/data/config_default.yaml" #PamTarget Class @@ -221,7 +221,7 @@ def test_filter_features(): anno._filter_features() anno._get_qualifiers(configpath=configpath) anno._format_guide_table(tl) - assert anno.pretty_df.shape == (869, 25) + assert anno.pretty_df.shape == (869, 23) def test_filterlocus(): @@ -244,7 +244,7 @@ def test_filterlocus(): anno._get_qualifiers(configpath=configpath) anno._format_guide_table(tl) filter_prettydf = anno._filterlocus(filter_by_locus=['CRP_001']) - assert filter_prettydf.shape == (4, 25) + assert filter_prettydf.shape == (4, 23) # Function : get_fastas @pytest.fixture(scope='session') @@ -290,6 +290,29 @@ def test_get_doench_efficiency_score(): anno._filter_features() anno._get_qualifiers(configpath=configpath) anno._format_guide_table(tl) - filter_prettydf = anno._filterlocus() - doench_df = guidemaker.core.get_doench_efficiency_score(df=filter_prettydf, pam_orientation=pamobj.pam_orientation) - assert abs(doench_df.Efficiency[213] - 0.42684514989852906) < 0.0001 \ No newline at end of file + filter_pretty_30mer_df = anno._filterlocus() + doench_df = guidemaker.core.get_doench_efficiency_score(df=filter_pretty_30mer_df, pam_orientation=pamobj.pam_orientation) + assert abs(doench_df.Efficiency[213] - 0.42684514989852906) < 0.0001 + +def test_cfd_score(): + pamobj = guidemaker.core.PamTarget("NGG", "3prime","hamming") + filegbk =os.path.join(TEST_DIR,"test_data", "Carsonella_ruddii.gbk") + file =os.path.join(TEST_DIR, "test_data","Carsonella_ruddii.fasta") + gb = SeqIO.parse(file, "fasta") + pamtargets = pamobj.find_targets(seq_record_iter=gb, target_len=20) + tl = guidemaker.core.TargetProcessor(targets=pamtargets, lsr=10, editdist=2, knum=10) + tl.check_restriction_enzymes(['NRAGCA']) + tl.find_unique_near_pam() + tl.create_index(configpath=configpath) + tl.get_neighbors(configpath=configpath) + tf_df = tl.export_bed() + anno = guidemaker.core.Annotation(genbank_list=[filegbk], + target_bed_df=tf_df) + anno._get_genbank_features() + anno._get_nearby_features() + anno._filter_features() + anno._get_qualifiers(configpath=configpath) + anno._format_guide_table(tl) + filter_pretty_30mer_df = anno._filterlocus() + cfd_df = guidemaker.core.cfd_score(df=filter_pretty_30mer_df) + assert abs(cfd_df['Max CFD'][0] > 0) \ No newline at end of file