From ac5c1676ca885fb2ef63595ad84d51121ed54d84 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 13 May 2021 17:37:18 +0200 Subject: [PATCH 01/22] plugin version first files --- Plugins/__init__.py | 14 ++ Plugins/read_fasta.py | 34 +++++ Plugins/read_gff_maker_3UTR.py | 27 ++++ Plugins/read_gff_maker_5UTR.py | 27 ++++ Plugins/read_gff_maker_CDS.py | 50 +++++++ Plugins/read_gff_maker_exon.py | 1 + Plugins/read_gff_maker_gene.py | 31 ++++ Plugins/read_gff_maker_mRNA.py | 27 ++++ Plugins/read_gff_maker_main.py | 23 +++ Plugins/read_gff_maker_source.py | 1 + Plugins/read_tab_pannzer_CDS.py | 48 +++++++ Plugins/read_tab_pannzer_gene.py | 28 ++++ core.py | 13 ++ main.py | 236 +++++++++++-------------------- 14 files changed, 407 insertions(+), 153 deletions(-) create mode 100644 Plugins/__init__.py create mode 100644 Plugins/read_fasta.py create mode 100644 Plugins/read_gff_maker_3UTR.py create mode 100644 Plugins/read_gff_maker_5UTR.py create mode 100644 Plugins/read_gff_maker_CDS.py create mode 100644 Plugins/read_gff_maker_exon.py create mode 100644 Plugins/read_gff_maker_gene.py create mode 100644 Plugins/read_gff_maker_mRNA.py create mode 100644 Plugins/read_gff_maker_main.py create mode 100644 Plugins/read_gff_maker_source.py create mode 100644 Plugins/read_tab_pannzer_CDS.py create mode 100644 Plugins/read_tab_pannzer_gene.py create mode 100644 core.py diff --git a/Plugins/__init__.py b/Plugins/__init__.py new file mode 100644 index 0000000..6fc65d5 --- /dev/null +++ b/Plugins/__init__.py @@ -0,0 +1,14 @@ +#__init.py__ + +from Plugins.read_fasta import Plugin + +from Plugins.read_gff_maker_gene import Plugin +from Plugins.read_gff_maker_3UTR import Plugin +from Plugins.read_gff_maker_5UTR import Plugin +from Plugins.read_gff_maker_CDS import Plugin +from Plugins.read_gff_maker_exon import Plugin +from Plugins.read_gff_maker_mRNA import Plugin +from Plugins.read_gff_maker_source import Plugin + +from Plugins.read_tab_pannzer_CDS import Plugin +from Plugins.read_tab_pannzer_gene import Plugin \ No newline at end of file diff --git a/Plugins/read_fasta.py b/Plugins/read_fasta.py new file mode 100644 index 0000000..a6dade6 --- /dev/null +++ b/Plugins/read_fasta.py @@ -0,0 +1,34 @@ +#plugin.py + +import importlib +import itertools + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + + for record in handle: + + #initialize record + _record_ = SeqRecord( + record.seq, + record.id, + dbxrefs=["Project:" + metadata["project"]], + annotations={"division":metadata["division"],"molecule_type":metadata["molecule_type"],"organism":metadata["organism"],"taxonomy":metadata["taxonomy"],"topology":metadata["topology"]}, + description="" + ) + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=_record_.id)) + + #post output treatment + _record_.features = list(itertools.chain(*receiver)) + + #outputing + with open(f"out/{_record_.id}.dat", "w") as o: + print(_record_.format("embl"), file=o) \ No newline at end of file diff --git a/Plugins/read_gff_maker_3UTR.py b/Plugins/read_gff_maker_3UTR.py new file mode 100644 index 0000000..dabca35 --- /dev/null +++ b/Plugins/read_gff_maker_3UTR.py @@ -0,0 +1,27 @@ +#read_gff_maker_3UTR.py + +import pandas as pd +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + try: + location = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "three_prime_UTR"),:].reset_index()) + + _sub_features_ = [ + SeqFeature( + FeatureLocation(int(location.iloc[0,0]), int(location.iloc[0,1]), (1,-1)[location.iloc[0,2] == "-"]), + type="3'UTR", + qualifiers={ + "gene":target[1], + "note":list()})] + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "3'UTR"))) + + return _sub_features_ + except KeyError: + return [] \ No newline at end of file diff --git a/Plugins/read_gff_maker_5UTR.py b/Plugins/read_gff_maker_5UTR.py new file mode 100644 index 0000000..5b72281 --- /dev/null +++ b/Plugins/read_gff_maker_5UTR.py @@ -0,0 +1,27 @@ +#read_gff_maker_5UTR.py + +import pandas as pd +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + try: + location = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "five_prime_UTR"),:].reset_index()) + + _sub_features_ = [ + SeqFeature( + FeatureLocation(int(location.iloc[0,0]), int(location.iloc[0,1]), (1,-1)[location.iloc[0,2] == "-"]), + type="5'UTR", + qualifiers={ + "gene":target[1], + "note":list()})] + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "5'UTR"))) + + return _sub_features_ + except KeyError: + return [] \ No newline at end of file diff --git a/Plugins/read_gff_maker_CDS.py b/Plugins/read_gff_maker_CDS.py new file mode 100644 index 0000000..7b29f18 --- /dev/null +++ b/Plugins/read_gff_maker_CDS.py @@ -0,0 +1,50 @@ +#read_gff_maker_CDS.py + +import pandas as pd +import itertools +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +def mergeLocations(_locationArray_): + return _locationArray_[0] if len(_locationArray_) == 1 else CompoundLocation(_locationArray_) + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + locations = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index()) + + _sub_features_ = [ + SeqFeature( + mergeLocations(locations.apply(lambda location: FeatureLocation(int(location[0]), int(location[1]), (1,-1)[location[2] == "-"]), axis=1)), + type="CDS", + qualifiers={ + "gene":target[1], + "product":list(), + "note":list(), + "db_xref":list(), + "translation":list(), + "transl_table":metadata["transl_table"]})] + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "CDS"))) + + annotations = list(itertools.chain(receiver)) + for annotation in annotations: + if "product" in annotation.keys() and annotation["product"] != []: + for sub_feature in _sub_features_: + sub_feature.qualifiers["product"].extend(annotation["product"]) + + if "note" in annotation.keys() and annotation["note"] != []: + for sub_feature in _sub_features_: + sub_feature.qualifiers["note"].extend(annotation["note"]) + + if "db_xref" in annotation.keys() and annotation["db_xref"] != []: + for sub_feature in _sub_features_: + sub_feature.qualifiers["db_xref"].extend(annotation["db_xref"]) + + if "translation" in annotation.keys() and annotation["translation"] != []: + for sub_feature in _sub_features_: + sub_feature.qualifiers["translation"].extend(annotation["translation"]) + + return _sub_features_ \ No newline at end of file diff --git a/Plugins/read_gff_maker_exon.py b/Plugins/read_gff_maker_exon.py new file mode 100644 index 0000000..711c5ca --- /dev/null +++ b/Plugins/read_gff_maker_exon.py @@ -0,0 +1 @@ +#read_gff_maker_exon.py \ No newline at end of file diff --git a/Plugins/read_gff_maker_gene.py b/Plugins/read_gff_maker_gene.py new file mode 100644 index 0000000..069b66b --- /dev/null +++ b/Plugins/read_gff_maker_gene.py @@ -0,0 +1,31 @@ +#read_gff_maker_gene.py + +import pandas as pd +import itertools +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + location = (handle.loc[(target[0], target[1], "gene"),:].reset_index()) + + _sub_features_ = [ + SeqFeature( + FeatureLocation(int(location.iloc[0,0]), int(location.iloc[0,1]), (1,-1)[location.iloc[0,2] == "-"]), + type="gene", + qualifiers={ + "gene":target[1], + "note":list()})] + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=(target[0], target[1], "gene"))) + + annotations = list(itertools.chain(receiver)) + for annotation in annotations: + if "note" in annotation.keys() and annotation["note"] != []: + for sub_feature in _sub_features_: + sub_feature.qualifiers["note"].extend(annotation["note"]) + + return _sub_features_ \ No newline at end of file diff --git a/Plugins/read_gff_maker_mRNA.py b/Plugins/read_gff_maker_mRNA.py new file mode 100644 index 0000000..7639664 --- /dev/null +++ b/Plugins/read_gff_maker_mRNA.py @@ -0,0 +1,27 @@ +#read_gff_maker_mRNA.py + +import pandas as pd +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +def mergeLocations(_locationArray_): + return _locationArray_[0] if len(_locationArray_) == 1 else CompoundLocation(_locationArray_) + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + locations = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index()) + + #initialise + _sub_features_ = [ + SeqFeature( + mergeLocations(locations.apply(lambda location: FeatureLocation(int(location[0]), int(location[1]), (1,-1)[location[2] == "-"]), axis=1)), + type="mRNA", + qualifiers={ + "gene":target[1]})] + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "mRNA"))) + + return _sub_features_ \ No newline at end of file diff --git a/Plugins/read_gff_maker_main.py b/Plugins/read_gff_maker_main.py new file mode 100644 index 0000000..2043ade --- /dev/null +++ b/Plugins/read_gff_maker_main.py @@ -0,0 +1,23 @@ +#read_gff_maker_main.py + +import pandas as pd +import re + +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + + for gene in handle.loc[(target, slice(None), "gene"),:].reset_index()["sub_seq_id"]: + + #initialize features + _features_subset_ = [] + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=(target, gene))) + + _features_subset_ = receiver + yield _features_subset_ \ No newline at end of file diff --git a/Plugins/read_gff_maker_source.py b/Plugins/read_gff_maker_source.py new file mode 100644 index 0000000..f8c0bce --- /dev/null +++ b/Plugins/read_gff_maker_source.py @@ -0,0 +1 @@ +#read_gff_maker_source.py \ No newline at end of file diff --git a/Plugins/read_tab_pannzer_CDS.py b/Plugins/read_tab_pannzer_CDS.py new file mode 100644 index 0000000..7b0fd2e --- /dev/null +++ b/Plugins/read_tab_pannzer_CDS.py @@ -0,0 +1,48 @@ +#read_tab_pannzer_CDS + +import pandas as pd + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + + #initialisation + try: + anno_bp = handle.loc[(target[1], "BP_ARGOT"),:].reset_index()["id"] + except KeyError: + anno_bp = pd.Series([]) + + try: + anno_cc = handle.loc[(target[1], "CC_ARGOT"),:].reset_index()["id"] + except KeyError: + anno_cc = pd.Series([]) + + try: + anno_mf = handle.loc[(target[1], "MF_ARGOT"),:].reset_index()["id"] + except KeyError: + anno_mf = pd.Series([]) + + try: + anno_qsec = [handle.loc[(target[1], "qseq"),:].reset_index().iloc[0,1]] + except KeyError: + anno_qsec = list() + + try: + anno_de = [handle.loc[(target[1], "DE"),:].reset_index().iloc[0,1]] + except KeyError: + anno_de = list() + + _annotations_ = [{ + "db_xref":[f"GO:{str(go)}" for go in pd.concat([anno_bp, anno_cc, anno_mf])], + "translation": anno_qsec, + "product": anno_de + }] + + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=target)) + + #output + return _annotations_ \ No newline at end of file diff --git a/Plugins/read_tab_pannzer_gene.py b/Plugins/read_tab_pannzer_gene.py new file mode 100644 index 0000000..521797e --- /dev/null +++ b/Plugins/read_tab_pannzer_gene.py @@ -0,0 +1,28 @@ +#read_tab_pannzer_gene.py + +#read_tab_pannzer_CDS + +import pandas as pd + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + + #initialisation + try: + anno_de = handle.loc[(target[1], "DE"),:].reset_index().iloc[0,1] + except KeyError: + anno_de = [] + + _annotations_ = [{ + "note": anno_de + }] + + + #calls + receiver = [] + for call,*args in calls: + receiver.extend(call.process(*args, target=target)) + + #output + return _annotations_ \ No newline at end of file diff --git a/core.py b/core.py new file mode 100644 index 0000000..03dfa18 --- /dev/null +++ b/core.py @@ -0,0 +1,13 @@ +#core.py + +import importlib + +class app: + def __init__(self, plugins:list=[]): + assert plugins != [], "No plugins specified" + + self.plugins = plugins + + def run(self): + for plugin,*args in self.plugins: + plugin.process(*args) \ No newline at end of file diff --git a/main.py b/main.py index 4e941a0..479331e 100644 --- a/main.py +++ b/main.py @@ -1,153 +1,83 @@ -#!/usr/bin/python3 -# -*-coding:utf8 - -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -from Bio.SeqRecord import SeqRecord -from joblib import Parallel, delayed -from tqdm import tqdm -import multiprocessing -import os -import pandas as pd -import re -import sys - -def read_anno(file): - anno = pd.read_csv(file, sep="\t") - anno = anno.sort_values(by=["qpid", "type"]).drop(["score", "PPV"], axis=1).set_index(["qpid", "type", "id", "desc"]) - return anno - -def read_gff(file): - gff = pd.read_csv(file, sep="\t") - gff = gff.reset_index() - gff.columns = ["seq_id", "source", "ft_type", "start", "stop", "score", "strand", "phase", "attr"] - gff = gff.sort_values(by=["seq_id"]).drop(["source", "score", "phase"], axis=1).dropna() - gff["sub_seq_id"] = [re.split(r':',re.search("^ID=.*?;", x).group(0)[3:-1])[0] for x in gff["attr"]] - gff = gff[["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]].sort_values(by=["seq_id", "sub_seq_id", "ft_type"]).set_index(["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]) - gff["start"].apply(lambda x: x-1) - return gff - -def read_config(file): - config = {"description":None, "division":None, "molecule_type":None, "organism":None, "project":None, "taxonomy":None, "topology":None, "transl_table":None} - for line in [line.rsplit("\n")[0] for line in file.readlines()]: - config[re.split(r":", line)[0].lower()]=re.split(r":", line)[1] - return config - -def mergeLocations(_locationArray_): - return _locationArray_[0] if len(_locationArray_) == 1 else CompoundLocation(_locationArray_) - -def init_features(sec, gene, gff): - ft_table = gff.loc[(sec, gene),:].reset_index() - ft_table_mRNA = ft_table[ft_table["ft_type"] == "mRNA"].iloc[0,:] - ft_table_CDSs = [x for i, x in ft_table[ft_table["ft_type"] == "CDS"].iterrows()] - tmp_ft_table_3UTR = ft_table[ft_table["ft_type"] == "three_prime_UTR"] - ft_table_3UTR = pd.Series(dtype="float64") if tmp_ft_table_3UTR.empty else tmp_ft_table_3UTR.iloc[0,:] - tmp_ft_table_5UTR = ft_table[ft_table["ft_type"] == "five_prime_UTR"] - ft_table_5UTR = pd.Series(dtype="float64") if tmp_ft_table_5UTR.empty else tmp_ft_table_5UTR.iloc[0,:] - - return { - "location":FeatureLocation(int(ft_table_mRNA[1]),int(ft_table_mRNA[2]),(1,-1)[ft_table_mRNA[3] == "-"]), - "qualifiers":{"gene":gene,"note":list()}, - "type":"gene" - },{ - "location":mergeLocations([FeatureLocation(int(CDS[1]),int(CDS[2]),(1,-1)[CDS[3] == "-"]) for CDS in ft_table_CDSs]), - "qualifiers":{"gene":gene}, - "type":"mRNA" - },{ - "location":mergeLocations([FeatureLocation(int(CDS[1]),int(CDS[2]),(1,-1)[CDS[3] == "-"]) for CDS in ft_table_CDSs]), - "qualifiers":{"gene":gene,"product":list(),"note":list(),"db_xref":list(),"translation":list(),"transl_table":11}, - "type":"CDS" - },{ - "location": None if ft_table_3UTR.empty else FeatureLocation(int(ft_table_3UTR[1]), int(ft_table_3UTR[2]), (1,-1)[ft_table_3UTR[3] == "-"]), - "qualifiers":{"gene":gene}, - "type":"3'UTR" - },{ - "location":None if ft_table_5UTR.empty else FeatureLocation(int(ft_table_5UTR[1]), int(ft_table_5UTR[2]), (1,-1)[ft_table_5UTR[3] == "-"]), - "qualifiers":{"gene":gene}, - "type":"5'UTR"} - -def merge(record, anno, gff, conf, out_dir): - sec = record.id - - #Record initialisation - _record_ = SeqRecord( - record.seq, - id=sec, - dbxrefs=["Project:" + conf["project"]], - annotations={"division":conf["division"],"molecule_type":conf["molecule_type"],"organism":conf["organism"],"taxonomy":conf["taxonomy"],"topology":conf["topology"]}, - description=conf["description"]) - - #Source feature - ft_table = gff.loc[(sec, slice(None), "contig"), :].reset_index() - _source_ = SeqFeature(FeatureLocation(int(ft_table.iloc[0,3]),int(ft_table.iloc[0,4]),(1,-1)[ft_table.iloc[0,5] == "-"]),type="source",qualifiers={"organism":"test","mol_type":"genomic DNA","db_xref":list()}) - _record_.features.append(_source_) - - #GENE/MRNA/CDS/3UTR/5UTR features - for gene in gff.loc[(sec, slice(None), "gene"),:].reset_index()["sub_seq_id"].apply(lambda x: x+"-mRNA-1"): - _gene_, _mRNA_, _CDS_, _3UTR_, _5UTR_ = init_features(sec, gene, gff) - - try: - anno_table = anno.loc[(gene),:].reset_index() - anno_bp = anno.loc[(gene, "BP_ARGOT"),:].reset_index()["id"] - anno_cc = anno.loc[(gene, "CC_ARGOT"),:].reset_index()["id"] - anno_mf = anno.loc[(gene, "MF_ARGOT"),:].reset_index()["id"] - _CDS_["qualifiers"]["db_xref"] = ["GO:" + str(go) for go in pd.concat([anno_bp, anno_cc, anno_mf])] - _CDS_["qualifiers"]["translation"] = anno.loc[(gene, "qseq"),:].reset_index().iloc[0,1] - _CDS_["qualifiers"]["transl_table"] = conf["transl_table"] - _gene_["qualifiers"]["note"] = _CDS_["qualifiers"]["product"] = anno.loc[(gene, "DE"),:].reset_index().iloc[0,1] - except KeyError: - pass - - for feature in [_gene_, _mRNA_, _CDS_, _3UTR_, _5UTR_]: - if feature["location"]: - _record_.features.append(SeqFeature(feature["location"], type=feature["type"], qualifiers=feature["qualifiers"])) - - #Exon feature - ft_table = gff.loc[sec, slice(None), "exon"].reset_index() - _record_.features.extend([SeqFeature(FeatureLocation(int(exon["start"]), int(exon["stop"]), (1,-1)[exon["strand"]=="-"]), type="exon", qualifiers={}) for i,exon in ft_table.iterrows()]) - - #Print EMBL entry in output folder - with open(out_dir + "/" + sec + ".dat", "w") as file: - print(_record_.format("embl"), file=file) - file.close() - -if __name__ == "__main__": - - GFF_FILE = FASTA_FILE = ANNO_FILE = CONF_FILE = None - OUT_DIR = "out" - - args = sys.argv[1:] - for i in [0,2,4,6,8]: - if args[i] in ["-gff", "-g"]: GFF_FILE = args[i+1] - elif args[i] in ["-fasta", "-f"]: FASTA_FILE = args[i+1] - elif args[i] in ["-anno", "-a"]: ANNO_FILE = args[i+1] - elif args[i] in ["-conf", "-c"]: CONF_FILE = args[i+1] - elif args[i] in ["-out", "-o"]: OUT_DIR = args[i+1] - - if not os.path.exists(OUT_DIR): - os.makedirs(OUT_DIR) - - CONF = None - with open(CONF_FILE) as conf_file: - CONF = read_config(conf_file) - conf_file.close() - print("Configuration file reading : DONE!") - - ANNO = None - with open(ANNO_FILE) as anno_file: - ANNO = read_anno(anno_file) - anno_file.close() - print("Annotation file reading : DONE!") - - GFF = None - with open(GFF_FILE) as gff_file: - GFF = read_gff(gff_file) - gff_file.close() - print("Prediction file reading : DONE!") - - with open(FASTA_FILE) as fasta_file: - num_cores = multiprocessing.cpu_count() - records = list(SeqIO.parse(fasta_file, "fasta")) - processed_list = Parallel(n_jobs=num_cores)(delayed(merge)(record, ANNO, GFF, CONF, OUT_DIR) for record in tqdm(records)) - fasta_file.close() \ No newline at end of file +#main.py + +from core import app + +from Bio import SeqIO +import pandas as pd +import re +import importlib + +def fasta2handle(file_path): + with open(file_path) as handle: + return list(SeqIO.parse(handle, "fasta")) + +def gff_maker2handle(file_path): + with open(file_path) as handle: + gff = pd.read_csv(handle, sep="\t") + gff = gff.reset_index() + gff.columns = ["seq_id", "source", "ft_type", "start", "stop", "score", "strand", "phase", "attr"] + gff = gff.sort_values(by=["seq_id"]).drop(["source", "score", "phase"], axis=1).dropna() + gff["sub_seq_id"] = [re.split(r':',re.search("^ID=.*?;", x).group(0)[3:-1])[0] for x in gff["attr"]] + return gff[["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]].sort_values(by=["seq_id", "sub_seq_id", "ft_type"]).set_index(["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]) + +def tab_pannzer2handle(file_path): + with open(file_path) as handle: + anno = pd.read_csv(handle, sep="\t") + anno = anno.sort_values(by=["qpid", "type"]).drop(["score", "PPV"], axis=1).set_index(["qpid", "type", "id", "desc"]) + return anno + +if __name__ == "__main__": + + _GLOBALS_ = { + "handles":dict(), + "plugins":dict(), + "metadata":dict() + } + + _GLOBALS_["handles"]["fasta"] = fasta2handle("files/sequences.fasta") + _GLOBALS_["handles"]["gff_maker"] = gff_maker2handle("files/data.gff") + _GLOBALS_["handles"]["tab_panzer"] = tab_pannzer2handle("files/anno.out") + + _GLOBALS_["plugins"]["read_fasta"] = importlib.import_module(".read_fasta","Plugins").Plugin() + + _GLOBALS_["plugins"]["read_gff_maker_3UTR"] = importlib.import_module(".read_gff_maker_3UTR","Plugins").Plugin() + _GLOBALS_["plugins"]["read_gff_maker_5UTR"] = importlib.import_module(".read_gff_maker_5UTR","Plugins").Plugin() + _GLOBALS_["plugins"]["read_gff_maker_CDS"] = importlib.import_module(".read_gff_maker_CDS","Plugins").Plugin() + _GLOBALS_["plugins"]["read_gff_maker_exon"] = importlib.import_module(".read_gff_maker_exon","Plugins").Plugin() + _GLOBALS_["plugins"]["read_gff_maker_gene"] = importlib.import_module(".read_gff_maker_gene","Plugins").Plugin() + _GLOBALS_["plugins"]["read_gff_maker_main"] = importlib.import_module(".read_gff_maker_main","Plugins").Plugin() + _GLOBALS_["plugins"]["read_gff_maker_mRNA"] = importlib.import_module(".read_gff_maker_mRNA","Plugins").Plugin() + _GLOBALS_["plugins"]["read_gff_maker_source"] = importlib.import_module(".read_gff_maker_source","Plugins").Plugin() + + _GLOBALS_["plugins"]["read_tab_pannzer_CDS"] = importlib.import_module(".read_tab_pannzer_CDS","Plugins").Plugin() + _GLOBALS_["plugins"]["read_tab_pannzer_gene"] = importlib.import_module(".read_tab_pannzer_gene","Plugins").Plugin() + + + _GLOBALS_["metadata"]["project"] = "temp" + _GLOBALS_["metadata"]["division"] = "INV" + _GLOBALS_["metadata"]["taxonomy"] = "29031" + _GLOBALS_["metadata"]["organism"] = "Phlebotomus papatasi" + _GLOBALS_["metadata"]["molecule_type"] = "genomic DNA" + _GLOBALS_["metadata"]["topology"] = "linear" + _GLOBALS_["metadata"]["description"] = "description" + _GLOBALS_["metadata"]["transl_table"] = 0 + + app = app( + [ + (_GLOBALS_["plugins"]["read_fasta"], _GLOBALS_["handles"]["fasta"], _GLOBALS_["metadata"], [ + (_GLOBALS_["plugins"]["read_gff_maker_main"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ + (_GLOBALS_["plugins"]["read_gff_maker_source"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"],[]) + (_GLOBALS_["plugins"]["read_gff_maker_gene"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ + (_GLOBALS_["plugins"]["read_tab_pannzer_gene"], _GLOBALS_["handles"]["tab_panzer"], _GLOBALS_["metadata"], []) + ]), + (_GLOBALS_["plugins"]["read_gff_maker_mRNA"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []), + (_GLOBALS_["plugins"]["read_gff_maker_CDS"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ + (_GLOBALS_["plugins"]["read_tab_pannzer_CDS"], _GLOBALS_["handles"]["tab_panzer"], _GLOBALS_["metadata"], []) + ]), + (_GLOBALS_["plugins"]["read_gff_maker_3UTR"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []), + (_GLOBALS_["plugins"]["read_gff_maker_5UTR"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []) + ]), + (_GLOBALS_["plugins"]["read_gff_maker_exon"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []) + ]) + ]) + app.run() \ No newline at end of file From a3ad89bf3fd545db62f60b7bcf25f1fd15c76094 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Fri, 14 May 2021 08:55:22 +0200 Subject: [PATCH 02/22] Delete read_gff_maker_source.py removed this plugin as it is contained in the read_gff_maker_main plugin. --- Plugins/read_gff_maker_source.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 Plugins/read_gff_maker_source.py diff --git a/Plugins/read_gff_maker_source.py b/Plugins/read_gff_maker_source.py deleted file mode 100644 index f8c0bce..0000000 --- a/Plugins/read_gff_maker_source.py +++ /dev/null @@ -1 +0,0 @@ -#read_gff_maker_source.py \ No newline at end of file From 128eb82a0a81c3cd0a7b205f4362624248b23a1d Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Fri, 14 May 2021 08:57:22 +0200 Subject: [PATCH 03/22] upload refering to read_gff_maker_source removal --- main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/main.py b/main.py index 479331e..59cd318 100644 --- a/main.py +++ b/main.py @@ -47,7 +47,6 @@ def tab_pannzer2handle(file_path): _GLOBALS_["plugins"]["read_gff_maker_gene"] = importlib.import_module(".read_gff_maker_gene","Plugins").Plugin() _GLOBALS_["plugins"]["read_gff_maker_main"] = importlib.import_module(".read_gff_maker_main","Plugins").Plugin() _GLOBALS_["plugins"]["read_gff_maker_mRNA"] = importlib.import_module(".read_gff_maker_mRNA","Plugins").Plugin() - _GLOBALS_["plugins"]["read_gff_maker_source"] = importlib.import_module(".read_gff_maker_source","Plugins").Plugin() _GLOBALS_["plugins"]["read_tab_pannzer_CDS"] = importlib.import_module(".read_tab_pannzer_CDS","Plugins").Plugin() _GLOBALS_["plugins"]["read_tab_pannzer_gene"] = importlib.import_module(".read_tab_pannzer_gene","Plugins").Plugin() @@ -66,7 +65,6 @@ def tab_pannzer2handle(file_path): [ (_GLOBALS_["plugins"]["read_fasta"], _GLOBALS_["handles"]["fasta"], _GLOBALS_["metadata"], [ (_GLOBALS_["plugins"]["read_gff_maker_main"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ - (_GLOBALS_["plugins"]["read_gff_maker_source"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"],[]) (_GLOBALS_["plugins"]["read_gff_maker_gene"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ (_GLOBALS_["plugins"]["read_tab_pannzer_gene"], _GLOBALS_["handles"]["tab_panzer"], _GLOBALS_["metadata"], []) ]), From 6ce0ed2a658cc569fdb1c7b8d172426efa1ac610 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Fri, 14 May 2021 08:59:18 +0200 Subject: [PATCH 04/22] upload refering to read_gff_maker_source removal --- Plugins/__init__.py | 1 - Plugins/read_gff_maker_main.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Plugins/__init__.py b/Plugins/__init__.py index 6fc65d5..8937431 100644 --- a/Plugins/__init__.py +++ b/Plugins/__init__.py @@ -8,7 +8,6 @@ from Plugins.read_gff_maker_CDS import Plugin from Plugins.read_gff_maker_exon import Plugin from Plugins.read_gff_maker_mRNA import Plugin -from Plugins.read_gff_maker_source import Plugin from Plugins.read_tab_pannzer_CDS import Plugin from Plugins.read_tab_pannzer_gene import Plugin \ No newline at end of file diff --git a/Plugins/read_gff_maker_main.py b/Plugins/read_gff_maker_main.py index 2043ade..eac0f9c 100644 --- a/Plugins/read_gff_maker_main.py +++ b/Plugins/read_gff_maker_main.py @@ -9,6 +9,18 @@ class Plugin: def process(self, handle, metadata, calls:list=[], target=None): + location = (handle.loc[(target, slice(None), "contig"),:].reset_index()) + _feature_ = [ + SeqFeature( + FeatureLocation(int(location.iloc[0,3]), int(location.iloc[0,4]), (1,-1)[location.iloc[0,5] == "-"]), + type="source", + qualifiers={ + "oganism":metadata["organism"], + "mol_type":metadata["molecule_type"], + "db_xref":list()})] + + yield _feature_ + for gene in handle.loc[(target, slice(None), "gene"),:].reset_index()["sub_seq_id"]: #initialize features From c0f313761cd6ca2e69ec9a5ed41052c1d44e3a73 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Fri, 14 May 2021 09:00:02 +0200 Subject: [PATCH 05/22] read_gff_maker_exon set up --- Plugins/read_gff_maker_exon.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/Plugins/read_gff_maker_exon.py b/Plugins/read_gff_maker_exon.py index 711c5ca..e4b84ef 100644 --- a/Plugins/read_gff_maker_exon.py +++ b/Plugins/read_gff_maker_exon.py @@ -1 +1,28 @@ -#read_gff_maker_exon.py \ No newline at end of file +#read_gff_maker_exon.py + +import pandas as pd +import re + +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + + +class Plugin: + + def process(self, handle, metadata, calls:list=[], target=None): + exons = handle.loc[(target, slice(None), "exon"),:].reset_index() + + for index, exon in exons.iterrows(): + _features_subset_ = [ + SeqFeature( + FeatureLocation(int(exon["start"]), int(exon["stop"]), (1,-1)[exon["strand"] == "-"]), + type="exon", + qualifiers={} + )] + + #calls + receiver = [] + for call, *args in calls: + receiver.extend(call.process(*args, target=(target))) + + + yield _features_subset_ From 0ec1b6c891cbb0eefc3cbfe375b1d515b3386af2 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Fri, 14 May 2021 14:04:56 +0200 Subject: [PATCH 06/22] Create config.info (example file) --- files/config.info | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 files/config.info diff --git a/files/config.info b/files/config.info new file mode 100644 index 0000000..1aa75e3 --- /dev/null +++ b/files/config.info @@ -0,0 +1,49 @@ +# Axel Giottonini +# test config.info +# 14.05.2021 + + +project:temp +division:inv +taxonomy:29031 +organism:Phlebotomus papatasi +molecule_type:genomic DNA +topology:linear +description:empty +transl_table:0 + + + +fasta2handle:.fasta2handle,Plugins +gff_maker2handle:.gff_maker2handle,Plugins +tab_pannzer2handle:.tab_pannzer2handle,Plugins +read_fasta:.read_fasta,Plugins +read_gff_maker_3UTR:.read_gff_maker_3UTR,Plugins +read_gff_maker_5UTR:.read_gff_maker_5UTR,Plugins +read_gff_maker_CDS:.read_gff_maker_CDS,Plugins +read_gff_maker_exon:.read_gff_maker_exon,Plugins +read_gff_maker_gene:.read_gff_maker_gene,Plugins +read_gff_maker_main:.read_gff_maker_main,Plugins +read_gff_maker_mRNA:.read_gff_maker_mRNA,Plugins +read_tab_pannzer_CDS:.read_tab_pannzer_CDS,Plugins +read_tab_pannzer_gene:.read_tab_pannzer_gene,Plugins + + + +fasta:fasta2handle,files/sequences.fasta +gff_maker:gff_maker2handle,files/data.gff +tab_pannzer:tab_pannzer2handle,files/anno.out + + + +-read_fasta,fasta +--read_gff_maker_main,gff_maker +---read_gff_maker_gene,gff_maker +----read_tab_pannzer_gene,tab_pannzer +---read_gff_maker_mRNA,gff_maker +---read_gff_maker_CDS,gff_maker +----read_tab_pannzer_CDS,tab_pannzer +---read_gff_maker_3UTR,gff_maker +---read_gff_maker_5UTR,gff_maker +--read_gff_maker_exon,gff_maker + From e17f02fe47705552fbdae9b16bb6b03291509e6b Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Fri, 14 May 2021 14:06:07 +0200 Subject: [PATCH 07/22] Create plugins for handling files --- Plugins/__init__.py | 4 ++++ Plugins/fasta2handle.py | 9 +++++++++ Plugins/gff_maker2handle.py | 16 ++++++++++++++++ Plugins/tab_pannzer2handle.py | 11 +++++++++++ 4 files changed, 40 insertions(+) create mode 100644 Plugins/fasta2handle.py create mode 100644 Plugins/gff_maker2handle.py create mode 100644 Plugins/tab_pannzer2handle.py diff --git a/Plugins/__init__.py b/Plugins/__init__.py index 8937431..583ac6d 100644 --- a/Plugins/__init__.py +++ b/Plugins/__init__.py @@ -1,5 +1,9 @@ #__init.py__ +from Plugins.fasta2handle import Plugin +from Plugins.gff_maker2handle import Plugin +from Plugins.tab_pannzer2handle import Plugin + from Plugins.read_fasta import Plugin from Plugins.read_gff_maker_gene import Plugin diff --git a/Plugins/fasta2handle.py b/Plugins/fasta2handle.py new file mode 100644 index 0000000..b9e319b --- /dev/null +++ b/Plugins/fasta2handle.py @@ -0,0 +1,9 @@ +#fasta2handle.py + +from Bio import SeqIO + +class Plugin: + + def process(self, file_path): + with open(file_path) as handle: + return list(SeqIO.parse(handle, "fasta")) \ No newline at end of file diff --git a/Plugins/gff_maker2handle.py b/Plugins/gff_maker2handle.py new file mode 100644 index 0000000..39124c2 --- /dev/null +++ b/Plugins/gff_maker2handle.py @@ -0,0 +1,16 @@ +#gff_maker2handle.py + +import pandas as pd +import re + +class Plugin: + + def process(self, file_path): + with open(file_path) as handle: + gff = pd.read_csv(handle, sep="\t") + gff = gff.reset_index() + gff.columns = ["seq_id", "source", "ft_type", "start", "stop", "score", "strand", "phase", "attr"] + gff = gff.sort_values(by=["seq_id"]).drop(["source", "score", "phase"], axis=1).dropna() + gff["sub_seq_id"] = [re.split(r':',re.search("^ID=.*?;", x).group(0)[3:-1])[0] for x in gff["attr"]] + return gff[["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]].sort_values(by=["seq_id", "sub_seq_id", "ft_type"]).set_index(["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]) + \ No newline at end of file diff --git a/Plugins/tab_pannzer2handle.py b/Plugins/tab_pannzer2handle.py new file mode 100644 index 0000000..ac7c532 --- /dev/null +++ b/Plugins/tab_pannzer2handle.py @@ -0,0 +1,11 @@ +#tab_pannzer2handle.py + +import pandas as pd + +class Plugin: + + def process(self, file_path): + with open(file_path) as handle: + anno = pd.read_csv(handle, sep="\t") + anno = anno.sort_values(by=["qpid", "type"]).drop(["score", "PPV"], axis=1).set_index(["qpid", "type", "id", "desc"]) + return anno \ No newline at end of file From d5a80b94f85394f0f4d6ce0d157a66d1f7338fe5 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Fri, 14 May 2021 14:07:02 +0200 Subject: [PATCH 08/22] Upload main to work with the new plugin (2handle) and config file --- main.py | 111 +++++++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 57 deletions(-) diff --git a/main.py b/main.py index 59cd318..4d97e6f 100644 --- a/main.py +++ b/main.py @@ -7,25 +7,6 @@ import re import importlib -def fasta2handle(file_path): - with open(file_path) as handle: - return list(SeqIO.parse(handle, "fasta")) - -def gff_maker2handle(file_path): - with open(file_path) as handle: - gff = pd.read_csv(handle, sep="\t") - gff = gff.reset_index() - gff.columns = ["seq_id", "source", "ft_type", "start", "stop", "score", "strand", "phase", "attr"] - gff = gff.sort_values(by=["seq_id"]).drop(["source", "score", "phase"], axis=1).dropna() - gff["sub_seq_id"] = [re.split(r':',re.search("^ID=.*?;", x).group(0)[3:-1])[0] for x in gff["attr"]] - return gff[["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]].sort_values(by=["seq_id", "sub_seq_id", "ft_type"]).set_index(["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]) - -def tab_pannzer2handle(file_path): - with open(file_path) as handle: - anno = pd.read_csv(handle, sep="\t") - anno = anno.sort_values(by=["qpid", "type"]).drop(["score", "PPV"], axis=1).set_index(["qpid", "type", "id", "desc"]) - return anno - if __name__ == "__main__": _GLOBALS_ = { @@ -34,48 +15,64 @@ def tab_pannzer2handle(file_path): "metadata":dict() } - _GLOBALS_["handles"]["fasta"] = fasta2handle("files/sequences.fasta") - _GLOBALS_["handles"]["gff_maker"] = gff_maker2handle("files/data.gff") - _GLOBALS_["handles"]["tab_panzer"] = tab_pannzer2handle("files/anno.out") + _PROCESSING_ = { + "metadata":[], + "plugins":[], + "handles":[], + "workflow":[] + } + current_field = None + with open("files/config.info") as handle: + for line in handle: + current_line = line.rstrip("\n") + + if line[0] == "#": continue + if current_line == "": continue - _GLOBALS_["plugins"]["read_fasta"] = importlib.import_module(".read_fasta","Plugins").Plugin() + if current_field: + assert current_field in ["metadata", "plugins", "handles", "workflow"] + if re.match(r"^(<\/)(\w+)(>)$", current_line): + current_field = None + else: + _PROCESSING_[current_field].append(current_line) + else: + assert re.match(r"^(<)(\w+)(>)$", current_line) + assert current_line[1:-1] in ["metadata", "plugins", "handles", "workflow"] + current_field = current_line[1:-1] + handle.close() - _GLOBALS_["plugins"]["read_gff_maker_3UTR"] = importlib.import_module(".read_gff_maker_3UTR","Plugins").Plugin() - _GLOBALS_["plugins"]["read_gff_maker_5UTR"] = importlib.import_module(".read_gff_maker_5UTR","Plugins").Plugin() - _GLOBALS_["plugins"]["read_gff_maker_CDS"] = importlib.import_module(".read_gff_maker_CDS","Plugins").Plugin() - _GLOBALS_["plugins"]["read_gff_maker_exon"] = importlib.import_module(".read_gff_maker_exon","Plugins").Plugin() - _GLOBALS_["plugins"]["read_gff_maker_gene"] = importlib.import_module(".read_gff_maker_gene","Plugins").Plugin() - _GLOBALS_["plugins"]["read_gff_maker_main"] = importlib.import_module(".read_gff_maker_main","Plugins").Plugin() - _GLOBALS_["plugins"]["read_gff_maker_mRNA"] = importlib.import_module(".read_gff_maker_mRNA","Plugins").Plugin() + for element in _PROCESSING_["metadata"]: + _GLOBALS_["metadata"][element.split(":")[0]] = element.split(":")[1] + for element in _PROCESSING_["plugins"]: + _GLOBALS_["plugins"][element.split(":")[0]] = importlib.import_module(element.split(":")[1].split(",")[0],element.split(":")[1].split(",")[1]).Plugin() + for element in _PROCESSING_["handles"]: + _GLOBALS_["handles"][element.split(":")[0]] = _GLOBALS_["plugins"][element.split(":")[1].split(",")[0]].process(element.split(":")[1].split(",")[1]) - _GLOBALS_["plugins"]["read_tab_pannzer_CDS"] = importlib.import_module(".read_tab_pannzer_CDS","Plugins").Plugin() - _GLOBALS_["plugins"]["read_tab_pannzer_gene"] = importlib.import_module(".read_tab_pannzer_gene","Plugins").Plugin() + max_level = 0 + for i in range(len(_PROCESSING_["workflow"])): + element = _PROCESSING_["workflow"][i] + regex = re.compile(r"^(-)+") + level = len(regex.search(element).group()) + max_level = max(max_level, level) + _PROCESSING_["workflow"][i] = (level, element[level:], []) + _PROCESSING_["workflow"].insert(0, (0,None,[])) - _GLOBALS_["metadata"]["project"] = "temp" - _GLOBALS_["metadata"]["division"] = "INV" - _GLOBALS_["metadata"]["taxonomy"] = "29031" - _GLOBALS_["metadata"]["organism"] = "Phlebotomus papatasi" - _GLOBALS_["metadata"]["molecule_type"] = "genomic DNA" - _GLOBALS_["metadata"]["topology"] = "linear" - _GLOBALS_["metadata"]["description"] = "description" - _GLOBALS_["metadata"]["transl_table"] = 0 + for i in range(max_level, -1, -1): + for j in range(len(_PROCESSING_["workflow"])): + element = _PROCESSING_["workflow"][j] + if element[0] == i: + for k in range(1, j+1): + if _PROCESSING_["workflow"][j-k] is not None and _PROCESSING_["workflow"][j-k][0] == i-1: + _PROCESSING_["workflow"][j-k][2].append(( + _GLOBALS_["plugins"][_PROCESSING_["workflow"][j][1].split(",")[0]], + _GLOBALS_["handles"][_PROCESSING_["workflow"][j][1].split(",")[1]], + _GLOBALS_["metadata"], + _PROCESSING_["workflow"][j][2] + )) + _PROCESSING_["workflow"][j] = None + break + _PROCESSING_["workflow"] = [x for x in _PROCESSING_["workflow"] if x is not None] - app = app( - [ - (_GLOBALS_["plugins"]["read_fasta"], _GLOBALS_["handles"]["fasta"], _GLOBALS_["metadata"], [ - (_GLOBALS_["plugins"]["read_gff_maker_main"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ - (_GLOBALS_["plugins"]["read_gff_maker_gene"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ - (_GLOBALS_["plugins"]["read_tab_pannzer_gene"], _GLOBALS_["handles"]["tab_panzer"], _GLOBALS_["metadata"], []) - ]), - (_GLOBALS_["plugins"]["read_gff_maker_mRNA"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []), - (_GLOBALS_["plugins"]["read_gff_maker_CDS"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], [ - (_GLOBALS_["plugins"]["read_tab_pannzer_CDS"], _GLOBALS_["handles"]["tab_panzer"], _GLOBALS_["metadata"], []) - ]), - (_GLOBALS_["plugins"]["read_gff_maker_3UTR"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []), - (_GLOBALS_["plugins"]["read_gff_maker_5UTR"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []) - ]), - (_GLOBALS_["plugins"]["read_gff_maker_exon"], _GLOBALS_["handles"]["gff_maker"], _GLOBALS_["metadata"], []) - ]) - ]) + app = app(_PROCESSING_["workflow"][0][2]) app.run() \ No newline at end of file From 6b9f438d2520b6298deeb00424884403d84f21c5 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Sat, 15 May 2021 16:52:27 +0200 Subject: [PATCH 09/22] Refactoring --- core.py | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- main.py | 72 +-------------------------- 2 files changed, 145 insertions(+), 76 deletions(-) diff --git a/core.py b/core.py index 03dfa18..3bbc502 100644 --- a/core.py +++ b/core.py @@ -1,13 +1,152 @@ #core.py import importlib +import re class app: - def __init__(self, plugins:list=[]): - assert plugins != [], "No plugins specified" + def __init__(self, config_path): + self.metadata = None + self.plugins = None + self.handles = None + self.workflow = None - self.plugins = plugins + config = self.read_config(config_path) + self.set_metadata(config["metadata"]) + self.set_plugins(config["plugins"]) + self.set_handles(config["handles"]) + self.set_workflow(config["workflow"]) + + """ + This function reads the config file which is divided into four fields : + - metadata: contains the general and shared informations for EMBLish + as a couple key:value + - plugins: contains the list of the plugins that will be used as a + triplet plugin_key:plugin_name,plugin_package + - handles: contains the list of files that will be used as inputs as + a triplet handle_key:plugin,file_path + - workflow: contains a hierarchical list of the different step to run + as a couple plugin_key,handle_key + """ + def read_config(self, config_path): + config = { + "metadata":[], + "plugins":[], + "handles":[], + "workflow":[] + } + + current_field = None + current_line = None + + with open(config_path) as handle: + for line in handle: + current_line = line.rstrip("\n") + + #Comments and blank lines ignore + if current_line == "": continue + if current_line[0] == "#": continue + + #Filling the config object with the content of the configuration file + if current_field: + assert current_field in ["metadata", "plugins", "handles", "workflow"] + + #Checking if we are at the end of a field + if re.match(r"^(<\/)(\w+)(>)$", current_line): + current_field = None + else: + config[current_field].append(current_line) + + else: + assert re.match(r"^(<)(\w+)(>)$", current_line) + assert current_line[1:-1] in ["metadata", "plugins", "handles", "workflow"] + + current_field = current_line[1:-1] + + handle.close() + return config + + """ + This function converts the array containing the metadata into a dictionnary + """ + def set_metadata(self, array): + self.metadata = {element.split(":")[0]:element.split(":")[1] for element in array} + + """ + This function converts the array containing the plugins parameters into a dictionnary + with plugins to call with their key + """ + def set_plugins(self, array): + self.plugins = {element.split(":")[0]:importlib.import_module(element.split(":")[1].split(",")[0],element.split(":")[1].split(",")[1]).Plugin() for element in array} + + """ + This function converts the array containing the handles parameters into a dictionnary + with handles to call with their key + """ + def set_handles(self, array): + self.handles = {element.split(":")[0]:self.plugins[element.split(":")[1].split(",")[0]].process(element.split(":")[1].split(",")[1]) for element in array} + + """ + """ + def set_workflow(self, array): + temp = self.refactor_workflow(array) + temp = self.merge_workflow(temp) + self.workflow = self.convert_workflow_task(temp[0]) + + """ + Convert the list element in triplet level,,[] + """ + def refactor_workflow(self, array): + + for i in range(len(array)): + element = array[i] + level = len(re.search(r"^(-)+", element).group()) + + array[i] = (level, element[level:], []) + + return array + + """ + Order the elements and create the hierarchical nodes + """ + def merge_workflow(self, array): + array.insert(0, (0,None,[])) + + max_level = max(array, key = lambda element: element[0])[0] + + for i in range(max_level, -1, -1): + for j in range(len(array)): + element = array[j] + + if element[0] == i: + for k in range(1, j+1): + if ( + array[j-k] is not None and + array[j-k][0] == i-1 + ): + array[j-k][2].append(element) + array[j] = None + break + + array = [element for element in array if element != None] + + return array + + """ + """ + def convert_workflow_task(self, task): + if task[1]: + return ( + self, + task[1].split(",")[0], + task[1].split(",")[1], + [self.convert_workflow_task(sub_task) for sub_task in task[2]] + ) + + return [self.convert_workflow_task(sub_task) for sub_task in task[2]] + + """ + """ def run(self): - for plugin,*args in self.plugins: - plugin.process(*args) \ No newline at end of file + for app, key_plugin, *args in self.workflow: + app.plugins[key_plugin].process(app, *args) \ No newline at end of file diff --git a/main.py b/main.py index 4d97e6f..1331b9a 100644 --- a/main.py +++ b/main.py @@ -2,77 +2,7 @@ from core import app -from Bio import SeqIO -import pandas as pd -import re -import importlib - if __name__ == "__main__": - _GLOBALS_ = { - "handles":dict(), - "plugins":dict(), - "metadata":dict() - } - - _PROCESSING_ = { - "metadata":[], - "plugins":[], - "handles":[], - "workflow":[] - } - current_field = None - with open("files/config.info") as handle: - for line in handle: - current_line = line.rstrip("\n") - - if line[0] == "#": continue - if current_line == "": continue - - if current_field: - assert current_field in ["metadata", "plugins", "handles", "workflow"] - if re.match(r"^(<\/)(\w+)(>)$", current_line): - current_field = None - else: - _PROCESSING_[current_field].append(current_line) - else: - assert re.match(r"^(<)(\w+)(>)$", current_line) - assert current_line[1:-1] in ["metadata", "plugins", "handles", "workflow"] - current_field = current_line[1:-1] - handle.close() - - for element in _PROCESSING_["metadata"]: - _GLOBALS_["metadata"][element.split(":")[0]] = element.split(":")[1] - for element in _PROCESSING_["plugins"]: - _GLOBALS_["plugins"][element.split(":")[0]] = importlib.import_module(element.split(":")[1].split(",")[0],element.split(":")[1].split(",")[1]).Plugin() - for element in _PROCESSING_["handles"]: - _GLOBALS_["handles"][element.split(":")[0]] = _GLOBALS_["plugins"][element.split(":")[1].split(",")[0]].process(element.split(":")[1].split(",")[1]) - - max_level = 0 - for i in range(len(_PROCESSING_["workflow"])): - element = _PROCESSING_["workflow"][i] - regex = re.compile(r"^(-)+") - level = len(regex.search(element).group()) - max_level = max(max_level, level) - _PROCESSING_["workflow"][i] = (level, element[level:], []) - - _PROCESSING_["workflow"].insert(0, (0,None,[])) - - for i in range(max_level, -1, -1): - for j in range(len(_PROCESSING_["workflow"])): - element = _PROCESSING_["workflow"][j] - if element[0] == i: - for k in range(1, j+1): - if _PROCESSING_["workflow"][j-k] is not None and _PROCESSING_["workflow"][j-k][0] == i-1: - _PROCESSING_["workflow"][j-k][2].append(( - _GLOBALS_["plugins"][_PROCESSING_["workflow"][j][1].split(",")[0]], - _GLOBALS_["handles"][_PROCESSING_["workflow"][j][1].split(",")[1]], - _GLOBALS_["metadata"], - _PROCESSING_["workflow"][j][2] - )) - _PROCESSING_["workflow"][j] = None - break - _PROCESSING_["workflow"] = [x for x in _PROCESSING_["workflow"] if x is not None] - - app = app(_PROCESSING_["workflow"][0][2]) + app = app("files/config.info") app.run() \ No newline at end of file From 8d141313f5f5de26e1b2a51b31f7abf0c3598ca7 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Sat, 15 May 2021 16:53:15 +0200 Subject: [PATCH 10/22] Refactoring --- Plugins/read_fasta.py | 68 ++++++++++++++--------- Plugins/read_gff_maker_3UTR.py | 60 ++++++++++++++------- Plugins/read_gff_maker_5UTR.py | 60 ++++++++++++++------- Plugins/read_gff_maker_CDS.py | 92 ++++++++++++++++++-------------- Plugins/read_gff_maker_exon.py | 64 +++++++++++++++------- Plugins/read_gff_maker_gene.py | 62 +++++++++++++-------- Plugins/read_gff_maker_mRNA.py | 75 +++++++++++++++++++------- Plugins/read_gff_maker_main.py | 81 +++++++++++++++++++--------- Plugins/read_tab_pannzer_CDS.py | 92 +++++++++++++++++++++----------- Plugins/read_tab_pannzer_gene.py | 47 ++++++++++------ 10 files changed, 473 insertions(+), 228 deletions(-) diff --git a/Plugins/read_fasta.py b/Plugins/read_fasta.py index a6dade6..ba426db 100644 --- a/Plugins/read_fasta.py +++ b/Plugins/read_fasta.py @@ -1,6 +1,5 @@ #plugin.py -import importlib import itertools from Bio import SeqIO @@ -8,27 +7,46 @@ class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): - - for record in handle: - - #initialize record - _record_ = SeqRecord( - record.seq, - record.id, - dbxrefs=["Project:" + metadata["project"]], - annotations={"division":metadata["division"],"molecule_type":metadata["molecule_type"],"organism":metadata["organism"],"taxonomy":metadata["taxonomy"],"topology":metadata["topology"]}, - description="" - ) - - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=_record_.id)) - - #post output treatment - _record_.features = list(itertools.chain(*receiver)) - - #outputing - with open(f"out/{_record_.id}.dat", "w") as o: - print(_record_.format("embl"), file=o) \ No newline at end of file + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqRecord( + pre_feature.seq, + pre_feature.id, + dbxrefs=["Project:" + metadata["project"]], + annotations={ + "division":metadata["division"], + "molecule_type":metadata["molecule_type"], + "organism":metadata["organism"], + "taxonomy":metadata["taxonomy"], + "topology":metadata["topology"]}, + description="") + + """ + """ + def callbacks(self, app, calls, target): + sender = [] + + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender += temp + + return sender + + """ + """ + def merge(self, feature, receiver): + feature.features = receiver + + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + + for element in app.handles[key_handle]: + feature = self.feature_initialize(element, app.metadata) + receiver = self.callbacks(app, calls, (feature.id)) + self.merge(feature, receiver) + + with open(f"out/{feature.id}.dat", "w") as o: + print(feature.format("embl"), file=o) \ No newline at end of file diff --git a/Plugins/read_gff_maker_3UTR.py b/Plugins/read_gff_maker_3UTR.py index dabca35..ef46754 100644 --- a/Plugins/read_gff_maker_3UTR.py +++ b/Plugins/read_gff_maker_3UTR.py @@ -5,23 +5,47 @@ class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): - try: - location = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "three_prime_UTR"),:].reset_index()) + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqFeature( + FeatureLocation(int(pre_feature["start"]), int(pre_feature["stop"]), (1,-1)[pre_feature["strand"] == "-"]), + type="3'UTR", + qualifiers={ + "gene":None, + "note":list()}) + + """ + """ + def callbacks(self, app, calls, target): + sender = [] + + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender.append(temp) + + return sender + + """ + """ + def merge(self, feature, receiver): + return feature - _sub_features_ = [ - SeqFeature( - FeatureLocation(int(location.iloc[0,0]), int(location.iloc[0,1]), (1,-1)[location.iloc[0,2] == "-"]), - type="3'UTR", - qualifiers={ - "gene":target[1], - "note":list()})] - - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "3'UTR"))) - - return _sub_features_ + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + try: + feature = self.feature_initialize( + app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "three_prime_UTR"),:].reset_index().iloc[0,:], + app.metadata) except KeyError: - return [] \ No newline at end of file + return None + + feature.qualifiers["gene"] = target[1] + receiver = self.callbacks( + app, + calls, + (target[0], f"{target[1]}-mRNA-1", "3'UTR")) + + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_gff_maker_5UTR.py b/Plugins/read_gff_maker_5UTR.py index 5b72281..60fae42 100644 --- a/Plugins/read_gff_maker_5UTR.py +++ b/Plugins/read_gff_maker_5UTR.py @@ -5,23 +5,47 @@ class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): - try: - location = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "five_prime_UTR"),:].reset_index()) + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqFeature( + FeatureLocation(int(pre_feature["start"]), int(pre_feature["stop"]), (1,-1)[pre_feature["strand"] == "-"]), + type="5'UTR", + qualifiers={ + "gene":None, + "note":list()}) + + """ + """ + def callbacks(self, app, calls, target): + sender = [] - _sub_features_ = [ - SeqFeature( - FeatureLocation(int(location.iloc[0,0]), int(location.iloc[0,1]), (1,-1)[location.iloc[0,2] == "-"]), - type="5'UTR", - qualifiers={ - "gene":target[1], - "note":list()})] - - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "5'UTR"))) - - return _sub_features_ + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender.append(temp) + + return sender + + """ + """ + def merge(self, feature, receiver): + return feature + + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + try: + feature = self.feature_initialize( + app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "five_prime_UTR"),:].reset_index().iloc[0,:], + app.metadata) except KeyError: - return [] \ No newline at end of file + return None + + feature.qualifiers["gene"] = target[1] + receiver = self.callbacks( + app, + calls, + (target[0], f"{target[1]}-mRNA-1", "5'UTR")) + + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_gff_maker_CDS.py b/Plugins/read_gff_maker_CDS.py index 7b29f18..48bcb29 100644 --- a/Plugins/read_gff_maker_CDS.py +++ b/Plugins/read_gff_maker_CDS.py @@ -4,47 +4,61 @@ import itertools from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -def mergeLocations(_locationArray_): - return _locationArray_[0] if len(_locationArray_) == 1 else CompoundLocation(_locationArray_) - class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): - locations = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index()) - - _sub_features_ = [ - SeqFeature( - mergeLocations(locations.apply(lambda location: FeatureLocation(int(location[0]), int(location[1]), (1,-1)[location[2] == "-"]), axis=1)), - type="CDS", - qualifiers={ - "gene":target[1], - "product":list(), - "note":list(), - "db_xref":list(), - "translation":list(), - "transl_table":metadata["transl_table"]})] - - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "CDS"))) - - annotations = list(itertools.chain(receiver)) - for annotation in annotations: - if "product" in annotation.keys() and annotation["product"] != []: - for sub_feature in _sub_features_: - sub_feature.qualifiers["product"].extend(annotation["product"]) - - if "note" in annotation.keys() and annotation["note"] != []: - for sub_feature in _sub_features_: - sub_feature.qualifiers["note"].extend(annotation["note"]) + """ + """ + def feature_initialize(self, pre_feature, metadata): + refactor_pre_feature = lambda element: FeatureLocation( + int(element[0]), + int(element[1]), + (1,-1)[element[2] == "-"]) + merge_pre_feature = lambda array: array[0] if len(array) == 1 else CompoundLocation(array) - if "db_xref" in annotation.keys() and annotation["db_xref"] != []: - for sub_feature in _sub_features_: - sub_feature.qualifiers["db_xref"].extend(annotation["db_xref"]) + return SeqFeature( + merge_pre_feature( + pre_feature.apply(refactor_pre_feature, axis=1)), + type="CDS", + qualifiers={ + "gene":None, + "product":list(), + "note":list(), + "db_xref":list(), + "translation":list(), + "transl_table":metadata["transl_table"]}) + + """ + """ + def callbacks(self, app, calls, target): + sender = [] + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender.append(temp) + return sender - if "translation" in annotation.keys() and annotation["translation"] != []: - for sub_feature in _sub_features_: - sub_feature.qualifiers["translation"].extend(annotation["translation"]) + """ + """ + def merge(self, feature, receiver): + for element in receiver: + for key in element.keys(): + feature.qualifiers[key].extend(element[key]) + return feature - return _sub_features_ \ No newline at end of file + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + try: + feature = self.feature_initialize( + app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index(), + app.metadata) + except KeyError: + return None + + feature.qualifiers["gene"] = target[1] + receiver = self.callbacks( + app, + calls, + (target[0], f"{target[1]}-mRNA-1", "CDS")) + + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_gff_maker_exon.py b/Plugins/read_gff_maker_exon.py index e4b84ef..1aeb42b 100644 --- a/Plugins/read_gff_maker_exon.py +++ b/Plugins/read_gff_maker_exon.py @@ -8,21 +8,49 @@ class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): - exons = handle.loc[(target, slice(None), "exon"),:].reset_index() - - for index, exon in exons.iterrows(): - _features_subset_ = [ - SeqFeature( - FeatureLocation(int(exon["start"]), int(exon["stop"]), (1,-1)[exon["strand"] == "-"]), - type="exon", - qualifiers={} - )] - - #calls - receiver = [] - for call, *args in calls: - receiver.extend(call.process(*args, target=(target))) - - - yield _features_subset_ + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqFeature( + FeatureLocation(int(pre_feature["start"]), int(pre_feature["stop"]), (1,-1)[pre_feature["strand"] == "-"]), + type="exon", + qualifiers={}) + + """ + """ + def multi_feature_initialize(self, pre_multi_feature, metadata): + for _, element in pre_multi_feature: + yield self.feature_initialize(element, metadata) + + """ + """ + def callbacks(self, app, calls, target): + sender = [] + + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender.append() + + return sender + + """ + """ + def merge(self, feature, receiver): + return feature + + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + try: + feature = self.multi_feature_initialize( + app.handles[key_handle].loc[(target, slice(None), "exon"),:].reset_index().iterrows(), + app.metadata) + except KeyError: + return None + + receiver = self.callbacks( + app, + calls, + target) + return self.merge(feature, receiver) diff --git a/Plugins/read_gff_maker_gene.py b/Plugins/read_gff_maker_gene.py index 069b66b..9d5f7da 100644 --- a/Plugins/read_gff_maker_gene.py +++ b/Plugins/read_gff_maker_gene.py @@ -6,26 +6,46 @@ class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): - location = (handle.loc[(target[0], target[1], "gene"),:].reset_index()) + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqFeature( + FeatureLocation(int(pre_feature["start"]), int(pre_feature["stop"]), (1,-1)[pre_feature["strand"] == "-"]), + type="gene", + qualifiers={ + "gene":None, + "note":list()}) + + """ + """ + def callbacks(self, app, calls, target): + sender = [] - _sub_features_ = [ - SeqFeature( - FeatureLocation(int(location.iloc[0,0]), int(location.iloc[0,1]), (1,-1)[location.iloc[0,2] == "-"]), - type="gene", - qualifiers={ - "gene":target[1], - "note":list()})] - - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=(target[0], target[1], "gene"))) - - annotations = list(itertools.chain(receiver)) - for annotation in annotations: - if "note" in annotation.keys() and annotation["note"] != []: - for sub_feature in _sub_features_: - sub_feature.qualifiers["note"].extend(annotation["note"]) + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender.append(temp) + + return sender - return _sub_features_ \ No newline at end of file + """ + """ + def merge(self, feature, receiver): + for element in receiver: + for key in element.keys(): + feature.qualifiers[key].extend(element[key]) + return feature + + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + feature = self.feature_initialize( + app.handles[key_handle].loc[(target[0], target[1], "gene"),:].reset_index().iloc[0,:], + app.metadata) + feature.qualifiers["gene"]=target[1] + receiver = self.callbacks( + app, + calls, + (target[0], f"{target[1]}-mRNA-1", "gene")) + + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_gff_maker_mRNA.py b/Plugins/read_gff_maker_mRNA.py index 7639664..e4e8bc0 100644 --- a/Plugins/read_gff_maker_mRNA.py +++ b/Plugins/read_gff_maker_mRNA.py @@ -3,25 +3,62 @@ import pandas as pd from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -def mergeLocations(_locationArray_): - return _locationArray_[0] if len(_locationArray_) == 1 else CompoundLocation(_locationArray_) - class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): - locations = (handle.loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index()) - - #initialise - _sub_features_ = [ - SeqFeature( - mergeLocations(locations.apply(lambda location: FeatureLocation(int(location[0]), int(location[1]), (1,-1)[location[2] == "-"]), axis=1)), - type="mRNA", - qualifiers={ - "gene":target[1]})] - - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=(target[0], f"{target[1]}-mRNA-1", "mRNA"))) + """ + """ + def feature_initialize(self, pre_feature, metadata): + refactor_pre_feature = lambda element: FeatureLocation( + int(element[0]), + int(element[1]), + (1,-1)[element[2] == "-"] + ) + merge_pre_feature = lambda array: array[0] if len(array) == 1 else CompoundLocation(array) + + return SeqFeature( + merge_pre_feature( + pre_feature.apply(refactor_pre_feature, axis=1) + ), + type="mRNA", + qualifiers={ + "gene":None + } + ) + + """ + """ + def callbacks(self, app, calls, target): + sender = [] + + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender.append(temp) + + return sender + + + """ + """ + def merge(self, feature, receiver): + return feature + + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + try: + feature = self.feature_initialize( + app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index(), + app.metadata + ) + except KeyError: + return None + + feature.qualifiers["gene"]=target[1] + receiver = self.callbacks( + app, + calls, + (target[0], f"{target[1]}-mRNA-1", "mRNA") + ) - return _sub_features_ \ No newline at end of file + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_gff_maker_main.py b/Plugins/read_gff_maker_main.py index eac0f9c..5cc959f 100644 --- a/Plugins/read_gff_maker_main.py +++ b/Plugins/read_gff_maker_main.py @@ -1,35 +1,66 @@ #read_gff_maker_main.py import pandas as pd -import re from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation class Plugin: + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqFeature( + FeatureLocation(int(pre_feature["start"]), int(pre_feature["stop"]), (1,-1)[pre_feature["strand"] == "-"]), + type="source", + qualifiers={ + "oganism":metadata["organism"], + "mol_type":metadata["molecule_type"], + "db_xref":list()}) + + """ + """ + def callbacks(self, app, calls, target): + sender = [] - def process(self, handle, metadata, calls:list=[], target=None): - - location = (handle.loc[(target, slice(None), "contig"),:].reset_index()) - _feature_ = [ - SeqFeature( - FeatureLocation(int(location.iloc[0,3]), int(location.iloc[0,4]), (1,-1)[location.iloc[0,5] == "-"]), - type="source", - qualifiers={ - "oganism":metadata["organism"], - "mol_type":metadata["molecule_type"], - "db_xref":list()})] - - yield _feature_ - - for gene in handle.loc[(target, slice(None), "gene"),:].reset_index()["sub_seq_id"]: + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender.append(temp) + + return sender + + """ + """ + def callbacks_with_iterator(self, app, calls, target, iterator): + sender = [] + + for element in iterator: + temp = self.callbacks(app, calls, (target, element)) + if temp: + sender.extend(temp) + return sender - #initialize features - _features_subset_ = [] - - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=(target, gene))) + """ + """ + def merge(self, feature, receiver): + return [feature] + receiver + + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + try: + feature = self.feature_initialize( + app.handles[key_handle].loc[(target, slice(None), "contig"),:].reset_index().iloc[0,:], + app.metadata) + except KeyError: + return None - _features_subset_ = receiver - yield _features_subset_ \ No newline at end of file + try: + receiver = self.callbacks_with_iterator( + app, + calls, + target, + app.handles[key_handle].loc[(target, slice(None), "gene"),:].reset_index()["sub_seq_id"]) + except KeyError: + receiver = [] + + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_tab_pannzer_CDS.py b/Plugins/read_tab_pannzer_CDS.py index 7b0fd2e..f4fcee0 100644 --- a/Plugins/read_tab_pannzer_CDS.py +++ b/Plugins/read_tab_pannzer_CDS.py @@ -4,45 +4,77 @@ class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): + """ + """ + def feature_initialize(self, pre_feature, metadata): + #print(self.feature_initialize_db_xref(pre_feature)) + return { + "db_xref": self.feature_initialize_db_xref(pre_feature), + "translation": self.feature_initialize_translation(pre_feature), + "product": self.feature_initialize_product(pre_feature) + } - #initialisation - try: - anno_bp = handle.loc[(target[1], "BP_ARGOT"),:].reset_index()["id"] - except KeyError: - anno_bp = pd.Series([]) + """ + """ + def feature_initialize_db_xref(self, pre_feature): + sender = pd.Series([]) - try: - anno_cc = handle.loc[(target[1], "CC_ARGOT"),:].reset_index()["id"] - except KeyError: - anno_cc = pd.Series([]) + for field in ["BP_ARGOT", "CC_ARGOT", "MF_ARGOT"]: + try: + sender = pd.concat([sender, pre_feature(field)["id"]]) + except KeyError: + pass + return [f"GO:{str(element)}" for element in sender] + """ + """ + def feature_initialize_translation(self, pre_feature): + sender = list() try: - anno_mf = handle.loc[(target[1], "MF_ARGOT"),:].reset_index()["id"] + sender = [pre_feature("qseq").iloc[0,1]] except KeyError: - anno_mf = pd.Series([]) + pass + return sender + """ + """ + def feature_initialize_product(self, pre_feature): + sender = list() try: - anno_qsec = [handle.loc[(target[1], "qseq"),:].reset_index().iloc[0,1]] + sender = [pre_feature("DE").iloc[0,1]] except KeyError: - anno_qsec = list() + pass + return sender - try: - anno_de = [handle.loc[(target[1], "DE"),:].reset_index().iloc[0,1]] - except KeyError: - anno_de = list() - - _annotations_ = [{ - "db_xref":[f"GO:{str(go)}" for go in pd.concat([anno_bp, anno_cc, anno_mf])], - "translation": anno_qsec, - "product": anno_de - }] + """ + """ + def callbacks(self, app, calls, target): + sender = [] + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender += temp - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=target)) + return sender + + """ + """ + def merge(self, feature, receiver): + return feature + + """ + """ + def process(self, app, key_handle, calls:list=[], target=None): + + feature = self.feature_initialize( + (lambda field: app.handles[key_handle].loc[(target[1], field)].reset_index()), + app.metadata) + + receiver = self.callbacks( + app, + calls, + target + ) - #output - return _annotations_ \ No newline at end of file + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_tab_pannzer_gene.py b/Plugins/read_tab_pannzer_gene.py index 521797e..3824bc6 100644 --- a/Plugins/read_tab_pannzer_gene.py +++ b/Plugins/read_tab_pannzer_gene.py @@ -1,28 +1,45 @@ #read_tab_pannzer_gene.py -#read_tab_pannzer_CDS - import pandas as pd class Plugin: - def process(self, handle, metadata, calls:list=[], target=None): + def feature_initialize(self, pre_feature, metadata): + return { + "note": self.feature_initialize_note(pre_feature) + } - #initialisation + def feature_initialize_note(self, pre_feature): + sender = list() try: - anno_de = handle.loc[(target[1], "DE"),:].reset_index().iloc[0,1] + sender = [pre_feature("DE").iloc[0,1]] except KeyError: - anno_de = [] + pass + return sender - _annotations_ = [{ - "note": anno_de - }] + def callbacks(self, app, calls, target): + sender = [] + for app, key_plugin, *args in calls: + temp = app.plugins[key_plugin].process(app, *args, target) + if temp: + sender += temp - #calls - receiver = [] - for call,*args in calls: - receiver.extend(call.process(*args, target=target)) + return sender + + def merge(self, feature, receiver): + return feature + + def process(self, app, key_handle, calls:list=[], target=None): + + feature = self.feature_initialize( + (lambda field: app.handles[key_handle].loc[(target[1], field)].reset_index()), + app.metadata) + + receiver = self.callbacks( + app, + calls, + target + ) - #output - return _annotations_ \ No newline at end of file + return self.merge(feature, receiver) \ No newline at end of file From 36c375948dde1098282f53f556e95dc3ca4f879e Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:24:25 +0200 Subject: [PATCH 11/22] Reader refactoring plugins rewritting, hirarchical superclass creation --- Plugins/__read__.py | 60 +++++++++++++++++++++++++ Plugins/__read_gff_maker__.py | 21 +++++++++ Plugins/__read_tab_pannzer__.py | 10 +++++ Plugins/read_fasta.py | 26 +++++------ Plugins/read_gff_maker_3UTR.py | 22 ++-------- Plugins/read_gff_maker_5UTR.py | 22 ++-------- Plugins/read_gff_maker_CDS.py | 22 ++++------ Plugins/read_gff_maker_exon.py | 21 ++------- Plugins/read_gff_maker_gene.py | 20 ++------- Plugins/read_gff_maker_mRNA.py | 23 ++-------- Plugins/read_gff_maker_misc_feature.py | 32 ++++++++++++++ Plugins/read_gff_maker_source.py | 61 ++++++++++++++++++++++++++ Plugins/read_tab_pannzer_CDS.py | 22 ++-------- Plugins/read_tab_pannzer_gene.py | 9 ++-- 14 files changed, 228 insertions(+), 143 deletions(-) create mode 100644 Plugins/__read__.py create mode 100644 Plugins/__read_gff_maker__.py create mode 100644 Plugins/__read_tab_pannzer__.py create mode 100644 Plugins/read_gff_maker_misc_feature.py create mode 100644 Plugins/read_gff_maker_source.py diff --git a/Plugins/__read__.py b/Plugins/__read__.py new file mode 100644 index 0000000..0950107 --- /dev/null +++ b/Plugins/__read__.py @@ -0,0 +1,60 @@ +# __read__.py + +from Plugins.__plugin__ import __Plugin__, RequiredMetadataError, UndefinedMethodError +from Plugins.__caller__ import Caller, CallerFailedVerification + +class __Read__(__Plugin__): + + """ + """ + def feature_initialize(self, pre_feature, metadata): + raise UndefinedMethodError("feature_initialize has not been defined.") + + """ + """ + def callbacks(self, app, calls, target): + raise UndefinedMethodError("callbacks has not been defined.") + + """ + """ + def callbacks_extend(self, app, calls, target): + caller = Caller(app) + sender = [] + for app, key_plugin, *args in calls: + temp = None + try: + temp = caller.run(app.plugins[key_plugin].process, app, *args, target) + except CallerFailedVerification: + sender = [] + if temp: + sender.extend(temp) + return sender + + """ + """ + def callbacks_append(self, app, calls, target): + caller = Caller(app) + sender = [] + for app, key_plugin, *args in calls: + temp = None + try: + temp = caller.run(app.plugins[key_plugin].process, app, *args, target) + except CallerFailedVerification: + sender = [] + if temp: + sender.append(temp) + return sender + + """ + """ + def merge(self, feature, receiver): + return feature + + """ + """ + def required_metadata_check(self, app, keys:list=[]): + if keys: + for key in keys: + if not key in app.metadata: + raise RequiredMetadataError(f"Required metadata attribute, {key}, not found.") + return True \ No newline at end of file diff --git a/Plugins/__read_gff_maker__.py b/Plugins/__read_gff_maker__.py new file mode 100644 index 0000000..13ceb97 --- /dev/null +++ b/Plugins/__read_gff_maker__.py @@ -0,0 +1,21 @@ +#__read_gff_maker__.py + +from Plugins.__read__ import __Read__ + +class __ReadGFFMaker__(__Read__): + + """ + """ + def multi_feature_initialize(self, pre_multi_feature, metadata): + raise UndefinedMethodError("multi_feature_initialize has not been defined.") + + """ + """ + def callbacks(self, app, calls, target): + return super().callbacks_append(app, calls, target) + + """ + """ + def callbacks_with_iterator(self, app, calls, target, iterator): + raise UndefinedMethodError("callbacks_with_iterator has not been defined.") + diff --git a/Plugins/__read_tab_pannzer__.py b/Plugins/__read_tab_pannzer__.py new file mode 100644 index 0000000..4c78378 --- /dev/null +++ b/Plugins/__read_tab_pannzer__.py @@ -0,0 +1,10 @@ +#__read_tab_pannzer__.py + +from Plugins.__read__ import __Read__ + +class __ReadTabPannzer__(__Read__): + + """ + """ + def callbacks(self, app, calls, target): + return self.callbacks_extend(app, calls, target) \ No newline at end of file diff --git a/Plugins/read_fasta.py b/Plugins/read_fasta.py index ba426db..8876aa0 100644 --- a/Plugins/read_fasta.py +++ b/Plugins/read_fasta.py @@ -4,9 +4,9 @@ from Bio import SeqIO from Bio.SeqRecord import SeqRecord +from Plugins.__read__ import __Read__ -class Plugin: - +class Plugin(__Read__): """ """ def feature_initialize(self, pre_feature, metadata): @@ -25,14 +25,7 @@ def feature_initialize(self, pre_feature, metadata): """ """ def callbacks(self, app, calls, target): - sender = [] - - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender += temp - - return sender + return super().callbacks_extend(app, calls, target) """ """ @@ -41,12 +34,19 @@ def merge(self, feature, receiver): """ """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): for element in app.handles[key_handle]: feature = self.feature_initialize(element, app.metadata) + app.current_sequence = feature.seq receiver = self.callbacks(app, calls, (feature.id)) self.merge(feature, receiver) - with open(f"out/{feature.id}.dat", "w") as o: - print(feature.format("embl"), file=o) \ No newline at end of file + #with open(f"out/{feature.id}.dat", "w") as o: + with open(f"{feature.id}.dat", "w") as o: + print(feature.format("embl"), file=o) + + """ + """ + def required_metadata_check(self, app, keys:list=[]): + return super().required_metadata_check(app, ["project", "transl_table", "molecule_type", "organism", "taxonomy", "topology"]) diff --git a/Plugins/read_gff_maker_3UTR.py b/Plugins/read_gff_maker_3UTR.py index ef46754..91ca810 100644 --- a/Plugins/read_gff_maker_3UTR.py +++ b/Plugins/read_gff_maker_3UTR.py @@ -1,9 +1,10 @@ #read_gff_maker_3UTR.py import pandas as pd +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -class Plugin: +class Plugin(__ReadGFFMaker__): """ """ @@ -17,24 +18,7 @@ def feature_initialize(self, pre_feature, metadata): """ """ - def callbacks(self, app, calls, target): - sender = [] - - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender.append(temp) - - return sender - - """ - """ - def merge(self, feature, receiver): - return feature - - """ - """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): try: feature = self.feature_initialize( app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "three_prime_UTR"),:].reset_index().iloc[0,:], diff --git a/Plugins/read_gff_maker_5UTR.py b/Plugins/read_gff_maker_5UTR.py index 60fae42..f7813f0 100644 --- a/Plugins/read_gff_maker_5UTR.py +++ b/Plugins/read_gff_maker_5UTR.py @@ -1,9 +1,10 @@ #read_gff_maker_5UTR.py import pandas as pd +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -class Plugin: +class Plugin(__ReadGFFMaker__): """ """ @@ -17,24 +18,7 @@ def feature_initialize(self, pre_feature, metadata): """ """ - def callbacks(self, app, calls, target): - sender = [] - - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender.append(temp) - - return sender - - """ - """ - def merge(self, feature, receiver): - return feature - - """ - """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): try: feature = self.feature_initialize( app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "five_prime_UTR"),:].reset_index().iloc[0,:], diff --git a/Plugins/read_gff_maker_CDS.py b/Plugins/read_gff_maker_CDS.py index 48bcb29..8b77fd2 100644 --- a/Plugins/read_gff_maker_CDS.py +++ b/Plugins/read_gff_maker_CDS.py @@ -2,9 +2,10 @@ import pandas as pd import itertools +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -class Plugin: +class Plugin(__ReadGFFMaker__): """ """ @@ -27,16 +28,6 @@ def feature_initialize(self, pre_feature, metadata): "translation":list(), "transl_table":metadata["transl_table"]}) - """ - """ - def callbacks(self, app, calls, target): - sender = [] - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender.append(temp) - return sender - """ """ def merge(self, feature, receiver): @@ -47,7 +38,7 @@ def merge(self, feature, receiver): """ """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): try: feature = self.feature_initialize( app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index(), @@ -61,4 +52,9 @@ def process(self, app, key_handle, calls:list=[], target=None): calls, (target[0], f"{target[1]}-mRNA-1", "CDS")) - return self.merge(feature, receiver) \ No newline at end of file + return self.merge(feature, receiver) + + """ + """ + def required_metadata_check(self, app, keys:list=[]): + return super().required_metadata_check(app, ["transl_table"]) \ No newline at end of file diff --git a/Plugins/read_gff_maker_exon.py b/Plugins/read_gff_maker_exon.py index 1aeb42b..4b23ebb 100644 --- a/Plugins/read_gff_maker_exon.py +++ b/Plugins/read_gff_maker_exon.py @@ -3,10 +3,11 @@ import pandas as pd import re +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -class Plugin: +class Plugin(__ReadGFFMaker__): """ """ @@ -22,26 +23,10 @@ def multi_feature_initialize(self, pre_multi_feature, metadata): for _, element in pre_multi_feature: yield self.feature_initialize(element, metadata) - """ - """ - def callbacks(self, app, calls, target): - sender = [] - - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender.append() - - return sender - - """ - """ - def merge(self, feature, receiver): - return feature """ """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): try: feature = self.multi_feature_initialize( app.handles[key_handle].loc[(target, slice(None), "exon"),:].reset_index().iterrows(), diff --git a/Plugins/read_gff_maker_gene.py b/Plugins/read_gff_maker_gene.py index 9d5f7da..00cf5c3 100644 --- a/Plugins/read_gff_maker_gene.py +++ b/Plugins/read_gff_maker_gene.py @@ -2,10 +2,10 @@ import pandas as pd import itertools +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -class Plugin: - +class Plugin(__ReadGFFMaker__): """ """ def feature_initialize(self, pre_feature, metadata): @@ -15,18 +15,6 @@ def feature_initialize(self, pre_feature, metadata): qualifiers={ "gene":None, "note":list()}) - - """ - """ - def callbacks(self, app, calls, target): - sender = [] - - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender.append(temp) - - return sender """ """ @@ -38,7 +26,7 @@ def merge(self, feature, receiver): """ """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): feature = self.feature_initialize( app.handles[key_handle].loc[(target[0], target[1], "gene"),:].reset_index().iloc[0,:], app.metadata) @@ -47,5 +35,5 @@ def process(self, app, key_handle, calls:list=[], target=None): app, calls, (target[0], f"{target[1]}-mRNA-1", "gene")) - + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_gff_maker_mRNA.py b/Plugins/read_gff_maker_mRNA.py index e4e8bc0..f8e76f8 100644 --- a/Plugins/read_gff_maker_mRNA.py +++ b/Plugins/read_gff_maker_mRNA.py @@ -1,9 +1,10 @@ #read_gff_maker_mRNA.py import pandas as pd +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation -class Plugin: +class Plugin(__ReadGFFMaker__): """ """ @@ -25,27 +26,9 @@ def feature_initialize(self, pre_feature, metadata): } ) - """ - """ - def callbacks(self, app, calls, target): - sender = [] - - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender.append(temp) - - return sender - - - """ - """ - def merge(self, feature, receiver): - return feature - """ """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): try: feature = self.feature_initialize( app.handles[key_handle].loc[(target[0], f"{target[1]}-mRNA-1", "CDS"),:].reset_index(), diff --git a/Plugins/read_gff_maker_misc_feature.py b/Plugins/read_gff_maker_misc_feature.py new file mode 100644 index 0000000..aeae5fc --- /dev/null +++ b/Plugins/read_gff_maker_misc_feature.py @@ -0,0 +1,32 @@ +#read_gff_maker_misc_feature.py + +import pandas as pd +import itertools +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +class Plugin(__ReadGFFMaker__): + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqFeature( + FeatureLocation(int(pre_feature["start"]), int(pre_feature["stop"]), (1,-1)[pre_feature["strand"] == "-"]), + type="misc_feature", + qualifiers={ + #"gene":None, + "note":list()}) + + """ + """ + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): + feature = self.feature_initialize( + app.handles[key_handle].loc[(target[0], target[1], "gene"),:].reset_index().iloc[0,:], + app.metadata) + feature.qualifiers["note"].append(target[1]) + #feature.qualifiers["gene"]=target[1] + receiver = self.callbacks( + app, + calls, + (target[0], f"{target[1]}-mRNA-1", "misc_feature")) + + return self.merge(feature, receiver) \ No newline at end of file diff --git a/Plugins/read_gff_maker_source.py b/Plugins/read_gff_maker_source.py new file mode 100644 index 0000000..04fe6db --- /dev/null +++ b/Plugins/read_gff_maker_source.py @@ -0,0 +1,61 @@ +#read_gff_maker_source.py + +import pandas as pd + +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +class Plugin(__ReadGFFMaker__): + + """ + """ + def feature_initialize(self, pre_feature, metadata): + return SeqFeature( + FeatureLocation(int(pre_feature["start"]), int(pre_feature["stop"]), (1,-1)[pre_feature["strand"] == "-"]), + type="source", + qualifiers={ + "organism":metadata["organism"], + "mol_type":metadata["molecule_type"], + "db_xref":list()}) + + """ + """ + def callbacks_with_iterator(self, app, calls, target, iterator): + sender = [] + + for element in iterator: + temp = self.callbacks(app, calls, (target, element)) + if temp: + sender.extend(temp) + return sender + + """ + """ + def merge(self, feature, receiver): + return [feature] + receiver + + """ + """ + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): + try: + feature = self.feature_initialize( + app.handles[key_handle].loc[(target, slice(None), "contig"),:].reset_index().iloc[0,:], + app.metadata) + except KeyError: + return None + + try: + receiver = self.callbacks_with_iterator( + app, + calls, + target, + app.handles[key_handle].loc[(target, slice(None), "gene"),:].reset_index()["sub_seq_id"]) + except KeyError: + receiver = [] + + return self.merge(feature, receiver) + + """ + """ + def required_metadata_check(self, app, keys:list=[]): + return super().required_metadata_check(app, ["organism", "molecule_type"]) \ No newline at end of file diff --git a/Plugins/read_tab_pannzer_CDS.py b/Plugins/read_tab_pannzer_CDS.py index f4fcee0..778d6f7 100644 --- a/Plugins/read_tab_pannzer_CDS.py +++ b/Plugins/read_tab_pannzer_CDS.py @@ -1,8 +1,9 @@ #read_tab_pannzer_CDS import pandas as pd +from Plugins.__read_tab_pannzer__ import __ReadTabPannzer__ -class Plugin: +class Plugin(__ReadTabPannzer__): """ """ @@ -48,24 +49,7 @@ def feature_initialize_product(self, pre_feature): """ """ - def callbacks(self, app, calls, target): - sender = [] - - for app, key_plugin, *args in calls: - temp = app.plugins[key_plugin].process(app, *args, target) - if temp: - sender += temp - - return sender - - """ - """ - def merge(self, feature, receiver): - return feature - - """ - """ - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): feature = self.feature_initialize( (lambda field: app.handles[key_handle].loc[(target[1], field)].reset_index()), diff --git a/Plugins/read_tab_pannzer_gene.py b/Plugins/read_tab_pannzer_gene.py index 3824bc6..ed1fb2a 100644 --- a/Plugins/read_tab_pannzer_gene.py +++ b/Plugins/read_tab_pannzer_gene.py @@ -1,8 +1,9 @@ #read_tab_pannzer_gene.py import pandas as pd +from Plugins.__read_tab_pannzer__ import __ReadTabPannzer__ -class Plugin: +class Plugin(__ReadTabPannzer__): def feature_initialize(self, pre_feature, metadata): return { @@ -27,15 +28,11 @@ def callbacks(self, app, calls, target): return sender - def merge(self, feature, receiver): - return feature - - def process(self, app, key_handle, calls:list=[], target=None): + def process(self, app, caller_mode, key_handle, calls:list=[], target=None): feature = self.feature_initialize( (lambda field: app.handles[key_handle].loc[(target[1], field)].reset_index()), app.metadata) - receiver = self.callbacks( app, calls, From 96b60a6b11a69a5d563d80c65a648624d4efb389 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:25:29 +0200 Subject: [PATCH 12/22] to_handle renaming --- Plugins/to_handle_fasta.py | 10 ++++++++++ Plugins/to_handle_gff_maker.py | 18 ++++++++++++++++++ Plugins/to_handle_tab_pannzer.py | 16 ++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 Plugins/to_handle_fasta.py create mode 100644 Plugins/to_handle_gff_maker.py create mode 100644 Plugins/to_handle_tab_pannzer.py diff --git a/Plugins/to_handle_fasta.py b/Plugins/to_handle_fasta.py new file mode 100644 index 0000000..c1745ef --- /dev/null +++ b/Plugins/to_handle_fasta.py @@ -0,0 +1,10 @@ +#fasta2handle.py + +from Bio import SeqIO +from Plugins.__plugin__ import __Plugin__ + +class Plugin(__Plugin__): + + def process(self, file_path): + with open(file_path) as handle: + return list(SeqIO.parse(handle, "fasta")) \ No newline at end of file diff --git a/Plugins/to_handle_gff_maker.py b/Plugins/to_handle_gff_maker.py new file mode 100644 index 0000000..4c5e93a --- /dev/null +++ b/Plugins/to_handle_gff_maker.py @@ -0,0 +1,18 @@ +#gff_maker2handle.py + +import pandas as pd +import re +from Plugins.__plugin__ import __Plugin__ + +class Plugin(__Plugin__): + + def process(self, file_path): + with open(file_path) as handle: + gff = pd.read_csv(handle, sep="\t") + gff = gff.reset_index() + gff.columns = ["seq_id", "source", "ft_type", "start", "stop", "score", "strand", "phase", "attr"] + gff = gff.sort_values(by=["seq_id"]).drop(["source", "score", "phase"], axis=1).dropna() + gff["sub_seq_id"] = [re.split(r':',re.search("^ID=.*?;", x).group(0)[3:-1])[0] for x in gff["attr"]] + gff["start"] = gff["start"].apply(lambda x: x-1) + return gff[["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]].sort_values(by=["seq_id", "sub_seq_id", "ft_type"]).set_index(["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]) + \ No newline at end of file diff --git a/Plugins/to_handle_tab_pannzer.py b/Plugins/to_handle_tab_pannzer.py new file mode 100644 index 0000000..e97c369 --- /dev/null +++ b/Plugins/to_handle_tab_pannzer.py @@ -0,0 +1,16 @@ +#tab_pannzer2handle.py + +import pandas as pd +from Plugins.__plugin__ import __Plugin__ + +class Plugin(__Plugin__): + + def process(self, file_path): + with open(file_path) as handle: + anno = pd.read_csv(handle, sep="\t") + + anno.sort_values(by=["qpid", "type"], inplace=True) + anno.drop(["score", "PPV"], axis=1, inplace=True) + anno.set_index(["qpid", "type", "id", "desc"], inplace=True) + + return anno \ No newline at end of file From a34b6ed0791ef232f9884719a2663e8d79e51ad0 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:25:59 +0200 Subject: [PATCH 13/22] verifier plugin creation --- Plugins/__verify__.py | 19 +++++++++++++++++ Plugins/verify_gff_maker_CDS.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 Plugins/__verify__.py create mode 100644 Plugins/verify_gff_maker_CDS.py diff --git a/Plugins/__verify__.py b/Plugins/__verify__.py new file mode 100644 index 0000000..e0f2c02 --- /dev/null +++ b/Plugins/__verify__.py @@ -0,0 +1,19 @@ +#__verify__.py + +from Plugins.__plugin__ import __Plugin__, RequiredMetadataError, UndefinedMethodError + +class __Verify__(__Plugin__): + pass + +class FailedVerification(Exception): + def __init__(self, *args): + if args: + self.message = args[0] + else: + self.message = None + + def __str__(self): + if self.message: + return f"FailedVerification: {self.message}" + else: + return "FailedVerification has been raised" \ No newline at end of file diff --git a/Plugins/verify_gff_maker_CDS.py b/Plugins/verify_gff_maker_CDS.py new file mode 100644 index 0000000..a9bf4aa --- /dev/null +++ b/Plugins/verify_gff_maker_CDS.py @@ -0,0 +1,37 @@ +#verify_gff_maker_gene.py + +from Plugins.__verify__ import __Verify__, FailedVerification +from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation + +class Plugin(__Verify__): + + def process(self, app, element_to_verify): + conversion_map = {"A":"T", "T":"A", "C":"G", "G":"C"} + convert = lambda array : "".join([conversion_map[element] for element in array[::-1]]) + + start_codon = None + stop_codon = None + + feature_location = element_to_verify.location + if(isinstance(feature_location, FeatureLocation)): + if(feature_location.strand == 1): + start_codon = app.current_sequence[feature_location._start:feature_location._start+3] + stop_codon = app.current_sequence[feature_location._end-3:feature_location._end] + else: + start_codon = convert(app.current_sequence[feature_location._end-3:feature_location._end]) + stop_codon = convert(app.current_sequence[feature_location._start:feature_location._start+3]) + elif(isinstance(feature_location, CompoundLocation)): + if(feature_location.strand == 1): + start_codon = app.current_sequence[feature_location.parts[0]._start:feature_location.parts[0]._start+3] + stop_codon = app.current_sequence[feature_location.parts[-1]._end-3:feature_location.parts[-1]._end] + else: + start_codon = convert(app.current_sequence[feature_location.parts[-1]._end-3:feature_location.parts[-1]._end]) + stop_codon = convert(app.current_sequence[feature_location.parts[0]._start:feature_location.parts[0]._start+3]) + + if start_codon not in ["ATG"]: + raise FailedVerification(f"invalid start codon: {start_codon}") + + if stop_codon not in ["TGA", "TAG", "TAA"]: + raise FailedVerification(f"invalid stop codon: {stop_codon}") + + return None \ No newline at end of file From 2f085e1370f5122f587cca99c9ca8f2d93c2792d Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:27:01 +0200 Subject: [PATCH 14/22] root superclasses and package description --- Plugins/__caller__.py | 62 +++++++++++++++++++++++++++++++++++++++++++ Plugins/__init__.py | 21 +++++++++++---- Plugins/__plugin__.py | 38 ++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 Plugins/__caller__.py create mode 100644 Plugins/__plugin__.py diff --git a/Plugins/__caller__.py b/Plugins/__caller__.py new file mode 100644 index 0000000..c6a3a5a --- /dev/null +++ b/Plugins/__caller__.py @@ -0,0 +1,62 @@ +#__caller__.py + +""" +""" +from Plugins.__verify__ import FailedVerification + + +class Caller: + """ + """ + def __init__(self, app): + self.app = app + self.status = True + self.last_result = None + + """ + """ + def run(self, callback_function, *args): + if args[1] not in ["default", "verify", "bypass"]: + raise UnknownCallerModeError(f"{args[1]} mode is not defined.") + + if self.status and args[1] in ["default"]: + self.last_result = callback_function(*args) + return self.last_result + + elif self.status and args[1] in ["verify"]: + try: + callback_function(self.app, self.last_result) + except FailedVerification: + self.status = False + raise CallerFailedVerification() + return None + + elif not self.status and args[1] in ["bypass"]: + self.last_result = callback_function(*args) + return self.last_result + +class UnknownCallerModeError(Exception): + def __init__(self, *args): + if args: + self.message = args[0] + else: + self.message = None + + def __str__(self): + if self.message: + return f"UnknownCallerModeError: {self.message}" + else: + return "UnknownCallerModeError has been raised" + +class CallerFailedVerification(Exception): + def __init__(self, *args): + if args: + self.message = args[0] + else: + self.message = None + + def __str__(self): + if self.message: + return f"CallerFailedVerification: {self.message}" + else: + return "CallerFailedVerification has been raised" \ No newline at end of file diff --git a/Plugins/__init__.py b/Plugins/__init__.py index 583ac6d..ff585cf 100644 --- a/Plugins/__init__.py +++ b/Plugins/__init__.py @@ -1,17 +1,28 @@ #__init.py__ -from Plugins.fasta2handle import Plugin -from Plugins.gff_maker2handle import Plugin -from Plugins.tab_pannzer2handle import Plugin +from Plugins.__caller__ import Caller, UnknownCallerModeError, CallerFailedVerification + +from Plugins.__plugin__ import __Plugin__, RequiredMetadataError, UndefinedMethodError +from Plugins.__read__ import __Read__ +from Plugins.__read_gff_maker__ import __ReadGFFMaker__ +from Plugins.__read_tab_pannzer__ import __ReadTabPannzer__ +from Plugins.__verify__ import __Verify__, FailedVerification from Plugins.read_fasta import Plugin -from Plugins.read_gff_maker_gene import Plugin from Plugins.read_gff_maker_3UTR import Plugin from Plugins.read_gff_maker_5UTR import Plugin from Plugins.read_gff_maker_CDS import Plugin from Plugins.read_gff_maker_exon import Plugin +from Plugins.read_gff_maker_gene import Plugin from Plugins.read_gff_maker_mRNA import Plugin +from Plugins.read_gff_maker_source import Plugin from Plugins.read_tab_pannzer_CDS import Plugin -from Plugins.read_tab_pannzer_gene import Plugin \ No newline at end of file +from Plugins.read_tab_pannzer_gene import Plugin + +from Plugins.to_handle_fasta import Plugin +from Plugins.to_handle_gff_maker import Plugin +from Plugins.to_handle_tab_pannzer import Plugin + +from Plugins.verify_gff_maker_CDS import Plugin \ No newline at end of file diff --git a/Plugins/__plugin__.py b/Plugins/__plugin__.py new file mode 100644 index 0000000..9e55c96 --- /dev/null +++ b/Plugins/__plugin__.py @@ -0,0 +1,38 @@ +#__plugin__.py + +class __Plugin__: + def process(*args): + raise UndefinedMethodError("process has not been defined") + + def required_metadata_check(*args): + return True + +""" +""" +class RequiredMetadataError(Exception): + def __init__(self, *args): + if args: + self.message = args[0] + else: + self.message = None + + def __str__(self): + if self.message: + return f"RequiredMetadataError: {self.message}" + else: + return "RequiredMetadataError has been raised" + +""" +""" +class UndefinedMethodError(Exception): + def __init__(self, *args): + if args: + self.message = args[0] + else: + self.message = None + + def __str__(self): + if self.message: + return f"UndefinedMethodError: {self.message}" + else: + return "UndefinedMethodError has been raised" \ No newline at end of file From ec0bbf41ade13e22067879db65fb9fa0cd1e3501 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:27:32 +0200 Subject: [PATCH 15/22] renamed as to_handle_fasta --- Plugins/fasta2handle.py | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 Plugins/fasta2handle.py diff --git a/Plugins/fasta2handle.py b/Plugins/fasta2handle.py deleted file mode 100644 index b9e319b..0000000 --- a/Plugins/fasta2handle.py +++ /dev/null @@ -1,9 +0,0 @@ -#fasta2handle.py - -from Bio import SeqIO - -class Plugin: - - def process(self, file_path): - with open(file_path) as handle: - return list(SeqIO.parse(handle, "fasta")) \ No newline at end of file From 2239814117782d0b199f41fd265c9ff62c9b7191 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:27:51 +0200 Subject: [PATCH 16/22] renamed as to_handle_gff_maker --- Plugins/gff_maker2handle.py | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 Plugins/gff_maker2handle.py diff --git a/Plugins/gff_maker2handle.py b/Plugins/gff_maker2handle.py deleted file mode 100644 index 39124c2..0000000 --- a/Plugins/gff_maker2handle.py +++ /dev/null @@ -1,16 +0,0 @@ -#gff_maker2handle.py - -import pandas as pd -import re - -class Plugin: - - def process(self, file_path): - with open(file_path) as handle: - gff = pd.read_csv(handle, sep="\t") - gff = gff.reset_index() - gff.columns = ["seq_id", "source", "ft_type", "start", "stop", "score", "strand", "phase", "attr"] - gff = gff.sort_values(by=["seq_id"]).drop(["source", "score", "phase"], axis=1).dropna() - gff["sub_seq_id"] = [re.split(r':',re.search("^ID=.*?;", x).group(0)[3:-1])[0] for x in gff["attr"]] - return gff[["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]].sort_values(by=["seq_id", "sub_seq_id", "ft_type"]).set_index(["seq_id", "sub_seq_id", "ft_type", "start", "stop", "strand"]) - \ No newline at end of file From 63dfc33705cd25997a30979425f657c319447f80 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:28:14 +0200 Subject: [PATCH 17/22] renamed as to_handle_tab_pannzer --- Plugins/tab_pannzer2handle.py | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 Plugins/tab_pannzer2handle.py diff --git a/Plugins/tab_pannzer2handle.py b/Plugins/tab_pannzer2handle.py deleted file mode 100644 index ac7c532..0000000 --- a/Plugins/tab_pannzer2handle.py +++ /dev/null @@ -1,11 +0,0 @@ -#tab_pannzer2handle.py - -import pandas as pd - -class Plugin: - - def process(self, file_path): - with open(file_path) as handle: - anno = pd.read_csv(handle, sep="\t") - anno = anno.sort_values(by=["qpid", "type"]).drop(["score", "PPV"], axis=1).set_index(["qpid", "type", "id", "desc"]) - return anno \ No newline at end of file From a4a1ba885b7e324d063db702f8c8cdf36c81b6d3 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:29:03 +0200 Subject: [PATCH 18/22] Delete files directory --- files/config.info | 49 ----------------------------------------------- 1 file changed, 49 deletions(-) delete mode 100644 files/config.info diff --git a/files/config.info b/files/config.info deleted file mode 100644 index 1aa75e3..0000000 --- a/files/config.info +++ /dev/null @@ -1,49 +0,0 @@ -# Axel Giottonini -# test config.info -# 14.05.2021 - - -project:temp -division:inv -taxonomy:29031 -organism:Phlebotomus papatasi -molecule_type:genomic DNA -topology:linear -description:empty -transl_table:0 - - - -fasta2handle:.fasta2handle,Plugins -gff_maker2handle:.gff_maker2handle,Plugins -tab_pannzer2handle:.tab_pannzer2handle,Plugins -read_fasta:.read_fasta,Plugins -read_gff_maker_3UTR:.read_gff_maker_3UTR,Plugins -read_gff_maker_5UTR:.read_gff_maker_5UTR,Plugins -read_gff_maker_CDS:.read_gff_maker_CDS,Plugins -read_gff_maker_exon:.read_gff_maker_exon,Plugins -read_gff_maker_gene:.read_gff_maker_gene,Plugins -read_gff_maker_main:.read_gff_maker_main,Plugins -read_gff_maker_mRNA:.read_gff_maker_mRNA,Plugins -read_tab_pannzer_CDS:.read_tab_pannzer_CDS,Plugins -read_tab_pannzer_gene:.read_tab_pannzer_gene,Plugins - - - -fasta:fasta2handle,files/sequences.fasta -gff_maker:gff_maker2handle,files/data.gff -tab_pannzer:tab_pannzer2handle,files/anno.out - - - --read_fasta,fasta ---read_gff_maker_main,gff_maker ----read_gff_maker_gene,gff_maker -----read_tab_pannzer_gene,tab_pannzer ----read_gff_maker_mRNA,gff_maker ----read_gff_maker_CDS,gff_maker -----read_tab_pannzer_CDS,tab_pannzer ----read_gff_maker_3UTR,gff_maker ----read_gff_maker_5UTR,gff_maker ---read_gff_maker_exon,gff_maker - From e599c67d3c9950aae4d4631a0a96c557a75cd197 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:29:53 +0200 Subject: [PATCH 19/22] core upload handle config file as an argument --- core.py | 157 ++++++++++++++++++++++++++++++++++++++++++++++++-------- main.py | 14 +++-- 2 files changed, 146 insertions(+), 25 deletions(-) diff --git a/core.py b/core.py index 3bbc502..3fed957 100644 --- a/core.py +++ b/core.py @@ -3,13 +3,27 @@ import importlib import re -class app: +""" +""" +class App: + """ + Description: + + Arguments: + + Output: + + Note: + """ def __init__(self, config_path): + self.metadata = None self.plugins = None self.handles = None self.workflow = None + self.current_sequence = None + config = self.read_config(config_path) self.set_metadata(config["metadata"]) @@ -17,16 +31,16 @@ def __init__(self, config_path): self.set_handles(config["handles"]) self.set_workflow(config["workflow"]) + self.all_plugins_required_metadata_check() + """ - This function reads the config file which is divided into four fields : - - metadata: contains the general and shared informations for EMBLish - as a couple key:value - - plugins: contains the list of the plugins that will be used as a - triplet plugin_key:plugin_name,plugin_package - - handles: contains the list of files that will be used as inputs as - a triplet handle_key:plugin,file_path - - workflow: contains a hierarchical list of the different step to run - as a couple plugin_key,handle_key + Description: + + Arguments: + + Output: + + Note: """ def read_config(self, config_path): config = { @@ -67,26 +81,68 @@ def read_config(self, config_path): return config """ - This function converts the array containing the metadata into a dictionnary + Description: + - converts the array containing the metadata into a dictionnary by splitting + each string containing the metadata key and metadata value into an item with + the key and value. + Arguments: + - array: list of strings + Output: + - dictionnary of strings + Note: """ - def set_metadata(self, array): - self.metadata = {element.split(":")[0]:element.split(":")[1] for element in array} + def set_metadata(self, array:list): + + def convert(value): + temp = value.split(",") + if len(temp) > 1: + if temp[1] == "int": + return int(temp[0]) + return value + + self.metadata = {element.split(":")[0]:convert(element.split(":")[1]) for element in array} """ - This function converts the array containing the plugins parameters into a dictionnary - with plugins to call with their key + Description: + - converts the array containing the plugins parameters (name, package) into a + dictionnary by splitting each string containing the plugin key, the plugin name + and the plugin package into an item with the key and the callable plugin. + Store the result in the self.metadata variable. + Arguments: + - array: list of strings + Output: + - dictionnary of plugin objects + Note: """ - def set_plugins(self, array): + def set_plugins(self, array:list): self.plugins = {element.split(":")[0]:importlib.import_module(element.split(":")[1].split(",")[0],element.split(":")[1].split(",")[1]).Plugin() for element in array} """ - This function converts the array containing the handles parameters into a dictionnary - with handles to call with their key + Description: + - converts the array containing the handles parameters into a dictionnary by + splitting each string containing the handle key, the handle converter and the + file path into an item with the key and the converted as a data frame handle. + Store the result in the self.plugin variable. + Arguments: + - array: list of strings + Output: + - dictionnary of data frames + Note: """ def set_handles(self, array): self.handles = {element.split(":")[0]:self.plugins[element.split(":")[1].split(",")[0]].process(element.split(":")[1].split(",")[1]) for element in array} """ + Description: + - converts the array containing the workflow into a recursive automaton where + task are described by a tuple containing the required plugin, the handle where + the data is found and a list of elements to call. + Store the result in the self.handles variable. + Arguments: + - array: list of strings + Output (assigned): + - array: list of tuples (recursive) + Note: """ def set_workflow(self, array): temp = self.refactor_workflow(array) @@ -94,7 +150,14 @@ def set_workflow(self, array): self.workflow = self.convert_workflow_task(temp[0]) """ - Convert the list element in triplet level,,[] + Description: + - converts strings into a tuple containing the level of the task, the rest of the + string and an empty array. + Arguments: + - array list of strings + Output: + - array: list of tuples + Note: """ def refactor_workflow(self, array): @@ -107,7 +170,13 @@ def refactor_workflow(self, array): return array """ - Order the elements and create the hierarchical nodes + Description: + - place the tasks in their parent (level-1) tasks array + Arguments: + - array: list of tuples + Output: + - array: list of tuples (recursive) + Note: """ def merge_workflow(self, array): array.insert(0, (0,None,[])) @@ -133,12 +202,22 @@ def merge_workflow(self, array): return array """ + Description: + - recursively converts the tuples containing the task string into a tuple containing + a reference to the application, the plugin key, the handle key and an array of + subtasks. + Arguments: + - array: list of tuples (recursive) + Output: + - array: list of tuples (recusrive) + Note: """ def convert_workflow_task(self, task): if task[1]: return ( self, task[1].split(",")[0], + task[1].split(",")[2] if len(task[1].split(",")) > 2 else "default", task[1].split(",")[1], [self.convert_workflow_task(sub_task) for sub_task in task[2]] ) @@ -146,7 +225,43 @@ def convert_workflow_task(self, task): return [self.convert_workflow_task(sub_task) for sub_task in task[2]] """ + Description: + + Arguments: + + Output: + + Note: + """ + def all_plugins_required_metadata_check(self): + for key, plugin in self.plugins.items(): + if not plugin.required_metadata_check(self): + raise InvalidConfigurationError(f"{key} plugin could not find required metadata") + + """ + Description: + + Arguments: + + Output: + + Note: """ def run(self): for app, key_plugin, *args in self.workflow: - app.plugins[key_plugin].process(app, *args) \ No newline at end of file + app.plugins[key_plugin].process(app, *args) + +""" +""" +class InvalidConfigurationError(Exception): + def __init__(self, *args): + if args: + self.message = args[0] + else: + self.message = None + + def __str__(self): + if self.message: + return f"InvalidConfigurationError, {self.message}" + else: + return "InvalidConfigurationError has been raised" \ No newline at end of file diff --git a/main.py b/main.py index 1331b9a..875477b 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,14 @@ #main.py -from core import app +from core import App +import sys -if __name__ == "__main__": +def main(): + args = sys.argv[1:] + + config_file = args[0] - app = app("files/config.info") - app.run() \ No newline at end of file + App(config_file).run() + +if __name__ == "__main__": + main() \ No newline at end of file From c418d55a43df71333a5ba2295d3d00fd17fbaa45 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:33:12 +0200 Subject: [PATCH 20/22] Update README.md --- README.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 40f2b8e..734ccf6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,58 @@ # GFF3-Tabular_to_EMBL -## Status of the project +``` +# Axel Giottonini +# test config.info +# 19.05.2021 -## Upcoming features + +project:PRJEB1234 +division:inv +taxonomy:29031 +organism:Phlebotomus papatasi +molecule_type:genomic DNA +topology:linear +description:empty +transl_table:1,int + + + +read_fasta:.read_fasta,Plugins +read_gff_maker_3UTR:.read_gff_maker_3UTR,Plugins +read_gff_maker_5UTR:.read_gff_maker_5UTR,Plugins +read_gff_maker_CDS:.read_gff_maker_CDS,Plugins +read_gff_maker_exon:.read_gff_maker_exon,Plugins +read_gff_maker_gene:.read_gff_maker_gene,Plugins +read_gff_maker_misc_feature:.read_gff_maker_misc_feature,Plugins +read_gff_maker_mRNA:.read_gff_maker_mRNA,Plugins +read_gff_maker_source:.read_gff_maker_source,Plugins +read_tab_pannzer_CDS:.read_tab_pannzer_CDS,Plugins +read_tab_pannzer_gene:.read_tab_pannzer_gene,Plugins +to_handle_fasta:.to_handle_fasta,Plugins +to_handle_gff_maker:.to_handle_gff_maker,Plugins +to_handle_tab_pannzer:.to_handle_tab_pannzer,Plugins +verify_gff_maker_CDS:.verify_gff_maker_CDS,Plugins + + + +fasta:to_handle_fasta,sequences.fasta +gff_maker:to_handle_gff_maker,data.gff +tab_pannzer:to_handle_tab_pannzer,anno.out + + + +-read_fasta,fasta +--read_gff_maker_source,gff_maker +---read_gff_maker_gene,gff_maker +----read_tab_pannzer_gene,tab_pannzer +---read_gff_maker_mRNA,gff_maker +---read_gff_maker_CDS,gff_maker +----read_tab_pannzer_CDS,tab_pannzer +---verify_gff_maker_CDS,NF,verify +---read_gff_maker_misc_feature,gff_maker,bypass +---read_gff_maker_3UTR,gff_maker +---read_gff_maker_5UTR,gff_maker +--read_gff_maker_exon,gff_maker + + +``` From 12795dd374a55eb8e771fad6ad41fb9b816e3635 Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:34:00 +0200 Subject: [PATCH 21/22] Delete Pipfile --- Pipfile | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 Pipfile diff --git a/Pipfile b/Pipfile deleted file mode 100644 index c029e52..0000000 --- a/Pipfile +++ /dev/null @@ -1,15 +0,0 @@ -[[source]] -url = "https://pypi.org/simple" -verify_ssl = true -name = "pypi" - -[packages] -biopython = "*" -tqdm = "*" -pandas = "*" -joblib = "*" - -[dev-packages] - -[requires] -python_version = "3.9" From 6429b39fceecbd5a635bd89928b2219c5cb805ca Mon Sep 17 00:00:00 2001 From: AxelGiottonini <75320942+AxelGiottonini@users.noreply.github.com> Date: Thu, 20 May 2021 09:34:07 +0200 Subject: [PATCH 22/22] Delete Pipfile.lock --- Pipfile.lock | 134 --------------------------------------------------- 1 file changed, 134 deletions(-) delete mode 100644 Pipfile.lock diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 43ebfb6..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,134 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "bc0663e00dfee42678acee85bb844332840e78c75b12bf90e09c3210744e343b" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.9" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "biopython": { - "hashes": [ - "sha256:010142a8ec2549ff0649edd497658964ef1a18eefdb9fd942ec1e81b292ce2d9", - "sha256:0b9fbb0d3022dc22716da108b8a81b80d952cd97ac1f106de491dce850f92f62", - "sha256:0df5cddef2819c975e6508adf5d85aa046e449df5420d02b04871c7836b41273", - "sha256:194528eda6856a4c68f840ca0bcc9b544a5edee3548b97521084e7ac38c833ca", - "sha256:195f099c2c0c39518b6df921ab2b3cc43a601896018fc61909ac8385d5878866", - "sha256:1ee0a0b6c2376680fea6642d5080baa419fd73df104a62d58a8baf7a8bbe4564", - "sha256:2bd5a630be2a8e593094f7b1717fc962eda8931b68542b97fbf9bd8e2ac1e08d", - "sha256:4565c97fab16c5697d067b821b6a1da0ec3ef36a9c96cf103ac7b4a94eb9f9ba", - "sha256:48d424453a5512a1d1d41a4acabdfe5291da1f491a2d3606f2b0e4fbd63aeda6", - "sha256:5c0b369f91a76b8e5e36624d075585c3f0f088ea4a6e3d015c48f08e48ce0114", - "sha256:75b55000793f6b76334b8e80dc7e6d8cd2b019af917aa431cea6646e8e696c7f", - "sha256:ada611f12ee3b0bef7308ef41ee7b94898613b369ab44e0268d74bd1d6a06920", - "sha256:cc3b0b78022d14f11d508038a288a189d03c97c476d6636c7b6f98bd8bc8462b", - "sha256:e0af107cc62a905d13d35dd7b38f335a37752ede45e4617139e84409a6a88dc4", - "sha256:f1076653937947773768455556b1d24acad9575759e9089082f32636b09add54", - "sha256:f5021a398c898b9cf6815cc5171c146a601b935b55364c53e6516a2545ab740c" - ], - "index": "pypi", - "version": "==1.78" - }, - "joblib": { - "hashes": [ - "sha256:9c17567692206d2f3fb9ecf5e991084254fe631665c450b443761c4186a613f7", - "sha256:feeb1ec69c4d45129954f1b7034954241eedfd6ba39b5e9e4b6883be3332d5e5" - ], - "index": "pypi", - "version": "==1.0.1" - }, - "numpy": { - "hashes": [ - "sha256:2428b109306075d89d21135bdd6b785f132a1f5a3260c371cee1fae427e12727", - "sha256:377751954da04d4a6950191b20539066b4e19e3b559d4695399c5e8e3e683bf6", - "sha256:4703b9e937df83f5b6b7447ca5912b5f5f297aba45f91dbbbc63ff9278c7aa98", - "sha256:471c0571d0895c68da309dacee4e95a0811d0a9f9f532a48dc1bea5f3b7ad2b7", - "sha256:61d5b4cf73622e4d0c6b83408a16631b670fc045afd6540679aa35591a17fe6d", - "sha256:6c915ee7dba1071554e70a3664a839fbc033e1d6528199d4621eeaaa5487ccd2", - "sha256:6e51e417d9ae2e7848314994e6fc3832c9d426abce9328cf7571eefceb43e6c9", - "sha256:719656636c48be22c23641859ff2419b27b6bdf844b36a2447cb39caceb00935", - "sha256:780ae5284cb770ade51d4b4a7dce4faa554eb1d88a56d0e8b9f35fca9b0270ff", - "sha256:878922bf5ad7550aa044aa9301d417e2d3ae50f0f577de92051d739ac6096cee", - "sha256:924dc3f83de20437de95a73516f36e09918e9c9c18d5eac520062c49191025fb", - "sha256:97ce8b8ace7d3b9288d88177e66ee75480fb79b9cf745e91ecfe65d91a856042", - "sha256:9c0fab855ae790ca74b27e55240fe4f2a36a364a3f1ebcfd1fb5ac4088f1cec3", - "sha256:9cab23439eb1ebfed1aaec9cd42b7dc50fc96d5cd3147da348d9161f0501ada5", - "sha256:a8e6859913ec8eeef3dbe9aed3bf475347642d1cdd6217c30f28dee8903528e6", - "sha256:aa046527c04688af680217fffac61eec2350ef3f3d7320c07fd33f5c6e7b4d5f", - "sha256:abc81829c4039e7e4c30f7897938fa5d4916a09c2c7eb9b244b7a35ddc9656f4", - "sha256:bad70051de2c50b1a6259a6df1daaafe8c480ca98132da98976d8591c412e737", - "sha256:c73a7975d77f15f7f68dacfb2bca3d3f479f158313642e8ea9058eea06637931", - "sha256:d15007f857d6995db15195217afdbddfcd203dfaa0ba6878a2f580eaf810ecd6", - "sha256:d76061ae5cab49b83a8cf3feacefc2053fac672728802ac137dd8c4123397677", - "sha256:e8e4fbbb7e7634f263c5b0150a629342cc19b47c5eba8d1cd4363ab3455ab576", - "sha256:e9459f40244bb02b2f14f6af0cd0732791d72232bbb0dc4bab57ef88e75f6935", - "sha256:edb1f041a9146dcf02cd7df7187db46ab524b9af2515f392f337c7cbbf5b52cd" - ], - "markers": "python_version >= '3.7'", - "version": "==1.20.2" - }, - "pandas": { - "hashes": [ - "sha256:09761bf5f8c741d47d4b8b9073288de1be39bbfccc281d70b889ade12b2aad29", - "sha256:0f27fd1adfa256388dc34895ca5437eaf254832223812afd817a6f73127f969c", - "sha256:43e00770552595c2250d8d712ec8b6e08ca73089ac823122344f023efa4abea3", - "sha256:46fc671c542a8392a4f4c13edc8527e3a10f6cb62912d856f82248feb747f06e", - "sha256:475b7772b6e18a93a43ea83517932deff33954a10d4fbae18d0c1aba4182310f", - "sha256:4d821b9b911fc1b7d428978d04ace33f0af32bb7549525c8a7b08444bce46b74", - "sha256:5e3c8c60541396110586bcbe6eccdc335a38e7de8c217060edaf4722260b158f", - "sha256:621c044a1b5e535cf7dcb3ab39fca6f867095c3ef223a524f18f60c7fee028ea", - "sha256:72ffcea00ae8ffcdbdefff800284311e155fbb5ed6758f1a6110fc1f8f8f0c1c", - "sha256:8a051e957c5206f722e83f295f95a2cf053e890f9a1fba0065780a8c2d045f5d", - "sha256:97b1954533b2a74c7e20d1342c4f01311d3203b48f2ebf651891e6a6eaf01104", - "sha256:9f5829e64507ad10e2561b60baf285c470f3c4454b007c860e77849b88865ae7", - "sha256:a93e34f10f67d81de706ce00bf8bb3798403cabce4ccb2de10c61b5ae8786ab5", - "sha256:d59842a5aa89ca03c2099312163ffdd06f56486050e641a45d926a072f04d994", - "sha256:dbb255975eb94143f2e6ec7dadda671d25147939047839cd6b8a4aff0379bb9b", - "sha256:df6f10b85aef7a5bb25259ad651ad1cc1d6bb09000595cab47e718cbac250b1d" - ], - "index": "pypi", - "version": "==1.2.3" - }, - "python-dateutil": { - "hashes": [ - "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", - "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.8.1" - }, - "pytz": { - "hashes": [ - "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da", - "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798" - ], - "version": "==2021.1" - }, - "six": { - "hashes": [ - "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", - "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.15.0" - }, - "tqdm": { - "hashes": [ - "sha256:9fdf349068d047d4cfbe24862c425883af1db29bcddf4b0eeb2524f6fbdb23c7", - "sha256:d666ae29164da3e517fcf125e41d4fe96e5bb375cd87ff9763f6b38b5592fe33" - ], - "index": "pypi", - "version": "==4.59.0" - } - }, - "develop": {} -}