From b13106550c21bf5d3c8ccbb71a512042d5a4c781 Mon Sep 17 00:00:00 2001 From: suecharo Date: Fri, 13 Apr 2018 16:13:49 +0900 Subject: [PATCH] release. --- KPHMMER/__init__.py | 5 - LICENSE | 21 --- MANIFEST.in | 2 + script/KPHMMER_API_access.py | 296 +++++++++++++++++++++++++++++++ script/KPHMMER_stat.py | 131 ++++++++++++++ script/README.txt | 8 + script/calc_pvalue.py | 41 +++++ script/domain_text.py | 20 +++ script/dup.py | 14 ++ script/make_ensemble_tsv.py | 78 ++++++++ script/tsv.py | 41 +++++ script/tsv2txt.py | 41 +++++ script/txt2hmm_domain_changer.py | 37 ++++ script/yaml_gene_count_getter.py | 25 +++ setup.cfg | 4 + setup.py | 60 +++---- 16 files changed, 758 insertions(+), 66 deletions(-) delete mode 100644 LICENSE create mode 100644 script/KPHMMER_API_access.py create mode 100644 script/KPHMMER_stat.py create mode 100644 script/README.txt create mode 100644 script/calc_pvalue.py create mode 100644 script/domain_text.py create mode 100644 script/dup.py create mode 100644 script/make_ensemble_tsv.py create mode 100644 script/tsv.py create mode 100644 script/tsv2txt.py create mode 100644 script/txt2hmm_domain_changer.py create mode 100644 script/yaml_gene_count_getter.py create mode 100644 setup.cfg diff --git a/KPHMMER/__init__.py b/KPHMMER/__init__.py index 72b6acf..4269b49 100644 --- a/KPHMMER/__init__.py +++ b/KPHMMER/__init__.py @@ -11,8 +11,3 @@ from .query import Query from .search import Search from .util import check_status_code, dump_log, get_kegg, get_pfam - -__author__ = "Hirotaka Suetake" -__author_email__ = "hirotaka.suetake@riken.jp" -__version__ = "1.0" -__release__ = "1" diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 2053b24..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2018 suecharo - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index 052c0b5..0366f42 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,3 @@ recursive-include KPHMMER/config_files * +recursive-include data * +recursive-include script * diff --git a/script/KPHMMER_API_access.py b/script/KPHMMER_API_access.py new file mode 100644 index 0000000..a010898 --- /dev/null +++ b/script/KPHMMER_API_access.py @@ -0,0 +1,296 @@ +#!/bin/env python +# coding: utf-8 +""" +KPHMMER でそれぞれの API を叩いた回数を表示するバージョン +$ python3 KPHMMER_API-access.py で普通の使い方 +""" +import sys +import traceback +from collections import defaultdict + +from KPHMMER import (Analysis, Config, Convert, Query, Search, + determine_submethod, get_args) + +from .util import check_status_code, dump_log, get_kegg, get_pfam + + +class QueryCount(Query): + def __init__(self, args=None): + super().__init__() + self.kegg_count = 0 + + def _search_pathway(self): + dump_log("Start searching pathway") + for organism in self.l_organism_code: + endpoint = "/list/pathway/{}".format(organism) + status_code, text = get_kegg(endpoint) + self.kegg_count += 1 + msg = "Query organism code {} is wrong.".format(organism) + if check_status_code(status_code, msg): + self.d_organism[organism] = dict() + self.d_organism[organism]["pathway"] = [] + for row in text.split("\n"): + ele = row.split("\t")[0] + pathway_id = ele[-5:] + if pathway_id != "": + self.d_organism[organism]["pathway"].append(pathway_id) + + return True + + def _find_gene(self): + dump_log("Start finding genes") + for organism, value in self.d_organism.items(): + endpoint = "/link/{}/pathway".format(organism) + status_code, text = get_kegg(endpoint) + self.kegg_count += 1 + msg = "This endpoint {} is Nothing.".format(endpoint) + d_path_to_gene = defaultdict(list) + if check_status_code(status_code, msg): + for row in text.split("\n"): + l_ele = row.split("\t") + if len(l_ele) < 2: + continue + d_path_to_gene[l_ele[0]].append(l_ele[1]) + + s_1st_gene = set() + s_2nd_gene = set() + for name in ["1st", "2nd"]: + for pathway_id in value["{}_pathway".format(name)]: + l_gene_id = d_path_to_gene[pathway_id] + for gene_id in l_gene_id: + if name == "1st": + s_1st_gene.add(gene_id) + elif name == "2nd": + s_2nd_gene.add(gene_id) + s_duplicate = s_1st_gene & s_2nd_gene + + len_1 = len(s_1st_gene) + len_2 = len(s_2nd_gene) + len_dup = len(s_duplicate) + dump_log("{0}'s 1st gene count : {1}".format(organism, len_1)) + dump_log("{0}'s 2nd gene count : {1}".format(organism, len_2)) + msg = "{0}'s duplicate gene count : {1}".format(organism, len_dup) + dump_log(msg) + + if self.d_config["CONFIG"]["INSERT_DUPLICATE"] == "1st": + s_2nd_gene = s_2nd_gene - s_duplicate + elif self.d_config["CONFIG"]["INSERT_DUPLICATE"] == "2nd": + s_1st_gene = s_1st_gene - s_duplicate + else: + msg = "Please check your config.yml-INSERT_DUPLICATE" + raise ValueError(msg) + + self.d_organism[organism]["1st_gene"] = list(s_1st_gene) + self.d_organism[organism]["2nd_gene"] = list(s_2nd_gene) + self.d_organism[organism]["duplicate_gene"] = list(s_duplicate) + + return True + + def _find_domain(self): + dump_log("Start finding pathways") + for organism, value in self.d_organism.items(): + dump_log("Organism : {}".format(organism)) + self.d_organism[organism]["d_domain"] = dict() + all_gene = value["1st_gene"] + value["2nd_gene"] + group_num = 10 + dump_log("Number of genes : {}".format(len(all_gene))) + count = 0 + for i in range(0, len(all_gene), group_num): + count += 10 + dump_log("{} / {}".format(count, len(all_gene))) + chunk = all_gene[i:i + group_num] + endpoint = "/get/{}".format("+".join(chunk)) + status_code, text = get_kegg(endpoint) + self.kegg_count += 1 + msg = "This endpoint {} is Nothing.".format(endpoint) + if check_status_code(status_code, msg): + l_text = text.split("//") + for j in range(len(chunk)): + gene = chunk[j] + ele_text = l_text[j] + l_domain = [] + b_domain = False + for row in ele_text.split("\n"): + if len(row) < 5: + continue + if b_domain is True: + if row[0] != " ": + break + else: + l_row = row.split(" ") + else: + if row[:5] == "MOTIF": + b_domain = True + l_row = row.split(" ") + else: + continue + for ele in l_row: + if ele in ["MOTIF", "", "Pfam:"]: + continue + else: + l_domain.append(ele) + self.d_organism[organism]["d_domain"][gene] = l_domain + + return True + + +class ConvertCount(Convert): + def __init__(self, args=None): + super().__init__() + self.kegg_count = 0 + + def _dump_fasta(self): + dump_log("Start dumping fasta files") + for organism, value in self.d_domain.items(): + dump_log("Organism : {}".format(organism)) + gene_1st = value["gene_1st"] + gene_2nd = value["gene_2nd"] + l_fasta_1st = [] + l_fasta_2nd = [] + l_fasta_all = [] + group_num = 10 + len_gene = len(gene_1st) + len(gene_2nd) + dump_log("Number of genes : {}".format(len_gene)) + count = 0 + for i in range(0, len(gene_1st), group_num): + count += 10 + dump_log("{} / {}".format(count, len_gene)) + chunk = gene_1st[i:i + group_num] + endpoint = "/get/{}/aaseq".format("+".join(chunk)) + status_code, text = get_kegg(endpoint) + self.kegg_count += 1 + msg = "This endpoint {} is Nothing.".format(endpoint) + if check_status_code(status_code, msg): + l_fasta = [] + l_content = [] + for row in text.split("\n"): + if len(row) == 0: + continue + if row[0] == ">": + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_content = [] + l_fasta.append(row) + else: + l_fasta.append(row) + else: + l_content.append(row) + else: + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_fasta_1st.extend(l_fasta) + l_fasta_all.extend(l_fasta) + + count = len(gene_1st) + for i in range(0, len(gene_2nd), group_num): + count += 10 + dump_log("{} / {}".format(count, len_gene)) + chunk = gene_2nd[i:i + group_num] + endpoint = "/get/{}/aaseq".format("+".join(chunk)) + status_code, text = get_kegg(endpoint) + self.kegg_count += 1 + msg = "This endpoint {} is Nothing.".format(endpoint) + if check_status_code(status_code, msg): + l_fasta = [] + l_content = [] + for row in text.split("\n"): + if len(row) == 0: + continue + if row[0] == ">": + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_content = [] + l_fasta.append(row) + else: + l_fasta.append(row) + else: + l_content.append(row) + else: + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_fasta_2nd.extend(l_fasta) + l_fasta_all.extend(l_fasta) + + fasta_1st = "\n".join(l_fasta_1st) + fasta_2nd = "\n".join(l_fasta_2nd) + fasta_all = "\n".join(l_fasta_all) + for (fa, name) in [[fasta_1st, "1st"], [fasta_2nd, "2nd"], + [fasta_all, "all"]]: + file_name = "{}_{}.fasta".format(organism, name) + file_path = self.output.joinpath(file_name) + with file_path.open(mode="w") as f: + f.write(fa) + + return True + + +class AnalysisCount(Analysis): + def __init__(self, args=None): + super().__init__() + self.pfam_count = 0 + + def _dump_hmm(self): + dump_log("Start dumping hmm files") + for category in ["1st", "2nd"]: + dump_log("Start {} category".format(category)) + if category == "1st": + use_dict = self.d_count_1st + elif category == "2nd": + use_dict = self.d_count_2nd + dir_name = "{}_{}".format("_".join(self.d_domain.keys()), category) + dir_path = self.output.joinpath(dir_name) + dir_path.mkdir(parents=True) + dump_log("Number of domains : {}".format(len(use_dict.keys()))) + count = 0 + for domain in use_dict.keys(): + count += 1 + dump_log("{} : {}".format(count, domain)) + endpoint = "/family/{}/hmm".format(domain) + status_code, text = get_pfam(endpoint) + self.pfam_count += 1 + msg = "input domain {} is not found.".format(domain) + if check_status_code(status_code, msg): + file_path = dir_path.joinpath("{}.hmm".format(domain)) + with file_path.open(mode="w") as f: + f.write(text) + + return True + + +def main(): + args = get_args() + submethod = determine_submethod(args) + if submethod is False: + get_args(usage=True) + + try: + if submethod == "query": + my_submethod = Query(args) + kegg = "KEGG API access count : {}".format(my_submethod.kegg_count) + dump_log(kegg) + elif submethod == "search": + my_submethod = Search(args) + elif submethod == "analysis": + my_submethod = Analysis(args) + pfam = "Pfam API access count : {}".format(my_submethod.pfam_count) + dump_log(pfam) + elif submethod == "convert": + my_submethod = Convert(args) + kegg = "KEGG API access count : {}".format(my_submethod.kegg_count) + dump_log(kegg) + elif submethod == "config": + my_submethod = Config(args) + my_submethod.run() + except: + traceback.print_exc() + sys.exit(1) + + return True + + +if __name__ == "__main__": + main() diff --git a/script/KPHMMER_stat.py b/script/KPHMMER_stat.py new file mode 100644 index 0000000..d5c4570 --- /dev/null +++ b/script/KPHMMER_stat.py @@ -0,0 +1,131 @@ +#!/bin/env python +# coding: utf-8 +from KPHMMER.util import dump_log +import traceback +import sys + +class KphmmerStat: + def __init__(self, tsv_path, yml_path): + self.tsv_path = tsv_path + self.yml_path = yml_path + self.l_domain_gene = [] + + def run(self): + dump_log("Start KPHMMER stat") + dump_log("=== Your Input ===") + dump_log("TSV Path : {}".format(self.tsv_path)) + dump_log("YML Path : {}".format(self.yml_path)) + try: + self.read_tsv_data() + except: + traceback.print_exc() + sys.exit(1) + print(self.l_domain_gene) + + def read_tsv_data(self): + with open(self.tsv_path, "r") as f: + d_tsv = f.read() + l_row = d_tsv.split("\n") + for row in l_row: + l_ele = row.split() + if len(row.split()) >= 4: + self.l_domain_gene.append([l_ele[0], l_ele[3]]) + if len(self.l_domain_gene) >= 11: + self.l_domain_gene = self.l_domain_gene[3:-7] + + return True + +# +# +# +# +# +# +# +# +# +# +# +# def get_domain_gene_name(tsv_hmmer_output): +# +# +# +# def count_gene_domain(l_domain_gene): +# gene_domain = defaultdict(list) +# for [domain, gene] in l_domain_gene: +# gene_domain[gene].append(domain) +# +# return gene_domain +# +# +# def split_l_gene(motif_yaml_path, l_gene): +# with open(motif_yaml_path, "r") as f: +# data_yml = f.read() +# d_yml = yaml.load(data_yml) +# gene_1 = set(d_yml["GENE"]["1ST"]) +# gene_2 = set(d_yml["GENE"]["2ND"]) +# +# count_1 = 0 +# count_2 = 0 +# +# for gene in l_gene: +# if gene in gene_1: +# count_1 += 1 +# elif gene in gene_2: +# count_2 += 1 +# else: +# print(gene) +# +# l_count = [len(gene_1), count_1, len(gene_2), count_2] +# +# return l_count +# +# +# def main(): +# print("=== HMMER output analysis start ===") +# print("=== Your input ===") +# hmm_output_path = os.path.abspath(sys.argv[1]) +# motif_yaml_path = os.path.abspath(sys.argv[2]) +# print("HMMER output tsv file path : {}".format(hmm_output_path)) +# print("Domain yaml file : {}".format(motif_yaml_path)) +# print("\n") +# +# l_domain_gene = get_domain_gene_name(tsv_hmmer_output) +# gene_domain = count_gene_domain(l_domain_gene) +# +# +# with open(hmm_output_path, "r") as f: +# tsv_hmmer_output = f.read() +# l_domain_gene = get_domain_gene_name(tsv_hmmer_output) +# gene_domain = count_gene_domain(l_domain_gene) +# +# print("=== Found Gene list and domain ===") +# for key, value in gene_domain.items(): +# print("{} : {}".format(key, value)) +# +# l_gene = list(gene_domain.keys()) +# l_count = split_l_gene(motif_yaml_path, l_gene) +# +# TP = l_count[3] +# FP = l_count[1] +# FN = l_count[2] - l_count[3] +# TN = l_count[0] - l_count[1] +# +# print("\n") +# print("=== 2 * 2 contingency table ===") +# print("|-----------|-------|-------|") +# print("|-----------| 2nd | 1st |") +# print("|-----------|-------|-------|") +# print("| Found | {:>5} | {:>5} |".format(str(TP), str(FP))) +# print("| Not Found | {:>5} | {:>5} |".format(str(FN), str(TN))) +# print("|-----------|-------|-------|") +# print("Precision : {}".format(TP / (TP + FP))) +# print("Recall : {}".format(TP / (TP + FN))) + + +def main(): + my_kphmmer_stat = KphmmerStat() + my_kphmmer_stat.run(sys.argv[1], sys.argv[2]) + +if __name__ == "__main__": + main() diff --git a/script/README.txt b/script/README.txt new file mode 100644 index 0000000..b3f05fe --- /dev/null +++ b/script/README.txt @@ -0,0 +1,8 @@ +## KPHMMER_API_access.py +- KPHMMER でそれぞれの API を叩いた回数を表示するバージョン +$ python3 KPHMMER_API-access.py で普通の使い方 + +## KPHMMER_stat.py +- KPHMMER analysis で出力した,hmm file を用いて,対象の fasta に対し HMMER をかける。 +- その出力の tsv file と 対象生物の KPHMMER query で出力された yaml file を入力として,統計的な結果を求める +$ python3 KPHMMER_stat.py hoge.tsv piyo.yml diff --git a/script/calc_pvalue.py b/script/calc_pvalue.py new file mode 100644 index 0000000..c4b8651 --- /dev/null +++ b/script/calc_pvalue.py @@ -0,0 +1,41 @@ +# coding: utf-8 +import numpy +import sys +from scipy import stats + + +def calc_pvalue(count_1, count_2, count_no_1, count_no_2): + data = numpy.array([[count_1, count_2], [count_no_1, count_no_2]]) + x2, p, dof, exp = stats.chi2_contingency(data) + + return p + + +def main(): + with open(sys.argv[1], "r") as f: + data = f.read() + l_row = data.split("\n") + header = l_row[0].split("\t") + gene_1st_all_count = int(header[1][10:-1]) + gene_2nd_all_count = int(header[2][10:-1]) + + print("=== Start calculation P value. ===") + with open(sys.argv[2], "w") as f: + f.write("{}\tp_value\n".format("\t".join(header))) + for row in l_row[1:]: + if row == "": + continue + l_ele = row.split("\t") + count_1 = int(l_ele[1]) + count_2 = int(l_ele[2]) + count_no_1 = gene_1st_all_count - count_1 + count_no_2 = gene_2nd_all_count - count_2 + p_value = calc_pvalue(count_1, count_2, count_no_1, count_no_2) + f.write("{}\t{}\n".format(row, p_value)) + print("=== Done. ===") + + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/script/domain_text.py b/script/domain_text.py new file mode 100644 index 0000000..6cc3e53 --- /dev/null +++ b/script/domain_text.py @@ -0,0 +1,20 @@ +# coding: utf-8 +import sys + + +def main(): + with open(sys.argv[1], "r") as f: + data = f.read() + l_row = data.split("\n") + l_domain = [] + for row in l_row: + if row == "": + continue + if row[:4] == "NAME": + l_domain.append(row[6:]) + with open(sys.argv[2], "w") as f: + f.write("\n".join(l_domain)) + + +if __name__ == "__main__": + main() diff --git a/script/dup.py b/script/dup.py new file mode 100644 index 0000000..5f5f627 --- /dev/null +++ b/script/dup.py @@ -0,0 +1,14 @@ +# coding: utf-8 + + +def main(): + with open("./KPHMMER_domains.txt", "r") as f: + data_kphmmer = set(f.read().split("\n")) + with open("./Pfam-A_domains.txt", "r") as f: + data_pfam = set(f.read().split("\n")) + print(len(data_kphmmer & data_pfam)) + with open("dup.txt", "w") as f: + f.write("\n".join(list(data_kphmmer & data_pfam))) + +if __name__ == "__main__": + main() diff --git a/script/make_ensemble_tsv.py b/script/make_ensemble_tsv.py new file mode 100644 index 0000000..c9661cb --- /dev/null +++ b/script/make_ensemble_tsv.py @@ -0,0 +1,78 @@ +# coding: utf-8 +import numpy +import sys +from scipy import stats + + +def calc_pvalue(count_1, count_2, count_no_1, count_no_2, cochran): + data = numpy.array([[count_1, count_2], [count_no_1, count_no_2]]) + x2, p, dof, exp = stats.chi2_contingency(data) + + if p < 0.05: + l_exp = list(exp.ravel()) + b_check = False + for num_exp in l_exp: + if num_exp <= cochran: + b_check = True + break + if b_check: + return False + res = data - exp + res_var = numpy.zeros(res.shape) + it = numpy.nditer(data, flags=["multi_index"]) + d_sum = data.sum() + while not it.finished: + var = (1 - (data[:, it.multi_index[1]].sum() / d_sum)) * \ + (1 - (data[it.multi_index[0], :].sum() / d_sum)) + res_var[it.multi_index[0], it.multi_index[1]] = var + it.iternext() + stdres = res / numpy.sqrt(exp * res_var) + if stdres[0][1] >= 1.96: + return p + else: + return False + + return False + + +def main(): + with open("./sco_sma_sgr_sen_all.tsv", "r") as f: + data = f.read() + l_row = data.split("\n") + header = l_row[0].split("\t") + gene_1st_all_count = int(header[1][10:-1]) + gene_2nd_all_count = int(header[2][10:-1]) + + print("=== Start ensemble ===") + l_cochran = list(range(11)) + for cochran in l_cochran: + file_name = "./count_file/count_{}.tsv".format(cochran) + + print("P value : 0.05, Cochran : {}".format(cochran)) + count = 0 + with open(file_name, "w") as f: + f.write("{}\tp_value\n".format("\t".join(header))) + for row in l_row[1:]: + if row == "": + continue + l_ele = row.split("\t") + count_1 = int(l_ele[1]) + count_2 = int(l_ele[2]) + count_no_1 = gene_1st_all_count - count_1 + count_no_2 = gene_2nd_all_count - count_2 + p_value = calc_pvalue(count_1, count_2, count_no_1, + count_no_2, cochran) + if p_value is False: + continue + else: + f.write("{}\t{}\n".format(row, p_value)) + count += 1 + print("2nd count : {}".format(count)) + print("=" * 10) + print("=== Done. ===") + + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/script/tsv.py b/script/tsv.py new file mode 100644 index 0000000..b968075 --- /dev/null +++ b/script/tsv.py @@ -0,0 +1,41 @@ +#!/bin/env python3 +# coding: utf-8 +import requests +import sys + + +def main(): + with open(sys.argv[1], "r") as f: + data = f.read() + l_tmp = data.split("\n") + if len(l_tmp) == 0: + print("Your input file is null.") + sys.exit(1) + l_domain = [] + for row in l_tmp[1:]: + if row == "": + continue + l_ele = row.split("\t") + l_domain.append(l_ele[0]) + + print("=== Number of domains ===") + print(len(l_domain)) + print("=== Domain list ===") + print(" ".join(l_domain)) + + print("=== Convert start ===") + count = 0 + for domain in l_domain: + count += 1 + print("{} : {}".format(str(count), domain)) + endpoint = "/family/{}/hmm".format(domain) + url = "http://pfam.xfam.org" + endpoint + ret = requests.get(url) + text = ret.text + with open("./{}/{}.hmm".format(sys.argv[2], domain), "w") as f: + f.write(text) + print("=== Convert finish ===") + + +if __name__ == "__main__": + main() diff --git a/script/tsv2txt.py b/script/tsv2txt.py new file mode 100644 index 0000000..b968075 --- /dev/null +++ b/script/tsv2txt.py @@ -0,0 +1,41 @@ +#!/bin/env python3 +# coding: utf-8 +import requests +import sys + + +def main(): + with open(sys.argv[1], "r") as f: + data = f.read() + l_tmp = data.split("\n") + if len(l_tmp) == 0: + print("Your input file is null.") + sys.exit(1) + l_domain = [] + for row in l_tmp[1:]: + if row == "": + continue + l_ele = row.split("\t") + l_domain.append(l_ele[0]) + + print("=== Number of domains ===") + print(len(l_domain)) + print("=== Domain list ===") + print(" ".join(l_domain)) + + print("=== Convert start ===") + count = 0 + for domain in l_domain: + count += 1 + print("{} : {}".format(str(count), domain)) + endpoint = "/family/{}/hmm".format(domain) + url = "http://pfam.xfam.org" + endpoint + ret = requests.get(url) + text = ret.text + with open("./{}/{}.hmm".format(sys.argv[2], domain), "w") as f: + f.write(text) + print("=== Convert finish ===") + + +if __name__ == "__main__": + main() diff --git a/script/txt2hmm_domain_changer.py b/script/txt2hmm_domain_changer.py new file mode 100644 index 0000000..9589a15 --- /dev/null +++ b/script/txt2hmm_domain_changer.py @@ -0,0 +1,37 @@ +#!/bin/env python3 +# coding: utf-8 +import requests +import sys + + +def main(): + with open(sys.argv[1], "r") as f: + data = f.read() + l_tmp = data.split("\n") + l_domain = [] + for domain in l_tmp: + if domain == "": + continue + l_domain.append(str(domain)) + + print("=== Number of domains ===") + print(len(l_domain)) + print("=== Domain list ===") + print(" ".join(l_domain)) + + print("=== Convert start ===") + count = 0 + with open(sys.argv[2], "w") as f: + for domain in l_domain: + count += 1 + print("{} : {}".format(str(count), domain)) + endpoint = "/family/{}/hmm".format(domain) + url = "http://pfam.xfam.org" + endpoint + ret = requests.get(url) + text = ret.text + f.write(text) + print("=== Convert finish ===") + + +if __name__ == "__main__": + main() diff --git a/script/yaml_gene_count_getter.py b/script/yaml_gene_count_getter.py new file mode 100644 index 0000000..54a9959 --- /dev/null +++ b/script/yaml_gene_count_getter.py @@ -0,0 +1,25 @@ +#!/bin/env python3 +# coding: utf-8 +import yaml +import os +import sys + + +def main(): + file_path = sys.argv[1] + with open(file_path, "r") as f: + data = yaml.load(f) + count_1 = len(data["GENE"]["1ST"]) + count_2 = len(data["GENE"]["2ND"]) + count_all = count_1 + count_2 + file_abs = os.path.abspath(file_path) + file_name = os.path.basename(file_abs) + f_title, f_ext = os.path.splitext(file_name) + print("=== {} all gene count ===".format(f_title)) + print("1st count : {}".format(count_1)) + print("2nd count : {}".format(count_2)) + print("all count : {}".format(count_all)) + + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8bfd5a1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[egg_info] +tag_build = +tag_date = 0 + diff --git a/setup.py b/setup.py index 5fc9ce7..3054fcf 100644 --- a/setup.py +++ b/setup.py @@ -1,52 +1,32 @@ -from KPHMMER import ( - __author__, - __author_email__, - __version__, - __release__ -) - from setuptools import setup -from setuptools import find_packages - -import sys - - -# validation -if sys.version_info < (3, 4): - print("Building SAMPLE_PROJECT requires at least Python 3.4 to run.") - sys.exit(1) def main(): - description = "KPHMMER" - setup( name="KPHMMER", - version=__version__, - author=__author__, - author_email=__author_email__, - url="www.example.jp", - description=description, - long_description=description, + version="1.0.1", + description="KPHMMER: Hidden Markov Model generator for detecting KEGG PATHWAY-specific genes", + author="Hirotaka Suetake", + author_email="hirotaka.suetake@riken.jp", + license="MIT", + keywords=["Life Science", "Bioinfomatics", "HMMER", "KEGG"], + packages=["KPHMMER"], zip_safe=False, include_package_data=True, - packages=[ - "KPHMMER" - ], - install_requires=[], - tests_require=[], - setup_requires=[], - scripts=[ - "bin/kphmmer" + install_requires=[ + "numpy", + "PyYAML", + "scipy", + "requests" ], - license="GNU Lesser General Public License v3 or later (LGPLv3+)", - keywords="", - platforms="Linux", - classifiers=["Intended Audience :: System Administrators", - "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)", - "Natural Language :: Japanese", - "Programming Language :: Python :: 3.4", - ], + scripts=["bin/kphmmer"], + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ] )