From b13106550c21bf5d3c8ccbb71a512042d5a4c781 Mon Sep 17 00:00:00 2001
From: suecharo <suehiro619@gmail.com>
Date: Fri, 13 Apr 2018 16:13:49 +0900
Subject: [PATCH] release.

---
 KPHMMER/__init__.py              |   5 -
 LICENSE                          |  21 ---
 MANIFEST.in                      |   2 +
 script/KPHMMER_API_access.py     | 296 +++++++++++++++++++++++++++++++
 script/KPHMMER_stat.py           | 131 ++++++++++++++
 script/README.txt                |   8 +
 script/calc_pvalue.py            |  41 +++++
 script/domain_text.py            |  20 +++
 script/dup.py                    |  14 ++
 script/make_ensemble_tsv.py      |  78 ++++++++
 script/tsv.py                    |  41 +++++
 script/tsv2txt.py                |  41 +++++
 script/txt2hmm_domain_changer.py |  37 ++++
 script/yaml_gene_count_getter.py |  25 +++
 setup.cfg                        |   4 +
 setup.py                         |  60 +++----
 16 files changed, 758 insertions(+), 66 deletions(-)
 delete mode 100644 LICENSE
 create mode 100644 script/KPHMMER_API_access.py
 create mode 100644 script/KPHMMER_stat.py
 create mode 100644 script/README.txt
 create mode 100644 script/calc_pvalue.py
 create mode 100644 script/domain_text.py
 create mode 100644 script/dup.py
 create mode 100644 script/make_ensemble_tsv.py
 create mode 100644 script/tsv.py
 create mode 100644 script/tsv2txt.py
 create mode 100644 script/txt2hmm_domain_changer.py
 create mode 100644 script/yaml_gene_count_getter.py
 create mode 100644 setup.cfg

diff --git a/KPHMMER/__init__.py b/KPHMMER/__init__.py
index 72b6acf..4269b49 100644
--- a/KPHMMER/__init__.py
+++ b/KPHMMER/__init__.py
@@ -11,8 +11,3 @@
 from .query import Query
 from .search import Search
 from .util import check_status_code, dump_log, get_kegg, get_pfam
-
-__author__ = "Hirotaka Suetake"
-__author_email__ = "hirotaka.suetake@riken.jp"
-__version__ = "1.0"
-__release__ = "1"
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 2053b24..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2018 suecharo <hirotaka.suetake@riken.jp>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
index 052c0b5..0366f42 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,3 @@
 recursive-include KPHMMER/config_files *
+recursive-include data *
+recursive-include script *
diff --git a/script/KPHMMER_API_access.py b/script/KPHMMER_API_access.py
new file mode 100644
index 0000000..a010898
--- /dev/null
+++ b/script/KPHMMER_API_access.py
@@ -0,0 +1,296 @@
+#!/bin/env python
+# coding: utf-8
+"""
+KPHMMER でそれぞれの API を叩いた回数を表示するバージョン
+$ python3 KPHMMER_API-access.py で普通の使い方
+"""
+import sys
+import traceback
+from collections import defaultdict
+
+from KPHMMER import (Analysis, Config, Convert, Query, Search,
+                     determine_submethod, get_args)
+
+from .util import check_status_code, dump_log, get_kegg, get_pfam
+
+
+class QueryCount(Query):
+    def __init__(self, args=None):
+        super().__init__()
+        self.kegg_count = 0
+
+    def _search_pathway(self):
+        dump_log("Start searching pathway")
+        for organism in self.l_organism_code:
+            endpoint = "/list/pathway/{}".format(organism)
+            status_code, text = get_kegg(endpoint)
+            self.kegg_count += 1
+            msg = "Query organism code {} is wrong.".format(organism)
+            if check_status_code(status_code, msg):
+                self.d_organism[organism] = dict()
+                self.d_organism[organism]["pathway"] = []
+                for row in text.split("\n"):
+                    ele = row.split("\t")[0]
+                    pathway_id = ele[-5:]
+                    if pathway_id != "":
+                        self.d_organism[organism]["pathway"].append(pathway_id)
+
+        return True
+
+    def _find_gene(self):
+        dump_log("Start finding genes")
+        for organism, value in self.d_organism.items():
+            endpoint = "/link/{}/pathway".format(organism)
+            status_code, text = get_kegg(endpoint)
+            self.kegg_count += 1
+            msg = "This endpoint {} is Nothing.".format(endpoint)
+            d_path_to_gene = defaultdict(list)
+            if check_status_code(status_code, msg):
+                for row in text.split("\n"):
+                    l_ele = row.split("\t")
+                    if len(l_ele) < 2:
+                        continue
+                    d_path_to_gene[l_ele[0]].append(l_ele[1])
+
+            s_1st_gene = set()
+            s_2nd_gene = set()
+            for name in ["1st", "2nd"]:
+                for pathway_id in value["{}_pathway".format(name)]:
+                    l_gene_id = d_path_to_gene[pathway_id]
+                    for gene_id in l_gene_id:
+                        if name == "1st":
+                            s_1st_gene.add(gene_id)
+                        elif name == "2nd":
+                            s_2nd_gene.add(gene_id)
+            s_duplicate = s_1st_gene & s_2nd_gene
+
+            len_1 = len(s_1st_gene)
+            len_2 = len(s_2nd_gene)
+            len_dup = len(s_duplicate)
+            dump_log("{0}'s 1st gene count : {1}".format(organism, len_1))
+            dump_log("{0}'s 2nd gene count : {1}".format(organism, len_2))
+            msg = "{0}'s duplicate gene count : {1}".format(organism, len_dup)
+            dump_log(msg)
+
+            if self.d_config["CONFIG"]["INSERT_DUPLICATE"] == "1st":
+                s_2nd_gene = s_2nd_gene - s_duplicate
+            elif self.d_config["CONFIG"]["INSERT_DUPLICATE"] == "2nd":
+                s_1st_gene = s_1st_gene - s_duplicate
+            else:
+                msg = "Please check your config.yml-INSERT_DUPLICATE"
+                raise ValueError(msg)
+
+            self.d_organism[organism]["1st_gene"] = list(s_1st_gene)
+            self.d_organism[organism]["2nd_gene"] = list(s_2nd_gene)
+            self.d_organism[organism]["duplicate_gene"] = list(s_duplicate)
+
+        return True
+
+    def _find_domain(self):
+        dump_log("Start finding pathways")
+        for organism, value in self.d_organism.items():
+            dump_log("Organism : {}".format(organism))
+            self.d_organism[organism]["d_domain"] = dict()
+            all_gene = value["1st_gene"] + value["2nd_gene"]
+            group_num = 10
+            dump_log("Number of genes : {}".format(len(all_gene)))
+            count = 0
+            for i in range(0, len(all_gene), group_num):
+                count += 10
+                dump_log("{} / {}".format(count, len(all_gene)))
+                chunk = all_gene[i:i + group_num]
+                endpoint = "/get/{}".format("+".join(chunk))
+                status_code, text = get_kegg(endpoint)
+                self.kegg_count += 1
+                msg = "This endpoint {} is Nothing.".format(endpoint)
+                if check_status_code(status_code, msg):
+                    l_text = text.split("//")
+                    for j in range(len(chunk)):
+                        gene = chunk[j]
+                        ele_text = l_text[j]
+                        l_domain = []
+                        b_domain = False
+                        for row in ele_text.split("\n"):
+                            if len(row) < 5:
+                                continue
+                            if b_domain is True:
+                                if row[0] != " ":
+                                    break
+                                else:
+                                    l_row = row.split(" ")
+                            else:
+                                if row[:5] == "MOTIF":
+                                    b_domain = True
+                                    l_row = row.split(" ")
+                                else:
+                                    continue
+                            for ele in l_row:
+                                if ele in ["MOTIF", "", "Pfam:"]:
+                                    continue
+                                else:
+                                    l_domain.append(ele)
+                        self.d_organism[organism]["d_domain"][gene] = l_domain
+
+        return True
+
+
+class ConvertCount(Convert):
+    def __init__(self, args=None):
+        super().__init__()
+        self.kegg_count = 0
+
+    def _dump_fasta(self):
+        dump_log("Start dumping fasta files")
+        for organism, value in self.d_domain.items():
+            dump_log("Organism : {}".format(organism))
+            gene_1st = value["gene_1st"]
+            gene_2nd = value["gene_2nd"]
+            l_fasta_1st = []
+            l_fasta_2nd = []
+            l_fasta_all = []
+            group_num = 10
+            len_gene = len(gene_1st) + len(gene_2nd)
+            dump_log("Number of genes : {}".format(len_gene))
+            count = 0
+            for i in range(0, len(gene_1st), group_num):
+                count += 10
+                dump_log("{} / {}".format(count, len_gene))
+                chunk = gene_1st[i:i + group_num]
+                endpoint = "/get/{}/aaseq".format("+".join(chunk))
+                status_code, text = get_kegg(endpoint)
+                self.kegg_count += 1
+                msg = "This endpoint {} is Nothing.".format(endpoint)
+                if check_status_code(status_code, msg):
+                    l_fasta = []
+                    l_content = []
+                    for row in text.split("\n"):
+                        if len(row) == 0:
+                            continue
+                        if row[0] == ">":
+                            if len(l_content) != 0:
+                                amino = "".join(l_content)
+                                l_fasta.append(amino)
+                                l_content = []
+                                l_fasta.append(row)
+                            else:
+                                l_fasta.append(row)
+                        else:
+                            l_content.append(row)
+                    else:
+                        if len(l_content) != 0:
+                            amino = "".join(l_content)
+                            l_fasta.append(amino)
+                    l_fasta_1st.extend(l_fasta)
+                    l_fasta_all.extend(l_fasta)
+
+            count = len(gene_1st)
+            for i in range(0, len(gene_2nd), group_num):
+                count += 10
+                dump_log("{} / {}".format(count, len_gene))
+                chunk = gene_2nd[i:i + group_num]
+                endpoint = "/get/{}/aaseq".format("+".join(chunk))
+                status_code, text = get_kegg(endpoint)
+                self.kegg_count += 1
+                msg = "This endpoint {} is Nothing.".format(endpoint)
+                if check_status_code(status_code, msg):
+                    l_fasta = []
+                    l_content = []
+                    for row in text.split("\n"):
+                        if len(row) == 0:
+                            continue
+                        if row[0] == ">":
+                            if len(l_content) != 0:
+                                amino = "".join(l_content)
+                                l_fasta.append(amino)
+                                l_content = []
+                                l_fasta.append(row)
+                            else:
+                                l_fasta.append(row)
+                        else:
+                            l_content.append(row)
+                    else:
+                        if len(l_content) != 0:
+                            amino = "".join(l_content)
+                            l_fasta.append(amino)
+                    l_fasta_2nd.extend(l_fasta)
+                    l_fasta_all.extend(l_fasta)
+
+            fasta_1st = "\n".join(l_fasta_1st)
+            fasta_2nd = "\n".join(l_fasta_2nd)
+            fasta_all = "\n".join(l_fasta_all)
+            for (fa, name) in [[fasta_1st, "1st"], [fasta_2nd, "2nd"],
+                               [fasta_all, "all"]]:
+                file_name = "{}_{}.fasta".format(organism, name)
+                file_path = self.output.joinpath(file_name)
+                with file_path.open(mode="w") as f:
+                    f.write(fa)
+
+        return True
+
+
+class AnalysisCount(Analysis):
+    def __init__(self, args=None):
+        super().__init__()
+        self.pfam_count = 0
+
+    def _dump_hmm(self):
+        dump_log("Start dumping hmm files")
+        for category in ["1st", "2nd"]:
+            dump_log("Start {} category".format(category))
+            if category == "1st":
+                use_dict = self.d_count_1st
+            elif category == "2nd":
+                use_dict = self.d_count_2nd
+            dir_name = "{}_{}".format("_".join(self.d_domain.keys()), category)
+            dir_path = self.output.joinpath(dir_name)
+            dir_path.mkdir(parents=True)
+            dump_log("Number of domains : {}".format(len(use_dict.keys())))
+            count = 0
+            for domain in use_dict.keys():
+                count += 1
+                dump_log("{} : {}".format(count, domain))
+                endpoint = "/family/{}/hmm".format(domain)
+                status_code, text = get_pfam(endpoint)
+                self.pfam_count += 1
+                msg = "input domain {} is not found.".format(domain)
+                if check_status_code(status_code, msg):
+                    file_path = dir_path.joinpath("{}.hmm".format(domain))
+                    with file_path.open(mode="w") as f:
+                        f.write(text)
+
+        return True
+
+
+def main():
+    args = get_args()
+    submethod = determine_submethod(args)
+    if submethod is False:
+        get_args(usage=True)
+
+    try:
+        if submethod == "query":
+            my_submethod = Query(args)
+            kegg = "KEGG API access count : {}".format(my_submethod.kegg_count)
+            dump_log(kegg)
+        elif submethod == "search":
+            my_submethod = Search(args)
+        elif submethod == "analysis":
+            my_submethod = Analysis(args)
+            pfam = "Pfam API access count : {}".format(my_submethod.pfam_count)
+            dump_log(pfam)
+        elif submethod == "convert":
+            my_submethod = Convert(args)
+            kegg = "KEGG API access count : {}".format(my_submethod.kegg_count)
+            dump_log(kegg)
+        elif submethod == "config":
+            my_submethod = Config(args)
+        my_submethod.run()
+    except:
+        traceback.print_exc()
+        sys.exit(1)
+
+    return True
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/KPHMMER_stat.py b/script/KPHMMER_stat.py
new file mode 100644
index 0000000..d5c4570
--- /dev/null
+++ b/script/KPHMMER_stat.py
@@ -0,0 +1,131 @@
+#!/bin/env python
+# coding: utf-8
+from KPHMMER.util import dump_log
+import traceback
+import sys
+
+class KphmmerStat:
+    def __init__(self, tsv_path, yml_path):
+        self.tsv_path = tsv_path
+        self.yml_path = yml_path
+        self.l_domain_gene = []
+
+    def run(self):
+        dump_log("Start KPHMMER stat")
+        dump_log("=== Your Input ===")
+        dump_log("TSV Path : {}".format(self.tsv_path))
+        dump_log("YML Path : {}".format(self.yml_path))
+        try:
+            self.read_tsv_data()
+        except:
+            traceback.print_exc()
+            sys.exit(1)
+        print(self.l_domain_gene)
+
+    def read_tsv_data(self):
+        with open(self.tsv_path, "r") as f:
+            d_tsv = f.read()
+        l_row = d_tsv.split("\n")
+        for row in l_row:
+            l_ele = row.split()
+            if len(row.split()) >= 4:
+                self.l_domain_gene.append([l_ele[0], l_ele[3]])
+        if len(self.l_domain_gene) >= 11:
+            self.l_domain_gene = self.l_domain_gene[3:-7]
+
+        return True
+
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+# def get_domain_gene_name(tsv_hmmer_output):
+#
+#
+#
+# def count_gene_domain(l_domain_gene):
+#     gene_domain = defaultdict(list)
+#     for [domain, gene] in l_domain_gene:
+#         gene_domain[gene].append(domain)
+#
+#     return gene_domain
+#
+#
+# def split_l_gene(motif_yaml_path, l_gene):
+#     with open(motif_yaml_path, "r") as f:
+#         data_yml = f.read()
+#     d_yml = yaml.load(data_yml)
+#     gene_1 = set(d_yml["GENE"]["1ST"])
+#     gene_2 = set(d_yml["GENE"]["2ND"])
+#
+#     count_1 = 0
+#     count_2 = 0
+#
+#     for gene in l_gene:
+#         if gene in gene_1:
+#             count_1 += 1
+#         elif gene in gene_2:
+#             count_2 += 1
+#         else:
+#             print(gene)
+#
+#     l_count = [len(gene_1), count_1, len(gene_2), count_2]
+#
+#     return l_count
+#
+#
+# def main():
+#     print("=== HMMER output analysis start ===")
+#     print("=== Your input ===")
+#     hmm_output_path = os.path.abspath(sys.argv[1])
+#     motif_yaml_path = os.path.abspath(sys.argv[2])
+#     print("HMMER output tsv file path : {}".format(hmm_output_path))
+#     print("Domain yaml file : {}".format(motif_yaml_path))
+#     print("\n")
+#
+#     l_domain_gene = get_domain_gene_name(tsv_hmmer_output)
+#     gene_domain = count_gene_domain(l_domain_gene)
+#
+#
+#     with open(hmm_output_path, "r") as f:
+#         tsv_hmmer_output = f.read()
+#     l_domain_gene = get_domain_gene_name(tsv_hmmer_output)
+#     gene_domain = count_gene_domain(l_domain_gene)
+#
+#     print("=== Found Gene list and domain ===")
+#     for key, value in gene_domain.items():
+#         print("{} : {}".format(key, value))
+#
+#     l_gene = list(gene_domain.keys())
+#     l_count = split_l_gene(motif_yaml_path, l_gene)
+#
+#     TP = l_count[3]
+#     FP = l_count[1]
+#     FN = l_count[2] - l_count[3]
+#     TN = l_count[0] - l_count[1]
+#
+#     print("\n")
+#     print("=== 2 * 2 contingency table ===")
+#     print("|-----------|-------|-------|")
+#     print("|-----------|   2nd |   1st |")
+#     print("|-----------|-------|-------|")
+#     print("| Found     | {:>5} | {:>5} |".format(str(TP), str(FP)))
+#     print("| Not Found | {:>5} | {:>5} |".format(str(FN), str(TN)))
+#     print("|-----------|-------|-------|")
+#     print("Precision : {}".format(TP / (TP + FP)))
+#     print("Recall : {}".format(TP / (TP + FN)))
+
+
+def main():
+    my_kphmmer_stat = KphmmerStat()
+    my_kphmmer_stat.run(sys.argv[1], sys.argv[2])
+
+if __name__ == "__main__":
+    main()
diff --git a/script/README.txt b/script/README.txt
new file mode 100644
index 0000000..b3f05fe
--- /dev/null
+++ b/script/README.txt
@@ -0,0 +1,8 @@
+## KPHMMER_API_access.py
+- KPHMMER でそれぞれの API を叩いた回数を表示するバージョン
+$ python3 KPHMMER_API-access.py で普通の使い方
+
+## KPHMMER_stat.py
+- KPHMMER analysis で出力した，hmm file を用いて，対象の fasta に対し HMMER をかける。
+- その出力の tsv file と 対象生物の KPHMMER query で出力された yaml file を入力として，統計的な結果を求める
+$ python3 KPHMMER_stat.py hoge.tsv piyo.yml
diff --git a/script/calc_pvalue.py b/script/calc_pvalue.py
new file mode 100644
index 0000000..c4b8651
--- /dev/null
+++ b/script/calc_pvalue.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+import numpy
+import sys
+from scipy import stats
+
+
+def calc_pvalue(count_1, count_2, count_no_1, count_no_2):
+    data = numpy.array([[count_1, count_2], [count_no_1, count_no_2]])
+    x2, p, dof, exp = stats.chi2_contingency(data)
+
+    return p
+
+
+def main():
+    with open(sys.argv[1], "r") as f:
+        data = f.read()
+    l_row = data.split("\n")
+    header = l_row[0].split("\t")
+    gene_1st_all_count = int(header[1][10:-1])
+    gene_2nd_all_count = int(header[2][10:-1])
+
+    print("=== Start calculation P value. ===")
+    with open(sys.argv[2], "w") as f:
+        f.write("{}\tp_value\n".format("\t".join(header)))
+        for row in l_row[1:]:
+            if row == "":
+                continue
+            l_ele = row.split("\t")
+            count_1 = int(l_ele[1])
+            count_2 = int(l_ele[2])
+            count_no_1 = gene_1st_all_count - count_1
+            count_no_2 = gene_2nd_all_count - count_2
+            p_value = calc_pvalue(count_1, count_2, count_no_1, count_no_2)
+            f.write("{}\t{}\n".format(row, p_value))
+    print("=== Done. ===")
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/domain_text.py b/script/domain_text.py
new file mode 100644
index 0000000..6cc3e53
--- /dev/null
+++ b/script/domain_text.py
@@ -0,0 +1,20 @@
+# coding: utf-8
+import sys
+
+
+def main():
+    with open(sys.argv[1], "r") as f:
+        data = f.read()
+    l_row = data.split("\n")
+    l_domain = []
+    for row in l_row:
+        if row == "":
+            continue
+        if row[:4] == "NAME":
+            l_domain.append(row[6:])
+    with open(sys.argv[2], "w") as f:
+        f.write("\n".join(l_domain))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/dup.py b/script/dup.py
new file mode 100644
index 0000000..5f5f627
--- /dev/null
+++ b/script/dup.py
@@ -0,0 +1,14 @@
+# coding: utf-8
+
+
+def main():
+    with open("./KPHMMER_domains.txt", "r") as f:
+        data_kphmmer = set(f.read().split("\n"))
+    with open("./Pfam-A_domains.txt", "r") as f:
+        data_pfam = set(f.read().split("\n"))
+    print(len(data_kphmmer & data_pfam))
+    with open("dup.txt", "w") as f:
+        f.write("\n".join(list(data_kphmmer & data_pfam)))
+
+if __name__ == "__main__":
+    main()
diff --git a/script/make_ensemble_tsv.py b/script/make_ensemble_tsv.py
new file mode 100644
index 0000000..c9661cb
--- /dev/null
+++ b/script/make_ensemble_tsv.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+import numpy
+import sys
+from scipy import stats
+
+
+def calc_pvalue(count_1, count_2, count_no_1, count_no_2, cochran):
+    data = numpy.array([[count_1, count_2], [count_no_1, count_no_2]])
+    x2, p, dof, exp = stats.chi2_contingency(data)
+
+    if p < 0.05:
+        l_exp = list(exp.ravel())
+        b_check = False
+        for num_exp in l_exp:
+            if num_exp <= cochran:
+                b_check = True
+                break
+        if b_check:
+            return False
+        res = data - exp
+        res_var = numpy.zeros(res.shape)
+        it = numpy.nditer(data, flags=["multi_index"])
+        d_sum = data.sum()
+        while not it.finished:
+            var = (1 - (data[:, it.multi_index[1]].sum() / d_sum)) * \
+                  (1 - (data[it.multi_index[0], :].sum() / d_sum))
+            res_var[it.multi_index[0], it.multi_index[1]] = var
+            it.iternext()
+        stdres = res / numpy.sqrt(exp * res_var)
+        if stdres[0][1] >= 1.96:
+            return p
+    else:
+        return False
+
+    return False
+
+
+def main():
+    with open("./sco_sma_sgr_sen_all.tsv", "r") as f:
+        data = f.read()
+    l_row = data.split("\n")
+    header = l_row[0].split("\t")
+    gene_1st_all_count = int(header[1][10:-1])
+    gene_2nd_all_count = int(header[2][10:-1])
+
+    print("=== Start ensemble ===")
+    l_cochran = list(range(11))
+    for cochran in l_cochran:
+        file_name = "./count_file/count_{}.tsv".format(cochran)
+
+        print("P value : 0.05, Cochran : {}".format(cochran))
+        count = 0
+        with open(file_name, "w") as f:
+            f.write("{}\tp_value\n".format("\t".join(header)))
+            for row in l_row[1:]:
+                if row == "":
+                    continue
+                l_ele = row.split("\t")
+                count_1 = int(l_ele[1])
+                count_2 = int(l_ele[2])
+                count_no_1 = gene_1st_all_count - count_1
+                count_no_2 = gene_2nd_all_count - count_2
+                p_value = calc_pvalue(count_1, count_2, count_no_1,
+                                      count_no_2, cochran)
+                if p_value is False:
+                    continue
+                else:
+                    f.write("{}\t{}\n".format(row, p_value))
+                    count += 1
+        print("2nd count : {}".format(count))
+        print("=" * 10)
+    print("=== Done. ===")
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/tsv.py b/script/tsv.py
new file mode 100644
index 0000000..b968075
--- /dev/null
+++ b/script/tsv.py
@@ -0,0 +1,41 @@
+#!/bin/env python3
+# coding: utf-8
+import requests
+import sys
+
+
+def main():
+    with open(sys.argv[1], "r") as f:
+        data = f.read()
+    l_tmp = data.split("\n")
+    if len(l_tmp) == 0:
+        print("Your input file is null.")
+        sys.exit(1)
+    l_domain = []
+    for row in l_tmp[1:]:
+        if row == "":
+            continue
+        l_ele = row.split("\t")
+        l_domain.append(l_ele[0])
+
+    print("=== Number of domains ===")
+    print(len(l_domain))
+    print("=== Domain list ===")
+    print(" ".join(l_domain))
+
+    print("=== Convert start ===")
+    count = 0
+    for domain in l_domain:
+        count += 1
+        print("{} : {}".format(str(count), domain))
+        endpoint = "/family/{}/hmm".format(domain)
+        url = "http://pfam.xfam.org" + endpoint
+        ret = requests.get(url)
+        text = ret.text
+        with open("./{}/{}.hmm".format(sys.argv[2], domain), "w") as f:
+            f.write(text)
+    print("=== Convert finish ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/tsv2txt.py b/script/tsv2txt.py
new file mode 100644
index 0000000..b968075
--- /dev/null
+++ b/script/tsv2txt.py
@@ -0,0 +1,41 @@
+#!/bin/env python3
+# coding: utf-8
+import requests
+import sys
+
+
+def main():
+    with open(sys.argv[1], "r") as f:
+        data = f.read()
+    l_tmp = data.split("\n")
+    if len(l_tmp) == 0:
+        print("Your input file is null.")
+        sys.exit(1)
+    l_domain = []
+    for row in l_tmp[1:]:
+        if row == "":
+            continue
+        l_ele = row.split("\t")
+        l_domain.append(l_ele[0])
+
+    print("=== Number of domains ===")
+    print(len(l_domain))
+    print("=== Domain list ===")
+    print(" ".join(l_domain))
+
+    print("=== Convert start ===")
+    count = 0
+    for domain in l_domain:
+        count += 1
+        print("{} : {}".format(str(count), domain))
+        endpoint = "/family/{}/hmm".format(domain)
+        url = "http://pfam.xfam.org" + endpoint
+        ret = requests.get(url)
+        text = ret.text
+        with open("./{}/{}.hmm".format(sys.argv[2], domain), "w") as f:
+            f.write(text)
+    print("=== Convert finish ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/txt2hmm_domain_changer.py b/script/txt2hmm_domain_changer.py
new file mode 100644
index 0000000..9589a15
--- /dev/null
+++ b/script/txt2hmm_domain_changer.py
@@ -0,0 +1,37 @@
+#!/bin/env python3
+# coding: utf-8
+import requests
+import sys
+
+
+def main():
+    with open(sys.argv[1], "r") as f:
+        data = f.read()
+    l_tmp = data.split("\n")
+    l_domain = []
+    for domain in l_tmp:
+        if domain == "":
+            continue
+        l_domain.append(str(domain))
+
+    print("=== Number of domains ===")
+    print(len(l_domain))
+    print("=== Domain list ===")
+    print(" ".join(l_domain))
+
+    print("=== Convert start ===")
+    count = 0
+    with open(sys.argv[2], "w") as f:
+        for domain in l_domain:
+            count += 1
+            print("{} : {}".format(str(count), domain))
+            endpoint = "/family/{}/hmm".format(domain)
+            url = "http://pfam.xfam.org" + endpoint
+            ret = requests.get(url)
+            text = ret.text
+            f.write(text)
+    print("=== Convert finish ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/yaml_gene_count_getter.py b/script/yaml_gene_count_getter.py
new file mode 100644
index 0000000..54a9959
--- /dev/null
+++ b/script/yaml_gene_count_getter.py
@@ -0,0 +1,25 @@
+#!/bin/env python3
+# coding: utf-8
+import yaml
+import os
+import sys
+
+
+def main():
+    file_path = sys.argv[1]
+    with open(file_path, "r") as f:
+        data = yaml.load(f)
+    count_1 = len(data["GENE"]["1ST"])
+    count_2 = len(data["GENE"]["2ND"])
+    count_all = count_1 + count_2
+    file_abs = os.path.abspath(file_path)
+    file_name = os.path.basename(file_abs)
+    f_title, f_ext = os.path.splitext(file_name)
+    print("=== {} all gene count ===".format(f_title))
+    print("1st count : {}".format(count_1))
+    print("2nd count : {}".format(count_2))
+    print("all count : {}".format(count_all))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..8bfd5a1
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,4 @@
+[egg_info]
+tag_build = 
+tag_date = 0
+
diff --git a/setup.py b/setup.py
index 5fc9ce7..3054fcf 100644
--- a/setup.py
+++ b/setup.py
@@ -1,52 +1,32 @@
-from KPHMMER import (
-    __author__,
-    __author_email__,
-    __version__,
-    __release__
-)
-
 from setuptools import setup
-from setuptools import find_packages
-
-import sys
-
-
-# validation
-if sys.version_info < (3, 4):
-    print("Building SAMPLE_PROJECT requires at least Python 3.4 to run.")
-    sys.exit(1)
 
 
 def main():
-    description = "KPHMMER"
-
     setup(
         name="KPHMMER",
-        version=__version__,
-        author=__author__,
-        author_email=__author_email__,
-        url="www.example.jp",
-        description=description,
-        long_description=description,
+        version="1.0.1",
+        description="KPHMMER: Hidden Markov Model generator for detecting KEGG PATHWAY-specific genes",
+        author="Hirotaka Suetake",
+        author_email="hirotaka.suetake@riken.jp",
+        license="MIT",
+        keywords=["Life Science", "Bioinfomatics", "HMMER", "KEGG"],
+        packages=["KPHMMER"],
         zip_safe=False,
         include_package_data=True,
-        packages=[
-            "KPHMMER"
-        ],
-        install_requires=[],
-        tests_require=[],
-        setup_requires=[],
-        scripts=[
-            "bin/kphmmer"
+        install_requires=[
+            "numpy",
+            "PyYAML",
+            "scipy",
+            "requests"
         ],
-        license="GNU Lesser General Public License v3 or later (LGPLv3+)",
-        keywords="",
-        platforms="Linux",
-        classifiers=["Intended Audience :: System Administrators",
-                     "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
-                     "Natural Language :: Japanese",
-                     "Programming Language :: Python :: 3.4",
-                     ],
+        scripts=["bin/kphmmer"],
+        classifiers=[
+            "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: Science/Research",
+            "License :: OSI Approved :: MIT License",
+            "Programming Language :: Python :: 3 :: Only",
+            "Topic :: Scientific/Engineering :: Bio-Informatics",
+        ]
     )