diff --git a/.gitignore b/.gitignore index 2b0bc09..d326694 100644 --- a/.gitignore +++ b/.gitignore @@ -6,24 +6,6 @@ __pycache__/ # C extensions *.so -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg # PyInstaller # Usually these files are written by a python script from a template diff --git a/KPHMMER.egg-info/PKG-INFO b/KPHMMER.egg-info/PKG-INFO new file mode 100644 index 0000000..fc74134 --- /dev/null +++ b/KPHMMER.egg-info/PKG-INFO @@ -0,0 +1,16 @@ +Metadata-Version: 1.1 +Name: KPHMMER +Version: 1.0.2 +Summary: KPHMMER: Hidden Markov Model generator for detecting KEGG PATHWAY-specific genes +Home-page: UNKNOWN +Author: Hirotaka Suetake +Author-email: hirotaka.suetake@riken.jp +License: MIT +Description: UNKNOWN +Keywords: Life Science,Bioinfomatics,HMMER,KEGG +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Topic :: Scientific/Engineering :: Bio-Informatics diff --git a/KPHMMER.egg-info/SOURCES.txt b/KPHMMER.egg-info/SOURCES.txt new file mode 100644 index 0000000..0cad2b9 --- /dev/null +++ b/KPHMMER.egg-info/SOURCES.txt @@ -0,0 +1,328 @@ +LICENSE +MANIFEST.in +README.md +setup.cfg +setup.py +KPHMMER/__init__.py +KPHMMER/analysis.py +KPHMMER/argument_parser.py +KPHMMER/config.py +KPHMMER/config_file_manager.py +KPHMMER/convert.py +KPHMMER/query.py +KPHMMER/search.py +KPHMMER/util.py +KPHMMER.egg-info/PKG-INFO +KPHMMER.egg-info/SOURCES.txt +KPHMMER.egg-info/dependency_links.txt +KPHMMER.egg-info/not-zip-safe +KPHMMER.egg-info/requires.txt +KPHMMER.egg-info/top_level.txt +KPHMMER/config_files/config.yml +KPHMMER/config_files/config.yml-dist +KPHMMER/config_files/help_message.yml +bin/kphmmer +data/2ndfind/.DS_Store +data/2ndfind/2ndfind.hmm +data/2ndfind/2ndfind.hmm.h3f +data/2ndfind/domains/AATase.hmm +data/2ndfind/domains/AATase_new.hmm +data/2ndfind/domains/ACP_syn_III_C.hmm +data/2ndfind/domains/ACP_syn_III_C_new.hmm +data/2ndfind/domains/AMP-binding.hmm +data/2ndfind/domains/ATP-grasp_3_new.hmm +data/2ndfind/domains/Abhydrolase_2.hmm +data/2ndfind/domains/Abhydrolase_2_new.hmm +data/2ndfind/domains/Acyl_transf_1.hmm +data/2ndfind/domains/Alk_phosphatase_new.hmm +data/2ndfind/domains/Aminotran_1_2.hmm +data/2ndfind/domains/BTAD.hmm +data/2ndfind/domains/BTAD_new.hmm +data/2ndfind/domains/CP_ATPgrasp_2.hmm +data/2ndfind/domains/CP_ATPgrasp_2_new.hmm +data/2ndfind/domains/Chal_sti_synt_C.hmm +data/2ndfind/domains/Chal_sti_synt_C_new.hmm +data/2ndfind/domains/Chal_sti_synt_N.hmm +data/2ndfind/domains/CheR.hmm +data/2ndfind/domains/Condensation_new.hmm +data/2ndfind/domains/Cyclase_polyket.hmm +data/2ndfind/domains/DHQS.hmm +data/2ndfind/domains/DHQS_new.hmm +data/2ndfind/domains/DUF1295.hmm +data/2ndfind/domains/DUF1295_new.hmm +data/2ndfind/domains/DUF1702.hmm +data/2ndfind/domains/DUF1702_new.hmm +data/2ndfind/domains/DUF2172_new.hmm +data/2ndfind/domains/DUF3463.hmm +data/2ndfind/domains/Docking.hmm +data/2ndfind/domains/ERG4_ERG24.hmm +data/2ndfind/domains/ERG4_ERG24_new.hmm +data/2ndfind/domains/Eno-Rase_FAD_bd_new.hmm +data/2ndfind/domains/Enoyl_reductase.hmm +data/2ndfind/domains/Enoyl_reductase_new.hmm +data/2ndfind/domains/FabA.hmm +data/2ndfind/domains/FabA_new.hmm +data/2ndfind/domains/FcoT_new.hmm +data/2ndfind/domains/Glyco_trans_4_4_new.hmm +data/2ndfind/domains/HMG-CoA_red.hmm +data/2ndfind/domains/HMG-CoA_red_new.hmm +data/2ndfind/domains/HMG_CoA_synt_N.hmm +data/2ndfind/domains/HMG_CoA_synt_N_new.hmm +data/2ndfind/domains/Hexose_dehydrat.hmm +data/2ndfind/domains/ICMT_new.hmm +data/2ndfind/domains/IucA_IucC.hmm +data/2ndfind/domains/KR.hmm +data/2ndfind/domains/KR_new.hmm +data/2ndfind/domains/Ketoacyl-synt_C.hmm +data/2ndfind/domains/Ketoacyl-synt_C_new.hmm +data/2ndfind/domains/Lant_dehyd_C.hmm +data/2ndfind/domains/Lant_dehydr_N.hmm +data/2ndfind/domains/Lant_dehydr_N_new.hmm +data/2ndfind/domains/LuxC.hmm +data/2ndfind/domains/Lycopene_cycl.hmm +data/2ndfind/domains/Lycopene_cycl_new.hmm +data/2ndfind/domains/MbtH_new.hmm +data/2ndfind/domains/MelC1.hmm +data/2ndfind/domains/MelC1_new.hmm +data/2ndfind/domains/Methyltransf_13_new.hmm +data/2ndfind/domains/Methyltransf_14_new.hmm +data/2ndfind/domains/NTF2.hmm +data/2ndfind/domains/OCD_Mu_crystall.hmm +data/2ndfind/domains/PEMT.hmm +data/2ndfind/domains/PEMT_new.hmm +data/2ndfind/domains/ParBc.hmm +data/2ndfind/domains/Peptidase_M42.hmm +data/2ndfind/domains/Peptidase_M42_new.hmm +data/2ndfind/domains/Polyketide_cyc.hmm +data/2ndfind/domains/Polyketide_cyc2.hmm +data/2ndfind/domains/Polyketide_cyc2_new.hmm +data/2ndfind/domains/PqqD.hmm +data/2ndfind/domains/PqqD_new.hmm +data/2ndfind/domains/Prenyltrans.hmm +data/2ndfind/domains/Prenyltrans_new.hmm +data/2ndfind/domains/Regulator_TrmB.hmm +data/2ndfind/domains/Regulator_TrmB_new.hmm +data/2ndfind/domains/Ring_hydroxyl_A.hmm +data/2ndfind/domains/Ring_hydroxyl_A_new.hmm +data/2ndfind/domains/SAF_new.hmm +data/2ndfind/domains/SLA_LP_auto_ag.hmm +data/2ndfind/domains/SLA_LP_auto_ag_new.hmm +data/2ndfind/domains/SQS_PSY.hmm +data/2ndfind/domains/SQS_PSY_new.hmm +data/2ndfind/domains/Steroid_dh.hmm +data/2ndfind/domains/Terpene_synth_C.hmm +data/2ndfind/domains/Terpene_synth_C_new.hmm +data/2ndfind/domains/Thioesterase.hmm +data/2ndfind/domains/Thioesterase_new.hmm +data/2ndfind/domains/Transglut_core3.hmm +data/2ndfind/domains/Transglut_core3_new.hmm +data/2ndfind/domains/TylF.hmm +data/2ndfind/domains/TylF_new.hmm +data/2ndfind/domains/Tyrosinase.hmm +data/2ndfind/domains/Tyrosinase_new.hmm +data/2ndfind/domains/UnbV_ASPIC.hmm +data/2ndfind/domains/YcaO.hmm +data/2ndfind/domains/YcaO_new.hmm +data/2ndfind/domains/dTDP_sugar_isom_new.hmm +data/2ndfind/domains/ketoacyl-synt.hmm +data/2ndfind/domains/p450.hmm +data/2ndfind/domains/p450_new.hmm +data/2ndfind/domains/peroxidase.hmm +data/2ndfind/domains/peroxidase_new.hmm +data/2ndfind/domains/polyprenyl_synt_new.hmm +data/scb/scb.yml +data/scb/scb_1st.fasta +data/scb/scb_2nd.fasta +data/scb/scb_all.fasta +data/scb/scb_analysis_2ndfind.txt +data/scb/scb_analysis_KPHMMER.txt +data/scb/scb_domain_2ndfind.tsv +data/scb/scb_domain_KPHMMER.tsv +data/sco_sma_sgr_sen/sco.yml +data/sco_sma_sgr_sen/sco_1st.fasta +data/sco_sma_sgr_sen/sco_2nd.fasta +data/sco_sma_sgr_sen/sco_all.fasta +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st.tsv +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd.tsv +data/sco_sma_sgr_sen/sco_sma_sgr_sen_all.tsv +data/sco_sma_sgr_sen/sen.yml +data/sco_sma_sgr_sen/sen_1st.fasta +data/sco_sma_sgr_sen/sen_2nd.fasta +data/sco_sma_sgr_sen/sen_all.fasta +data/sco_sma_sgr_sen/sgr.yml +data/sco_sma_sgr_sen/sgr_1st.fasta +data/sco_sma_sgr_sen/sgr_2nd.fasta +data/sco_sma_sgr_sen/sgr_all.fasta +data/sco_sma_sgr_sen/sma.yml +data/sco_sma_sgr_sen/sma_1st.fasta +data/sco_sma_sgr_sen/sma_2nd.fasta +data/sco_sma_sgr_sen/sma_all.fasta +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/AAA_16.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/AAA_22.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/AAA_33.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/ATP-grasp.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/Alpha-amylase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/Amidohydro_1.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/Amidohydro_3.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/Aminotran_3.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/Biotin_lipoyl.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/CPSase_L_D2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/His_biosynth.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/IlvN.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/K_oxygenase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_1st/TPP_enzyme_N.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/3Beta_HSD.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/3HCDH.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/A2M_comp.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/AAT.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/AA_kinase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/ACP_syn_III.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/ACT_7.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/ATP_bind_3.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/AXE1.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Abhydrolase_3.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Acyl_transf_1.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Aldedh.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Alliinase_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Amino_oxidase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Aminotran_1_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Anth_synt_I_N.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/BcrAD_BadFG.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Beta-lactamase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Beta-lactamase2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/CMAS.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Chorismate_bind.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Condensation.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DAO.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DEDD_Tnp_IS110.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DHDPS.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DUF1205.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DUF1487.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DUF1702.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DUF5423.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DXPR_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DXP_redisom_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DXP_reductoisom.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DXP_synthase_N.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DapB_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/DapB_N.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Dimerisation.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Dimerisation2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Diphthami_syn_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Docking.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/ECH_1.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/ECH_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/EHN.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Epimerase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Erythro-docking.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/FAD_binding_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/FAD_binding_3.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Flavin_Reduct.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/GDP_Man_Dehyd.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/GFO_IDH_MocA.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/GFO_IDH_MocA_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/GH3.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/GTP_EFTU.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/GTP_EFTU_D2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/GcpE.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Glucokinase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Glyco_tran_28_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/HI0933_like.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/HTH_21.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/HTH_Crp_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Herpes_BBRF1.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Hexose_dehydrat.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Hydant_A_N.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Inos-1-P_synth.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Inositol_P.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/IspD.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/KAsynt_C_assoc.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/KR.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Ketoacyl-synt_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/LYTB.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Lactamase_B.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Lactamase_B_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/MCRA.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Mago-bind.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/MbtH.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Methyltransf_12.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Methyltransf_13.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Methyltransf_14.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Methyltransf_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Methyltransf_23.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Methyltransf_32.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Methyltransf_9.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NAD_binding_10.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NAD_binding_4.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NAD_binding_5.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NAD_binding_8.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NAD_binding_9.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NTP_transf_3.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NUDIX.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NmrA.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/NodS.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/OmdA.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/PAPS_reduct.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/PCMT.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/PP-binding.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/PS-DH.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/PUA.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Penicil_amidase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Peptidase_M22.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Peptidase_S9.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Polyketide_cyc.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Polyketide_cyc2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Polysacc_synt_2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Prenyltrans.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Prenyltransf.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Pyr_redox.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/QueC.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/ROK.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Ribosomal_L50.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/RmlD_sub_bind.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/S4.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/SQHop_cyclase_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/SQHop_cyclase_N.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/SQS_PSY.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/SRP72.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Sacchrp_dh_NADP.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Semialdhyde_dh.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Semialdhyde_dhC.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/SpoVAD.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Terpene_synth_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Thi4.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Thiolase_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Thiolase_N.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Transket_pyr.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Transketolase_C.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Trp_halogenase.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/TylF.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/UDPGT.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/Ubie_methyltran.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/YgbB.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/adh_short.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/adh_short_C2.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/dTDP_sugar_isom.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/ketoacyl-synt.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/p450.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/polyprenyl_synt.hmm +data/sco_sma_sgr_sen/sco_sma_sgr_sen_2nd/zf-RING_7.hmm +data/sve/sve.yml +data/sve/sve_1st.fasta +data/sve/sve_2nd.fasta +data/sve/sve_all.fasta +data/sve/sve_analysis_2ndfind.txt +data/sve/sve_analysis_KPHMMER.txt +data/sve/sve_domain_2ndfind.tsv +data/sve/sve_domain_KPHMMER.tsv +script/KPHMMER_API_access.py +script/KPHMMER_stat.py +script/README.txt +script/calc_pvalue.py +script/domain_text.py +script/dup.py +script/make_ensemble_tsv.py +script/tsv.py +script/tsv2txt.py +script/txt2hmm_domain_changer.py +script/yaml_gene_count_getter.py \ No newline at end of file diff --git a/KPHMMER.egg-info/dependency_links.txt b/KPHMMER.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/KPHMMER.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/KPHMMER.egg-info/not-zip-safe b/KPHMMER.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/KPHMMER.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/KPHMMER.egg-info/requires.txt b/KPHMMER.egg-info/requires.txt new file mode 100644 index 0000000..c553248 --- /dev/null +++ b/KPHMMER.egg-info/requires.txt @@ -0,0 +1,4 @@ +numpy +PyYAML +scipy +requests diff --git a/KPHMMER.egg-info/top_level.txt b/KPHMMER.egg-info/top_level.txt new file mode 100644 index 0000000..ca57ef2 --- /dev/null +++ b/KPHMMER.egg-info/top_level.txt @@ -0,0 +1 @@ +KPHMMER diff --git a/build/lib/KPHMMER/__init__.py b/build/lib/KPHMMER/__init__.py new file mode 100644 index 0000000..4269b49 --- /dev/null +++ b/build/lib/KPHMMER/__init__.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# coding: utf-8 +""" +KPHMMER: Hidden Markov Model generator for detecting KEGG PATHWAY-specific genes +""" +from .analysis import Analysis +from .argument_parser import determine_submethod, get_args +from .config import Config +from .config_file_manager import ConfigFileManager +from .convert import Convert +from .query import Query +from .search import Search +from .util import check_status_code, dump_log, get_kegg, get_pfam diff --git a/build/lib/KPHMMER/analysis.py b/build/lib/KPHMMER/analysis.py new file mode 100644 index 0000000..a6eb42b --- /dev/null +++ b/build/lib/KPHMMER/analysis.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python +# coding: utf-8 +from pathlib import Path + +import numpy +import yaml +from scipy import stats + +from .config_file_manager import ConfigFileManager +from .util import check_status_code, dump_log, get_pfam + + +class Analysis: + def __init__(self, args=None): + if args is not None: + self.l_domain_file = args.analysis_domain_file + self.output = Path.cwd().joinpath(args.output).resolve() + if self.output.is_dir() is False: + self.output.mkdir(parents=True) + + self.d_domain = dict() + self.d_count = dict() + self.g_count_1 = 0 + self.g_count_2 = 0 + self.d_count_1st = dict() + self.d_count_2nd = dict() + + my_config_file_manager = ConfigFileManager() + self.d_config = my_config_file_manager.read_config() + + def run(self): + dump_log("=== Starting Analysis method ===") + self._comfirm_domain_file() + self._read_domain() + self._count_domain() + self._stat_domain() + self._dump_tsv() + self._dump_hmm() + dump_log("=== Analysis method finished ===") + + return True + + def _comfirm_domain_file(self): + """ + Configirm existence of l_domain_file and obtain absolute path. + """ + dump_log("Start Comfirming domain file ") + l_domain = [] + for domain in self.l_domain_file: + domain = Path(domain) + if domain.is_absolute is False: + domain = Path.cwd().joinpath(domain).resolve() + if domain.suffix != ".yml": + msg = "Your input {} is not yaml file".format(domain) + raise ValueError(msg) + if domain.exists is False: + msg = "Your input {} is not found".format(domain) + raise ValueError(msg) + l_domain.append(domain) + + self.l_domain_file = l_domain + + return True + + def _read_domain(self): + """ + d_domain + sco : + category_1st : ["1.1", "1.2"] + category_2nd : ["1.3", "1.4"] + duplicate : 2nd + gene_1st : [...] + gene_2nd : [...] + domain : + sco:SC00063 : [hoge] + ... + sma : + ... + """ + dump_log("Start reading domains") + for domain in self.l_domain_file: + with domain.open(mode="r") as f: + data = yaml.load(f) + organism = data["CONFIG"]["ORGANISM"] + category_1st = data["CONFIG"]["1ST_CATEGORY"] + category_2nd = data["CONFIG"]["2ND_CATEGORY"] + duplicate = data["CONFIG"]["INSERT_DUPLICATE"] + gene_1st = data["GENE"]["1ST"] + gene_2nd = data["GENE"]["2ND"] + domain = data["DOMAIN"] + self.d_domain[organism] = dict() + self.d_domain[organism]["category_1st"] = category_1st + self.d_domain[organism]["category_2nd"] = category_2nd + self.d_domain[organism]["duplicate"] = duplicate + self.d_domain[organism]["gene_1st"] = gene_1st + self.d_domain[organism]["gene_2nd"] = gene_2nd + self.d_domain[organism]["domain"] = domain + + return True + + def _count_domain(self): + """ + d_domain + sco : + category_1st : ["1.1", "1.2"] + category_2nd : ["1.3", "1.4"] + duplicate : 2nd + gene_1st : [...] + gene_2nd : [...] + domain : + sco:SC00063 : [ROK, Beta-lactamase2] + ... + sma : + ... + + d_count + ROK : + 1st_count : 10 + 2nd_count : 5 + 1st_frequency : 0.5000 + 2nd_frequency : 0.5000 + Beta-lactamase2 + """ + dump_log("Start counting domain") + for organism, value in self.d_domain.items(): + for gene_id in value["gene_1st"]: + self.g_count_1 += 1 + l_domain = value["domain"][gene_id] + for domain in l_domain: + if domain not in self.d_count.keys(): + self.d_count[domain] = dict() + self.d_count[domain]["1st_count"] = 1 + self.d_count[domain]["2nd_count"] = 0 + else: + self.d_count[domain]["1st_count"] += 1 + + for gene_id in value["gene_2nd"]: + self.g_count_2 += 1 + l_domain = value["domain"][gene_id] + for domain in l_domain: + if domain not in self.d_count.keys(): + self.d_count[domain] = dict() + self.d_count[domain]["1st_count"] = 0 + self.d_count[domain]["2nd_count"] = 1 + else: + self.d_count[domain]["2nd_count"] += 1 + + return True + + def _stat_domain(self): + """ + Perform a chi-square test. + The significance level is 5%. + If a value less than to the expected value 5 exists, it is excluded + Perform residual analysis and divide into 1st and 2nd. + """ + dump_log("Start calculating domain stats") + for domain, value in self.d_count.items(): + count_1 = value["1st_count"] + count_2 = value["2nd_count"] + count_no_1 = self.g_count_1 - count_1 + count_no_2 = self.g_count_2 - count_2 + data = \ + numpy.array([[count_1, count_2], [count_no_1, count_no_2]]) + x2, p, dof, exp = stats.chi2_contingency(data) + self.d_count[domain]["p_value"] = p + if p < self.d_config["CONFIG"]["THRESHOLD_P_VALUE"]: + l_exp = list(exp.ravel()) + b_check = False + for num_exp in l_exp: + if num_exp <= self.d_config["CONFIG"]["COCHRAN_RULE"]: + b_check = True + break + if b_check: + continue + res = data - exp + res_var = numpy.zeros(res.shape) + it = numpy.nditer(data, flags=["multi_index"]) + d_sum = data.sum() + while not it.finished: + var = (1 - (data[:, it.multi_index[1]].sum() / d_sum)) * \ + (1 - (data[it.multi_index[0], :].sum() / d_sum)) + res_var[it.multi_index[0], it.multi_index[1]] = var + it.iternext() + stdres = res / numpy.sqrt(exp * res_var) + if stdres[0][0] >= 1.96: + self.d_count_1st[domain] = dict() + self.d_count_1st[domain]["1st_count"] = count_1 + self.d_count_1st[domain]["2nd_count"] = count_2 + self.d_count_1st[domain]["p_value"] = p + if stdres[0][1] >= 1.96: + self.d_count_2nd[domain] = dict() + self.d_count_2nd[domain]["1st_count"] = count_1 + self.d_count_2nd[domain]["2nd_count"] = count_2 + self.d_count_2nd[domain]["p_value"] = p + + return True + + def _dump_tsv(self): + dump_log("Start dumping tsv files") + header = "Pfam_domain\t" +\ + "1st_count({})\t".format(self.g_count_1) +\ + "2nd_count({})\t".format(self.g_count_2) +\ + "p_value\n" + for category in ["all", "1st", "2nd"]: + if category == "all": + use_dict = self.d_count + elif category == "1st": + use_dict = self.d_count_1st + elif category == "2nd": + use_dict = self.d_count_2nd + file_name = "{0}_{1}.tsv".format("_".join(self.d_domain.keys()), + category) + file_path = self.output.joinpath(file_name) + with file_path.open(mode="w") as f: + f.write(header) + for domain, value in use_dict.items(): + l_write = [domain] + l_write.append(value["1st_count"]) + l_write.append(value["2nd_count"]) + l_write.append(value["p_value"]) + items = "\t".join(map(str, l_write)) + "\n" + f.write(items) + + return True + + def _dump_hmm(self): + dump_log("Start dumping hmm files") + for category in ["1st", "2nd"]: + dump_log("Start {} category".format(category)) + if category == "1st": + use_dict = self.d_count_1st + elif category == "2nd": + use_dict = self.d_count_2nd + dir_name = "{}_{}".format("_".join(self.d_domain.keys()), category) + dir_path = self.output.joinpath(dir_name) + dir_path.mkdir(parents=True) + dump_log("Number of domains : {}".format(len(use_dict.keys()))) + count = 0 + for domain in use_dict.keys(): + count += 1 + dump_log("{} : {}".format(count, domain)) + endpoint = "/family/{}/hmm".format(domain) + status_code, text = get_pfam(endpoint) + msg = "input domain {} is not found.".format(domain) + if check_status_code(status_code, msg): + file_path = dir_path.joinpath("{}.hmm".format(domain)) + with file_path.open(mode="w") as f: + f.write(text) + + return True diff --git a/build/lib/KPHMMER/argument_parser.py b/build/lib/KPHMMER/argument_parser.py new file mode 100644 index 0000000..31fe99d --- /dev/null +++ b/build/lib/KPHMMER/argument_parser.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# coding: utf-8 +""" +Parse the input. +""" +import argparse +import sys + +import yaml + +from .config_file_manager import ConfigFileManager + + +def get_args(usage=False): + """ + Parse the input and return args. + It also displays a help message. + """ + config_file_manager = ConfigFileManager() + help_message_path = config_file_manager.get_help_message_path() + with open(help_message_path, "r") as f: + d_help_message = yaml.load(f) + + desc_argparser = d_help_message["DESCRIPTION"]["ARGPARSER"] + desc_subparser = d_help_message["DESCRIPTION"]["SUBPARSER"] + desc_query = d_help_message["DESCRIPTION"]["QUERY"] + desc_search = d_help_message["DESCRIPTION"]["SEARCH"] + desc_analysis = d_help_message["DESCRIPTION"]["ANALYSIS"] + desc_convert = d_help_message["DESCRIPTION"]["CONVERT"] + desc_config = d_help_message["DESCRIPTION"]["CONFIG"] + help_sub_query = d_help_message["SUBPARSER"]["QUERY"] + help_sub_search = d_help_message["SUBPARSER"]["SEARCH"] + help_sub_analysis = d_help_message["SUBPARSER"]["ANALYSIS"] + help_sub_convert = d_help_message["SUBPARSER"]["CONVERT"] + help_sub_config = d_help_message["SUBPARSER"]["CONFIG"] + help_query_organism = d_help_message["QUERY"]["ORGANISM"] + help_query_output = d_help_message["QUERY"]["OUTPUT"] + help_query_analysis = d_help_message["QUERY"]["ANALYSIS"] + help_search_query = d_help_message["SEARCH"]["QUERY"] + help_search_all = d_help_message["SEARCH"]["ALL"] + help_analysis_domain = d_help_message["ANALYSIS"]["DOMAIN"] + help_analysis_output = d_help_message["ANALYSIS"]["OUTPUT"] + help_convert_domain = d_help_message["CONVERT"]["DOMAIN"] + help_convert_output = d_help_message["CONVERT"]["OUTPUT"] + help_config_default = d_help_message["CONFIG"]["DEFAULT"] + help_config_1st = d_help_message["CONFIG"]["1ST"] + help_config_2nd = d_help_message["CONFIG"]["2ND"] + help_config_duplicate = d_help_message["CONFIG"]["DUPLICATE"] + + parser = argparse.ArgumentParser(description=desc_argparser) + subparser = parser.add_subparsers(help=desc_subparser) + + query_parser = subparser.add_parser("query", + description=desc_query, + help=help_sub_query) + query_parser.add_argument("organism_code", + nargs="+", + type=str, + help=help_query_organism, + metavar="CODE") + query_parser.add_argument("-o", + "--output", + nargs="?", + default=".", + type=str, + help=help_query_output) + query_parser.add_argument("-a", + "--with-analysis", + action="store_true", + help=help_query_analysis) + + search_parser = subparser.add_parser("search", + description=desc_search, + help=help_sub_search) + search_group = search_parser.add_mutually_exclusive_group(required=True) + search_group.add_argument("search_query", + nargs="?", + const=None, + type=str, + help=help_search_query, + metavar="STRING") + search_group.add_argument("-a", + "--show-all", + action="store_true", + help=help_search_all) + + analysis_parser = subparser.add_parser("analysis", + description=desc_analysis, + help=help_sub_analysis) + analysis_parser.add_argument("analysis_domain_file", + nargs="+", + type=str, + help=help_analysis_domain, + metavar="DOMAIN") + analysis_parser.add_argument("-o", + "--output", + nargs="?", + default=".", + type=str, + help=help_analysis_output, + metavar="OUTPUT") + + convert_parser = subparser.add_parser("convert", + description=desc_convert, + help=help_sub_convert) + convert_parser.add_argument("convert_domain_file", + nargs="+", + type=str, + help=help_convert_domain, + metavar="DOMAIN") + convert_parser.add_argument("-o", + "--output", + nargs="?", + default=".", + type=str, + help=help_convert_output, + metavar="OUTPUT") + + config_parser = subparser.add_parser("config", + description=desc_config, + help=help_sub_config) + config_parser.add_argument("-s", + "--set-default", + action="store_true", + help=help_config_default) + config_parser.add_argument("-1", + "--category-1st", + nargs="+", + type=str, + help=help_config_1st) + config_parser.add_argument("-2", + "--category-2nd", + nargs="+", + type=str, + help=help_config_2nd) + config_parser.add_argument("-d", + "--duplicate", + nargs="?", + choices=["1st", "2nd"], + help=help_config_duplicate) + + if usage is True: + print(parser.format_help()) + sys.exit(1) + else: + args = parser.parse_args() + return args + + +def determine_submethod(args): + """ + Determine the submethod. + """ + submethod = False + if "organism_code" in args: + submethod = "query" + elif "search_query" in args: + submethod = "search" + elif "analysis_domain_file" in args: + submethod = "analysis" + elif "convert_domain_file" in args: + submethod = "convert" + elif "duplicate" in args: + submethod = "config" + + return submethod diff --git a/build/lib/KPHMMER/config.py b/build/lib/KPHMMER/config.py new file mode 100644 index 0000000..c7350e8 --- /dev/null +++ b/build/lib/KPHMMER/config.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# coding: utf-8 +from .config_file_manager import ConfigFileManager +from .util import dump_log + + +class Config: + def __init__(self, args): + self.ins_dup = args.duplicate + self.l_1st = args.category_1st + self.l_2nd = args.category_2nd + self.b_set_default = args.set_default + self.b_check = False + if self.b_set_default is True: + if any([self.ins_dup, self.l_1st, self.l_2nd]): + msg = "argument -s can not be used with other options" + raise ValueError(msg) + else: + if any([self.ins_dup, self.l_1st, self.l_2nd]) is False: + self.b_check = True + + def run(self): + dump_log("=== Starting Config method ===") + if self.b_check is True: + self._print_config() + else: + if self.b_set_default is True: + self._set_default() + else: + self._set_value() + dump_log("=== Config method finished ===") + + return True + + def _print_config(self): + dump_log("Start printing config state") + my_config_file_manager = ConfigFileManager() + my_config_file_manager.print_config() + + return True + + def _set_default(self): + dump_log("Start set default values") + my_config_file_manager = ConfigFileManager() + my_config_file_manager.set_default() + + return True + + def _set_value(self): + dump_log("Start set values") + my_config_file_manager = ConfigFileManager() + my_config_file_manager.set_value(l_1st_category_num=self.l_1st, + l_2nd_category_num=self.l_2nd, + insert_duplicate=self.ins_dup) + + return True diff --git a/build/lib/KPHMMER/config_file_manager.py b/build/lib/KPHMMER/config_file_manager.py new file mode 100644 index 0000000..6988dd9 --- /dev/null +++ b/build/lib/KPHMMER/config_file_manager.py @@ -0,0 +1,94 @@ +#!/usr/bin +# coding: utf-8 +from pathlib import Path + +import yaml + +HELP_MESSAGE = "help_message.yml" +CONFIG = "config.yml" +CONFIG_DIST = "config.yml-dist" + + +class ConfigFileManager: + def __init__(self): + self.file = Path(__file__) + self.config_dir_path = self.file.parents[0].joinpath("config_files") + + def get_help_message_path(self): + help_message_path = self.config_dir_path.joinpath(HELP_MESSAGE) + if help_message_path.exists() is False: + raise OSError("Help Message file is not found.") + + return help_message_path + + def get_config_file_path(self): + config_file_path = self.config_dir_path.joinpath(CONFIG) + if config_file_path.exists() is False: + raise OSError("Config file is not found.") + + return config_file_path + + def read_config(self): + # d_config["CONFIG"]["1ST_CATEGORY"] + # d_config["CONFIG"]["2ND_CATEGORY"] + # d_config["CONFIG"]["INSERT_DUPLICATE"] + # d_config["CONFIG"]["KEGG_URL_BASE"] + # d_config["CONFIG"]["PFAM_URL_BASE"] + # d_config["CONFIG"]["WAIT_TIME"] + # for category_num in d_config["CONFIG"]["1ST_CATEGORY"]: + # l_1st_pathway_id.extend(d_config["PATHWAY_ID"][category_num]) + # for category_num in d_config["CONFIG"]["2ND_CATEGORY"]: + # l_2nd_pathway_id.extend(d_config["PATHWAY_ID"][category_num]) + + config_file_path = self.get_config_file_path() + with open(config_file_path, "r") as f: + d_config = yaml.load(f) + + return d_config + + def print_config(self): + d_config = self.read_config() + l_1st_category = d_config["CONFIG"]["1ST_CATEGORY"] + l_2nd_category = d_config["CONFIG"]["2ND_CATEGORY"] + insert_duplicate = d_config["CONFIG"]["INSERT_DUPLICATE"] + print("1st category : {}".format(l_1st_category)) + print("2nd category : {}".format(l_2nd_category)) + print("Insert duplicate : {}".format(insert_duplicate)) + + return True + + def set_value(self, l_1st_category_num=None, + l_2nd_category_num=None, + insert_duplicate=None): + config_file_path = self.get_config_file_path() + d_config = self.read_config() + + if l_1st_category_num is not None: + d_config["CONFIG"]["1ST_CATEGORY"] = l_1st_category_num + if l_2nd_category_num is not None: + d_config["CONFIG"]["2ND_CATEGORY"] = l_2nd_category_num + if insert_duplicate is not None: + d_config["CONFIG"]["INSERT_DUPLICATE"] = insert_duplicate + + with config_file_path.open(mode="w") as f: + f.write(yaml.dump(d_config)) + + self.print_config() + + return True + + def set_default(self): + config_dist_file_path = self.config_dir_path.joinpath(CONFIG_DIST) + if config_dist_file_path.exists() is False: + raise OSError("Config dist file is not found.") + with config_dist_file_path.open() as f: + d_config_dist = yaml.load(f) + + l_1st = d_config_dist["CONFIG"]["1ST_CATEGORY"] + l_2nd = d_config_dist["CONFIG"]["2ND_CATEGORY"] + ins_dup = d_config_dist["CONFIG"]["INSERT_DUPLICATE"] + + self.set_value(l_1st_category_num=l_1st, l_2nd_category_num=l_2nd, + insert_duplicate=ins_dup) + + return True diff --git a/build/lib/KPHMMER/config_files/config.yml b/build/lib/KPHMMER/config_files/config.yml new file mode 100644 index 0000000..b99fb9c --- /dev/null +++ b/build/lib/KPHMMER/config_files/config.yml @@ -0,0 +1,94 @@ +CONFIG: + 1ST_CATEGORY: ['1.1', '1.2', '1.3', '1.4', '1.5', '1.6', '1.7', '1.8', '1.11'] + 2ND_CATEGORY: ['1.9', '1.10', '1.12'] + INSERT_DUPLICATE: 2nd + THRESHOLD_P_VALUE: 0.05 + COCHRAN_RULE: 0 + KEGG_URL_BASE: http://rest.kegg.jp + PFAM_URL_BASE: http://pfam.xfam.org + WAIT_TIME: 0 +PATHWAY_ID: + '1.1': ['00010', '00020', '00030', '00040', '00051', '00052', '00053', '00500', + '00520', '00620', '00630', '00640', '00650', '00660', '00562'] + '1.10': [00940, 00945, 00941, 00944, 00942, 00943, 00901, '00403', 00950, 00960, + 01058, '00232', 00965, 00966, '00402', '00311', '00332', '00261', '00331', '00521', + '00524', '00525', '00231', '00401', '00404', '00405', '00333', '00254'] + '1.11': ['00362', '00627', '00364', '00625', '00361', '00623', '00622', '00633', + '00642', '00643', 00791, 00930, '00351', '00363', '00621', '00626', '00624', '00365', + 00984, 00980, '45000', '45000'] + '1.12': ['01010', '01060', '01061', '01062', '01063', '01064', '01065', '01066', + '01070'] + '1.2': [00190, 00195, 00196, '00710', '00720', 00680, 00910, 00920] + '1.3': ['00061', '00062', '00071', '00072', '00073', '00100', '00120', '00121', + '00140', '00561', '00564', '00565', '00600', 00590, 00591, 00592, '01040'] + '1.4': ['00230', '00240'] + '1.5': ['00250', '00260', '00270', 00280, 00290, '00300', '00310', '00220', '00330', + '00340', '00350', '00360', 00380, '00400'] + '1.6': ['00410', '00430', '00440', '00450', '00460', '00471', '00472', '00473', + 00480] + '1.7': ['00510', '00513', '00512', '00515', '00514', '00532', '00534', '00533', + '00531', '00563', '00601', '00603', '00604', '00540', '00550', '00511'] + '1.8': ['00730', '00740', '00750', '00760', '00770', 00780, 00785, 00790, '00670', + 00830, 00860, '00130'] + '1.9': [00900, 00902, 00909, 00904, 00906, 00905, 00981, 00908, 00903, 00281, '01052', + '00522', '01051', 01059, '01056', '01057', '00253', '00523', '01054', '01053', + '01055'] + '2.1': ['03020', '03022', '03040'] + '2.2': ['03010', 00970, '03013', '03015', 03008] + '2.3': ['03060', '04141', '04130', '04120', '04122', '03050', 03018] + '2.4': ['03030', '03410', '03420', '03430', '03440', '03450', '03460'] + '3.1': ['02010', '02060', '03070'] + '3.2': ['02020', '04014', '04015', '04010', '04013', '04016', '04011', '04012', + '04310', '04330', '04340', '04341', '04350', 04390, 04391, 04392, '04370', '04371', + '04630', '04064', 04668, '04066', 04068, '04020', '04070', '04072', '04071', '04024', + '04022', '04151', '04152', '04150', '04075'] + '3.3': [04080, '04060', '04512', '04514'] + '4.1': ['04144', '04145', '04142', '04146', '04140', 04138, '04136', '04137', 04139] + '4.2': ['04110', '04111', '04112', '04113', '04114', '04210', '04214', '04215', + '04216', '04217', '04115', 04218] + '4.3': ['04510', '04520', '04530', '04540', '04550'] + '4.4': ['02024', '05111', '02025', '02026'] + '4.5': ['02030', '02040', 04810] + '5.1': ['04640', '04610', '04611', '04620', '04624', '04621', '04622', '04623', + '04650', '04612', '04660', 04658, 04659, '04657', '04662', '04664', '04666', '04670', + '04672', '04062'] + '5.10': ['04710', '04713', '04711', '04712', '04714', '04626'] + '5.2': [04911, 04910, 04922, 04923, 04920, '03320', 04912, 04913, 04915, 04914, + 04917, 04921, 04926, 04918, 04919, 04916, 04924, '04614', 04925, 04927] + '5.3': ['04260', '04261', '04270'] + '5.4': [04970, 04971, 04972, 04976, 04973, 04974, 04975, 04979, 04977, 04978] + '5.5': [04962, 04960, 04961, 04964, 04966] + '5.6': ['04724', '04727', '04725', 04728, '04726', '04720', '04730', '04723', '04721', + '04722'] + '5.7': ['04744', '04745', '04740', '04742', '04750'] + '5.8': ['04320', '04360', 04380] + '5.9': ['04211', '04212', '04213'] + '6.1': ['05200', '05230', '05231', '05202', '05206', '05205', '05204', '05203'] + '6.10': ['05146', '05144', '05145', '05140', '05142', '05143'] + '6.11': ['01501', '01502', '01503'] + '6.12': ['01521', '01524', '01523', '01522'] + '6.2': ['05210', '05212', '05225', '05226', '05214', '05216', '05221', '05220', + '05217', 05218, '05211', 05219, '05215', '05213', '05224', '05222', '05223'] + '6.3': ['05310', '05322', '05323', '05320', '05321', '05330', '05332', '05340'] + '6.4': ['05010', '05012', '05014', '05016', '05020'] + '6.5': ['05030', '05031', '05032', '05033', '05034'] + '6.6': [05418, '05410', '05412', '05414', '05416'] + '6.7': [04930, 04940, 04950, 04932, 04931, 04933] + '6.8': ['05110', '05120', '05130', '05132', '05131', '05133', '05134', '05150', + '05152', '05100'] + '6.9': ['05166', '05162', '05164', '05161', '05160', 05168, '05167', 05169, '05165'] + '7.1': ['07011', '07012', '07013', '07021', 07019, '07020', '07014', '07023', '07026', + '07044', '07053'] + '7.10': ['07025', '07034', '07035'] + '7.11': ['07110', '07112', '07114', '07117'] + '7.2': ['07040', '07041', '07042', '07043', '07045'] + '7.3': ['07032', '07030', '07033', '07015', 07039, 07028, 07029, '07031', '07027', + '07056', '07057'] + '7.4': ['07055', '07016', '07017', 07018, '07037', 07038, '07046', '07047', 07048, + 07049, '07050', '07051', '07052', '07054'] + '7.5': ['07220', '07215', '07214', '07213', '07212', '07227', '07211', 07228, '07224', + 07229] + '7.6': ['07225', '07226', '07223', '07222'] + '7.7': ['07221', '07230', '07036', '07231', '07232', '07235'] + '7.8': ['07233', '07234'] + '7.9': ['07216', 07219, '07024', '07217', 07218] diff --git a/build/lib/KPHMMER/config_files/config.yml-dist b/build/lib/KPHMMER/config_files/config.yml-dist new file mode 100644 index 0000000..b99fb9c --- /dev/null +++ b/build/lib/KPHMMER/config_files/config.yml-dist @@ -0,0 +1,94 @@ +CONFIG: + 1ST_CATEGORY: ['1.1', '1.2', '1.3', '1.4', '1.5', '1.6', '1.7', '1.8', '1.11'] + 2ND_CATEGORY: ['1.9', '1.10', '1.12'] + INSERT_DUPLICATE: 2nd + THRESHOLD_P_VALUE: 0.05 + COCHRAN_RULE: 0 + KEGG_URL_BASE: http://rest.kegg.jp + PFAM_URL_BASE: http://pfam.xfam.org + WAIT_TIME: 0 +PATHWAY_ID: + '1.1': ['00010', '00020', '00030', '00040', '00051', '00052', '00053', '00500', + '00520', '00620', '00630', '00640', '00650', '00660', '00562'] + '1.10': [00940, 00945, 00941, 00944, 00942, 00943, 00901, '00403', 00950, 00960, + 01058, '00232', 00965, 00966, '00402', '00311', '00332', '00261', '00331', '00521', + '00524', '00525', '00231', '00401', '00404', '00405', '00333', '00254'] + '1.11': ['00362', '00627', '00364', '00625', '00361', '00623', '00622', '00633', + '00642', '00643', 00791, 00930, '00351', '00363', '00621', '00626', '00624', '00365', + 00984, 00980, '45000', '45000'] + '1.12': ['01010', '01060', '01061', '01062', '01063', '01064', '01065', '01066', + '01070'] + '1.2': [00190, 00195, 00196, '00710', '00720', 00680, 00910, 00920] + '1.3': ['00061', '00062', '00071', '00072', '00073', '00100', '00120', '00121', + '00140', '00561', '00564', '00565', '00600', 00590, 00591, 00592, '01040'] + '1.4': ['00230', '00240'] + '1.5': ['00250', '00260', '00270', 00280, 00290, '00300', '00310', '00220', '00330', + '00340', '00350', '00360', 00380, '00400'] + '1.6': ['00410', '00430', '00440', '00450', '00460', '00471', '00472', '00473', + 00480] + '1.7': ['00510', '00513', '00512', '00515', '00514', '00532', '00534', '00533', + '00531', '00563', '00601', '00603', '00604', '00540', '00550', '00511'] + '1.8': ['00730', '00740', '00750', '00760', '00770', 00780, 00785, 00790, '00670', + 00830, 00860, '00130'] + '1.9': [00900, 00902, 00909, 00904, 00906, 00905, 00981, 00908, 00903, 00281, '01052', + '00522', '01051', 01059, '01056', '01057', '00253', '00523', '01054', '01053', + '01055'] + '2.1': ['03020', '03022', '03040'] + '2.2': ['03010', 00970, '03013', '03015', 03008] + '2.3': ['03060', '04141', '04130', '04120', '04122', '03050', 03018] + '2.4': ['03030', '03410', '03420', '03430', '03440', '03450', '03460'] + '3.1': ['02010', '02060', '03070'] + '3.2': ['02020', '04014', '04015', '04010', '04013', '04016', '04011', '04012', + '04310', '04330', '04340', '04341', '04350', 04390, 04391, 04392, '04370', '04371', + '04630', '04064', 04668, '04066', 04068, '04020', '04070', '04072', '04071', '04024', + '04022', '04151', '04152', '04150', '04075'] + '3.3': [04080, '04060', '04512', '04514'] + '4.1': ['04144', '04145', '04142', '04146', '04140', 04138, '04136', '04137', 04139] + '4.2': ['04110', '04111', '04112', '04113', '04114', '04210', '04214', '04215', + '04216', '04217', '04115', 04218] + '4.3': ['04510', '04520', '04530', '04540', '04550'] + '4.4': ['02024', '05111', '02025', '02026'] + '4.5': ['02030', '02040', 04810] + '5.1': ['04640', '04610', '04611', '04620', '04624', '04621', '04622', '04623', + '04650', '04612', '04660', 04658, 04659, '04657', '04662', '04664', '04666', '04670', + '04672', '04062'] + '5.10': ['04710', '04713', '04711', '04712', '04714', '04626'] + '5.2': [04911, 04910, 04922, 04923, 04920, '03320', 04912, 04913, 04915, 04914, + 04917, 04921, 04926, 04918, 04919, 04916, 04924, '04614', 04925, 04927] + '5.3': ['04260', '04261', '04270'] + '5.4': [04970, 04971, 04972, 04976, 04973, 04974, 04975, 04979, 04977, 04978] + '5.5': [04962, 04960, 04961, 04964, 04966] + '5.6': ['04724', '04727', '04725', 04728, '04726', '04720', '04730', '04723', '04721', + '04722'] + '5.7': ['04744', '04745', '04740', '04742', '04750'] + '5.8': ['04320', '04360', 04380] + '5.9': ['04211', '04212', '04213'] + '6.1': ['05200', '05230', '05231', '05202', '05206', '05205', '05204', '05203'] + '6.10': ['05146', '05144', '05145', '05140', '05142', '05143'] + '6.11': ['01501', '01502', '01503'] + '6.12': ['01521', '01524', '01523', '01522'] + '6.2': ['05210', '05212', '05225', '05226', '05214', '05216', '05221', '05220', + '05217', 05218, '05211', 05219, '05215', '05213', '05224', '05222', '05223'] + '6.3': ['05310', '05322', '05323', '05320', '05321', '05330', '05332', '05340'] + '6.4': ['05010', '05012', '05014', '05016', '05020'] + '6.5': ['05030', '05031', '05032', '05033', '05034'] + '6.6': [05418, '05410', '05412', '05414', '05416'] + '6.7': [04930, 04940, 04950, 04932, 04931, 04933] + '6.8': ['05110', '05120', '05130', '05132', '05131', '05133', '05134', '05150', + '05152', '05100'] + '6.9': ['05166', '05162', '05164', '05161', '05160', 05168, '05167', 05169, '05165'] + '7.1': ['07011', '07012', '07013', '07021', 07019, '07020', '07014', '07023', '07026', + '07044', '07053'] + '7.10': ['07025', '07034', '07035'] + '7.11': ['07110', '07112', '07114', '07117'] + '7.2': ['07040', '07041', '07042', '07043', '07045'] + '7.3': ['07032', '07030', '07033', '07015', 07039, 07028, 07029, '07031', '07027', + '07056', '07057'] + '7.4': ['07055', '07016', '07017', 07018, '07037', 07038, '07046', '07047', 07048, + 07049, '07050', '07051', '07052', '07054'] + '7.5': ['07220', '07215', '07214', '07213', '07212', '07227', '07211', 07228, '07224', + 07229] + '7.6': ['07225', '07226', '07223', '07222'] + '7.7': ['07221', '07230', '07036', '07231', '07232', '07235'] + '7.8': ['07233', '07234'] + '7.9': ['07216', 07219, '07024', '07217', 07218] diff --git a/build/lib/KPHMMER/config_files/help_message.yml b/build/lib/KPHMMER/config_files/help_message.yml new file mode 100644 index 0000000..e1588c4 --- /dev/null +++ b/build/lib/KPHMMER/config_files/help_message.yml @@ -0,0 +1,32 @@ +DESCRIPTION : + ARGPARSER: KPHMMER; Hidden Markov Model generator for detecting KEGG pathway-specific genes written by suecharo. + SUBPARSER: KPHMMER has each submethod. Refer to the help of each submethod. + QUERY: Receive the KEGG organism code and output the Pfam domain file in the form of yaml file. If you do not know KEGG's organism code (e.g. hsa), use search method. + SEARCH: Receive query keywords (e.g. human) and search KEGG's organism code (e.g. hsa). + ANALYSIS: Receive the Pfam domain file in the form of yaml file as input and output hmm file created by frequently detected Pfam domains in the set category. + CONVERT: Receive the Pfam domain file in the form of yaml file and output the gene included therein in the form of FASTA file. + CONFIG: Check the currently set category. Or change the setting. +SUBPARSER: + QUERY: Receive the KEGG's organism code (e.g. hsa) and output the Pfam domain file in the form of yaml file. + SEARCH: Receive query keywords (e.g. human) and search KEGG's organism code (e.g. hsa). + ANALYSIS: Receive the Pfam domain file in the form of yaml file as input and output hmm file created by frequently detected Pfam domains in the set category. + CONVERT: Receive the Pfam domain file in the form of yaml file and output the gene included therein in the form of FASTA file. + CONFIG: Check the currently set category. Or change the setting. +QUERY: + ORGANISM: Specify KEGG's organism code (e.g. hsa). Multiple inputs are accepted. + OUTPUT: Specify where output file is generated. (default = ./) + ANALYSIS: If you want to do analysis together, specify this option. +SEARCH: + QUERY: Perform the keyword search on the input character string. + ALL: Display all organism codes present on KEGG. Be careful as there are many outputs. +ANALYSIS: + DOMAIN: Specify Pfam domain file path in the form of yaml file. Multiple inputs are accepted. + OUTPUT: Specify where output file is generated. (default = ./) +CONVERT: + DOMAIN: Specify Pfam domain file path in the form of yaml file. Multiple inputs are accepted. + OUTPUT: Specify where output file is generated. (default = ./) +CONFIG: + DEFAULT: Restore config to the default values. + 1ST: Set 1st category. Multiple inputs are accepted. + 2ND: Set 2nd category. Multiple inputs are accepted. + DUPLICATE: Set which categories to place duplicates. diff --git a/build/lib/KPHMMER/convert.py b/build/lib/KPHMMER/convert.py new file mode 100644 index 0000000..89890a8 --- /dev/null +++ b/build/lib/KPHMMER/convert.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +# coding: utf-8 +from pathlib import Path + +import yaml + +from .util import check_status_code, dump_log, get_kegg + + +class Convert: + def __init__(self, args): + self.l_domain_file = args.convert_domain_file + self.output = Path.cwd().joinpath(args.output).resolve() + if self.output.is_dir() is False: + self.output.mkdir(parents=True) + + self.d_domain = dict() + + def run(self): + dump_log("=== Starting Convert method ===") + self._comfirm_domain_file() + self._read_domain() + self._dump_fasta() + dump_log("=== Convert method finished ===") + + return True + + def _comfirm_domain_file(self): + """ + Configirm existence of l_domain_file and obtain absolute path. + """ + dump_log("Start Comfirming domain file ") + l_domain = [] + for domain in self.l_domain_file: + domain = Path(domain) + if domain.is_absolute is False: + domain = Path.cwd().joinpath(domain).resolve() + if domain.suffix != ".yml": + msg = "Your input {} is not yaml file".format(domain) + raise ValueError(msg) + if domain.exists is False: + msg = "Your input {} is not found".format(domain) + raise ValueError(msg) + l_domain.append(domain) + + self.l_domain_file = l_domain + + return True + + def _read_domain(self): + """ + d_domain + sco : + category_1st : ["1.1", "1.2"] + category_2nd : ["1.3", "1.4"] + duplicate : 2nd + gene_1st : [...] + gene_2nd : [...] + domain : + sco:SC00063 : [hoge] + ... + sma : + ... + """ + dump_log("Start reading domains") + for domain in self.l_domain_file: + with domain.open(mode="r") as f: + data = yaml.load(f) + organism = data["CONFIG"]["ORGANISM"] + category_1st = data["CONFIG"]["1ST_CATEGORY"] + category_2nd = data["CONFIG"]["2ND_CATEGORY"] + duplicate = data["CONFIG"]["INSERT_DUPLICATE"] + gene_1st = data["GENE"]["1ST"] + gene_2nd = data["GENE"]["2ND"] + domain = data["DOMAIN"] + self.d_domain[organism] = dict() + self.d_domain[organism]["category_1st"] = category_1st + self.d_domain[organism]["category_2nd"] = category_2nd + self.d_domain[organism]["duplicate"] = duplicate + self.d_domain[organism]["gene_1st"] = gene_1st + self.d_domain[organism]["gene_2nd"] = gene_2nd + self.d_domain[organism]["domain"] = domain + + return True + + def _dump_fasta(self): + dump_log("Start dumping fasta files") + for organism, value in self.d_domain.items(): + dump_log("Organism : {}".format(organism)) + gene_1st = value["gene_1st"] + gene_2nd = value["gene_2nd"] + l_fasta_1st = [] + l_fasta_2nd = [] + l_fasta_all = [] + group_num = 10 + len_gene = len(gene_1st) + len(gene_2nd) + dump_log("Number of genes : {}".format(len_gene)) + count = 0 + for i in range(0, len(gene_1st), group_num): + count += 10 + dump_log("{} / {}".format(count, len_gene)) + chunk = gene_1st[i:i + group_num] + endpoint = "/get/{}/aaseq".format("+".join(chunk)) + status_code, text = get_kegg(endpoint) + msg = "This endpoint {} is Nothing.".format(endpoint) + if check_status_code(status_code, msg): + l_fasta = [] + l_content = [] + for row in text.split("\n"): + if len(row) == 0: + continue + if row[0] == ">": + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_content = [] + l_fasta.append(row) + else: + l_fasta.append(row) + else: + l_content.append(row) + else: + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_fasta_1st.extend(l_fasta) + l_fasta_all.extend(l_fasta) + + count = len(gene_1st) + for i in range(0, len(gene_2nd), group_num): + count += 10 + dump_log("{} / {}".format(count, len_gene)) + chunk = gene_2nd[i:i + group_num] + endpoint = "/get/{}/aaseq".format("+".join(chunk)) + status_code, text = get_kegg(endpoint) + msg = "This endpoint {} is Nothing.".format(endpoint) + if check_status_code(status_code, msg): + l_fasta = [] + l_content = [] + for row in text.split("\n"): + if len(row) == 0: + continue + if row[0] == ">": + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_content = [] + l_fasta.append(row) + else: + l_fasta.append(row) + else: + l_content.append(row) + else: + if len(l_content) != 0: + amino = "".join(l_content) + l_fasta.append(amino) + l_fasta_2nd.extend(l_fasta) + l_fasta_all.extend(l_fasta) + + fasta_1st = "\n".join(l_fasta_1st) + fasta_2nd = "\n".join(l_fasta_2nd) + fasta_all = "\n".join(l_fasta_all) + for (fa, name) in [[fasta_1st, "1st"], [fasta_2nd, "2nd"], + [fasta_all, "all"]]: + file_name = "{}_{}.fasta".format(organism, name) + file_path = self.output.joinpath(file_name) + with file_path.open(mode="w") as f: + f.write(fa) + + return True diff --git a/build/lib/KPHMMER/query.py b/build/lib/KPHMMER/query.py new file mode 100644 index 0000000..bf4e490 --- /dev/null +++ b/build/lib/KPHMMER/query.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python +# coding: utf-8 +from collections import defaultdict +from datetime import datetime +from pathlib import Path + +import yaml + +from .analysis import Analysis +from .config_file_manager import ConfigFileManager +from .util import check_status_code, dump_log, get_kegg + + +class Query: + def __init__(self, args): + self.args = args + + self.l_organism_code = args.organism_code + self.output = Path.cwd().joinpath(args.output).resolve() + if self.output.is_dir() is False: + self.output.mkdir(parents=True) + self.with_analysis = args.with_analysis + + self.d_organism = dict() + + my_config_file_manager = ConfigFileManager() + self.d_config = my_config_file_manager.read_config() + self.l_1st_pathway_id = [] + self.l_2nd_pathway_id = [] + for category_num in self.d_config["CONFIG"]["1ST_CATEGORY"]: + l_ids = self.d_config["PATHWAY_ID"][category_num] + self.l_1st_pathway_id.extend(l_ids) + for category_num in self.d_config["CONFIG"]["2ND_CATEGORY"]: + l_ids = self.d_config["PATHWAY_ID"][category_num] + self.l_2nd_pathway_id.extend(l_ids) + + self.l_domain_file = [] + + def run(self): + dump_log("=== Starting Query method ===") + dump_log("Your query : {}".format(self.l_organism_code)) + self._search_pathway() + self._format_pathway() + self._find_gene() + self._find_domain() + self._dump_domain_file() + dump_log("=== Query method finished ===") + if self.with_analysis is True: + my_analysis = Analysis() + my_analysis.l_domain_file = self.l_domain_file + my_analysis.output = self.output + my_analysis.run() + + return True + + def _search_pathway(self): + """ + Search for the pathway that each creature has. + Here the pathway is a str type number like "00010". + self.d_organism : + hsa : + pathway : create here + sco : + ... + """ + dump_log("Start searching pathway") + for organism in self.l_organism_code: + endpoint = "/list/pathway/{}".format(organism) + status_code, text = get_kegg(endpoint) + msg = "Query organism code {} is wrong.".format(organism) + if check_status_code(status_code, msg): + self.d_organism[organism] = dict() + self.d_organism[organism]["pathway"] = [] + for row in text.split("\n"): + ele = row.split("\t")[0] + pathway_id = ele[-5:] + if pathway_id != "": + self.d_organism[organism]["pathway"].append(pathway_id) + + return True + + def _format_pathway(self): + """ + Information obtained from config.yml is divided into l_1st and l_2nd. + The pathway not specified in config.yml is excluded. + self.d_organism : + hsa : + pathway : + 1st_pathway : create here + 2nd_pathway : create here + sco : + ... + """ + dump_log("Start formatting pathways") + for organism, value in self.d_organism.items(): + l_1st = list(set(self.l_1st_pathway_id) & set(value["pathway"])) + l_2nd = list(set(self.l_2nd_pathway_id) & set(value["pathway"])) + l_1st = ["path:{0}{1}".format(organism, p_id) for p_id in l_1st] + l_2nd = ["path:{0}{1}".format(organism, p_id) for p_id in l_2nd] + self.d_organism[organism]["1st_pathway"] = l_1st + self.d_organism[organism]["2nd_pathway"] = l_2nd + len_1 = len(l_1st) + len_2 = len(l_2nd) + dump_log("{0}'s 1st pathway count : {1}".format(organism, len_1)) + dump_log("{0}'s 2nd pathway count : {1}".format(organism, len_2)) + + return True + + def _find_gene(self): + """ + Search for genes contained in each pathway. + The duplicated gene enters the one specified by config.yml. + self.d_organism : + hsa : + pathway : + 1st_pathway : + 2nd_pathway : + 1st_gene : create here + 2nd_gene : create here + duplicate_gene : create here + sco : + ... + """ + dump_log("Start finding genes") + for organism, value in self.d_organism.items(): + endpoint = "/link/{}/pathway".format(organism) + status_code, text = get_kegg(endpoint) + msg = "This endpoint {} is Nothing.".format(endpoint) + d_path_to_gene = defaultdict(list) + if check_status_code(status_code, msg): + for row in text.split("\n"): + l_ele = row.split("\t") + if len(l_ele) < 2: + continue + d_path_to_gene[l_ele[0]].append(l_ele[1]) + + s_1st_gene = set() + s_2nd_gene = set() + for name in ["1st", "2nd"]: + for pathway_id in value["{}_pathway".format(name)]: + l_gene_id = d_path_to_gene[pathway_id] + for gene_id in l_gene_id: + if name == "1st": + s_1st_gene.add(gene_id) + elif name == "2nd": + s_2nd_gene.add(gene_id) + s_duplicate = s_1st_gene & s_2nd_gene + + len_1 = len(s_1st_gene) + len_2 = len(s_2nd_gene) + len_dup = len(s_duplicate) + dump_log("{0}'s 1st gene count : {1}".format(organism, len_1)) + dump_log("{0}'s 2nd gene count : {1}".format(organism, len_2)) + msg = "{0}'s duplicate gene count : {1}".format(organism, len_dup) + dump_log(msg) + + if self.d_config["CONFIG"]["INSERT_DUPLICATE"] == "1st": + s_2nd_gene = s_2nd_gene - s_duplicate + elif self.d_config["CONFIG"]["INSERT_DUPLICATE"] == "2nd": + s_1st_gene = s_1st_gene - s_duplicate + else: + msg = "Please check your config.yml-INSERT_DUPLICATE" + raise ValueError(msg) + + self.d_organism[organism]["1st_gene"] = list(s_1st_gene) + self.d_organism[organism]["2nd_gene"] = list(s_2nd_gene) + self.d_organism[organism]["duplicate_gene"] = list(s_duplicate) + + return True + + def _find_domain(self): + """ + Find pfam domain. + self.d_organism : + hsa : + pathway : + 1st_pathway : + 2nd_pathway : + 1st_gene : + 2nd_gene : + duplicate_gene : + d_domain : create now + gene_id : domain list + sco : + ... + """ + dump_log("Start finding pathways") + for organism, value in self.d_organism.items(): + dump_log("Organism : {}".format(organism)) + self.d_organism[organism]["d_domain"] = dict() + all_gene = value["1st_gene"] + value["2nd_gene"] + group_num = 10 + dump_log("Number of genes : {}".format(len(all_gene))) + count = 0 + for i in range(0, len(all_gene), group_num): + count += 10 + dump_log("{} / {}".format(count, len(all_gene))) + chunk = all_gene[i:i + group_num] + endpoint = "/get/{}".format("+".join(chunk)) + status_code, text = get_kegg(endpoint) + msg = "This endpoint {} is Nothing.".format(endpoint) + if check_status_code(status_code, msg): + l_text = text.split("//") + for j in range(len(chunk)): + gene = chunk[j] + ele_text = l_text[j] + l_domain = [] + b_domain = False + for row in ele_text.split("\n"): + if len(row) < 5: + continue + if b_domain is True: + if row[0] != " ": + break + else: + l_row = row.split(" ") + else: + if row[:5] == "MOTIF": + b_domain = True + l_row = row.split(" ") + else: + continue + for ele in l_row: + if ele in ["MOTIF", "", "Pfam:"]: + continue + else: + l_domain.append(ele) + self.d_organism[organism]["d_domain"][gene] = l_domain + + return True + + def _dump_domain_file(self): + """ + Output gene and pfam as ${output_dir}/${organism}.yml + CONFIG : + ORGANISM : sco + 1ST_LIST : + - 1.1 + - 1.2 + - ... + 2ND_LIST : + - 1.9 + - ... + INSERT_DUPLICATE : "2nd" + CREATE_DATE : 2018/02/01 00:00:00 + PATHWAY : + 1ST : + - sco19291 + - ... + 2ND : + - ... + GENE : + 1ST : + - SCO1931 + - SCO2382 + - ... + 2ND : + - SCO7290 + - ... + DOMAIN : + SCO1931 : ['Rieske', 'Phage_holin_3_6', 'Bac_export_2'] + ... + """ + dump_log("Start dumping domain files") + for organism, value in self.d_organism.items(): + d_write = dict() + d_write["CONFIG"] = dict() + d_write["CONFIG"]["ORGANISM"] = organism + d_write["CONFIG"]["1ST_CATEGORY"] = \ + self.d_config["CONFIG"]["1ST_CATEGORY"] + d_write["CONFIG"]["2ND_CATEGORY"] = \ + self.d_config["CONFIG"]["2ND_CATEGORY"] + d_write["CONFIG"]["INSERT_DUPLICATE"] = \ + self.d_config["CONFIG"]["INSERT_DUPLICATE"] + now = datetime.now().strftime("%Y/%m/%d %H:%M:%S") + d_write["CONFIG"]["CREATE_DATE"] = now + d_write["PATHWAY"] = dict() + p_1st = [organism + p_id for p_id in value["1st_pathway"]] + p_2nd = [organism + p_id for p_id in value["2nd_pathway"]] + d_write["PATHWAY"]["1ST"] = p_1st + d_write["PATHWAY"]["2ND"] = p_2nd + d_write["GENE"] = dict() + d_write["GENE"]["1ST"] = value["1st_gene"] + d_write["GENE"]["2ND"] = value["2nd_gene"] + d_write["DOMAIN"] = value["d_domain"] + + file_name = "{}.yml".format(organism) + file_path = self.output.joinpath(file_name) + with file_path.open(mode="w") as f: + credit = "# Created by KPHMMER " +\ + "\n" + f.write(credit) + f.write(yaml.dump(d_write)) + self.l_domain_file.append(file_path) + + return True diff --git a/build/lib/KPHMMER/search.py b/build/lib/KPHMMER/search.py new file mode 100644 index 0000000..cc67cac --- /dev/null +++ b/build/lib/KPHMMER/search.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# coding: utf-8 +from .util import check_status_code, dump_log, get_kegg + + +class Search: + def __init__(self, args): + self.search_query = args.search_query + self.b_show_all = args.show_all + + def run(self): + dump_log("=== Starting Search method ===") + if self.b_show_all is True: + self._show_all() + else: + self._ordinaly_search() + dump_log("=== Search method finished ===") + + return True + + def _show_all(self): + dump_log("Start showing all items") + endpoint = "/list/organism" + status_code, text = get_kegg(endpoint) + msg = "method is show all." + if check_status_code(status_code, msg): + print(text) + + return True + + def _ordinaly_search(self): + dump_log("Start ordinaly search method") + endpoint = "/find/genome/{}".format(self.search_query) + status_code, text = get_kegg(endpoint) + msg = "Your query [{}] is wrong.".format(self.search_query) + if check_status_code(status_code, msg): + print(text) + + return True diff --git a/build/lib/KPHMMER/util.py b/build/lib/KPHMMER/util.py new file mode 100644 index 0000000..7686299 --- /dev/null +++ b/build/lib/KPHMMER/util.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# coding: utf-8 +import time +from datetime import datetime + +import requests + +from .config_file_manager import ConfigFileManager + + +def dump_log(msg): + now = datetime.now().strftime("%Y/%m/%d %H:%M:%S") + print("[{}] {}".format(now, msg), flush=True) + + return True + + +def get_kegg(endpoint): + my_config_file_manager = ConfigFileManager() + d_config = my_config_file_manager.read_config() + time.sleep(d_config["CONFIG"]["WAIT_TIME"]) + url = d_config["CONFIG"]["KEGG_URL_BASE"] + endpoint + ret = requests.get(url) + status_code = ret.status_code + text = ret.text + + return status_code, text + + +def get_pfam(endpoint): + my_config_file_manager = ConfigFileManager() + d_config = my_config_file_manager.read_config() + time.sleep(d_config["CONFIG"]["WAIT_TIME"]) + url = d_config["CONFIG"]["PFAM_URL_BASE"] + endpoint + ret = requests.get(url) + status_code = ret.status_code + text = ret.text + + return status_code, text + + +def check_status_code(status_code, msg): + if status_code == 200: + return True + elif status_code == 400: + raise ConnectionError(msg) + elif status_code == 404: + raise ValueError(msg) + else: + raise ConnectionError("status code is {}".format(status_code)) + + return True diff --git a/build/scripts-3.6/kphmmer b/build/scripts-3.6/kphmmer new file mode 100755 index 0000000..4d909d3 --- /dev/null +++ b/build/scripts-3.6/kphmmer @@ -0,0 +1,36 @@ +#!python +# coding: utf-8 +import sys +import traceback + +from KPHMMER import (Analysis, Config, Convert, Query, Search, + determine_submethod, get_args) + + +def main(): + args = get_args() + submethod = determine_submethod(args) + if submethod is False: + get_args(usage=True) + + try: + if submethod == "query": + my_submethod = Query(args) + elif submethod == "search": + my_submethod = Search(args) + elif submethod == "analysis": + my_submethod = Analysis(args) + elif submethod == "convert": + my_submethod = Convert(args) + elif submethod == "config": + my_submethod = Config(args) + my_submethod.run() + except: + traceback.print_exc() + sys.exit(1) + + return True + + +if __name__ == "__main__": + main() diff --git a/dist/KPHMMER-1.0.2-py3-none-any.whl b/dist/KPHMMER-1.0.2-py3-none-any.whl new file mode 100644 index 0000000..dea33e8 Binary files /dev/null and b/dist/KPHMMER-1.0.2-py3-none-any.whl differ diff --git a/dist/KPHMMER-1.0.2.tar.gz b/dist/KPHMMER-1.0.2.tar.gz new file mode 100644 index 0000000..f4499ca Binary files /dev/null and b/dist/KPHMMER-1.0.2.tar.gz differ