decompose_groups.py

import pandas as pd
import pdb
import json
from rdkit import Chem

def count_substructures(radius,molecule):
    """Helper function for get the information of molecular signature of a
    metabolite. The relaxed signature requires the number of each substructure
    to construct a matrix for each molecule.
    Parameters
    ----------
    radius : int
        the radius is bond-distance that defines how many neighbor atoms should
        be considered in a reaction center.
    molecule : Molecule
        a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
        or Chem.MolToSmiles(smiles_code))
    Returns
    -------
    dict
        dictionary of molecular signature for a molecule,
        {smiles: molecular_signature}
    """
    m = molecule
    smi_count = dict()
    atomList = [atom for atom in m.GetAtoms()]

    for i in range(len(atomList)):
        env = Chem.FindAtomEnvironmentOfRadiusN(m,radius,i)
        atoms=set()
        for bidx in env:
            atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
            atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())

        # only one atom is in this environment, such as O in H2O
        if len(atoms) == 0:
            atoms = {i}

        smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(atoms),
                                    bondsToUse=env,canonical=True)

        if smi in smi_count:
            smi_count[smi] = smi_count[smi] + 1
        else:
            smi_count[smi] = 1
    return smi_count

def decompse_ac(db_smiles,radius=1):
    non_decomposable = []
    decompose_vector = dict()

    for cid in db_smiles:
        # print cid
        smiles_pH7 = db_smiles[cid]
        try:
            mol = Chem.MolFromSmiles(smiles_pH7)
            mol = Chem.RemoveHs(mol)
            # Chem.RemoveStereochemistry(mol) 
            smi_count = count_substructures(radius,mol)
            decompose_vector[cid] = smi_count

        except Exception as e:
            non_decomposable.append(cid)

    with open('./data/decompose_vector_ac.json','w') as fp:
        json.dump(decompose_vector,fp)

def get_rxn_rule():
    """calculate reaction rules based on the relaxed molecular signatures.

    Parameters
    ----------
    radius : int
        the radius is bond-distance that defines how many neighbor atoms should
        be considered in a reaction center.

    Returns
    -------
    None
        All of the reaction rules are saved in files (csv file)

    """
    reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
    molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
    molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
    all_mets = molsigna_df.columns.tolist()
    all_mets.append("C00080")
    all_mets.append("C00282")


    rule_df = pd.DataFrame(index=molsigna_df.index)
    for rid, value in list(reaction_dict.items()):
        # skip the reactions with missing metabolites
        mets = list(value.keys())
        flag = False
        for met in mets:
            if met not in all_mets: 
                flag = True
                break
        if flag: continue

        rule_df[rid] = 0
        for met, stoic in list(value.items()):
            if met == "C00080" or met == "C00282":
                continue  # hydogen is zero
            rule_df[rid] += molsigna_df[met] * stoic
    rule_df.to_csv("./data/reaction_rule.csv", index=True)

def get_rxn_rule_no_stero():
    """calculate reaction rules based on the relaxed molecular signatures.

    Parameters
    ----------
    radius : int
        the radius is bond-distance that defines how many neighbor atoms should
        be considered in a reaction center.

    Returns
    -------
    None
        All of the reaction rules are saved in files (csv file)

    """
    reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
    molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
    molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
    all_mets = molsigna_df.columns.tolist()
    all_mets.append("C00080")
    all_mets.append("C00282")


    rule_df = pd.DataFrame(index=molsigna_df.index)
    for rid, value in list(reaction_dict.items()):
        # skip the reactions with missing metabolites
        mets = list(value.keys())
        flag = False
        for met in mets:
            if met not in all_mets: 
                flag = True
                break
        if flag: continue

        rule_df[rid] = 0
        for met, stoic in list(value.items()):
            if met == "C00080" or met == "C00282":
                continue  # hydogen is zero
            rule_df[rid] += molsigna_df[met] * stoic
    rule_df.to_csv("./data/reaction_rule_no_stero.csv", index=True)

def get_rxn_rule_remove_TECRDB_mets():
    """calculate reaction rules based on the relaxed molecular signatures.

    Parameters
    ----------
    radius : int
        the radius is bond-distance that defines how many neighbor atoms should
        be considered in a reaction center.

    Returns
    -------
    None
        All of the reaction rules are saved in files (csv file)

    """
    reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
    molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
    molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
    all_mets = molsigna_df.columns.tolist()
    all_mets.append("C00080")
    all_mets.append("C00282")

    mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
    mets_TECRDB = mets_TECRDB_df[0].tolist()

    # pdb.set_trace()
    all_mets = list(set(all_mets + mets_TECRDB))

    rule_df = pd.DataFrame(index=molsigna_df.index)
    for rid, value in list(reaction_dict.items()):
        # skip the reactions with missing metabolites
        mets = list(value.keys())
        flag = False
        for met in mets:
            if met not in all_mets: 
                flag = True
                break
        if flag: continue

        rule_df[rid] = 0
        for met, stoic in list(value.items()):
            if met in mets_TECRDB:
                continue  # hydogen is zero
            rule_df[rid] += molsigna_df[met] * stoic
    rule_df.to_csv("./data/reaction_rule_remove_TECRDB_mets.csv", index=True)

def get_rxn_rule_no_stero_remove_TECRDB_mets():
    """calculate reaction rules based on the relaxed molecular signatures.

    Parameters
    ----------
    radius : int
        the radius is bond-distance that defines how many neighbor atoms should
        be considered in a reaction center.

    Returns
    -------
    None
        All of the reaction rules are saved in files (csv file)

    """
    reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
    molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
    molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
    all_mets = molsigna_df.columns.tolist()
    all_mets.append("C00080")
    all_mets.append("C00282")

    mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
    mets_TECRDB = mets_TECRDB_df[0].tolist()

    # pdb.set_trace()
    all_mets = list(set(all_mets + mets_TECRDB))

    rule_df = pd.DataFrame(index=molsigna_df.index)
    for rid, value in list(reaction_dict.items()):
        # skip the reactions with missing metabolites
        mets = list(value.keys())
        flag = False
        for met in mets:
            if met not in all_mets: 
                flag = True
                break
        if flag: continue

        rule_df[rid] = 0
        for met, stoic in list(value.items()):
            if met in mets_TECRDB:
                continue  # hydogen is zero
            rule_df[rid] += molsigna_df[met] * stoic
    rule_df.to_csv("./data/reaction_rule_nostereo_remove_TECRDB_mets.csv", index=True)


if __name__ == '__main__':
    # db = pd.read_csv('./data/cache_compounds_20160818.csv',index_col='compound_id')
    # db_smiles = db['smiles_pH7'].to_dict()
    # decompse_ac(db_smiles)
    # get_rxn_rule()

    # get_rxn_rule_remove_TECRDB_mets()
    get_rxn_rule_no_stero_remove_TECRDB_mets()