diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 87de0099..6b54b4a0 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -3,6 +3,8 @@ name: Run Pre-Commit on: pull_request: {} push: + paths-ignore: + - 'examples/**' branches: - dev - main diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml new file mode 100644 index 00000000..9ce47285 --- /dev/null +++ b/.github/workflows/tox.yml @@ -0,0 +1,28 @@ +name: Run Tox + +on: + pull_request: {} + push: + branches: [main] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.8', '3.9', '3.10'] + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel build + python -m pip install tox tox-gh-actions + - name: Test with tox + run: | + tox + python -m build . diff --git a/.gitignore b/.gitignore index 6390162b..87619079 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,8 @@ dmypy.json # Pyre type checker .pyre/ + +.pydevproject +.settings/* +*data/* +*.lp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 04cde634..325706ab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,9 @@ repos: args: - --pytest-test-first - id: check-json + exclude: examples/ - id: pretty-format-json + exclude: examples/ args: - --autofix - --top-keys=_id diff --git a/.travis.yml b/.travis.yml index e72cfaff..20911611 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,8 @@ language: python python: - - 3.6 - 3.7 - 3.8 + - 3.9 before_install: - python --version - pip install -U pip diff --git a/examples/Flux Analysis/FullThermodynamicsExample.ipynb b/examples/Flux Analysis/FullThermodynamicsExample.ipynb index b5ffac67..776c4933 100644 --- a/examples/Flux Analysis/FullThermodynamicsExample.ipynb +++ b/examples/Flux Analysis/FullThermodynamicsExample.ipynb @@ -1373,7 +1373,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.9" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/examples/Model Reconstruction/ATPGapfilling.ipynb b/examples/Model Reconstruction/ATPGapfilling.ipynb index f0116989..d236d609 100644 --- a/examples/Model Reconstruction/ATPGapfilling.ipynb +++ b/examples/Model Reconstruction/ATPGapfilling.ipynb @@ -526,7 +526,13 @@ "cell_type": "code", "execution_count": 60, "id": "6ade9096-f3f4-40f8-a1ea-53b5b63ec2c0", - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, "outputs": [ { "name": "stderr", @@ -1174,123 +1180,417 @@ }, { "cell_type": "code", - "execution_count": 67, - "id": "7aba6de8-9252-4980-95b0-bd1a72db2e05", + "execution_count": 1, + "id": "e24d8e82-357a-4658-9362-6073f502b6bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "modelseedpy 0.2.2\n" + ] + } + ], + "source": [ + "import modelseedpy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1080bc7b-58c2-4105-91a2-2defaa8a1c92", "metadata": {}, "outputs": [], "source": [ - "atp_correction.apply_growth_media_gapfilling()" + "%run /home/fliu/workspace/python3/ModelSEEDpy/tests/core/test_msatpcorreption.py" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3ee9a1dd-9b8c-4204-b846-609cecebffc7", + "metadata": {}, + "outputs": [], + "source": [ + "def get_model(ko):\n", + " def _method(ko=ko, added_compounds=None, added_reactions=None):\n", + " if ko is None:\n", + " ko = []\n", + " with open(\n", + " '/home/fliu/workspace/python3/ModelSEEDpy/tests/test_data/e_coli_core.json',\n", + " \"r\",\n", + " ) as fh:\n", + " model_json = json.load(fh)\n", + " model_json[\"compartments\"] = {\n", + " k + \"0\": v for (k, v) in model_json[\"compartments\"].items()\n", + " }\n", + " metabolites = {}\n", + " for m in model_json[\"metabolites\"]:\n", + " m[\"id\"] += \"0\"\n", + " m[\"compartment\"] += \"0\"\n", + " metabolites[m[\"id\"]] = m\n", + " for r in model_json[\"reactions\"]:\n", + " r[\"metabolites\"] = {i + \"0\": v for (i, v) in r[\"metabolites\"].items()}\n", + " compartments = set(\n", + " [metabolites[k][\"compartment\"] for k in r[\"metabolites\"].keys()]\n", + " )\n", + " if r[\"id\"].endswith(\"_e\"):\n", + " r[\"id\"] += \"0\"\n", + " elif len(compartments) == 1:\n", + " r[\"id\"] += \"_\" + list(compartments)[0]\n", + " else:\n", + " r[\"id\"] += (\n", + " \"_\" + \"c0\"\n", + " ) # hack cause there is only combo between e0 and c0\n", + "\n", + " model_json[\"reactions\"] = [\n", + " x for x in model_json[\"reactions\"] if x[\"id\"] not in ko\n", + " ]\n", + "\n", + " if added_compounds:\n", + " for o in added_compounds:\n", + " model_json[\"metabolites\"].append(o)\n", + " if added_reactions:\n", + " for o in added_reactions:\n", + " model_json[\"reactions\"].append(o)\n", + " model = cobra.io.from_json(json.dumps(model_json))\n", + " model.reactions.ATPM_c0.lower_bound = 0\n", + " model.reactions.ATPM_c0.upper_bound = 1000\n", + " return model\n", + "\n", + " return _method(ko)" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "e8107ba2-f470-4e05-8b80-731fc00febe7", + "execution_count": 45, + "id": "928bb140-9110-4a1a-b750-dbd9d6a2acc6", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "95db6e6f-bedc-4c0d-9e73-c6eec5365c16", + "metadata": {}, + "outputs": [], + "source": [ + "model = get_model([\"NADH16_c0\", \"CYTBD_c0\", \"O2t_c0\", \"GLCpts_c0\"])\n", + "with open('/home/fliu/workspace/python3/ModelSEEDpy/tests/test_data/template_core_bigg.json', 'r') as fh:\n", + " template = MSTemplateBuilder.from_dict(json.load(fh)).build()\n", + "media_glucose_aerobic = MSMedia.from_dict(\n", + " {\n", + " \"glc__D\": (-1, 1000),\n", + " \"o2\": (-1000, 1000),\n", + " \"h\": (-1000, 1000),\n", + " \"h2o\": (-1000, 1000),\n", + " }\n", + " )\n", + "media_glucose_aerobic.id = 'glc/o2'\n", + "media_acetate_aerobic = MSMedia.from_dict(\n", + " {\n", + " \"ac\": (-1, 1000),\n", + " \"o2\": (-1000, 1000),\n", + " \"h\": (-1000, 1000),\n", + " \"h2o\": (-1000, 1000),\n", + " }\n", + " )\n", + "media_acetate_aerobic.id = 'ac/o2'\n", + "medias = [media_glucose_aerobic, media_acetate_aerobic]" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "8fdc8faf-fcc8-45cd-b775-e6bc143a42cc", + "metadata": {}, + "outputs": [], + "source": [ + "%run /home/fliu/workspace/python3/ModelSEEDpy/modelseedpy/core/msatpcorrection.py\n", + "atp_correction = MSATPCorrection(\n", + " model,\n", + " template,\n", + " medias,\n", + " atp_hydrolysis_id=\"ATPM_c0\",\n", + " load_default_medias=False,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "fc07b43d-88f5-477c-9149-28756a5cd926", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0" + "[[, 0.01],\n", + " [, 0.01]]" ] }, - "execution_count": 18, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.max_gapfilling" + "atp_correction.atp_medias" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "1af1e574-76b2-40f7-82f8-4ffd1bb2c442", + "execution_count": 99, + "id": "369ef2d4-f696-4762-9370-d91276e3b95f", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Namee_coli_core
Memory address7ff258653370
Number of metabolites72
Number of reactions91
Number of genes137
Number of groups0
Objective expression1.0*BIOMASS_Ecoli_core_w_GAM_c0 - 1.0*BIOMASS_Ecoli_core_w_GAM_c0_reverse_70c47
Compartmentsextracellular space, cytosol
" + ], "text/plain": [ - "0" + "" ] }, - "execution_count": 19, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_delta" + "model" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "0a344084-edad-456f-9e88-064a404039d4", + "execution_count": 100, + "id": "62862b90-d73b-4597-8e3f-c8bf55e9090e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "{'glc/o2': 0.0, 'ac/o2': 0.0}" ] }, - "execution_count": 43, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_tests" + "atp_correction.evaluate_growth_media()" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "9e78779d-b7e7-4e73-a77c-9813bee3c6a9", + "execution_count": 101, + "id": "e67db875-e06f-464c-b96c-8e4ce7eb6324", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "{: {'reversed': {},\n", + " 'new': {'GLCpts_c0': '>'},\n", + " 'media': ,\n", + " 'target': 'ATPM_c0',\n", + " 'minobjective': 0.01,\n", + " 'binary_check': False},\n", + " : {'reversed': {},\n", + " 'new': {'CYTBD_c0': '>', 'NADH16_c0': '>', 'O2t_c0': '>'},\n", + " 'media': ,\n", + " 'target': 'ATPM_c0',\n", + " 'minobjective': 0.01,\n", + " 'binary_check': False}}" ] }, - "execution_count": 44, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_tests" + "atp_correction.media_gapfill_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "47da598f-b3cd-423d-93eb-0e68f11eaef9", + "metadata": {}, + "outputs": [], + "source": [ + "atp_correction.determine_growth_media()" ] }, { "cell_type": "code", - "execution_count": 68, - "id": "669e1ddb-493b-461e-bef9-d19cb1f5e542", + "execution_count": 105, + "id": "42673388-2500-4922-83b9-3e4dfa7acb17", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "'glc/o2'" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "atp_correction.selected_media[0].id" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "c0e29cc8-85d5-450e-a3d6-c1207d297963", + "metadata": {}, + "outputs": [], + "source": [ + "atp_correction.apply_growth_media_gapfilling()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "43f29d4f-30b3-452f-a5f9-49489b97d646", + "metadata": {}, + "outputs": [], + "source": [ + "media_eval = atp_correction.evaluate_growth_media()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "f8044fd4-70f1-4082-9316-e601ac06ac7e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'glc/o2': 2.75, 'ac/o2': 0.0}" ] }, - "execution_count": 68, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_tests" + "media_eval" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "db1e8df2-4a86-408b-a479-5eebf13e9971", + "metadata": {}, + "outputs": [], + "source": [ + "atp_correction.expand_model_to_genome_scale()" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "d76dcb54-1ea2-4e53-8853-521790cd8300", + "metadata": {}, + "outputs": [], + "source": [ + "tests = atp_correction.build_tests()" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "f30e70fa-5258-42fd-b624-aafdce509b80", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "empty {'media': , 'is_max_threshold': True, 'threshold': 1e-05, 'objective': 'ATPM_c0'}\n", + "glc/o2 {'media': , 'is_max_threshold': True, 'threshold': 3.3, 'objective': 'ATPM_c0'}\n" + ] + } + ], + "source": [ + "for t in tests:\n", + " print(t['media'].id, t)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "c35d3047-da1f-4331-a907-765c2b43048d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'media': ,\n", + " 'is_max_threshold': True,\n", + " 'threshold': 1e-05,\n", + " 'objective': 'ATPM_c0'},\n", + " {'media': ,\n", + " 'is_max_threshold': True,\n", + " 'threshold': 3.3,\n", + " 'objective': 'ATPM_c0'}]" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tests" ] }, { "cell_type": "code", "execution_count": null, - "id": "e24d8e82-357a-4658-9362-6073f502b6bc", + "id": "7b718e1d-059d-410b-bf1a-05a734f09e0d", "metadata": {}, "outputs": [], "source": [] @@ -1298,7 +1598,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/examples/Model Reconstruction/Biomass.ipynb b/examples/Model Reconstruction/Biomass.ipynb index e4a2c901..3726f959 100644 --- a/examples/Model Reconstruction/Biomass.ipynb +++ b/examples/Model Reconstruction/Biomass.ipynb @@ -2,18 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "5434992c-fc67-40f5-ae08-82f44790666c", "metadata": {}, "outputs": [], "source": [ - "from modelseedpy.helpers import get_template\n", - "from modelseedpy.core.mstemplate import MSTemplateBuilder" + "import modelseedpy" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 2, "id": "b243e00a-4a8b-489d-a778-61844a439e63", "metadata": {}, "outputs": [ @@ -21,7 +20,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "cobrakbase 0.2.8\n" + "cobrakbase 0.3.1\n" ] } ], @@ -30,6 +29,157 @@ "kbase = cobrakbase.KBaseAPI()" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3a177c16-ecb0-4050-bbf5-47aad10f2af9", + "metadata": {}, + "outputs": [], + "source": [ + "template = kbase.get_from_ws('GramNegModelTemplateV3', 'NewKBaseModelTemplates')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4ce52552-dce2-4c44-9884-cf00d15e76ab", + "metadata": {}, + "outputs": [], + "source": [ + "from modelseedpy import MSBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6f216f6a-5e25-4697-bf6b-9ae63475b5c7", + "metadata": {}, + "outputs": [], + "source": [ + "from cobra.core import Model\n", + "model = Model('test')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d9763d58-daba-4751-811f-23581b390025", + "metadata": {}, + "outputs": [], + "source": [ + "biomass = template.biomasses[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d3e884ac-2568-445a-ac04-1508b536c88a", + "metadata": {}, + "outputs": [], + "source": [ + "reaction = biomass.build_biomass(model, '0', True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f5140ac5-273f-4eb5-b806-ddd9178b252e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00010_c0 {'modelseed_template_id': 'cpd00010_c'}\n", + "cpd11493_c0 {'modelseed_template_id': 'cpd11493_c'}\n", + "cpd12370_c0 {'modelseed_template_id': 'cpd12370_c'}\n", + "cpd00003_c0 {'modelseed_template_id': 'cpd00003_c'}\n", + "cpd00006_c0 {'modelseed_template_id': 'cpd00006_c'}\n", + "cpd00205_c0 {'modelseed_template_id': 'cpd00205_c'}\n", + "cpd00254_c0 {'modelseed_template_id': 'cpd00254_c'}\n", + "cpd10516_c0 {'modelseed_template_id': 'cpd10516_c'}\n", + "cpd00063_c0 {'modelseed_template_id': 'cpd00063_c'}\n", + "cpd00009_c0 {'modelseed_template_id': 'cpd00009_c'}\n", + "cpd00099_c0 {'modelseed_template_id': 'cpd00099_c'}\n", + "cpd00149_c0 {'modelseed_template_id': 'cpd00149_c'}\n", + "cpd00058_c0 {'modelseed_template_id': 'cpd00058_c'}\n", + "cpd00015_c0 {'modelseed_template_id': 'cpd00015_c'}\n", + "cpd10515_c0 {'modelseed_template_id': 'cpd10515_c'}\n", + "cpd00030_c0 {'modelseed_template_id': 'cpd00030_c'}\n", + "cpd00048_c0 {'modelseed_template_id': 'cpd00048_c'}\n", + "cpd00034_c0 {'modelseed_template_id': 'cpd00034_c'}\n", + "cpd00016_c0 {'modelseed_template_id': 'cpd00016_c'}\n", + "cpd00220_c0 {'modelseed_template_id': 'cpd00220_c'}\n", + "cpd00017_c0 {'modelseed_template_id': 'cpd00017_c'}\n", + "cpd00201_c0 {'modelseed_template_id': 'cpd00201_c'}\n", + "cpd00087_c0 {'modelseed_template_id': 'cpd00087_c'}\n", + "cpd00345_c0 {'modelseed_template_id': 'cpd00345_c'}\n", + "cpd00042_c0 {'modelseed_template_id': 'cpd00042_c'}\n", + "cpd00028_c0 {'modelseed_template_id': 'cpd00028_c'}\n", + "cpd00557_c0 {'modelseed_template_id': 'cpd00557_c'}\n", + "cpd00264_c0 {'modelseed_template_id': 'cpd00264_c'}\n", + "cpd00118_c0 {'modelseed_template_id': 'cpd00118_c'}\n", + "cpd00056_c0 {'modelseed_template_id': 'cpd00056_c'}\n", + "cpd15560_c0 {'modelseed_template_id': 'cpd15560_c'}\n", + "cpd15352_c0 {'modelseed_template_id': 'cpd15352_c'}\n", + "cpd15500_c0 {'modelseed_template_id': 'cpd15500_c'}\n", + "cpd00166_c0 {'modelseed_template_id': 'cpd00166_c'}\n", + "cpd01997_c0 {'modelseed_template_id': 'cpd01997_c'}\n", + "cpd03422_c0 {'modelseed_template_id': 'cpd03422_c'}\n", + "cpd00104_c0 {'modelseed_template_id': 'cpd00104_c'}\n", + "cpd00037_c0 {'modelseed_template_id': 'cpd00037_c'}\n", + "cpd00050_c0 {'modelseed_template_id': 'cpd00050_c'}\n", + "cpd15793_c0 {'modelseed_template_id': 'cpd15793_c'}\n", + "cpd15540_c0 {'modelseed_template_id': 'cpd15540_c'}\n", + "cpd15533_c0 {'modelseed_template_id': 'cpd15533_c'}\n", + "cpd15432_c0 {'modelseed_template_id': 'cpd15432_c'}\n", + "cpd02229_c0 {'modelseed_template_id': 'cpd02229_c'}\n", + "cpd15665_c0 {'modelseed_template_id': 'cpd15665_c'}\n", + "cpd15666_c0 {'modelseed_template_id': 'cpd15666_c'}\n", + "cpd00023_c0 {'modelseed_template_id': 'cpd00023_c'}\n", + "cpd00001_c0 {'modelseed_template_id': 'cpd00001_c'}\n", + "cpd00033_c0 {'modelseed_template_id': 'cpd00033_c'}\n", + "cpd00035_c0 {'modelseed_template_id': 'cpd00035_c'}\n", + "cpd00039_c0 {'modelseed_template_id': 'cpd00039_c'}\n", + "cpd00041_c0 {'modelseed_template_id': 'cpd00041_c'}\n", + "cpd00051_c0 {'modelseed_template_id': 'cpd00051_c'}\n", + "cpd00053_c0 {'modelseed_template_id': 'cpd00053_c'}\n", + "cpd00054_c0 {'modelseed_template_id': 'cpd00054_c'}\n", + "cpd00060_c0 {'modelseed_template_id': 'cpd00060_c'}\n", + "cpd00065_c0 {'modelseed_template_id': 'cpd00065_c'}\n", + "cpd00066_c0 {'modelseed_template_id': 'cpd00066_c'}\n", + "cpd00069_c0 {'modelseed_template_id': 'cpd00069_c'}\n", + "cpd00084_c0 {'modelseed_template_id': 'cpd00084_c'}\n", + "cpd00107_c0 {'modelseed_template_id': 'cpd00107_c'}\n", + "cpd00119_c0 {'modelseed_template_id': 'cpd00119_c'}\n", + "cpd00129_c0 {'modelseed_template_id': 'cpd00129_c'}\n", + "cpd00132_c0 {'modelseed_template_id': 'cpd00132_c'}\n", + "cpd00156_c0 {'modelseed_template_id': 'cpd00156_c'}\n", + "cpd00161_c0 {'modelseed_template_id': 'cpd00161_c'}\n", + "cpd00322_c0 {'modelseed_template_id': 'cpd00322_c'}\n", + "cpd00115_c0 {'modelseed_template_id': 'cpd00115_c'}\n", + "cpd00012_c0 {'modelseed_template_id': 'cpd00012_c'}\n", + "cpd00241_c0 {'modelseed_template_id': 'cpd00241_c'}\n", + "cpd00356_c0 {'modelseed_template_id': 'cpd00356_c'}\n", + "cpd00357_c0 {'modelseed_template_id': 'cpd00357_c'}\n", + "cpd00002_c0 {'modelseed_template_id': 'cpd00002_c'}\n", + "cpd00038_c0 {'modelseed_template_id': 'cpd00038_c'}\n", + "cpd00052_c0 {'modelseed_template_id': 'cpd00052_c'}\n", + "cpd00062_c0 {'modelseed_template_id': 'cpd00062_c'}\n", + "cpd00008_c0 {'modelseed_template_id': 'cpd00008_c'}\n", + "cpd00067_c0 {'modelseed_template_id': 'cpd00067_c'}\n", + "cpd11416_c0 {'modelseed_template_id': 'cpd11416_c'}\n", + "cpd17041_c0 {'modelseed_template_id': 'cpd17041_c'}\n", + "cpd17042_c0 {'modelseed_template_id': 'cpd17042_c'}\n", + "cpd17043_c0 {'modelseed_template_id': 'cpd17043_c'}\n" + ] + } + ], + "source": [ + "for m in reaction.metabolites:\n", + " print(m, m.notes)" + ] + }, { "cell_type": "code", "execution_count": 42, @@ -551,7 +701,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/examples/Model Reconstruction/Gapfilling.ipynb b/examples/Model Reconstruction/Gapfilling.ipynb index eea0c536..88eadaa6 100644 --- a/examples/Model Reconstruction/Gapfilling.ipynb +++ b/examples/Model Reconstruction/Gapfilling.ipynb @@ -2,17 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cobrakbase 0.2.8\n" - ] - } - ], + "outputs": [], "source": [ "import cobra\n", "#If you have CPLEX, uncomment this\n", @@ -20,31 +12,37 @@ "import cobrakbase\n", "#import modelseedpy.fbapkg\n", "from modelseedpy import GapfillingPkg, KBaseMediaPkg\n", - "from modelseedpy import FBAHelper, MSBuilder" + "from modelseedpy import FBAHelper, MSBuilder\n", + "kbase_api = cobrakbase.KBaseAPI()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "model = kbase_api.get_from_ws(\"test_model\",18528)" + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:modelseedpy.core.msmodelutl:cpd00244 not found in model!\n" + ] + }, { "data": { "text/html": [ - "

Objective

1.0 bio1 = 0.8048653841131165

Uptake

\n", + "

Objective

1.0 bio1 = 0.7997546667881398

Uptake

\n", " \n", " \n", " \n", @@ -58,14 +56,14 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -79,98 +77,98 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -189,28 +187,35 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -218,19 +223,15 @@ "
Metabolite
cpd00009_e0EX_cpd00009_e00.99980.993400.00%
cpd00013_e0EX_cpd00013_e06.0376.09400.00%
cpd00030_e0EX_cpd00030_e00.006390.0063500.00%
cpd00034_e0EX_cpd00034_e00.006390.0063500.00%
cpd00048_e0EX_cpd00048_e00.17550.174400.00%
cpd00058_e0EX_cpd00058_e00.006390.0063500.00%
cpd00063_e0EX_cpd00063_e00.006390.0063500.00%
cpd00067_e0EX_cpd00067_e061.8561.4300.00%
cpd00099_e0EX_cpd00099_e00.006390.0063500.00%
cpd00149_e0EX_cpd00149_e00.006390.0063500.00%
cpd00205_e0EX_cpd00205_e00.006390.0063500.00%
cpd00254_e0EX_cpd00254_e00.006390.0063500.00%
cpd10516_e0EX_cpd10516_e00.025560.025400.00%
cpd17041_c0rxn13782_c00.80490.799800.00%
cpd17042_c0rxn13783_c00.80490.799800.00%
cpd17043_c0rxn13784_c00.80490.799800.00%
cpd00001_e0EX_cpd00001_e0-82.26-81.9500.00%
cpd00007_e0EX_cpd00007_e0-2.928-2.86900.00%
cpd15378_e0EX_cpd15378_e0-0.00639-0.006357100.00%18.92%
cpd03091_c0SK_cpd03091_c0-0.019051081.08%
cpd11416_c0SK_cpd11416_c0-0.8049-0.799800.00%
" ], "text/plain": [ - "" + "" ] }, - "execution_count": 2, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "kbase_api = cobrakbase.KBaseAPI()\n", - "model = kbase_api.get_from_ws(\"test_model\",18528)\n", - "#If you have CPLEX, uncomment this\n", - "#model.solver = 'optlang-cplex'\n", "template = kbase_api.get_from_ws(\"GramNegModelTemplateV3\",\"NewKBaseModelTemplates\")\n", "media = kbase_api.get_from_ws(\"Carbon-D-Glucose\",\"KBaseMedia\")\n", "model = MSBuilder.gapfill_model(model,\"bio1\",template,media)\n", @@ -17910,7 +17911,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/examples/Model Reconstruction/Genomes.ipynb b/examples/Model Reconstruction/Genomes.ipynb index 60270468..8ea82ef4 100644 --- a/examples/Model Reconstruction/Genomes.ipynb +++ b/examples/Model Reconstruction/Genomes.ipynb @@ -1,223 +1,300 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "tags": [] + }, "source": [ - "import modelseedpy\n", - "from modelseedpy.core.msgenome import MSGenome\n", - "from modelseedpy.core.rast_client import RastClient" + "### Genomes\n", + "\n", + "ModelSEEDpy provides its own genome object type `modelseedpy.core.msgenome.MSGenome` to manipulate genomes" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "genome = MS" + "import modelseedpy\n", + "from modelseedpy.core.msgenome import MSGenome" ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ - "1" + "#### Reading faa file\n", + "\n", + "To load a genome we can read a `.faa` file that contains protein sequences" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "genome = MSGenome.from_fasta('GCF_000005845.2_ASM584v2_protein.faa', split=' ')" + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "rast = RastClient()" + "genome" ] }, { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ - "genome = MSGenome.from_fasta('GCF_000005845.2.faa', split=' ')" + "#### Manipulating genes\n", + "\n", + "Each gene is stored as a `modelseedpy.core.msgenome.MSFeature` in the `.features` of type `cobra.core.dictlist.DictList` similiar to the cobrapy `.reactions` and `.metabolites` in the `cobra.core.Model`" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of features: 3\n" - ] + "data": { + "text/plain": [ + "4285" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print('Number of features:', len(genome.features))" + "len(genome.features)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "for f in genome.features:\n", - " print(f.id, len(f.seq), f.description)" + "gene = genome.features.get_by_id('NP_414542.1')\n", + "gene" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{'execution_time': 1622756127.36331,\n", - " 'tool_name': 'kmer_search',\n", - " 'hostname': 'pear',\n", - " 'parameters': ['-a',\n", - " '-g',\n", - " 200,\n", - " '-m',\n", - " 5,\n", - " '-d',\n", - " '/opt/patric-common/data/kmer_metadata_v2',\n", - " '-u',\n", - " 'http://pear.mcs.anl.gov:6100/query'],\n", - " 'id': '9CCA6D20-C4B3-11EB-A893-36A8BEF382BD'},\n", - " {'parameters': ['annotate_hypothetical_only=1',\n", - " 'dataset_name=Release70',\n", - " 'kmer_size=8'],\n", - " 'hostname': 'pear',\n", - " 'tool_name': 'KmerAnnotationByFigfam',\n", - " 'id': '9CE3769E-C4B3-11EB-A893-36A8BEF382BD',\n", - " 'execution_time': 1622756127.52738},\n", - " {'execute_time': 1622756127.88296,\n", - " 'hostname': 'pear',\n", - " 'parameters': [],\n", - " 'tool_name': 'annotate_proteins_similarity',\n", - " 'id': '9D19B7EA-C4B3-11EB-9714-71B3BDF382BD'}]" + "modelseedpy.core.msgenome.MSFeature" ] }, - "execution_count": 14, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rast.annotate_genome(genome)" + "type(gene)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Equivalent call from the client it self" + "##### Gene annotation\n", + "Annotation is store as an **ontology term**. When loading from a `.faa` file no ontology term is present but we can add them later." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#genome, res = rast.annotate_genome_from_fasta('GCF_000005845.2_ASM584v2_protein.faa', split=' ')\n", - "#res" + "gene.ontology_terms" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "'thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene.description" + ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 9, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "{'annotation': ['thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]']}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene.add_ontology_term('annotation', gene.description)\n", + "gene.ontology_terms" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "#### RAST\n", + "It is possible to annotate genomes with RAST by calling the `RastClient`" + ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from modelseedpy.core.rast_client import RastClient\n", + "rast = RastClient()" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 'C54F08A4-CDB3-11ED-A7E9-CAF09D6086F0',\n", + " 'parameters': ['-a',\n", + " '-g',\n", + " 200,\n", + " '-m',\n", + " 5,\n", + " '-d',\n", + " '/opt/patric-common/data/kmer_metadata_v2',\n", + " '-u',\n", + " 'http://pear.mcs.anl.gov:6100/query'],\n", + " 'hostname': 'pear',\n", + " 'tool_name': 'kmer_search',\n", + " 'execution_time': 1680040751.14837},\n", + " {'id': 'C5638324-CDB3-11ED-A7E9-CAF09D6086F0',\n", + " 'parameters': ['annotate_hypothetical_only=1',\n", + " 'dataset_name=Release70',\n", + " 'kmer_size=8'],\n", + " 'tool_name': 'KmerAnnotationByFigfam',\n", + " 'hostname': 'pear',\n", + " 'execution_time': 1680040751.28257},\n", + " {'parameters': [],\n", + " 'id': 'C5944E1E-CDB3-11ED-8217-51F29F6086F0',\n", + " 'execute_time': 1680040751.60236,\n", + " 'tool_name': 'annotate_proteins_similarity',\n", + " 'hostname': 'pear'}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rast.annotate_genome(genome)" + ] }, { - "cell_type": "code", - "execution_count": 34, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "feature = genome.features.get_by_id('YP_588478.1')" + "RAST annotation is stored in the ontology term **RAST** and this is used as default to build metabolic models with the ModelSEED templates" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'RAST': 'DUF1435 domain-containing protein YjjZ [Escherichia coli str. K-12 substr. MG1655]'}" + "{'annotation': ['thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]'],\n", + " 'RAST': ['Thr operon leader peptide']}" ] }, - "execution_count": 36, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "feature.ontology_terms" + "gene.ontology_terms" ] }, { @@ -225,14 +302,12 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "feature.add_ontology_term('')" - ] + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -246,7 +321,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/examples/Model Reconstruction/build_metabolic_model.ipynb b/examples/Model Reconstruction/build_metabolic_model.ipynb index 2f1e8d3f..ea2e8d41 100644 --- a/examples/Model Reconstruction/build_metabolic_model.ipynb +++ b/examples/Model Reconstruction/build_metabolic_model.ipynb @@ -1,12 +1,26 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build Metabolic Model from Genome .faa file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* MSGenome: to read a faa file\n", + "* MSBuilder: to build metabolic model from the genome" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import modelseedpy\n", "from modelseedpy import MSBuilder, MSGenome" ] }, @@ -19,21 +33,1446 @@ "genome = MSGenome.from_fasta('GCF_000005845.2_ASM584v2_protein.faa', split=' ')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`MSBuilder.build_metabolic_model` default parameters runs RAST, ML prediction to select template (gram neg, gram pos, cyano [not implemented], archaea [not implemented]), builds draft model and gapfills with complete media" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "type object argument after ** must be a mapping, not str", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_3118582/859642788.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mmodelseedpy\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRastClient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mrast\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mRastClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mrast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mannotate_genome\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenome\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/rast_client.py\u001b[0m in \u001b[0;36mannotate_genome\u001b[0;34m(self, genome)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mp_features\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"id\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"protein_translation\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"features\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/rast_client.py\u001b[0m in \u001b[0;36mf\u001b[0;34m(self, p_features)\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"features\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mp_features\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"stages\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstages\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrpc_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"GenomeAnnotation.run_pipeline\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 94\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/rpcclient.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, method, params, token)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0merr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mret\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"error\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 75\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mServerError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"error\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 76\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mServerError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Unknown\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mret\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: type object argument after ** must be a mapping, not str" + ] + } + ], + "source": [ + "from modelseedpy import RastClient\n", + "rast = RastClient()\n", + "rast.annotate_genome(genome)" + ] + }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Number of features: 4285\n" + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:89: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.float):\n", + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:91: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.bool):\n" ] } ], "source": [ - "print('Number of features:', len(genome.features))" + "model = MSBuilder.build_metabolic_model('ecoli', genome, classic_biomass=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Objective

1.0 bio1 = 141.02637369025626

Uptake

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00007_e0EX_cpd00007_e0244.300.00%
cpd00024_e0EX_cpd00024_e083.0752.58%
cpd00028_e0EX_cpd00028_e00.3955340.08%
cpd00030_e0EX_cpd00030_e00.395500.00%
cpd00033_e0EX_cpd00033_e079.8120.99%
cpd00034_e0EX_cpd00034_e00.395500.00%
cpd00039_e0EX_cpd00039_e031.4261.17%
cpd00051_e0EX_cpd00051_e034.7461.29%
cpd00054_e0EX_cpd00054_e034.3530.64%
cpd00058_e0EX_cpd00058_e00.395500.00%
cpd00060_e0EX_cpd00060_e031.0950.96%
cpd00063_e0EX_cpd00063_e00.395500.00%
cpd00065_e0EX_cpd00065_e06.647110.45%
cpd00066_e0EX_cpd00066_e021.7691.21%
cpd00069_e0EX_cpd00069_e016.9990.95%
cpd00079_e0EX_cpd00079_e0499.9618.61%
cpd00080_e0EX_cpd00080_e0609.4311.34%
cpd00099_e0EX_cpd00099_e00.395500.00%
cpd00106_e0EX_cpd00106_e0401.249.96%
cpd00107_e0EX_cpd00107_e052.8661.97%
cpd00118_e0EX_cpd00118_e00.395540.01%
cpd00119_e0EX_cpd00119_e011.1660.42%
cpd00129_e0EX_cpd00129_e025.9650.81%
cpd00130_e0EX_cpd00130_e0199.144.94%
cpd00132_e0EX_cpd00132_e028.2840.70%
cpd00136_e0EX_cpd00136_e00.395570.02%
cpd00149_e0EX_cpd00149_e00.395500.00%
cpd00156_e0EX_cpd00156_e049.651.54%
cpd00161_e0EX_cpd00161_e029.7240.74%
cpd00184_e0EX_cpd00184_e0221.11013.71%
cpd00205_e0EX_cpd00205_e00.395500.00%
cpd00208_e0EX_cpd00208_e03.526120.26%
cpd00209_e0EX_cpd00209_e019000.00%
cpd00249_e0EX_cpd00249_e011.5690.65%
cpd00254_e0EX_cpd00254_e00.395500.00%
cpd00264_e0EX_cpd00264_e00.395570.02%
cpd00268_e0EX_cpd00268_e00.197800.00%
cpd00277_e0EX_cpd00277_e022.59101.40%
cpd00305_e0EX_cpd00305_e00.3955120.03%
cpd00322_e0EX_cpd00322_e034.0561.27%
cpd00355_e0EX_cpd00355_e00.791110.05%
cpd00367_e0EX_cpd00367_e012.9990.73%
cpd00383_e0EX_cpd00383_e01.97870.09%
cpd00412_e0EX_cpd00412_e02.76990.15%
cpd00438_e0EX_cpd00438_e02411014.95%
cpd00644_e0EX_cpd00644_e00.79190.04%
cpd00794_e0EX_cpd00794_e014.1121.05%
cpd01080_e0EX_cpd01080_e035.09183.92%
cpd03847_e0EX_cpd03847_e03.526140.31%
cpd10515_e0EX_cpd10515_e00.79100.00%
cpd10516_e0EX_cpd10516_e00.395500.00%
cpd17041_c0rxn13782_c014100.00%
cpd17042_c0rxn13783_c014100.00%
cpd17043_c0rxn13784_c014100.00%

Secretion

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00009_e0EX_cpd00009_e0-100000.00%
cpd00011_e0EX_cpd00011_e0-796.817.50%
cpd00020_e0EX_cpd00020_e0-282.137.97%
cpd00027_e0EX_cpd00027_e0-445.8625.18%
cpd00029_e0EX_cpd00029_e0-49029.22%
cpd00035_e0EX_cpd00035_e0-185.235.23%
cpd00047_e0EX_cpd00047_e0-2.37310.02%
cpd00100_e0EX_cpd00100_e0-4.38630.12%
cpd00108_e0EX_cpd00108_e0-3.52660.20%
cpd00116_e0EX_cpd00116_e0-0.395510.00%
cpd00139_e0EX_cpd00139_e0-1.18720.02%
cpd00151_e0EX_cpd00151_e0-221.1510.40%
cpd00159_e0EX_cpd00159_e0-835.5323.60%
cpd00226_e0EX_cpd00226_e0-220.8510.39%
cpd02701_c0SK_cpd02701_c0-0.3955150.06%
cpd03091_c0SK_cpd03091_c0-0.791100.07%
cpd11416_c0SK_cpd11416_c0-14100.00%
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ignore this below ..." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from modelseedpy import RastClient\n", + "rast = RastClient()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Objective

1.0 bio1 = 141.02637369025626

Uptake

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00007_e0EX_cpd00007_e0244.300.00%
cpd00024_e0EX_cpd00024_e083.0752.58%
cpd00028_e0EX_cpd00028_e00.3955340.08%
cpd00030_e0EX_cpd00030_e00.395500.00%
cpd00033_e0EX_cpd00033_e079.8120.99%
cpd00034_e0EX_cpd00034_e00.395500.00%
cpd00039_e0EX_cpd00039_e031.4261.17%
cpd00051_e0EX_cpd00051_e034.7461.29%
cpd00054_e0EX_cpd00054_e034.3530.64%
cpd00058_e0EX_cpd00058_e00.395500.00%
cpd00060_e0EX_cpd00060_e031.0950.96%
cpd00063_e0EX_cpd00063_e00.395500.00%
cpd00065_e0EX_cpd00065_e06.647110.45%
cpd00066_e0EX_cpd00066_e021.7691.21%
cpd00069_e0EX_cpd00069_e016.9990.95%
cpd00079_e0EX_cpd00079_e0499.9618.61%
cpd00080_e0EX_cpd00080_e0609.4311.34%
cpd00099_e0EX_cpd00099_e00.395500.00%
cpd00106_e0EX_cpd00106_e0401.249.96%
cpd00107_e0EX_cpd00107_e052.8661.97%
cpd00118_e0EX_cpd00118_e00.395540.01%
cpd00119_e0EX_cpd00119_e011.1660.42%
cpd00129_e0EX_cpd00129_e025.9650.81%
cpd00130_e0EX_cpd00130_e0199.144.94%
cpd00132_e0EX_cpd00132_e028.2840.70%
cpd00136_e0EX_cpd00136_e00.395570.02%
cpd00149_e0EX_cpd00149_e00.395500.00%
cpd00156_e0EX_cpd00156_e049.651.54%
cpd00161_e0EX_cpd00161_e029.7240.74%
cpd00184_e0EX_cpd00184_e0221.11013.71%
cpd00205_e0EX_cpd00205_e00.395500.00%
cpd00208_e0EX_cpd00208_e03.526120.26%
cpd00209_e0EX_cpd00209_e019000.00%
cpd00249_e0EX_cpd00249_e011.5690.65%
cpd00254_e0EX_cpd00254_e00.395500.00%
cpd00264_e0EX_cpd00264_e00.395570.02%
cpd00268_e0EX_cpd00268_e00.197800.00%
cpd00277_e0EX_cpd00277_e022.59101.40%
cpd00305_e0EX_cpd00305_e00.3955120.03%
cpd00322_e0EX_cpd00322_e034.0561.27%
cpd00355_e0EX_cpd00355_e00.791110.05%
cpd00367_e0EX_cpd00367_e012.9990.73%
cpd00383_e0EX_cpd00383_e01.97870.09%
cpd00412_e0EX_cpd00412_e02.76990.15%
cpd00438_e0EX_cpd00438_e02411014.95%
cpd00644_e0EX_cpd00644_e00.79190.04%
cpd00794_e0EX_cpd00794_e014.1121.05%
cpd01080_e0EX_cpd01080_e035.09183.92%
cpd03847_e0EX_cpd03847_e03.526140.31%
cpd10515_e0EX_cpd10515_e00.79100.00%
cpd10516_e0EX_cpd10516_e00.395500.00%
cpd17041_c0rxn13782_c014100.00%
cpd17042_c0rxn13783_c014100.00%
cpd17043_c0rxn13784_c014100.00%

Secretion

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00009_e0EX_cpd00009_e0-100000.00%
cpd00011_e0EX_cpd00011_e0-796.817.50%
cpd00020_e0EX_cpd00020_e0-282.137.97%
cpd00027_e0EX_cpd00027_e0-445.8625.18%
cpd00029_e0EX_cpd00029_e0-49029.22%
cpd00035_e0EX_cpd00035_e0-185.235.23%
cpd00047_e0EX_cpd00047_e0-2.37310.02%
cpd00100_e0EX_cpd00100_e0-4.38630.12%
cpd00108_e0EX_cpd00108_e0-3.52660.20%
cpd00116_e0EX_cpd00116_e0-0.395510.00%
cpd00139_e0EX_cpd00139_e0-1.18720.02%
cpd00151_e0EX_cpd00151_e0-221.1510.40%
cpd00159_e0EX_cpd00159_e0-835.5323.60%
cpd00226_e0EX_cpd00226_e0-220.8510.39%
cpd02701_c0SK_cpd02701_c0-0.3955150.06%
cpd03091_c0SK_cpd03091_c0-0.791100.07%
cpd11416_c0SK_cpd11416_c0-14100.00%
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of features: 4285\n" + ] + } + ], + "source": [ + "print('Number of features:', len(genome.features))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "builder = MSBuilder(genome)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "The genomes or genomeSet that you have submitted wasn’t annotated using the RAST annotation pipeline. Please annotate the genomes via ‘Annotate Microbial Genome’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genome/release)or genomeSets via Annotate Multiple Microbial Genomes’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genomes/release) and resubmit the RAST annotated genome/genomeSets into the Predict Phenotype app. (", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/ml/predict_phenotype.py\u001b[0m in \u001b[0;36mcreate_indicator_matrix\u001b[0;34m(ref_to_role, master_role_list)\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 94\u001b[0;31m \u001b[0mindicators\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatching_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 95\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: arrays used as indices must be of integer (or boolean) type", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_3016957/3197840996.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbuilder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_select_template\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/msbuilder.py\u001b[0m in \u001b[0;36mauto_select_template\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 664\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m \u001b[0mgenome_classifier\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_classifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"knn_ACNP_RAST_filter_01_17_2023\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 666\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgenome_class\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgenome_classifier\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclassify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgenome\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;31m# TODO: update with enum MSGenomeClass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/msgenomeclassifier.py\u001b[0m in \u001b[0;36mclassify\u001b[0;34m(self, genome_or_roles, ontology_term)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mgenome_or_roles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0montology_term\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m )\n\u001b[0;32m---> 33\u001b[0;31m indicator_df, master_role_list = create_indicator_matrix(\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0mgenome_or_roles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m )\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/ml/predict_phenotype.py\u001b[0m in \u001b[0;36mcreate_indicator_matrix\u001b[0;34m(ref_to_role, master_role_list)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mindicators\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatching_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m raise IndexError(\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;31m\"\u001b[0m\u001b[0mThe\u001b[0m \u001b[0mgenomes\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mgenomeSet\u001b[0m \u001b[0mthat\u001b[0m \u001b[0myou\u001b[0m \u001b[0mhave\u001b[0m \u001b[0msubmitted\u001b[0m \u001b[0mwasn\u001b[0m\u001b[0;31m’\u001b[0m\u001b[0mt\u001b[0m \u001b[0mannotated\u001b[0m \u001b[0musing\u001b[0m \u001b[0mthe\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mRAST\u001b[0m \u001b[0mannotation\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mPlease\u001b[0m \u001b[0mannotate\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mgenomes\u001b[0m \u001b[0mvia\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m‘\u001b[0m\u001b[0mAnnotate\u001b[0m \u001b[0mMicrobial\u001b[0m \u001b[0mGenome\u001b[0m\u001b[0;31m’\u001b[0m \u001b[0mapp\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: The genomes or genomeSet that you have submitted wasn’t annotated using the RAST annotation pipeline. Please annotate the genomes via ‘Annotate Microbial Genome’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genome/release)or genomeSets via Annotate Multiple Microbial Genomes’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genomes/release) and resubmit the RAST annotated genome/genomeSets into the Predict Phenotype app. (" + ] + } + ], + "source": [ + "builder.auto_select_template()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from cobra.core import Reaction" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "rxn = Reaction('SK_cpd11416_c0', 'SK_cpd11416_c0', '', 0, 1000)\n", + "rxn.add_metabolites({model.metabolites.cpd11416_c0: -1})\n", + "model.add_reactions([rxn])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:89: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.float):\n", + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:91: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.bool):\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Nameecoli
Memory address7f3dd51e8400
Number of metabolites1458
Number of reactions1772
Number of genes1295
Number of groups1323
Objective expression1.0*bio1 - 1.0*bio1_reverse_b18f7
CompartmentsCytosol, Extracellular
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MSBuilder.gapfill_model(model, \"bio1\", builder.template, None)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Objective

1.0 bio1 = 0.0

Uptake

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux

Secretion

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00010_c0 CoA [c0] 80\n", + "cpd11493_c0 ACP [c0] 39\n", + "cpd12370_c0 apo-ACP [c0] 3\n", + "cpd00003_c0 NAD [c0] 127\n", + "cpd00006_c0 NADP [c0] 89\n", + "cpd00205_c0 K+ [c0] 5\n", + "cpd00254_c0 Mg [c0] 3\n", + "cpd10516_c0 fe3 [c0] 5\n", + "cpd00063_c0 Ca2+ [c0] 2\n", + "cpd00009_c0 Phosphate [c0] 210\n", + "cpd00099_c0 Cl- [c0] 3\n", + "cpd00149_c0 Co2+ [c0] 2\n", + "cpd00058_c0 Cu2+ [c0] 3\n", + "cpd00015_c0 FAD [c0] 13\n", + "cpd10515_c0 Fe2+ [c0] 5\n", + "cpd00030_c0 Mn2+ [c0] 2\n", + "cpd00048_c0 Sulfate [c0] 4\n", + "cpd00034_c0 Zn2+ [c0] 2\n", + "cpd00016_c0 Pyridoxal phosphate [c0] 5\n", + "cpd00220_c0 Riboflavin [c0] 5\n", + "cpd00017_c0 S-Adenosyl-L-methionine [c0] 21\n", + "cpd00201_c0 10-Formyltetrahydrofolate [c0] 7\n", + "cpd00087_c0 Tetrahydrofolate [c0] 12\n", + "cpd00345_c0 5-Methyltetrahydrofolate [c0] 3\n", + "cpd00042_c0 GSH [c0] 13\n", + "cpd00028_c0 Heme [c0] 4\n", + "cpd00557_c0 Siroheme [c0] 2\n", + "cpd00264_c0 Spermidine [c0] 8\n", + "cpd00118_c0 Putrescine [c0] 9\n", + "cpd00056_c0 TPP [c0] 7\n", + "cpd15560_c0 Ubiquinone-8 [c0] 18\n", + "cpd15352_c0 2-Demethylmenaquinone 8 [c0] 7\n", + "cpd15500_c0 Menaquinone 8 [c0] 12\n", + "cpd00166_c0 Calomide [c0] 4\n", + "cpd01997_c0 Dimethylbenzimidazole [c0] 2\n", + "cpd03422_c0 Cobinamide [c0] 2\n", + "cpd00104_c0 BIOT [c0] 5\n", + "cpd00037_c0 UDP-N-acetylglucosamine [c0] 16\n", + "cpd00050_c0 FMN [c0] 11\n", + "cpd15793_c0 Stearoylcardiolipin (B. subtilis) [c0] 1\n", + "cpd15540_c0 Phosphatidylglycerol dioctadecanoyl [c0] 3\n", + "cpd15533_c0 phosphatidylethanolamine dioctadecanoyl [c0] 3\n", + "cpd15432_c0 core oligosaccharide lipid A [c0] 2\n", + "cpd02229_c0 Bactoprenyl diphosphate [c0] 5\n", + "cpd15665_c0 Peptidoglycan polymer (n subunits) [c0] 2\n", + "cpd15666_c0 Peptidoglycan polymer (n-1 subunits) [c0] 2\n", + "cpd00023_c0 L-Glutamate [c0] 57\n", + "cpd00001_c0 H2O [c0] 556\n", + "cpd00033_c0 Glycine [c0] 21\n", + "cpd00035_c0 L-Alanine [c0] 17\n", + "cpd00039_c0 L-Lysine [c0] 8\n", + "cpd00041_c0 L-Aspartate [c0] 19\n", + "cpd00051_c0 L-Arginine [c0] 6\n", + "cpd00053_c0 L-Glutamine [c0] 17\n", + "cpd00054_c0 L-Serine [c0] 23\n", + "cpd00060_c0 L-Methionine [c0] 19\n", + "cpd00065_c0 L-Tryptophan [c0] 5\n", + "cpd00066_c0 L-Phenylalanine [c0] 4\n", + "cpd00069_c0 L-Tyrosine [c0] 6\n", + "cpd00084_c0 L-Cysteine [c0] 14\n", + "cpd00107_c0 L-Leucine [c0] 6\n", + "cpd00119_c0 L-Histidine [c0] 4\n", + "cpd00129_c0 L-Proline [c0] 11\n", + "cpd00132_c0 L-Asparagine [c0] 6\n", + "cpd00156_c0 L-Valine [c0] 5\n", + "cpd00161_c0 L-Threonine [c0] 7\n", + "cpd00322_c0 L-Isoleucine [c0] 4\n", + "cpd00115_c0 dATP [c0] 7\n", + "cpd00012_c0 PPi [c0] 134\n", + "cpd00241_c0 dGTP [c0] 8\n", + "cpd00356_c0 dCTP [c0] 6\n", + "cpd00357_c0 TTP [c0] 7\n", + "cpd00002_c0 ATP [c0] 276\n", + "cpd00038_c0 GTP [c0] 20\n", + "cpd00052_c0 CTP [c0] 25\n", + "cpd00062_c0 UTP [c0] 13\n", + "cpd00008_c0 ADP [c0] 214\n", + "cpd00067_c0 H+ [c0] 896\n", + "cpd11416_c0 Biomass [c0] 2\n", + "cpd17041_c0 Protein biosynthesis [c0] 2\n", + "cpd17042_c0 DNA replication [c0] 2\n", + "cpd17043_c0 RNA transcription [c0] 2\n" + ] + } + ], + "source": [ + "for m in model.reactions.bio1.metabolites:\n", + " print(m, m.name, len(m.reactions))" ] }, { diff --git a/examples/Others/Biochem.ipynb b/examples/Others/Biochem.ipynb index 2433f4dd..00b845b8 100644 --- a/examples/Others/Biochem.ipynb +++ b/examples/Others/Biochem.ipynb @@ -4,18 +4,17 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cobrakbase 0.2.8\n" - ] - } - ], + "outputs": [], + "source": [ + "import modelseedpy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "import modelseedpy\n", - "import cobrakbase" + "### Load the database object from local github repository\n", + "https://github.com/ModelSEED/ModelSEEDDatabase" ] }, { @@ -24,336 +23,221 @@ "metadata": {}, "outputs": [], "source": [ - "modelseed = modelseedpy.biochem.from_local('../../../ModelSEEDDatabase')" + "database_path = '../../../ModelSEEDDatabase/'\n", + "modelseed = modelseedpy.biochem.from_local(database_path)" ] }, { - "cell_type": "code", - "execution_count": 3, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compounds" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "modelseedpy.biochem.modelseed_biochem.ModelSEEDBiochem" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "type(modelseed)" + "### Fetch compounds" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{'C00001', 'C01328'}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Pyruvate\n", + "L-Lactate\n" + ] } ], "source": [ - "modelseed.compound_aliases['cpd00001']['KEGG']" + "cpd_pyruvate = modelseed.compounds.cpd00020\n", + "print(cpd_pyruvate.name)\n", + "cpd_lactate = modelseed.compounds.get_by_id('cpd00159')\n", + "print(cpd_lactate.name)" ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "kbase = cobrakbase.KBaseAPI()" + "### Read Aliases" ] }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "12218/444/1\n" + "Aliases dict_keys(['AlgaGEM', 'AraCyc', 'AraGEM', 'BiGG', 'BiGG1', 'BrachyCyc', 'ChlamyCyc', 'CornCyc', 'DF_Athaliana', 'EcoCyc', 'JM_Creinhardtii', 'JP_Creinhardtii_MSB', 'JP_Creinhardtii_NMeth', 'KEGG', 'MaizeCyc', 'Maize_C4GEM', 'MetaCyc', 'PlantCyc', 'PoplarCyc', 'RiceCyc', 'SorghumCyc', 'SoyCyc', 'TS_Athaliana', 'iAF1260', 'iAF692', 'iAG612', 'iAO358', 'iAbaylyiv4', 'iGT196', 'iIN800', 'iIT341', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904', 'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083', 'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844', 'metanetx.chemical', 'SMILE', 'InChIKey', 'InChI'])\n", + "KEGG {'C00022'}\n" ] } ], "source": [ - "template = kbase.get_from_ws('CoreBacteria_updated', 12218)\n", - "print(template.info)" + "print('Aliases', cpd_pyruvate.annotation.keys())\n", + "print('KEGG', cpd_pyruvate.annotation['KEGG'])" ] }, { - "cell_type": "code", - "execution_count": 103, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from cobrakbase.core.kbasefba.newmodeltemplate_metabolite import NewModelTemplateCompound\n", - "compounds = {}\n", - "for cc in template.compcompounds:\n", - " if cc.compound is None:\n", - " cpd = modelseed.get_seed_compound(cc.id[:-2])\n", - " if cpd.id not in compounds:\n", - " template_compound = NewModelTemplateCompound(cpd.id, cpd.formula, cpd.name)\n", - " compounds[template_compound.id] = NewModelTemplateCompound(cpd.id, cpd.formula, cpd.name)\n", - " print(cpd)" + "### Read Structures" ] }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "SMILES C[C@H](O)C(=O)[O-]\n", + "InChI InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1/t2-/m0/s1\n", + "InChI Key JVTAAEKCZFNVCJ-REOHCLBHSA-M\n" + ] } ], "source": [ - "kbase.save_object('CoreBacteria_updated', 12218, template.info.type, template)" + "print('SMILES', cpd_lactate.smiles)\n", + "print('InChI', cpd_lactate.inchi)\n", + "print('InChI Key', cpd_lactate.inchi_key)" ] }, { - "cell_type": "code", - "execution_count": 100, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "template.add_compounds(list(compounds.values()))" + "### Fetch by inchi key\n", + "`find_compounds_by_inchi_key(inchi_key, exact=True)` exact forces first and second key match `exact=False` searches by first inchi hash only" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 6, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Compound identifiercpd26984
NameDsrC-disulfide-form
Memory address0x07fc27bc0b710
FormulaC6H9N2O2R3S2
In 0 species\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 91, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00159 L-Lactate JVTAAEKCZFNVCJ-REOHCLBHSA-M\n" + ] } ], "source": [ - "\n", - "template_compound" + "for cpd in modelseed.find_compounds_by_inchi_key('JVTAAEKCZFNVCJ-REOHCLBHSA-M', True):\n", + " print(cpd, cpd.name, cpd.inchi_key)" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 7, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Compound identifiercpd26984
NameDsrC-disulfide-form
Memory address0x07fc27bb9ea50
FormulaC6H9N2O2R3S2
In 0 species\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 90, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00221 D-Lactate JVTAAEKCZFNVCJ-UWTATZPHSA-M\n", + "cpd00159 L-Lactate JVTAAEKCZFNVCJ-REOHCLBHSA-M\n", + "cpd01022 Lactate JVTAAEKCZFNVCJ-UHFFFAOYSA-M\n" + ] } ], - "source": [] + "source": [ + "for cpd in modelseed.find_compounds_by_inchi_key('JVTAAEKCZFNVCJ-REOHCLBHSA-M', False):\n", + " print(cpd, cpd.name, cpd.inchi_key)" + ] }, { - "cell_type": "code", - "execution_count": 104, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "template_reaction = template.reactions.rxa45615_c" + "# Reactions" ] }, { - "cell_type": "code", - "execution_count": 17, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': 'rxa45615_c',\n", - " 'name': 'rxa45615_c',\n", - " 'GapfillDirection': '=',\n", - " 'base_cost': 1000,\n", - " 'reverse_penalty': 1000,\n", - " 'forward_penalty': 1000,\n", - " 'upper_bound': 1000,\n", - " 'lower_bound': -1000,\n", - " 'direction': '=',\n", - " 'maxforflux': 1000,\n", - " 'maxrevflux': 1000.0,\n", - " 'reaction_ref': 'kbase/default/reactions/id/rxa45615',\n", - " 'templateReactionReagents': [{'coefficient': -2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00067_c'},\n", - " {'coefficient': -3,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00971_c'},\n", - " {'coefficient': -2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd11620_c'},\n", - " {'coefficient': -1,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd08701_c'},\n", - " {'coefficient': 2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00067_e'},\n", - " {'coefficient': 3,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00971_e'},\n", - " {'coefficient': 2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd11621_c'},\n", - " {'coefficient': 1,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd08702_c'}],\n", - " 'templatecompartment_ref': '~/compartments/id/c',\n", - " 'templatecomplex_refs': [],\n", - " 'type': 'spontaneous'}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "template_reaction.get_data()" + "### Fetch Reactions" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 140472454487120 cpd00067_c cpd00067:H+ H+\n", - " 140472433862480 cpd00971_c cpd00971:Na+ Na+\n", - " 140472454486480 cpd11620_c cpd11620:Reducedferredoxin Reducedferredoxin\n", - " 140472433931728 cpd08701_c cpd08701:Methanophenazine Methanophenazine\n", - " 140472433821840 cpd00067_e cpd00067:H+ H+\n", - " 140472433861840 cpd00971_e cpd00971:Na+ Na+\n", - " 140472433893520 cpd11621_c cpd11621:Oxidizedferredoxin Oxidizedferredoxin\n", - " 140472433931856 cpd08702_c cpd08702:Dihydromethanophenazine Dihydromethanophenazine\n" + "rxn00148: cpd00002_0 + cpd00020_0 <=> cpd00008_0 + cpd00061_0 + cpd00067_0\n", + "ATP + Pyruvate <=> ADP + Phosphoenolpyruvate + H+\n" ] } ], "source": [ - "for o in template_reaction.metabolites:\n", - " print(type(o), id(o), o.id, o.compound, o.name)" + "reaction_PYK = modelseed.reactions.rxn00148\n", + "print(reaction_PYK)\n", + "print(reaction_PYK.build_reaction_string(True))" ] }, { - "cell_type": "code", - "execution_count": 65, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'charge': 0,\n", - " 'id': 'cpd08701_c',\n", - " 'maxuptake': 0,\n", - " 'templatecompartment_ref': '~/compartments/id/c',\n", - " 'templatecompound_ref': '~/compounds/id/cpd08701'}" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "template.compcompounds.cpd08701_c.get_data()" + "### Read Aliases" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 58, + "execution_count": 9, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "140472724747984" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Aliases dict_keys(['AlgaGEM', 'AraCyc', 'AraGEM', 'BiGG', 'BrachyCyc', 'ChlamyCyc', 'CornCyc', 'DF_Athaliana', 'EcoCyc', 'JM_Creinhardtii', 'JP_Creinhardtii_MSB', 'JP_Creinhardtii_NMeth', 'KEGG', 'MaizeCyc', 'Maize_C4GEM', 'MetaCyc', 'PlantCyc', 'PoplarCyc', 'RiceCyc', 'SorghumCyc', 'SoyCyc', 'TS_Athaliana', 'iAF1260', 'iAF692', 'iAG612', 'iAO358', 'iGT196', 'iIN800', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904', 'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083', 'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844', 'metanetx.reaction', 'rhea', 'ec-code'])\n", + "KEGG {'R00200'}\n", + "ec-code {'2.7.1.40'}\n" + ] } ], "source": [ - "id(template.compcompounds.cpd08701_c.cpd08701_c)" + "print('Aliases', reaction_PYK.annotation.keys())\n", + "print('KEGG', reaction_PYK.annotation['KEGG'])\n", + "print('ec-code', reaction_PYK.annotation['ec-code'])" ] }, { - "cell_type": "code", - "execution_count": 61, + "cell_type": "markdown", "metadata": {}, + "source": [ + "### Instantiate reaction \n", + "Instantiate database reaction to a template reaction with cytosol `c` assigned to token `0`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -361,70 +245,47 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - "
Metabolite identifiercpd08701_cReaction identifierrxn00148_c
NameNameATP:pyruvate 2-O-phosphotransferase
Memory address0x07fc25936bb900x7f5eb42f82e0
Stoichiometry\n", + "

cpd00002_c + cpd00020_c <=> cpd00008_c + cpd00061_c + cpd00067_c

\n", + "

ATP + Pyruvate <=> ADP + Phosphoenolpyruvate + H+

\n", + "
FormulaGPR
CompartmentcLower bound-1000
In 3 reaction(s)\n", - " rxn03126_c, rxa45615_c, rxn15961_cUpper bound1000
" + " \n", + " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 61, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "template.compcompounds.cpd08701_c" + "template_PYK_cytosol = reaction_PYK.to_template_reaction({0: 'c'})\n", + "template_PYK_cytosol" ] }, { - "cell_type": "code", - "execution_count": 134, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd00009[0] = >\n", - "(1) cpd00001[0] + (1) cpd00742[0] <=> (2) cpd00011[0] + (2) cpd00013[0] > >\n", - "(1) cpd00011[0] + (1) cpd00668[0] <=> (2) cpd00020[0] < <\n", - "(1) cpd02570[0] <=> (2) cpd00020[0] = =\n", - "(2) cpd00025[0] <=> (2) cpd00001[0] + (1) cpd00007[0] > >\n", - "(1) cpd00001[0] + (1) cpd00794[0] <=> (2) cpd00027[0] > =\n", - "(2) cpd00001[0] <=> (1) cpd00025[0] = <\n", - "(2) cpd00038[0] <=> (1) cpd00012[0] + (1) cpd00925[0] > =\n", - "(2) cpd00040[0] <=> (1) cpd00011[0] + (1) cpd00843[0] > =\n", - "(1) cpd00011[0] + (1) cpd03049[0] <=> (1) cpd00020[0] + (1) cpd00056[0] > <\n", - "(2) cpd00076[0] <=> (1) cpd00027[0] + (1) cpd02298[0] = =\n" - ] - } - ], "source": [ - "i =0 \n", - "for r in modelseed.reactions:\n", - " print(modelseed.reactions[r]['code'], modelseed.reactions[r]['direction'], modelseed.reactions[r]['reversibility'])\n", - " #print(modelseed.reactions[r]['code'])\n", - " #print(modelseed.reactions[r]['stoichiometry'])\n", - " #print(modelseed.reactions[r]['definition'])\n", - " \n", - " \n", - " i+= 1\n", - " if i > 10:\n", - " break" + "# Random debug stuff ignore for now" ] }, { @@ -532,83 +393,6 @@ "with open('/Users/fliu/workspace/jupyter/python3/annotation-server/data/extra_reactions.json', 'w') as fh:\n", " fh.write(json.dumps(extra_reactions))\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "modelseed.reactions.update" - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "rxn45615: cpd00003 [0] + cpd00067 [0] + cpd00971 [0] + 2.0 cpd28082 [0] <=> cpd00004 [0] + cpd00971 [1] + 2.0 cpd27757 [0]\n" - ] - } - ], - "source": [ - "rxn = modelseed.get_seed_reaction('rxn45615')\n", - "print(type(rxn))\n", - "print(rxn)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': 'rxn45615',\n", - " 'abbreviation': nan,\n", - " 'name': nan,\n", - " 'code': '(1) cpd00003[0] + (1) cpd00971[0] + (2) cpd28082[0] <=> (1) cpd00004[0] + (1) cpd00971[1] + (2) cpd27757[0]',\n", - " 'stoichiometry': '-1:cpd00003:0:0:NAD;-1:cpd00067:0:0:H+;-1:cpd00971:0:0:Na+;-2:cpd28082:0:0:Reduced-ferredoxins;1:cpd00004:0:0:NADH;1:cpd00971:1:0:Na+;2:cpd27757:0:0:Oxidized-ferredoxins',\n", - " 'is_transport': 1,\n", - " 'equation': '(1) cpd00003[0] + (1) cpd00067[0] + (1) cpd00971[0] + (2) cpd28082[0] <=> (1) cpd00004[0] + (1) cpd00971[1] + (2) cpd27757[0]',\n", - " 'definition': '(1) NAD[0] + (1) H+[0] + (1) Na+[0] + (2) Reduced-ferredoxins[0] <=> (1) NADH[0] + (1) Na+[1] + (2) Oxidized-ferredoxins[0]',\n", - " 'reversibility': '?',\n", - " 'direction': '=',\n", - " 'abstract_reaction': nan,\n", - " 'pathways': nan,\n", - " 'aliases': 'MetaCyc: TRANS-RXN-276',\n", - " 'ec_numbers': '7.2.1.2',\n", - " 'deltag': 10000000.0,\n", - " 'deltagerr': 10000000.0,\n", - " 'compound_ids': 'cpd00003;cpd00004;cpd00067;cpd00971;cpd27757;cpd28082',\n", - " 'status': 'OK',\n", - " 'is_obsolete': 0,\n", - " 'linked_reaction': nan,\n", - " 'notes': 'GCP|EQP',\n", - " 'source': 'Primary Database'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelseed.reactions['rxn45615']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -627,7 +411,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/modelseedpy/__init__.py b/modelseedpy/__init__.py index 7f135055..981165c5 100644 --- a/modelseedpy/__init__.py +++ b/modelseedpy/__init__.py @@ -5,33 +5,20 @@ # set the warning format to be on a single line import sys import logging +import cobra import warnings as _warnings from os import name as _name from os.path import abspath as _abspath from os.path import dirname as _dirname from modelseedpy.helpers import config -logging_hash = { - "debug": logging.DEBUG, - "critical": logging.CRITICAL, - "error": logging.ERROR, - "warning": logging.WARNING, - "info": logging.INFO, -} +__author__ = "Christopher Henry" +__email__ = "chenry@anl.gov" +__version__ = "0.3.3" -# Configuing modelseedpy logger logger = logging.getLogger(__name__) -c_handler = logging.StreamHandler() -c_handler.setLevel(logging_hash[config.get("logging", "console_level")]) -c_format = logging.Formatter("%(name)s - %(levelname)s - %(message)s") -c_handler.setFormatter(c_format) -logger.addHandler(c_handler) -if config.get("logging", "log_file") == "yes": - f_handler = logging.FileHandler(config.get("logging", "filename"), mode="a") - f_handler.setLevel(logging_hash[config.get("logging", "file_level")]) - f_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - f_handler.setFormatter(f_format) - logger.addHandler(f_handler) + +print("modelseedpy", __version__) if sys.version_info[0] == 2: logger.warning( @@ -41,6 +28,9 @@ "still work but we will no longer actively maintain Python 2 support." ) +if "e0" not in cobra.medium.annotations.compartment_shortlist["e"]: + cobra.medium.annotations.compartment_shortlist["e"].append("e0") + import modelseedpy from modelseedpy.core import ( RastClient, @@ -48,17 +38,22 @@ MSBuilder, MSMedia, MSGrowthPhenotypes, + MSGrowthPhenotype, MSModelUtil, FBAHelper, MSEditorAPI, MSATPCorrection, MSGapfill, MSEquation, + MSModelReport, + AnnotationOntology, ) from modelseedpy.core.exceptions import * from modelseedpy.community import MSCommunity, MSCompatibility, CommKineticPkg +from modelseedpy.biochem import ModelSEEDBiochem + from modelseedpy.fbapkg import ( BaseFBAPkg, RevBinPkg, @@ -78,8 +73,8 @@ ObjConstPkg, ChangeOptPkg, ElementUptakePkg, + ReactionActivationPkg, + ExpressionActivationPkg ) from modelseedpy.multiomics import MSExpression - -__version__ = "0.2.2" diff --git a/modelseedpy/biochem/modelseed_biochem.py b/modelseedpy/biochem/modelseed_biochem.py index 43cc865e..882d0bfb 100644 --- a/modelseedpy/biochem/modelseed_biochem.py +++ b/modelseedpy/biochem/modelseed_biochem.py @@ -1,12 +1,18 @@ # -*- coding: utf-8 -*- import logging +import os +import json import pandas as pd from cobra.core.dictlist import DictList from modelseedpy.biochem.modelseed_compound import ModelSEEDCompound, ModelSEEDCompound2 from modelseedpy.biochem.modelseed_reaction import ModelSEEDReaction, ModelSEEDReaction2 +from modelseedpy.helpers import config +from modelseedpy.core.msmodel import get_reaction_constraints_from_direction logger = logging.getLogger(__name__) +_BIOCHEM_FOLDER = "Biochemistry" + ALIAS_CPD_IDENTIFIERS_ORG = { "BiGG": "bigg.metabolite", "KEGG": "kegg.compound", @@ -54,6 +60,29 @@ "TS_Athaliana", } +def convert_to_searchname(name): + OriginalName = name + ending = ""; + if name[-1] == "-": + ending = "-" + name = name.lower() + name.replace(" ","") + name.replace(",","") + name.replace("-","") + name.replace("_","") + name.replace("(","") + name.replace(")","") + name.replace("}","") + name.replace("{","") + name.replace("[","") + name.replace("]","") + name.replace(":","") + name.replace("�","") + name.replace("'","") + name.replace("_","") + name += ending + name.replace("icacid","ate") + return name; def get_low(ids): low = None @@ -134,10 +163,13 @@ def load_metabolites_from_df( if cpd_id in structures: if "SMILE" in structures[cpd_id]: smiles = structures[cpd_id]["SMILE"] + aliases_annotation["SMILE"] = smiles if "InChI" in structures[cpd_id]: inchi = structures[cpd_id]["InChI"] + aliases_annotation["InChI"] = inchi if "InChIKey" in structures[cpd_id]: inchi_key = structures[cpd_id]["InChIKey"] + aliases_annotation["InChIKey"] = inchi_key inchi_key = None if pd.isna(inchi_key) or len(inchi_key) == 0 else inchi_key other_names = set() if cpd_id in names: @@ -153,9 +185,6 @@ def load_metabolites_from_df( mass, delta_g, delta_g_err, - smiles, - inchi_key, - inchi, is_core, is_obsolete, is_cofactor, @@ -174,6 +203,150 @@ def load_metabolites_from_df( return compounds +def _load_aliases_df(df_aliases, seed_index=1, source_index=3, alias_id_index=2): + aliases = {} + for i in df_aliases.itertuples(): + seed_id = i[seed_index] + alias_id = i[alias_id_index] + source = i[source_index] + if seed_id not in aliases: + aliases[seed_id] = {} + if source not in aliases[seed_id]: + aliases[seed_id][source] = set() + aliases[seed_id][source].add(alias_id) + return aliases + + +def _load_metabolites( + database_path: str, aliases=None, names=None, structures=None +) -> dict: + if aliases is None: + aliases = {} + if names is None: + names = {} + if structures is None: + structures = {} + metabolites = {} + contents = os.listdir(f"{database_path}/{_BIOCHEM_FOLDER}") + for f in contents: + if f.startswith("compound_") and f.endswith(".json"): + with open(f"{database_path}/{_BIOCHEM_FOLDER}/{f}", "r") as fh: + _compounds_data = json.load(fh) + for o in _compounds_data: + if "id" in o and o["id"]: + cpd_names = set() + if o["id"] in names: + cpd_names |= names[o["id"]] + cpd = ModelSEEDCompound2( + o["id"], + o.get("formula"), + o.get("name"), + o.get("charge"), + "", + o.get("abbreviation"), + cpd_names, + o.get("mass"), + o.get("deltag"), + o.get("deltagerr"), + o.get("is_core"), + o.get("is_obsolete"), + None, + o.get("pka"), + o.get("pkb"), + o.get("source"), + ) + if cpd.id in aliases: + cpd.annotation.update(aliases[cpd.id]) + if cpd.id in structures: + for alias_type in structures[cpd.id]: + v = structures[cpd.id][alias_type] + if len(v) == 1: + cpd.annotation[alias_type] = list(v)[0] + else: + logger.warning( + f"multiple {alias_type} structures found for {cpd.id}" + ) + metabolites[cpd.id] = cpd + else: + print("error", o) + # print(_compounds_data[0].keys()) + return metabolites + + +def _load_reactions( + database_path: str, metabolites: dict, aliases=None, names=None, ec_numbers=None +) -> (dict, dict): + if aliases is None: + aliases = {} + if names is None: + names = {} + if ec_numbers is None: + ec_numbers = {} + reactions = {} + contents = os.listdir(f"{database_path}/{_BIOCHEM_FOLDER}") + metabolites_indexed = {} + for f in contents: + if f.startswith("reaction_") and f.endswith(".json"): + with open(f"{database_path}/{_BIOCHEM_FOLDER}/{f}", "r") as fh: + _reactions_data = json.load(fh) + for o in _reactions_data: + if "id" in o and o["id"]: + rxn_names = set() + if o["id"] in names: + rxn_names |= names[o["id"]] + ( + lower_bound, + upper_bound, + ) = get_reaction_constraints_from_direction( + o.get("reversibility") + ) + stoichiometry = o.get("stoichiometry") + reaction_metabolites = {} + for s in stoichiometry: + cmp_token = s["compartment"] + value = s["coefficient"] + cpd = metabolites[s["compound"]] + cpd_index_id = f"{cpd.id}_{cmp_token}" + if cpd_index_id not in metabolites_indexed: + cpd_token = cpd.copy() + cpd_token.id = f"{cpd.id}_{cmp_token}" + cpd_token.base_id = cpd.id + cpd_token.compartment = cmp_token + metabolites_indexed[cpd_index_id] = cpd_token + reaction_metabolites[ + metabolites_indexed[cpd_index_id] + ] = value + rxn = ModelSEEDReaction2( + o["id"], + o.get("name"), + "", + lower_bound, + upper_bound, + "", + rxn_names, + o.get("deltag"), + o.get("deltagerr"), + o.get("is_obsolete"), + None, + o.get("status"), + o.get("source"), + ) + if "linked_reaction" in o and o.get("linked_reaction"): + ids = o.get("linked_reaction").split(";") + rxn.annotation["modelseed"] = ids[0] + rxn.add_metabolites(reaction_metabolites) + if rxn.id in aliases: + rxn.annotation.update(aliases[rxn.id]) + if rxn.id in ec_numbers: + rxn.annotation["ec-code"] = ec_numbers[rxn.id] + metabolites[cpd.id] = cpd + reactions[rxn.id] = rxn + else: + logger.error(f"failed to read reaction record {o}") + + return reactions, metabolites_indexed + + def load_reactions_from_df( df: pd.DataFrame, database_metabolites: dict, @@ -271,16 +444,31 @@ class ModelSEEDDatabase: ModelSEED database instance. """ - def __init__(self, compounds, reactions, compound_tokens): + def __init__(self, compounds: list, reactions: list, compound_tokens: list): self.compounds = DictList() self.compound_tokens = DictList() self.reactions = DictList() self.compounds += compounds self.reactions += reactions - self.reactions += compound_tokens + self.compound_tokens += compound_tokens + self.inchi_key_lookup = {} self.metabolite_reactions = {} + self._index_inchi() + + def _index_inchi(self): + for m in self.compounds: + if m.inchi_key: + f, s, p = m.inchi_key.split("-") + if f not in self.inchi_key_lookup: + self.inchi_key_lookup[f] = {} + if s not in self.inchi_key_lookup[f]: + self.inchi_key_lookup[f][s] = set() + proton_pair = (m.id, p) + if proton_pair not in self.inchi_key_lookup[f][s]: + self.inchi_key_lookup[f][s].add(proton_pair) + def compounds_by_alias(self, alias, value): pass @@ -288,9 +476,27 @@ def reactions_by_alias(self, alias, value): pass def find_compounds_by_inchi_key(self, inchi_key, exact=True): - pass + f, s, p = inchi_key.split("-") + if exact and f in self.inchi_key_lookup and s in self.inchi_key_lookup[f]: + # x is tuple (cpd.id, protonation) + return [self.compounds.get_by_id(x[0]) for x in self.inchi_key_lookup[f][s]] + elif f in self.inchi_key_lookup and not exact: + seed_ids = set() + for s in self.inchi_key_lookup[f]: + # x is tuple (cpd.id, protonation) + seed_ids |= {x[0] for x in self.inchi_key_lookup[f][s]} + + return [self.compounds.get_by_id(seed_id) for seed_id in seed_ids] + else: + return [] + + def find_reactions_by_compounds(self, compounds, or_instead_of_and=False): + """ - def find_reactions_by_compounds(self, compounds): + @param compounds: list of seed compound ids + @param or_instead_of_and: use OR logic instead of AND (default) + @return: + """ pass def add_compound(self, cpd): @@ -310,6 +516,16 @@ def add_reaction(self, rxn): class ModelSEEDBiochem: + default_biochemistry = None + + @staticmethod + def get(create_if_missing=True): + if not ModelSEEDBiochem.default_biochemistry: + ModelSEEDBiochem.default_biochemistry = from_local( + config.get("biochem", "path") + ) + return ModelSEEDBiochem.default_biochemistry + def __init__( self, compounds, @@ -549,7 +765,7 @@ def load_database( return database -def from_local(path): +def from_local_old(path): database_repo = path reactions_url = database_repo + "/Biochemistry/reactions.tsv" compounds_url = database_repo + "/Biochemistry/compounds.tsv" @@ -617,6 +833,76 @@ def from_local(path): return modelseed +def from_local(database_path: str): + contents = os.listdir(f"{database_path}/Biochemistry/") + if "compounds.tsv" in contents: + return from_local_old(database_path) + + compound_aliases_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Aliases.txt" + ) + reaction_aliases_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Aliases.txt" + ) + compound_aliases = _load_aliases_df( + pd.read_csv(compound_aliases_url, index_col=None, sep="\t") + ) + reaction_aliases = _load_aliases_df( + pd.read_csv(reaction_aliases_url, index_col=None, sep="\t") + ) + + compound_structures_url = ( + f"{database_path}/Biochemistry/Structures/Unique_ModelSEED_Structures.txt" + ) + compound_structures = _load_aliases_df( + pd.read_csv(compound_structures_url, index_col=None, sep="\t"), + source_index=2, + alias_id_index=6, + ) + + compound_names_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Names.txt" + ) + reaction_names_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Names.txt" + ) + compound_names = _load_aliases_df( + pd.read_csv(compound_names_url, index_col=None, sep="\t") + ) + reaction_names = _load_aliases_df( + pd.read_csv(reaction_names_url, index_col=None, sep="\t") + ) + + reaction_ecs_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_ECs.txt" + ) + reaction_ecs = _load_aliases_df( + pd.read_csv(reaction_ecs_url, index_col=None, sep="\t") + ) + + # build metabolites unpack names + metabolites = _load_metabolites( + database_path, + compound_aliases, + {k: v["name"] for k, v in compound_names.items()}, + compound_structures, + ) + + # build reactions unpack names, ecs + reactions, metabolite_tokens = _load_reactions( + database_path, + metabolites, + reaction_aliases, + {k: v["name"] for k, v in reaction_names.items()}, + {k: v["Enzyme Class"] for k, v in reaction_ecs.items()}, + ) + database = ModelSEEDDatabase( + metabolites.values(), reactions.values(), metabolite_tokens.values() + ) + + return database + + def get_names_from_df(df): names = {} for t in df.itertuples(): diff --git a/modelseedpy/biochem/modelseed_compound.py b/modelseedpy/biochem/modelseed_compound.py index fdb54065..a3ea75f3 100644 --- a/modelseedpy/biochem/modelseed_compound.py +++ b/modelseedpy/biochem/modelseed_compound.py @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- from modelseedpy.biochem.seed_object import ModelSEEDObject -from modelseedpy.core.mstemplate import MSTemplateSpecies +from modelseedpy.core.mstemplate import MSTemplateSpecies, MSTemplateMetabolite from cobra.core import Metabolite import pandas as pd +_SMILE_ALIAS = "SMILE" +_INCHI_ALIAS = "InChI" +_INCHI_KEY_ALIAS = "InChIKey" + class ModelSEEDCompound2(Metabolite): def __init__( @@ -18,9 +22,6 @@ def __init__( mass=None, delta_g=None, delta_g_error=None, - smiles=None, - inchi_key=None, - inchi=None, is_core=False, is_obsolete=False, is_cofactor=False, @@ -48,10 +49,6 @@ def __init__( self.delta_g = delta_g self.delta_g_error = delta_g_error - self.smiles = smiles - self.inchi_key = inchi_key - self.inchi = inchi - self.linked_compound = None self.pka = pka self.pkb = pkb @@ -60,11 +57,51 @@ def __init__( self.flags |= set(flags) def to_template_compartment_compound(self, compartment): - res = self.copy() - res.id = f"{self.seed_id}_{compartment}" - res.compartment = compartment + cpd_id = f"{self.seed_id}_{compartment}" + # build Template Compound + metabolite = MSTemplateMetabolite( + self.seed_id, + self.formula, + self.name, + self.charge, + self.mass, + self.delta_g, + self.delta_g_error, + self.is_cofactor, + self.abbr, + ) + # build Template Compartment Compound + res = MSTemplateSpecies(cpd_id, self.charge, compartment, metabolite.id) + + # assign Compound to Compartment Compound + res._template_compound = metabolite + res.annotation.update(self.annotation) return res + @property + def smiles(self): + return ( + None + if _SMILE_ALIAS not in self.annotation + else self.annotation[_SMILE_ALIAS] + ) + + @property + def inchi_key(self): + return ( + None + if _INCHI_KEY_ALIAS not in self.annotation + else self.annotation[_INCHI_KEY_ALIAS] + ) + + @property + def inchi(self): + return ( + None + if _INCHI_ALIAS not in self.annotation + else self.annotation[_INCHI_ALIAS] + ) + class ModelSEEDCompound(ModelSEEDObject): @property diff --git a/modelseedpy/biochem/modelseed_reaction.py b/modelseedpy/biochem/modelseed_reaction.py index 5d19a1b4..b43430ce 100644 --- a/modelseedpy/biochem/modelseed_reaction.py +++ b/modelseedpy/biochem/modelseed_reaction.py @@ -2,6 +2,7 @@ import math from modelseedpy.biochem.seed_object import ModelSEEDObject from cobra.core import Reaction +from modelseedpy.core.mstemplate import MSTemplateReaction def to_str2(rxn, cmp_replace=None, cpd_replace={}): @@ -145,10 +146,20 @@ def __init__( self.status = status self.is_obsolete = is_obsolete + if self.is_obsolete: + self.is_obsolete = True + else: + self.is_obsolete = False self.is_abstract = is_abstract - self.delta_g = delta_g - self.delta_g_error = delta_g_error + self.delta_g = float(delta_g) if delta_g else None + self.delta_g_error = float(delta_g_error) if delta_g_error else None + + # removing symbolic high values representing null/none + if self.delta_g and self.delta_g > 10000: + self.delta_g = None + if self.delta_g_error and self.delta_g_error > 10000: + self.delta_g_error = None self.flags = set() if flags: @@ -156,7 +167,7 @@ def __init__( @property def compound_ids(self): - pass + return None def to_template_reaction(self, compartment_setup=None): if compartment_setup is None: @@ -178,10 +189,11 @@ def to_template_reaction(self, compartment_setup=None): # if len(str(index)) > 0: # name = f'{self.name} [{compartment}]' - reaction = Reaction( - rxn_id, name, self.subsystem, self.lower_bound, self.upper_bound + reaction = MSTemplateReaction( + rxn_id, self.id, name, self.subsystem, self.lower_bound, self.upper_bound ) reaction.add_metabolites(metabolites) + reaction.annotation.update(self.annotation) return reaction @property diff --git a/modelseedpy/community/__init__.py b/modelseedpy/community/__init__.py index ebf07888..e4e6208e 100644 --- a/modelseedpy/community/__init__.py +++ b/modelseedpy/community/__init__.py @@ -5,6 +5,10 @@ # import pyximport; pyximport.install(language_level=3) # improve computational speed from modelseedpy.community.mscommunity import * -from modelseedpy.community.dfbapkg import dFBAPkg -from modelseedpy.community.mscompatibility import MSCompatibility +from modelseedpy.community.datastandardization import * from modelseedpy.community.commkineticpkg import CommKineticPkg +from modelseedpy.community.mscompatibility import MSCompatibility +from modelseedpy.community.mssteadycom import MSSteadyCom +from modelseedpy.community.commphitting import CommPhitting +from modelseedpy.community.commhelper import build_from_species_models, phenotypes +from modelseedpy.community.mskineticsfba import MSKineticsFBA diff --git a/modelseedpy/community/commhelper.py b/modelseedpy/community/commhelper.py new file mode 100644 index 00000000..b17f8867 --- /dev/null +++ b/modelseedpy/community/commhelper.py @@ -0,0 +1,530 @@ +from modelseedpy.core.msminimalmedia import minimizeFlux_withGrowth, bioFlux_check +from modelseedpy.core.exceptions import NoFluxError, ObjectiveError +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.fbahelper import FBAHelper +from cobra import Model, Reaction, Metabolite +from cobra.medium import minimal_medium + +# from commscores import GEMCompatibility +from cobra.flux_analysis import pfba +from collections import OrderedDict +from optlang.symbolics import Zero +from optlang import Constraint +from math import inf, isclose +from pandas import DataFrame +from pprint import pprint +from numpy import mean +import re + + +def strip_comp(ID): + ID = ID.replace("-", "~") + return re.sub("(\_\w\d)", "", ID) + + +def export_lp(model, name): + with open(f"{name}.lp", "w") as out: + out.write(model.solver.to_lp()) + + +def correct_nonMSID(nonMSobject, output, model_index): + name, compartment = output + index = 0 if compartment == "e" else model_index + nonMSobject.compartment = compartment + str(index) + comp = re.search(r"(_[a-z]\d+$)", nonMSobject.id) + if comp is None and rf"[{compartment}]" in nonMSobject.id: + return nonMSobject.id.replace( + rf"[{compartment}]", f"_{nonMSobject.compartment}" + ) + elif comp is None: + return nonMSobject.id + f"_{nonMSobject.compartment}" + return "_".join([nonMSobject.id.replace(comp.group(), ""), nonMSobject.compartment]) + + +def build_from_species_models( + org_models, + model_id=None, + name=None, + abundances=None, + standardize=False, + MSmodel=True, + commkinetics=True, + copy_models=True, + printing=False, +): + """Merges the input list of single species metabolic models into a community metabolic model + + Parameters + ---------- + org_models : list to be merged into a community model + model_id : string specifying community model ID + name : string specifying community model name + names : list human-readable names for models being merged + abundances : dict relative abundances for input models in community model + cobra_model : bool for whether the raw COBRA model is returned + standardize: bool for whether the exchanges of each member model will be standardized (True) or just aligned. + + Returns + ------- + Cobra.Model for the desired Community + + Raises + ------ + """ + # construct the new model + models = org_models # if not standardize else GEMCompatibility.standardize( + # org_models, exchanges=True, conflicts_file_name='exchanges_conflicts.json') + biomass_indices = [] + biomass_index = minimal_biomass_index = 2 + new_metabolites, new_reactions = set(), set() + member_biomasses = {} + for model_index, org_model in enumerate(models): + model_util = MSModelUtil(org_model, copy=copy_models) + model_reaction_ids = [rxn.id for rxn in model_util.model.reactions] + model_index += 1 + # if MSmodel: + # Rename metabolites + for met in model_util.model.metabolites: + # Renaming compartments + output = MSModelUtil.parse_id(met) + if printing: + print(met, output) + if output is None: + if printing: + print( + f"The {met.id} ({output}; {hasattr(met, 'compartment')}) is unpredictable." + ) + met.id = correct_nonMSID(met, (met.id, "c"), model_index) + elif len(output) == 2: + met.id = correct_nonMSID(met, output, model_index) + elif len(output) == 3: + name, compartment, out_index = output + index = 0 if compartment == "e" else model_index + if out_index == "": + met.id += str(index) + met.compartment += str(index) + elif compartment == "e": + met.compartment = "e0" + else: + met.compartment = compartment + str(index) + met.id = name + "_" + met.compartment + new_metabolites.add(met) + if "cpd11416_c" in met.id or "biomass" in met.id: + member_biomasses[org_model.id] = met + # Rename reactions + for ( + rxn + ) in ( + model_util.model.reactions + ): # !!! all reactions should have a non-zero compartment index + if rxn.id[0:3] != "EX_": + ## biomass reactions + if re.search("^(bio)(\d+)$", rxn.id): + index = int(re.sub(r"(^bio)", "", rxn.id)) + if biomass_index == 2: + while f"bio{biomass_index}" in model_reaction_ids: + biomass_index += 1 + if index not in biomass_indices and index >= minimal_biomass_index: + biomass_indices.append(index) + else: # biomass indices can be decoupled from the respective reaction indices of the same model + rxn.id = "bio" + str(biomass_index) + if rxn.id not in model_reaction_ids: + biomass_indices.append(biomass_index) + else: + index = minimal_biomass_index + rxn.id = "bio" + str(index) + while ( + rxn.id not in model_reaction_ids + and index not in biomass_indices + ): + index += 1 + rxn.id = "bio" + str(index) + biomass_indices.append(index) + biomass_index += 1 + ## non-biomass reactions + else: + initialID = str(rxn.id) + output = MSModelUtil.parse_id(rxn) + if output is None: + if printing: + print( + f"The {rxn.id} ({output}; {hasattr(rxn, 'compartment')}) is unpredictable." + ) + try: + rxn.id = correct_nonMSID(rxn, (rxn.id, "c"), model_index) + except ValueError: + pass + elif len(output) == 2: + rxn.id = correct_nonMSID(rxn, output, model_index) + elif len(output) == 3: + name, compartment, index = output + if compartment != "e": + rxn.name = f"{name}_{compartment}{model_index}" + rxn_id = re.search(r"(.+\_\w)(?=\d+)", rxn.id).group() + if index == "": + rxn.id += str(model_index) + else: + rxn.id = rxn_id + str(model_index) + finalID = str(rxn.id) + string_diff = "" + for index, let in enumerate(finalID): + if ( + index >= len(initialID) + or index < len(initialID) + and let != initialID[index] + ): + string_diff += let + if string_diff != f"_{compartment}{model_index}" and printing: + print( + f"The ID {initialID} is changed with {string_diff} to create the final ID {finalID}" + ) + new_reactions.add(rxn) + # else: + # # TODO develop a method for compartmentalizing models without editing all reaction IDs or assuming their syntax + # pass + # adds only unique reactions and metabolites to the community model + newmodel = Model( + model_id or "+".join([model.id for model in models]), + name or "+".join([model.name for model in models]), + ) + newmodel.add_reactions(FBAHelper.filter_cobra_set(new_reactions)) + newmodel.add_metabolites(FBAHelper.filter_cobra_set(new_metabolites)) + + # Create community biomass + comm_biomass = Metabolite("cpd11416_c0", None, "Community biomass", 0, "c0") + metabolites = {comm_biomass: 1} + ## constrain the community abundances + if abundances: + abundances = { + met: abundances[memberID] for memberID, met in member_biomasses.items() + } + else: + abundances = { + cpd: -1 / len(member_biomasses) for cpd in member_biomasses.values() + } + ## define community biomass components + metabolites.update(abundances) + comm_biorxn = Reaction(id="bio1", name="bio1", lower_bound=0, upper_bound=1000) + comm_biorxn.add_metabolites(metabolites) + newmodel.add_reactions([comm_biorxn]) + # update model components + newutl = MSModelUtil(newmodel) + newutl.add_objective(comm_biorxn.flux_expression) + newutl.model.add_boundary( + comm_biomass, "sink" + ) # Is a sink reaction for reversible cpd11416_c0 consumption necessary? + ## proportionally limit the fluxes to their abundances + if commkinetics: + add_commkinetics(newutl, models, member_biomasses, abundances) + # add the metadata of community composition + if hasattr(newutl.model, "_context"): + newutl.model._contents.append(member_biomasses) + elif hasattr(newutl.model, "notes"): + newutl.model.notes.update(member_biomasses) + # print([cons.name for cons in newutl.model.constraints]) + return newutl.model + + +def add_commkinetics(util, models, member_biomasses, abundances): + # TODO this creates an error with the member biomass reactions not being identified in the model + coef = {} + for model in models: + coef[member_biomasses[model.id]] = -abundances[member_biomasses[model.id]] + for rxn in model.reactions: + if rxn.id[:3] == "rxn": + coef[rxn.forward_variable] = coef[rxn.reverse_variable] = 1 + util.create_constraint( + Constraint(Zero, name="member_flux_limit"), coef=coef, printing=True + ) + + +def phenotypes(community_members, phenotype_flux_threshold=0.1, solver: str = "glpk"): + # log information of each respective model + models = OrderedDict() + solutions = [] + media_conc = set() + # calculate all phenotype profiles for all members + comm_members = community_members.copy() + # print(community_members) + for ( + org_model, + content, + ) in ( + community_members.items() + ): # community_members excludes the stationary phenotype + print("\n", org_model.id) + org_model.solver = solver + all_phenotypes = "phenotypes" not in content + model_util = MSModelUtil(org_model, True) + if "org_coef" not in locals(): + org_coef = { + model_util.model.reactions.get_by_id( + "EX_cpd00007_e0" + ).reverse_variable: -1 + } + model_util.standard_exchanges() + models[org_model.id] = { + "exchanges": model_util.exchange_list(), + "solutions": {}, + "name": content["name"], + } + phenotypes = ( + { + met.name: {"consumed": met.id.replace("EX_", "").replace("_e0", "")} + for met in model_util.carbon_exchange_mets_list(include_unknown=False) + } + if all_phenotypes + else content["phenotypes"] + ) + # print(phenotypes) + models[org_model.id]["phenotypes"] = ["stationary"] + [ + content["phenotypes"].keys() for member, content in comm_members.items() + ] + phenoRXNs = [ + pheno_cpd + for pheno, pheno_cpds in content["phenotypes"].items() + for pheno_cpd in pheno_cpds["consumed"] + ] + media = {cpd: 100 for cpd, flux in model_util.model.medium.items()} + # TODO correct or remove the media, since it seems to be overwritten by the optimization of all carbon exchanges + ### eliminate hydrogen absorption + media.update({"EX_cpd11640_e0": 0}) + past_phenoRXNs = [] + for name, phenoCPDs in phenotypes.items(): + pheno_util = MSModelUtil(model_util.model, True) + metID = phenoCPDs["consumed"][0] + try: + phenoRXN = pheno_util.model.reactions.get_by_id(f"EX_{metID}_e0") + if past_phenoRXNs: + del media[past_phenoRXNs[-1]] + except Exception as e: + print(e, f"\nEX_{metID}_e0 is not in the model {org_model.id}") + continue + media.update({phenoRXN.id: 100}) + pheno_util.add_medium(media) + print(phenoRXN.id) + pheno_util.model.solver = solver + ### define an oxygen absorption relative to the phenotype carbon source + # O2_consumption: EX_cpd00007_e0 <= phenotype carbon source # formerly <= 2 * sum(primary carbon fluxes) + coef = org_coef.copy() + coef.update({phenoRXN.reverse_variable: 1}) + pheno_util.create_constraint( + Constraint(Zero, lb=0, ub=None, name="EX_cpd00007_e0_limitation"), + coef=coef, + ) + + ## minimize the influx of all carbonaceous exchanges, mostly non-phenotype compounds, at a fixed biomass growth + min_growth = float(1) # arbitrarily assigned minimal growth + pheno_util.add_minimal_objective_cons(min_growth) + phenoRXN.upper_bound = 0 + for ex in pheno_util.carbon_exchange_list(): + exMet = ex.id.replace("EX_", "").replace("_e0", "") + if exMet in phenoRXNs and exMet != metID: + ex.lower_bound = 0 + # print(f"The new bounds of {exMet} exchange are: {ex.bounds}") + pheno_util.add_objective( + Zero, + "min", + coef={ + ex.reverse_variable: 1000 if ex.id != phenoRXN.id else 1 + for ex in pheno_util.carbon_exchange_list() + }, + ) + # export_lp(pheno_util.model, f"minimize_cInFlux_{phenoRXN.id}") + sol = pheno_util.model.optimize() + if sol.status != "optimal": + pheno_util.model.remove_cons_vars(["EX_cpd00007_e0_limitation"]) + coef.update({phenoRXN.reverse_variable: 5}) + pheno_util.create_constraint( + Constraint(Zero, lb=0, ub=None, name="EX_cpd00007_e0_limitation"), + coef=coef, + ) + sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, sol) + ### limit maximum consumption to the values from the previous minimization + for ex in pheno_util.carbon_exchange_list(): + #### (limiting the reverse_variable is more restrictive than the net flux variable) + if ex.id != phenoRXN.id: + ex.reverse_variable.ub = abs(min(0, sol.fluxes[ex.id])) + + ## maximize the phenotype yield with the previously defined growth and constraints + pheno_util.add_objective(phenoRXN.reverse_variable, "min") + # export_lp(pheno_util.model, f"maximize_phenoYield_{phenoRXN.id}") + pheno_sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, pheno_sol) + pheno_influx = pheno_sol.fluxes[phenoRXN.id] + if pheno_influx >= 0: + if not all_phenotypes: + print( + f"The phenotype carbon source has a flux of {pheno_sol.fluxes[phenoRXN.id]}." + ) + pprint( + { + rxn: flux + for rxn, flux in pheno_sol.fluxes.items() + if flux != 0 + } + ) + # TODO gapfill the model in media the non-functioning carbon source + raise NoFluxError( + f"The (+) net flux of {pheno_influx} for the {phenoRXN.id} phenotype" + f" indicates that it is an implausible phenotype." + ) + print( + f"NoFluxError: The (+) net flux of {pheno_influx} for the {phenoRXN.id}" + " phenotype indicates that it is an implausible phenotype." + ) + continue + phenoRXN.lower_bound = phenoRXN.upper_bound = pheno_influx + + ## maximize excretion of all potential carbon byproducts whose #C's < phenotype source #C's + phenotype_source_carbons = FBAHelper.rxn_mets_list(phenoRXN)[0].elements[ + "C" + ] + minimum_fluxes = {} + for carbon_source in pheno_util.carbon_exchange_list(include_unknown=False): + if ( + 0 + < FBAHelper.rxn_mets_list(carbon_source)[0].elements["C"] + < phenotype_source_carbons + ): + pheno_util.add_objective(carbon_source.flux_expression, "max") + minObj = pheno_util.model.slim_optimize() + # print(carbon_source.reaction, "\t", carbon_source.flux_expression, "\t", minObj) + if minObj > phenotype_flux_threshold: + minimum_fluxes[carbon_source.id] = minObj + # TODO limit the possible excreted compounds to only those that are defined in the media + excreted_compounds = list( + [exID for exID in minimum_fluxes.keys() if exID != "EX_cpd00011_e0"] + ) + # minimum_fluxes_df = DataFrame(data=list(minimum_fluxes.values()), index=excreted_compounds, columns=["min_flux"]) + # max_excretion_cpd = minimum_fluxes_df["minimum"].idxmin() + ### optimize the excretion of the discovered phenotype excreta + if "excreted" in phenoCPDs: + phenoCPDs["excreted"] = [ + f"EX_{cpd}_e0" for cpd in phenoCPDs["excreted"] + ] + phenoCPDs["excreted"].extend(excreted_compounds) + else: + phenoCPDs["excreted"] = excreted_compounds + pheno_excreta = [ + pheno_util.model.reactions.get_by_id(excreta) + for excreta in phenoCPDs["excreted"] + ] + pheno_util.add_objective( + sum([ex.flux_expression for ex in pheno_excreta]), "max" + ) + # export_lp(pheno_util.model, "maximize_excreta") + sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, sol) + for ex in pheno_excreta: + ex.lower_bound = ex.upper_bound = sol.fluxes[ex.id] + + ## minimize flux of the total simulation flux through pFBA + # TODO discover why some phenotypes are infeasible with pFBA + try: + pheno_sol = pfba(pheno_util.model) + # pheno_util.add_objective(sum([rxn.flux_expression for rxn in pheno_util.e]), "min") + # pheno_sol = pheno_util.model.optimize() + except Exception as e: + print( + f"The {phenoRXN.id} phenotype of the {pheno_util.model} model is " + f"unable to be simulated with pFBA and yields a < {e} > error." + ) + sol_dict = FBAHelper.solution_to_variables_dict(pheno_sol, pheno_util.model) + simulated_growth = sum( + [ + flux + for var, flux in sol_dict.items() + if re.search(r"(^bio\d+$)", var.name) + ] + ) + if not isclose(simulated_growth, min_growth): + display( + [ + (rxn, flux) + for rxn, flux in pheno_sol.fluxes.items() + if "EX_" in rxn and flux != 0 + ] + ) + raise ObjectiveError( + f"The assigned minimal_growth of {min_growth} was not optimized" + f" during the simulation, where the observed growth was {simulated_growth}." + ) + + ## store solution fluxes and update the community_members phenotypes + met_name = strip_comp(name).replace(" ", "-") + col = content["name"] + "_" + met_name + models[pheno_util.model.id]["solutions"][col] = pheno_sol + solutions.append( + models[pheno_util.model.id]["solutions"][col].objective_value + ) + met_name = met_name.replace("_", "-").replace("~", "-") + if all_phenotypes: + if "phenotypes" not in comm_members[org_model]: + comm_members[org_model]["phenotypes"] = { + met_name: {"consumed": [strip_comp(metID)]} + } + if met_name not in comm_members[org_model]["phenotypes"]: + comm_members[org_model]["phenotypes"].update( + {met_name: {"consumed": [strip_comp(metID)]}} + ) + else: + comm_members[org_model]["phenotypes"][met_name]["consumed"] = [ + strip_comp(metID) + ] + met_pheno = content["phenotypes"][met_name] + if ( + "excreted" in met_pheno + and strip_comp(metID) in met_pheno["excreted"] + ): + comm_members[org_model]["phenotypes"][met_name].update( + {"excreted": met_pheno} + ) + past_phenoRXNs.append(phenoRXN.id) + + # construct the parsed table of all exchange fluxes for each phenotype + cols = {} + ## biomass row + cols["rxn"] = ["bio"] + for content in models.values(): + for col in content["solutions"]: + cols[col] = [0] + if col not in content["solutions"]: + continue + bio_rxns = [x for x in content["solutions"][col].fluxes.index if "bio" in x] + flux = mean( + [ + content["solutions"][col].fluxes[rxn] + for rxn in bio_rxns + if content["solutions"][col].fluxes[rxn] != 0 + ] + ) + cols[col] = [flux] + ## exchange reactions rows + looped_cols = cols.copy() + looped_cols.pop("rxn") + for content in models.values(): + for ex_rxn in content["exchanges"]: + cols["rxn"].append(ex_rxn.id) + for col in looped_cols: + ### reactions that are not present in the columns are ignored + flux = ( + 0 + if ( + col not in content["solutions"] + or ex_rxn.id not in list(content["solutions"][col].fluxes.index) + ) + else content["solutions"][col].fluxes[ex_rxn.id] + ) + cols[col].append(flux) + ## construct the DataFrame + fluxes_df = DataFrame(data=cols) + fluxes_df.index = fluxes_df["rxn"] + fluxes_df.drop("rxn", axis=1, inplace=True) + fluxes_df = fluxes_df.groupby(fluxes_df.index).sum() + fluxes_df = fluxes_df.loc[(fluxes_df != 0).any(axis=1)] + fluxes_df.astype(str) + # fluxes_df.to_csv("fluxes.csv") + return fluxes_df, comm_members diff --git a/modelseedpy/community/commkineticpkg.py b/modelseedpy/community/commkineticpkg.py index 81e1479e..c84a122c 100644 --- a/modelseedpy/community/commkineticpkg.py +++ b/modelseedpy/community/commkineticpkg.py @@ -30,7 +30,8 @@ def build_package(self, kinetic_coef, community_model=None): def build_constraint(self, species): coef = { - species.biomasses[0].forward_variable: -1 * self.parameters["kinetic_coef"] + species.biomasses[0].forward_variable: -1 * self.parameters["kinetic_coef"], + species.biomasses[0].reverse_variable: self.parameters["kinetic_coef"] } for reaction in self.model.reactions: if ( diff --git a/modelseedpy/community/commphitting.py b/modelseedpy/community/commphitting.py new file mode 100644 index 00000000..939a5b21 --- /dev/null +++ b/modelseedpy/community/commphitting.py @@ -0,0 +1,2537 @@ +# -*- coding: utf-8 -*- +# from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.exceptions import ( + FeasibilityError, + ParameterError, + ObjectAlreadyDefinedError, + NoFluxError, +) +from modelseedpy.core.optlanghelper import ( + OptlangHelper, + Bounds, + tupVariable, + tupConstraint, + tupObjective, + isIterable, + define_term, +) +from modelseedpy.community.datastandardization import GrowthData +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.biochem import from_local +from scipy.constants import hour, minute +from zipfile import ZipFile, ZIP_LZMA +from optlang import Model, Objective +from time import sleep, process_time +from typing import Union, Iterable +from optlang.symbolics import Zero +from scipy.optimize import newton +from matplotlib import pyplot +from math import inf, isclose +from deepdiff import DeepDiff +from pandas import DataFrame +from itertools import chain +from pprint import pprint +from h5py import File +import numpy as np +import cobra.io + +# from cplex import Cplex +import warnings, logging, json, os, re + +logger = logging.getLogger(__name__) + + +def dict_keys_exists(dic, *keys): + result = keys[0] in dic + if keys[0] in dic: + remainingKeys = keys[1:] + if len(remainingKeys) > 0: + result = dict_keys_exists(dic[keys[0]], *remainingKeys) + return result + return result + + +def find_dic_number(dic): + for k, v in dic.items(): + if FBAHelper.isnumber(v): + return v + num = find_dic_number(dic[k]) + return num + + +def trial_contents(short_code, indices_tup, values): + matches = [ele == short_code for ele in indices_tup] + return np.array(values)[matches] + + +def dic_keys(dic): + keys = [] + if isinstance(dic, dict): + for key, value in dic.items(): + keys.append(key) + keys.extend(dic_keys(value)) + return keys + + +# define data objects +def _name(name, suffix, short_code, timestep, names): + name = "-".join( + [x for x in list(map(str, [name + suffix, short_code, timestep])) if x] + ) + if name not in names: + names.append(name) + return name + else: + pprint(names) + raise ObjectAlreadyDefinedError( + f"The object {name} is already defined for the problem." + ) + + +def _export_model_json(json_model, path): + with open(path, "w") as lp: + json.dump(json_model, lp, indent=3) + + +def _met_id_parser(met): + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + met_id = met_id.replace("c_", "", 1) + return met_id + + +# define an entity as a variable or a constant +def _obj_val( + primal, name, pheno, short_code, timestep, bounds, data_timestep_hr, names +): + time_hr = int(timestep) * data_timestep_hr + return ( + tupVariable(_name(name, pheno, short_code, timestep, names), Bounds=bounds) + if not primal + else primal[short_code][name + pheno][time_hr] + ) + + +def _michaelis_menten(conc, vmax, km): + return (conc * vmax) / (km + conc) + + +def clamp(val, minimum, maximum): + return min(max(val, minimum), maximum) + + +# parse primal values for use in the optimization loops +def parse_primals(primal_values, entity_labels=None, coefs=None, kcat_vals=None): + if kcat_vals: + kcat_primal = {} + for trial, content in primal_values.items(): + for primal, time_value in content.items(): + if "bin" not in primal: + continue + name, trial = primal.split("-") + number = re.search(r"(\d)", name).group() + species, pheno = re.sub(r"(bin\d_)", "", name).split("_") + if "stationary" in pheno: + continue + if species not in kcat_primal: + kcat_primal[species] = {} + if pheno not in kcat_primal[species]: + kcat_primal[species][pheno] = 0 + # kcat_(k,new) = sum_z^Z ( kcat_z * bin_k^z ) * kcat_(k,old) < 10 + if time_value == 0 and kcat_primal[species][pheno] < 10: + kcat_primal[species][pheno] += ( + coefs[int(number) - 1] * kcat_vals[species][pheno] + ) + kcat_primal[species][pheno] = clamp( + kcat_primal[species][pheno], 1e-4, 10 + ) + return kcat_primal + select_primals = {} + for trial, entities in primal_values.items(): + select_primals[trial] = {} + for entity, times in entities.items(): + # a poor man's dictionary copy + if any([label in entity for label in entity_labels]): + select_primals[trial][entity] = dict(list(times.items())) + return select_primals + + +def signal_species(signal): + return signal.split(":")[0].replace(" ", "_") + + +def _partition_coefs(initial_val, divisor): + return ( + initial_val, + initial_val / divisor, + initial_val / divisor**2, + initial_val / divisor**3, + initial_val / divisor**4, + ) + + +biomass_partition_coefs = [ + _partition_coefs(10, 10), + _partition_coefs(2, 2), + _partition_coefs(1, 3), +] + + +class CommPhitting: + + def __init__( + self, + msdb_path, + community_members: dict = None, + fluxes_df=None, + data_df=None, + carbon_conc=None, + media_conc=None, + experimental_metadata=None, + base_media=None, + solver: str = "glpk", + all_phenotypes=True, + data_paths: dict = None, + species_abundances: str = None, + ignore_trials: Union[dict, list] = None, + ignore_timesteps: list = None, + species_identities_rows=None, + significant_deviation: float = 2, + extract_zip_path: str = None, + determine_requisite_biomass: bool = True, + consumed_mets: iter = None, + ): + self.msdb = from_local(msdb_path) + self.msdb_path = msdb_path + self.solver = solver + self.all_phenotypes = all_phenotypes + self.data_paths = data_paths + self.species_abundances = species_abundances + self.ignore_trials = ignore_trials + self.ignore_timesteps = ignore_timesteps + self.species_identities_rows = species_identities_rows + self.significant_deviation = significant_deviation + self.extract_zip_path = extract_zip_path + + self.community_members = community_members + self.consumed_mets = consumed_mets or set( + [ + met + for content in community_members.values() + for met in content["phenotypes"] + ] + ) + if community_members is not None or any( + [x is None for x in [fluxes_df, data_df]] + ): + ( + self.experimental_metadata, + data_df, + fluxes_df, + carbon_conc, + self.requisite_biomass, + self.trial_name_conversion, + self.data_timestep_hr, + simulation_timestep, + media_conc, + ) = GrowthData.process( + community_members, + base_media, + solver, + all_phenotypes, + data_paths, + species_abundances, + carbon_conc, + ignore_trials, + ignore_timesteps, + species_identities_rows, + significant_deviation, + extract_zip_path, + determine_requisite_biomass, + ) + # for content in community_members.values() for met in content["phenotypes"]] + self.fluxes_tup = FBAHelper.parse_df(fluxes_df) + self.fluxes_df = fluxes_df + self.data_df = data_df + self.default_excreta = [ + index for index, row in fluxes_df.iterrows() if any(row > 1) + ] + self.parameters, self.variables, self.constraints = {}, {}, {} + self.zipped_output, self.plots, self.names = [], [], [] + self.experimental_metadata = experimental_metadata + self.carbon_conc = carbon_conc + self.media_conc = media_conc + + #################### FITTING PHASE METHODS #################### + + def fit_kcat( + self, + parameters: dict = None, + mets_to_track: list = None, + rel_final_conc: dict = None, + zero_start: list = None, + abs_final_conc: dict = None, + graphs: list = None, + data_timesteps: dict = None, + export_zip_name: str = None, + export_parameters: bool = True, + requisite_biomass: dict = None, + export_lp: str = f"solveKcat.lp", + figures_zip_name: str = None, + publishing=True, + primals_export_path=None, + ): + if export_zip_name and os.path.exists(export_zip_name): + os.remove(export_zip_name) + kcat_primal = None + requisite_biomass = requisite_biomass or self.requisite_biomass + for index, coefs in enumerate(biomass_partition_coefs): + # solve for growth rate constants with the previously solved biomasses + newSim = CommPhitting( + self.msdb_path, + None, + self.fluxes_df, + self.data_df, + self.carbon_conc, + self.media_conc, + self.experimental_metadata, + None, + self.solver, + self.all_phenotypes, + self.data_paths, + self.species_abundances, + self.ignore_trials, + self.ignore_timesteps, + self.species_identities_rows, + self.significant_deviation, + self.extract_zip_path, + True, + self.consumed_mets, + ) + newSim.define_problem( + parameters, + mets_to_track, + rel_final_conc, + zero_start, + abs_final_conc, + data_timesteps, + export_zip_name, + export_parameters, + export_lp, + kcat_primal, + coefs, + requisite_biomass, + ) + newSim.compute( + graphs, + export_zip_name, + figures_zip_name, + publishing, + primals_export_path or re.sub(r"(.lp)", ".json", export_lp), + ) + kcat_primal = parse_primals( + newSim.values, coefs=coefs, kcat_vals=newSim.parameters["kcat"] + ) + pprint(kcat_primal) + print(f"Interation {index+1} is complete\n") + kcats = {k: val for k, val in newSim.values.items() if "kcat" in k} + DataFrame(kcats).T.to_csv("pheno_growth_kcat.tsv", sep="\t") + return kcats + + def fit( + self, + parameters: dict = None, + mets_to_track: list = None, + rel_final_conc: dict = None, + zero_start: list = None, + abs_final_conc: dict = None, + graphs: list = None, + data_timesteps: dict = None, + export_zip_name: str = None, + export_parameters: bool = True, + requisite_biomass: dict = None, + export_lp: str = "CommPhitting.lp", + figures_zip_name: str = None, + publishing: bool = False, + primals_export_path=None, + ): + if hasattr(self, "requisite_biomass"): + requisite_biomass = self.requisite_biomass + self.define_problem( + parameters, + mets_to_track, + rel_final_conc, + zero_start, + abs_final_conc, + data_timesteps, + export_zip_name, + export_parameters, + export_lp, + None, + None, + requisite_biomass, + ) + self.compute( + graphs, + export_zip_name, + figures_zip_name, + publishing, + primals_export_path or re.sub(r"(.lp)", ".json", export_lp), + ) + + def define_b_vars(self, pheno, short_code, timestep, variables): + self.variables["b_" + pheno][short_code][timestep] = tupVariable( + _name("b_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b1_" + pheno][short_code][timestep] = tupVariable( + _name("b1_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b2_" + pheno][short_code][timestep] = tupVariable( + _name("b2_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b3_" + pheno][short_code][timestep] = tupVariable( + _name("b3_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b4_" + pheno][short_code][timestep] = tupVariable( + _name("b4_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b5_" + pheno][short_code][timestep] = tupVariable( + _name("b5_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + variables.extend( + [ + self.variables["b_" + pheno][short_code][timestep], + self.variables["b1_" + pheno][short_code][timestep], + self.variables["b2_" + pheno][short_code][timestep], + self.variables["b3_" + pheno][short_code][timestep], + self.variables["b4_" + pheno][short_code][timestep], + self.variables["b5_" + pheno][short_code][timestep], + ] + ) + if short_code not in self.variables[f"bin1_{pheno}"]: + self.variables[f"bin1_{pheno}"][short_code] = tupVariable( + _name("bin1_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin2_{pheno}"][short_code] = tupVariable( + _name("bin2_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin3_{pheno}"][short_code] = tupVariable( + _name("bin3_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin4_{pheno}"][short_code] = tupVariable( + _name("bin4_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin5_{pheno}"][short_code] = tupVariable( + _name("bin5_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + variables.extend( + [ + self.variables[f"bin1_{pheno}"][short_code], + self.variables[f"bin2_{pheno}"][short_code], + self.variables[f"bin3_{pheno}"][short_code], + self.variables[f"bin4_{pheno}"][short_code], + self.variables[f"bin5_{pheno}"][short_code], + ] + ) + return variables + + def define_b_cons(self, pheno, short_code, timestep, biomass_coefs): + biomass_coefs = biomass_coefs or biomass_partition_coefs[-1] + # define the partitioned biomass groups + ## b_n{pheno,t} <= coef*b_tot{pheno,t} + self.constraints["b1c_" + pheno][short_code][timestep] = tupConstraint( + _name("b1c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[0], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b1_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b2c_" + pheno][short_code][timestep] = tupConstraint( + _name("b2c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[1], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b2_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b3c_" + pheno][short_code][timestep] = tupConstraint( + _name("b3c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[2], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b3_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b4c_" + pheno][short_code][timestep] = tupConstraint( + _name("b4c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[3], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b4_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b5c_" + pheno][short_code][timestep] = tupConstraint( + _name("b5c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[4], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b5_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + + # define the comprehensive biomass constraints + ## coef*b{pheno,t} - b_n{pheno,t} - 1000*bin_n{pheno} <= 0 + self.constraints["b1c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b1c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[0], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b1_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin1_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b2c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b2c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[1], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b2_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin2_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b3c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b3c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[2], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b3_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin3_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b4c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b4c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[3], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b4_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin4_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b5c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b5c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[4], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b5_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin5_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + + # define the binary constraints + ## b_n{pheno,t} <= 1000 - 1000*bin_n{pheno} + self.constraints["bin1c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin1c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b1_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin1_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin2c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin2c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b2_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin2_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin3c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin3c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b3_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin3_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin4c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin4c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b4_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin4_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin5c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin5c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b5_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin5_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + + # load the constraints to the model + return [ + self.constraints["b1c_" + pheno][short_code][timestep], + self.constraints["b2c_" + pheno][short_code][timestep], + self.constraints["b3c_" + pheno][short_code][timestep], + self.constraints["b4c_" + pheno][short_code][timestep], + self.constraints["b5c_" + pheno][short_code][timestep], + self.constraints["b1c_control_" + pheno][short_code][timestep], + self.constraints["b2c_control_" + pheno][short_code][timestep], + self.constraints["b3c_control_" + pheno][short_code][timestep], + self.constraints["b4c_control_" + pheno][short_code][timestep], + self.constraints["b5c_control_" + pheno][short_code][timestep], + self.constraints["bin1c_" + pheno][short_code][timestep], + self.constraints["bin2c_" + pheno][short_code][timestep], + self.constraints["bin3c_" + pheno][short_code][timestep], + self.constraints["bin4c_" + pheno][short_code][timestep], + self.constraints["bin5c_" + pheno][short_code][timestep], + ] + + def initialize_vars_cons(self, pheno, short_code): + # cvt and cvf + self.variables["cvt_" + pheno] = {} + self.variables["cvf_" + pheno] = {} + self.variables["cvt_" + pheno][short_code] = {} + self.variables["cvf_" + pheno][short_code] = {} + # total biomass and growth + self.variables["b_" + pheno] = {} + self.variables["g_" + pheno] = {} + self.variables["b_" + pheno][short_code] = {} + self.variables["g_" + pheno][short_code] = {} + self.constraints["gc_" + pheno] = {} + self.constraints["cvc_" + pheno] = {} + self.constraints["gc_" + pheno][short_code] = {} + self.constraints["cvc_" + pheno][short_code] = {} + # partitioned biomasses + self.variables["b1_" + pheno] = {} + self.variables["b2_" + pheno] = {} + self.variables["b3_" + pheno] = {} + self.variables["b4_" + pheno] = {} + self.variables["b5_" + pheno] = {} + self.variables["b1_" + pheno][short_code] = {} + self.variables["b2_" + pheno][short_code] = {} + self.variables["b3_" + pheno][short_code] = {} + self.variables["b4_" + pheno][short_code] = {} + self.variables["b5_" + pheno][short_code] = {} + ## biomass binary variables + self.variables[f"bin1_{pheno}"] = {} + self.variables[f"bin2_{pheno}"] = {} + self.variables[f"bin3_{pheno}"] = {} + self.variables[f"bin4_{pheno}"] = {} + self.variables[f"bin5_{pheno}"] = {} + self.variables[f"bin1_{pheno}"][short_code] = {} + self.variables[f"bin2_{pheno}"][short_code] = {} + self.variables[f"bin3_{pheno}"][short_code] = {} + self.variables[f"bin4_{pheno}"][short_code] = {} + self.variables[f"bin5_{pheno}"][short_code] = {} + ## biomass partition constraints + self.constraints["b1c_" + pheno] = {} + self.constraints["b2c_" + pheno] = {} + self.constraints["b3c_" + pheno] = {} + self.constraints["b4c_" + pheno] = {} + self.constraints["b5c_" + pheno] = {} + self.constraints["b1c_" + pheno][short_code] = {} + self.constraints["b2c_" + pheno][short_code] = {} + self.constraints["b3c_" + pheno][short_code] = {} + self.constraints["b4c_" + pheno][short_code] = {} + self.constraints["b5c_" + pheno][short_code] = {} + self.constraints["b1c_control_" + pheno] = {} + self.constraints["b2c_control_" + pheno] = {} + self.constraints["b3c_control_" + pheno] = {} + self.constraints["b4c_control_" + pheno] = {} + self.constraints["b5c_control_" + pheno] = {} + self.constraints["b1c_control_" + pheno][short_code] = {} + self.constraints["b2c_control_" + pheno][short_code] = {} + self.constraints["b3c_control_" + pheno][short_code] = {} + self.constraints["b4c_control_" + pheno][short_code] = {} + self.constraints["b5c_control_" + pheno][short_code] = {} + self.constraints[f"binc_{pheno}"] = {} + self.constraints[f"binc_{pheno}"][short_code] = {} + self.constraints["bin1c_" + pheno] = {} + self.constraints["bin2c_" + pheno] = {} + self.constraints["bin3c_" + pheno] = {} + self.constraints["bin4c_" + pheno] = {} + self.constraints["bin5c_" + pheno] = {} + self.constraints["bin1c_" + pheno][short_code] = {} + self.constraints["bin2c_" + pheno][short_code] = {} + self.constraints["bin3c_" + pheno][short_code] = {} + self.constraints["bin4c_" + pheno][short_code] = {} + self.constraints["bin5c_" + pheno][short_code] = {} + + def get_timestep_bin(self, timestep): + if timestep < self.first: + return 0 + elif timestep < self.second: + return 1 + elif timestep < self.third: + return 2 + elif timestep < self.fourth: + return 3 + return 4 + + def define_problem( + self, + parameters=None, + mets_to_track=None, + rel_final_conc=None, + zero_start=None, + abs_final_conc=None, + data_timesteps=None, + export_zip_name: str = None, + export_parameters: bool = True, + export_lp: str = "CommPhitting.lp", + primal_values=None, + biomass_coefs=None, + requisite_biomass: dict = None, + biolog_simulation=False, + export_phenotype_profiles=True, + ): + # parse the growth data + growth_tup = FBAHelper.parse_df(self.data_df, False) + self.phenotypes = list(self.fluxes_tup.columns) + self.phenotypes.extend( + [ + signal_species(signal) + "_stationary" + for signal in growth_tup.columns + if (":" in signal and "OD" not in signal) + ] + ) + self.species_list = [ + signal_species(signal) for signal in growth_tup.columns if ":" in signal + ] + num_sorted = np.sort(np.array([int(obj[1:]) for obj in set(growth_tup.index)])) + # TODO - short_codes must be distinguished for different conditions + unique_short_codes = [ + f"{growth_tup.index[0][0]}{num}" for num in map(str, num_sorted) + ] + full_times = growth_tup.values[:, growth_tup.columns.index("Time (s)")] + self.times = { + short_code: trial_contents(short_code, growth_tup.index, full_times) + for short_code in unique_short_codes + } + average_time_series = np.mean(list(self.times.values()), axis=0) + points = len(average_time_series) + self.first, self.second, self.third, self.fourth = ( + int(points * 0.1), + int(points * 0.25), + int(points * 0.45), + int(points * 0.7), + ) + self.time_ranges = { + 0: average_time_series[: self.first], + 1: average_time_series[self.first : self.second], + 2: average_time_series[self.second : self.third], + 3: average_time_series[self.third : self.fourth], + 4: average_time_series[self.fourth :], + } + + # define default values + # TODO render bcv and cvmin dependent upon temperature, and possibly trained on Carlson's data + parameters, data_timesteps = parameters or {}, data_timesteps or {} + self.parameters["data_timestep_hr"] = ( + np.mean(np.diff(np.array(list(self.times.values())).flatten())) / hour + if not hasattr(self, "data_timestep_hr") + else self.data_timestep_hr + ) + self.parameters.update( + { + "timestep_hr": self.parameters["data_timestep_hr"], + "cvct": 0.01, + "cvcf": 0.01, + "bcv": 0.01, + "cvmin": 0.01, + "kcat": 0.33, + "diffpos": 1, + "diffneg": 1, # coefficients that weight difference between experimental and predicted biomass + "stationary": 10, # the penalty coefficient for the stationary phenotype + } + ) + self.parameters.update(parameters) + # distribute kcat values to all phenotypes of all species and update from previous simulations where necessary + self.parameters.update( + self._universalize(self.parameters, "kcat", exclude=["stationary"]) + ) + if primal_values is not None: + for species, content in self.parameters["kcat"].items(): + if species not in primal_values: + continue + for pheno, content2 in content.items(): + if pheno not in primal_values[species]: + continue + for time, val in content2.items(): + if time not in primal_values[species][pheno]: + continue + self.parameters["kcat"][species][pheno][time] = val + print(self.parameters["kcat"]) + # define the metabolites that are tracked, exchanged, and not available in the media + # TODO the default zero_start logic appears to be incorrect + self.zero_start = zero_start or [ + met + for met in self.consumed_mets + if (met not in self.carbon_conc or self.carbon_conc[met] == 0) + ] + self.rel_final_conc = rel_final_conc or { + met: 0.1 + for met, concs in self.carbon_conc.items() + if any( + [concs[short_code] > 0 for short_code in self.data_df.index.unique()] + ) + and met not in self.zero_start + } + self.abs_final_conc = abs_final_conc or {} + if mets_to_track: + self.mets_to_track = mets_to_track + elif not isinstance(rel_final_conc, dict): + self.mets_to_track = self.fluxes_tup.index + else: + self.mets_to_track = list(self.rel_final_conc.keys()) + self.zero_start + print(self.mets_to_track) + + ts_to_delete = ( + {} + ) # {short_code: full_times for short_code in unique_short_codes} + if data_timesteps: # {short_code:[times]} + for short_code, times in data_timesteps.items(): + ts_to_delete[short_code] = set(list(range(len(full_times)))) - set( + times + ) + self.times[short_code] = np.delete( + self.times[short_code], list(ts_to_delete[short_code]) + ) + + # construct the problem + objective = tupObjective( + "minimize variance and phenotypic transitions", [], "min" + ) + constraints, variables, simulated_mets = [], [], [] + time_1 = process_time() + for exID in self.fluxes_tup.index: + if exID == "bio": + continue + met_id = re.search(r"(cpd\d{5})", exID).group() + met = self.msdb.compounds.get_by_id(met_id) + if "C" not in met.elements: + continue + concID = f"c_{met_id}_e0" + simulated_mets.append(met_id) + self.variables[concID] = {} + self.constraints["dcc_" + met_id] = {} + + # define the growth rate for each metabolite and concentrations + # TODO the MM parameters may be deletable once the binned kcat method is refined + if "Vmax" and "Km" in self.parameters: + self.parameters["Vmax"].update( + self._universalize(self.parameters["Vmax"], met_id) + ) + self.parameters["Km"].update( + self._universalize(self.parameters["Km"], met_id) + ) + for short_code in unique_short_codes: + self.variables[concID][short_code] = {} + self.constraints["dcc_" + met_id][short_code] = {} + timesteps = list(range(1, len(self.times[short_code]) + 1)) + for timestep in timesteps: + ## define the concentration variables + conc_var = tupVariable( + _name(concID, "", short_code, timestep, self.names) + ) + ## constrain initial time concentrations to the media or a large default + if timestep == timesteps[0]: + initial_val = None + if met_id in self.media_conc: + initial_val = self.media_conc[met_id] + if met_id in self.zero_start: + initial_val = 0 + if dict_keys_exists(self.carbon_conc, met_id, short_code): + initial_val = self.carbon_conc[met_id][short_code] + if initial_val is not None: + conc_var = conc_var._replace( + bounds=Bounds(initial_val, initial_val) + ) + if biolog_simulation: + conc_var = conc_var._replace(bounds=Bounds(1, None)) + ## mandate complete carbon consumption + elif timestep == timesteps[-1] and ( + met_id in self.rel_final_conc or met_id in self.abs_final_conc + ): + if met_id in self.rel_final_conc: + final_bound = ( + self.variables[concID][short_code][1].bounds.lb + * self.rel_final_conc[met_id] + ) + if ( + met_id in self.abs_final_conc + ): # this intentionally overwrites rel_final_conc + final_bound = self.abs_final_conc[met_id] + conc_var = conc_var._replace(bounds=Bounds(0, final_bound)) + if met_id in self.zero_start: + conc_var = conc_var._replace( + bounds=Bounds(final_bound, final_bound) + ) + self.variables[concID][short_code][timestep] = conc_var + variables.append(self.variables[concID][short_code][timestep]) + for pheno in self.phenotypes: + self.constraints["dbc_" + pheno] = { + short_code: {} for short_code in unique_short_codes + } + + # define growth and biomass variables and constraints + for pheno in self.phenotypes: + for short_code in unique_short_codes: + self.initialize_vars_cons(pheno, short_code) + timesteps = list(range(1, len(self.times[short_code]) + 1)) + nth_percentile_timestep = timesteps[int(0.90 * len(timesteps))] + penalty_range = np.linspace( + self.parameters["stationary"], + self.parameters["stationary"] / 10, + len(timesteps[nth_percentile_timestep:]), + ) + timestep_excess_count = 0 + for timestep in map(int, timesteps): + variables = self.define_b_vars( + pheno, short_code, timestep, variables + ) + if short_code not in self.constraints[f"binc_{pheno}"]: + self.constraints[f"binc_{pheno}"][short_code] = tupConstraint( + _name("binc_", pheno, short_code, "", self.names), + Bounds(0, 4), + { + "elements": [ + self.variables[f"bin1_{pheno}"][short_code].name, + self.variables[f"bin2_{pheno}"][short_code].name, + self.variables[f"bin3_{pheno}"][short_code].name, + self.variables[f"bin4_{pheno}"][short_code].name, + self.variables[f"bin5_{pheno}"][short_code].name, + ], + "operation": "Add", + }, + ) + constraints.append( + self.constraints[f"binc_{pheno}"][short_code] + ) + constraints.extend( + self.define_b_cons(pheno, short_code, timestep, biomass_coefs) + ) + + ## define the growth rate variable or primal value + species, phenotype = pheno.split("_") + self.variables["g_" + pheno][short_code][timestep] = tupVariable( + _name("g_", pheno, short_code, timestep, self.names) + ) + variables.append(self.variables["g_" + pheno][short_code][timestep]) + + if "stationary" in pheno: + weight = self.parameters["stationary"] + if timestep > nth_percentile_timestep: + weight = penalty_range[timestep_excess_count] + timestep_excess_count += 1 + objective.expr.extend( + [ + { + "elements": [ + { + "elements": [ + weight, + self.variables["b_" + pheno][ + short_code + ][timestep].name, + ], + "operation": "Mul", + } + ], + "operation": "Add", + } + ] + ) + continue + # the conversion rates to and from the stationary phase + self.variables["cvt_" + pheno][short_code][timestep] = tupVariable( + _name("cvt_", pheno, short_code, timestep, self.names), + Bounds(0, 100), + ) + self.variables["cvf_" + pheno][short_code][timestep] = tupVariable( + _name("cvf_", pheno, short_code, timestep, self.names), + Bounds(0, 100), + ) + variables.extend( + [ + self.variables["cvf_" + pheno][short_code][timestep], + self.variables["cvt_" + pheno][short_code][timestep], + ] + ) + + # cvt <= bcv*b_{pheno} + cvmin + self.constraints["cvc_" + pheno][short_code][timestep] = ( + tupConstraint( + _name("cvc_", pheno, short_code, timestep, self.names), + (0, None), + { + "elements": [ + { + "elements": [ + -1, + self.variables["cvt_" + pheno][short_code][ + timestep + ].name, + ], + "operation": "Mul", + } + ], + "operation": "Add", + }, + ) + ) + # biomass_term = [self.parameters['bcv']*b_value + self.parameters['cvmin']] if FBAHelper.isnumber(b_value) else [ + biomass_term = [ + self.parameters["cvmin"], + { + "elements": [ + self.parameters["bcv"], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ] + self.constraints["cvc_" + pheno][short_code][timestep].expr[ + "elements" + ].extend(biomass_term) + + # g_{pheno} = b_{pheno}*v_{pheno} + b_values = [ + self.variables["b1_" + pheno][short_code][timestep].name, + self.variables["b2_" + pheno][short_code][timestep].name, + self.variables["b3_" + pheno][short_code][timestep].name, + self.variables["b4_" + pheno][short_code][timestep].name, + self.variables["b5_" + pheno][short_code][timestep].name, + ] + self.constraints["gc_" + pheno][short_code][timestep] = ( + tupConstraint( + name=_name("gc_", pheno, short_code, timestep, self.names), + expr={ + "elements": [ + *[ + { + "elements": [ + -self.parameters["kcat"][species][ + phenotype + ], + b, + ], + "operation": "Mul", + } + for b in b_values + ], + self.variables["g_" + pheno][short_code][ + timestep + ].name, + ], + "operation": "Add", + }, + ) + ) + + constraints.extend( + [ + self.constraints["cvc_" + pheno][short_code][timestep], + self.constraints["gc_" + pheno][short_code][timestep], + ] + ) + # self.constraints["binTot_" + pheno][short_code]]) + + # define the concentration constraint + half_dt = self.parameters["data_timestep_hr"] / 2 + time_2 = process_time() + print( + f"Done with concentrations and biomass loops: {(time_2 - time_1) / 60} min" + ) + for r_index, met in enumerate(self.fluxes_tup.index): + met_id = _met_id_parser(met) + if met_id not in simulated_mets: + continue + concID = f"c_{met_id}_e0" + for short_code in unique_short_codes: + timesteps = list(range(1, len(self.times[short_code]) + 1)) + for timestep in timesteps[:-1]: + # c_{met} + dt/2*sum_k^K(n_{k,met} * (g_{pheno}+g+1_{pheno})) = c+1_{met} + next_timestep = timestep + 1 + growth_phenos = [ + [ + self.variables["g_" + pheno][short_code][ + next_timestep + ].name, + self.variables["g_" + pheno][short_code][timestep].name, + ] + for pheno in self.fluxes_tup.columns + ] + self.constraints["dcc_" + met_id][short_code][timestep] = ( + tupConstraint( + name=_name( + "dcc_", met_id, short_code, timestep, self.names + ), + expr={ + "elements": [ + self.variables[concID][short_code][timestep].name, + { + "elements": [ + -1, + self.variables[concID][short_code][ + next_timestep + ].name, + ], + "operation": "Mul", + }, + *OptlangHelper.dot_product( + growth_phenos, + heuns_coefs=half_dt + * self.fluxes_tup.values[r_index], + ), + ], + "operation": "Add", + }, + ) + ) + constraints.append( + self.constraints["dcc_" + met_id][short_code][timestep] + ) + + # define the conversion variables of every signal for every phenotype + # for signal in growth_tup.columns[2:]: + # for pheno in self.fluxes_tup.columns: + # conversion_name = "_".join([signal, pheno, "__conversion"]) + # self.variables[conversion_name] = tupVariable(conversion_name) + # variables.append(self.variables[conversion_name]) + + time_3 = process_time() + print(f"Done with DCC loop: {(time_3 - time_2) / 60} min") + species_phenos = {} + self.conversion_bounds = [5e-6, 50] + for index, org_signal in enumerate(growth_tup.columns[2:]): + # signal = org_signal.split(":")[1] + signal = org_signal.replace(":", "|") + species = signal_species(org_signal) + species_phenos[species] = { + None if "OD" in species else f"{species}_stationary" + } + signal_column_index = index + 2 + data_timestep = 1 + self.variables[signal + "|conversion"] = tupVariable( + signal + "|conversion", bounds=Bounds(*self.conversion_bounds) + ) + variables.append(self.variables[signal + "|conversion"]) + + self.variables[signal + "|bio"] = {} + self.variables[signal + "|diffpos"] = {} + self.variables[signal + "|diffneg"] = {} + self.variables["g_" + species] = {} + self.constraints[signal + "|bioc"] = {} + self.constraints[signal + "|diffc"] = {} + self.constraints["gc_" + species] = {} + self.constraints["totVc_" + species] = {} + self.constraints["totGc_" + species] = {} + self.constraints[signal + "|bio_finalc"] = {} + for short_code in unique_short_codes: + self.variables[signal + "|bio"][short_code] = {} + self.variables[signal + "|diffpos"][short_code] = {} + self.variables[signal + "|diffneg"][short_code] = {} + self.variables["g_" + species][short_code] = {} + self.constraints[signal + "|bioc"][short_code] = {} + self.constraints[signal + "|diffc"][short_code] = {} + self.constraints["gc_" + species][short_code] = {} + self.constraints["totVc_" + species][short_code] = {} + self.constraints["totGc_" + species][short_code] = {} + # self.constraints[signal + '|bio_finalc'][short_code] = {} + # the value entries are matched to only the timesteps that are condoned by data_timesteps + values_slice = trial_contents( + short_code, growth_tup.index, growth_tup.values + ) + if ts_to_delete: + values_slice = np.delete( + values_slice, list(ts_to_delete[short_code]), axis=0 + ) + timesteps = list(range(1, len(values_slice) + 1)) + # the last timestep is omitted since Heun's method in the modelled biomass + ## requires a future timestep, which does not exist for the last timestep + for timestep in timesteps[:-1]: + ## the user timestep and data timestep must be synchronized + if ( + int(timestep) * self.parameters["timestep_hr"] + < data_timestep * self.parameters["data_timestep_hr"] + ): + print( + f"Skipping timestep {timestep} that does not align with the user's timestep" + ) + continue + data_timestep += 1 + if data_timestep > int( + self.times[short_code][-1] / self.parameters["data_timestep_hr"] + ): + print( + f"The user-defined time exceeds the simulation time, so the DBC & diff loop is broken." + ) + break + next_timestep = int(timestep) + 1 + ## the phenotype transition terms are aggregated + total_biomass, signal_sum, from_sum, to_sum = [], [], [], [] + for pheno_index, pheno in enumerate(self.phenotypes): + ### define the collections of signal and pheno terms + if species in pheno or "OD" in signal: + # if not FBAHelper.isnumber(b_values[pheno][short_code][timestep]): + signal_sum.append( + { + "operation": "Mul", + "elements": [ + -1, + self.variables["b_" + pheno][short_code][ + timestep + ].name, + ], + } + ) + # else: + # signal_sum.append(-b_values[pheno][short_code][timestep]) + ### total_biomass.append(self.variables["b_"+pheno][short_code][timestep].name) + if all( + [ + "OD" not in signal, + species in pheno, + "stationary" not in pheno, + ] + ): + species_phenos[species].add(pheno) + from_sum.append( + { + "operation": "Mul", + "elements": [ + -1, + self.variables["cvf_" + pheno][short_code][ + timestep + ].name, + ], + } + ) + to_sum.append( + self.variables["cvt_" + pheno][short_code][ + timestep + ].name + ) + for pheno in species_phenos[species]: + if "OD" in signal: + continue + # print(pheno, timestep, b_values[pheno][short_code][timestep], b_values[pheno][short_code][next_timestep]) + if "stationary" in pheno: + # b_{phenotype} - sum_k^K(es_k*cvf) + sum_k^K(pheno_bool*cvt) = b+1_{phenotype} + self.constraints["dbc_" + pheno][short_code][timestep] = ( + tupConstraint( + name=_name( + "dbc_", pheno, short_code, timestep, self.names + ), + expr={ + "elements": [*from_sum, *to_sum], + "operation": "Add", + }, + ) + ) + else: + # b_{phenotype} + dt/2*(g_{phenotype} + g+1_{phenotype}) + cvf-cvt = b+1_{phenotype} + self.constraints["dbc_" + pheno][short_code][timestep] = ( + tupConstraint( + name=_name( + "dbc_", pheno, short_code, timestep, self.names + ), + expr={ + "elements": [ + self.variables["cvf_" + pheno][short_code][ + timestep + ].name, + { + "elements": [ + half_dt, + self.variables["g_" + pheno][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + half_dt, + self.variables["g_" + pheno][ + short_code + ][next_timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["cvt_" + pheno][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + ) + # if not FBAHelper.isnumber(self.variables['b_' + pheno][short_code][timestep]): + biomass_term = [ + self.variables["b_" + pheno][short_code][timestep].name, + { + "elements": [ + -1, + self.variables["b_" + pheno][short_code][ + next_timestep + ].name, + ], + "operation": "Mul", + }, + ] + # else: + # biomass_term = [b_values[pheno][short_code][timestep]-b_values[pheno][short_code][next_timestep]] + self.constraints["dbc_" + pheno][short_code][timestep].expr[ + "elements" + ].extend(biomass_term) + constraints.append( + self.constraints["dbc_" + pheno][short_code][timestep] + ) + + if not requisite_biomass or any( + [ + timestep != timesteps[-2], + signal not in requisite_biomass[short_code], + ] + ): + self.variables[signal + "|bio"][short_code][timestep] = ( + tupVariable( + _name(signal, "|bio", short_code, timestep, self.names) + ) + ) + else: + biomass_flux = requisite_biomass[short_code][signal]["bio"] + estimated_biomass = biomass_flux # * int(timestep)*self.parameters['data_timestep_hr'] + self.variables[signal + "|bio"][short_code][timestep] = ( + tupVariable( + _name(signal, "|bio", short_code, timestep, self.names), + Bounds(estimated_biomass, None), + ) + ) + self.variables[signal + "|diffpos"][short_code][timestep] = ( + tupVariable( + _name(signal, "|diffpos", short_code, timestep, self.names), + Bounds(0, 100), + ) + ) + self.variables[signal + "|diffneg"][short_code][timestep] = ( + tupVariable( + _name(signal, "|diffneg", short_code, timestep, self.names), + Bounds(0, 100), + ) + ) + variables.extend( + [ + self.variables[signal + "|bio"][short_code][timestep], + self.variables[signal + "|diffpos"][short_code][timestep], + self.variables[signal + "|diffneg"][short_code][timestep], + ] + ) + + # {signal}__conversion*datum = {signal}__bio + # TODO - the conversion variable must be a constant for BIOLOG conditions + self.constraints[signal + "|bioc"][short_code][timestep] = ( + tupConstraint( + name=_name( + signal, "|bioc", short_code, timestep, self.names + ), + expr={ + "elements": [ + { + "elements": [ + -1, + self.variables[signal + "|bio"][short_code][ + timestep + ].name, + ], + "operation": "Mul", + }, + { + "elements": [ + self.variables[signal + "|conversion"].name, + values_slice[timestep, signal_column_index], + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + ) + constraints.append( + self.constraints[signal + "|bioc"][short_code][timestep] + ) + + # {speces}_bio + {signal}_diffneg-{signal}_diffpos = sum_k^K(es_k*b_{phenotype}) + self.constraints[signal + "|diffc"][short_code][timestep] = ( + tupConstraint( + name=_name( + signal, "|diffc", short_code, timestep, self.names + ), + expr={ + "elements": [ + self.variables[signal + "|bio"][short_code][ + timestep + ].name, + self.variables[signal + "|diffneg"][short_code][ + timestep + ].name, + { + "elements": [ + -1, + self.variables[signal + "|diffpos"][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + ) + if all([isinstance(val, dict) for val in signal_sum]): + self.constraints[signal + "|diffc"][short_code][timestep].expr[ + "elements" + ].extend(signal_sum) + else: + raise ValueError( + f"The {signal_sum} value has unexpected contents." + ) + constraints.append( + self.constraints[signal + "|diffc"][short_code][timestep] + ) + + objective.expr.extend( + [ + { + "elements": [ + { + "elements": [ + self.parameters["diffpos"], + self.variables[f"{signal}|diffpos"][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + self.parameters["diffneg"], + self.variables[f"{signal}|diffneg"][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + } + ] + ) + + time_4 = process_time() + print(f"Done with the DBC & diffc loop: {(time_4 - time_3) / 60} min") + + # construct the problem + self.problem = OptlangHelper.define_model( + "CommPhitting model", variables, constraints, objective, True + ) + self.hdf5_name = export_lp.replace(".lp", ".h5") + self.hdf5_file = File(self.hdf5_name, "w") + time_5 = process_time() + print( + f"Done with constructing the {type(self.problem)} model: {(time_5 - time_4) / 60} min" + ) + + # export contents + if export_phenotype_profiles: + phenotype_profiles_name = "phenotype_profiles.tsv" + self.fluxes_df.to_csv(phenotype_profiles_name, sep="\t") + self.zipped_output.append(phenotype_profiles_name) + if export_parameters: + parameter_name = "parameters.tsv" + DataFrame( + data=list(self.parameters.values()), + index=list(self.parameters.keys()), + columns=["values"], + ).to_csv(parameter_name, sep="\t") + self.zipped_output.append(parameter_name) + if export_lp: + if re.search(r"(\\\\/)", export_lp): + os.makedirs(os.path.dirname(export_lp), exist_ok=True) + with open(export_lp, "w") as lp: + lp.write(self.problem.to_lp()) + model_name = "CommPhitting.json" + _export_model_json(self.problem.to_json(), model_name) + self.zipped_output.extend([export_lp, model_name]) + if export_zip_name: + self.zip_name = export_zip_name + sleep(2) + with ZipFile(self.zip_name, "a", compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) + os.remove(file) + self.zipped_output.remove(file) + time_6 = process_time() + print(f"Done exporting the content: {(time_6 - time_5) / 60} min") + + def compute( + self, + graphs: list = None, + export_zip_name=None, + figures_zip_name=None, + publishing=False, + primals_export_path: str = "primal_values.json", + remove_empty_plots=False, + ): + print("starting optimization") + time1 = process_time() + self.values = {} + solution = self.problem.optimize() + timesteps = min(list(map(len, self.times.values()))) + fit_quality = self.problem.objective.value / timesteps + print(f"The optimization fit quality is {fit_quality}") + if "parameters.tsv" in self.zipped_output: + self.parameters["fit"] = fit_quality + parameter_name = "parameters.tsv" + DataFrame( + data=list(self.parameters.values()), + index=list(self.parameters.keys()), + columns=["values"], + ).to_csv(parameter_name, sep="\t") + with ZipFile(self.zip_name, "a", compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) + os.remove(file) + + # TODO approximate a threshold of good fits, and trigger black box optimization for bad fits + ## that iteratively adjust parameters until the fit metric surmounts the threshold. + + # categorize the primal values by trial and time + if "optimal" not in solution: + raise FeasibilityError( + f"The solution is sub-optimal, with a(n) {solution} status." + ) + if all(np.array(list(self.problem.primal_values.values())) == 0): + raise NoFluxError("The simulation lacks any flux.") + for variable, value in self.problem.primal_values.items(): + if "v_" in variable: + self.values[variable] = value + elif "conversion" in variable or re.search(r"(bin\d)", variable): + self.values[short_code].update({variable: value}) + if value in self.conversion_bounds: + warnings.warn( + f"The conversion factor {value} optimized to a bound, which may be " + f"indicative of an error, such as improper kinetic rates." + ) + else: + basename, short_code, timestep = variable.split("-") + time_hr = int(timestep) * self.parameters["data_timestep_hr"] + self.values[short_code] = self.values.get(short_code, {}) + self.values[short_code][basename] = self.values[short_code].get( + basename, {} + ) + self.values[short_code][basename][time_hr] = value + + # export the processed primal values for graphing + # with open(primals_export_path, 'w') as out: + # json.dump(self.values, out, indent=3) + # if not export_zip_name and hasattr(self, 'zip_name'): + # export_zip_name = self.zip_name + # if export_zip_name: + # with ZipFile(export_zip_name, 'a', compression=ZIP_LZMA) as zp: + # zp.write(primals_export_path) + # os.remove(primals_export_path) + # visualize the specified information + time2 = process_time() + if graphs: + self.graph( + graphs, + export_zip_name=figures_zip_name or export_zip_name, + publishing=publishing, + remove_empty_plots=remove_empty_plots, + ) + + # parse the primal values + values_df = DataFrame(self.values) + values_index = values_df.index.tolist() + for col in values_df.columns: + trial_values = values_df[col].tolist() + ## process the times + times = [list(ele.keys()) for ele in trial_values if isinstance(ele, dict)] + max_time = max(list(map(len, times))) + for max_time_series in times: + if len(max_time_series) == max_time: + break + trial_path = f"results/primals/{col}/" + self.hdf5_file.create_dataset(f"{trial_path}/times", data=max_time_series) + ## process the data values + for index, ele in enumerate(trial_values): + dataset_name = f"{trial_path}/{values_index[index]}" + if FBAHelper.isnumber(ele): + self.hdf5_file.create_dataset(dataset_name, data=[float(ele)]) + elif isinstance(ele, dict): + self.hdf5_file.create_dataset( + dataset_name, data=list(map(float, ele.values())) + ) + self.hdf5_file[dataset_name].attrs["full_time"] = ( + len(ele.values()) == max_time + ) + + self.hdf5_file.close() + with ZipFile(self.zip_name, "a", compression=ZIP_LZMA) as zp: + zp.write(self.hdf5_name) + os.remove(self.hdf5_name) + + time3 = process_time() + print(f"Optimization completed in {(time2-time1)/60} minutes") + print(f"Graphing completed in {(time3-time2)/60} minutes") + + def load_model( + self, + mscomfit_json_path: str = None, + zip_name: str = None, + model_to_load: dict = None, + ): + if zip_name: + with ZipFile(zip_name, "r") as zp: + zp.extract(mscomfit_json_path) + if mscomfit_json_path: + with open(mscomfit_json_path, "r") as mscmft: + return json.load(mscmft) + if model_to_load: + self.problem = Model.from_json(model_to_load) + + @staticmethod + def assign_values(param, var, next_dimension, kcat=True): + dic = {var: {}} + for dim1, dim2_list in next_dimension.items(): + if isinstance(dim2_list, dict): + dic[var].update(CommPhitting.assign_values(param, dim1, dim2_list)) + else: + if kcat: + dic[var][dim1] = param + else: + dic[var][dim1] = {dim2: param for dim2 in dim2_list} + return dic + + def _universalize(self, param, var, next_dimension=None, exclude=None, tsBin=False): + if not next_dimension: + next_dimension = {} + for organism in self.fluxes_tup.columns: + species, pheno = organism.split("_") + if pheno in exclude: + continue + if not tsBin: + if species in next_dimension: + next_dimension[species].append(pheno) + else: + next_dimension[species] = [pheno] + else: + if species in next_dimension: + next_dimension[species].update({pheno: self.time_ranges}) + else: + next_dimension[species] = {pheno: self.time_ranges} + if FBAHelper.isnumber(param): + return CommPhitting.assign_values(param, var, next_dimension) + elif FBAHelper.isnumber(param[var]): + return CommPhitting.assign_values(param[var], var, next_dimension) + elif isinstance(param[var], dict): + return { + var: { + dim1: {dim2: param[var][dim1] for dim2 in dim2_list} + for dim1, dim2_list in next_dimension.items() + } + } + else: + logger.critical( + f"The param (with keys {dic_keys(param)}) and var {var} are not amenable" + " with the parameterizing a universal value." + ) + # {short_code: {list(timestep_info.keys())[0]: find_dic_number(param)} for short_code, timestep_info in variable.items()}} + + def adjust_color(self, color, amount=0.5): + """ + Lightens the given color by multiplying (1-luminosity) by the given amount. + Input can be matplotlib color string, hex string, or RGB tuple. + + Examples: + >> lighten_color('g', 0.3) + >> lighten_color('#F034A3', 0.6) + >> lighten_color((.3,.55,.1), 0.5) + """ + import colorsys + import matplotlib.colors as mc + + try: + c = mc.cnames[color] + except: + c = color + c = colorsys.rgb_to_hls(*mc.to_rgb(c)) + return colorsys.hls_to_rgb(c[0], max(0, min(1, amount * c[1])), c[2]) + + def _add_plot( + self, + ax, + labels, + label, + basename, + trial, + x_axis_split, + linestyle="solid", + scatter=False, + color=None, + xs=None, + ys=None, + ): + labels.append(label or basename.split("-")[-1]) + xs = ( + xs + if xs is not None + else list(map(float, self.values[trial][basename].keys())) + ) + ys = ( + ys + if ys is not None + else list(map(float, self.values[trial][basename].values())) + ) + if scatter: + ax.scatter(xs, ys, s=10, label=labels[-1], color=color or None) + else: + ax.plot(xs, ys, label=labels[-1], linestyle=linestyle, color=color or None) + ax.set_xticks(list(map(int, xs))[::x_axis_split]) + return ax, labels + + def graph( + self, + graphs, + primal_values_filename: str = None, + primal_values_zip_path: str = None, + export_zip_name: str = None, + data_timestep_hr: float = 0.163, + publishing: bool = False, + title: str = None, + remove_empty_plots: bool = False, + ): + print(export_zip_name) + # define the default timestep ratio as 1 + data_timestep_hr = self.parameters.get("data_timestep_hr", data_timestep_hr) + timestep_ratio = data_timestep_hr / self.parameters.get( + "timestep_hr", data_timestep_hr + ) + if primal_values_filename: + if primal_values_zip_path: + with ZipFile(primal_values_zip_path, "r") as zp: + zp.extract(primal_values_filename) + with open(primal_values_filename, "r", encoding="utf-8") as primal: + self.values = json.load(primal) + + # plot the content for desired trials + x_axis_split = int(3 / data_timestep_hr / timestep_ratio) + self.plots = set() + contents = {"biomass": "b_", "all_biomass": "b_", "growth": "g_", "conc": "c_"} + mM_threshold = 1e-3 + for graph_index, graph in enumerate(graphs): + content = contents.get(graph["content"], graph["content"]) + y_label = "Variable value" + x_label = r"Time ($hr$)" + if any([x in graph["content"] for x in ["biomass", "OD"]]): + total_biomasses = {name: [] for name in self.species_list} + total_biomasses.update({"OD": []}) + if "species" not in graph: + graph["species"] = self.species_list + if "biomass" in graph["content"]: + y_label = r"Biomass ($\frac{g}{L}$)" + elif "growth" in graph["content"]: + y_label = r"Biomass growth ($\frac{g}{hr}$)" + graph["experimental_data"] = graph.get("experimental_data", False) + if "painting" not in graph: + graph["painting"] = { + "OD": { + "color": "blue", + "linestyle": "solid", + "name": "Total biomass", + }, + "ecoli": {"color": "red", "linestyle": "dashed", "name": "E. coli"}, + "pf": { + "color": "green", + "linestyle": "dotted", + "name": "P. fluorescens", + }, + } + graph["parsed"] = graph.get("parsed", False) + if "phenotype" in graph and graph["phenotype"] == "*": + if "species" not in graph: + graph["species"] = self.species_list + graph["phenotype"] = set( + [ + pheno.split("_")[-1] + for pheno in self.phenotypes + if pheno.split("_")[0] in graph["species"] + ] + ) + # TODO - a species-resolved option must be developed for the paper figure + if "species" in graph and graph["species"] == "*": + graph["species"] = self.species_list + elif content == "c_" and "mets" not in graph: + print(self.mets_to_track) + graph["mets"] = self.mets_to_track + elif not any(["species" in graph, "mets" in graph]): + raise ValueError( + f"The specified graph {graph} must define species for which data will be plotted." + ) + print(f"graph_{graph_index}") + pprint(graph) + + # define figure specifications + if publishing: + pyplot.rc("axes", titlesize=22, labelsize=28) + pyplot.rc("xtick", labelsize=24) + pyplot.rc("ytick", labelsize=24) + pyplot.rc("legend", fontsize=18) + if graph["parsed"]: + parsed_graphs = {} + for species in graph["species"]: + parsed_graphs[species] = pyplot.subplots(dpi=200, figsize=(11, 7)) + else: + fig, ax = pyplot.subplots(dpi=200, figsize=(11, 7)) + yscale = "linear" + + # populate the figures + for trial, basenames in self.values.items(): + if trial not in graph["trial"]: + continue + labels = [] + for basename, values in basenames.items(): + # graph experimental and total simulated biomasses + if any([x in graph["content"] for x in ["biomass", "OD"]]): + if "b_" in basename: + vals = list(map(float, values.values())) + var_name, species, phenotype = basename.split("_") + # ic(basename) + label = f"{species}_biomass (model)" + if publishing: + species_name = graph["painting"][species]["name"] + label = f"{species_name} total (model)" + labels.append({species: label}) + if remove_empty_plots and all([v == 0 for v in vals]): + print(f"The {basename} is empty and thus is removed.") + continue + if ( + any( + [ + x in graph["content"] + for x in ["total", "biomass", "OD"] + ] + ) + or graph["species"] == self.species_list + ): # and not graph["parsed"]: + total_biomasses["OD"].append(vals) + if "OD" not in graph["content"]: + total_biomasses[species].append(vals) + if all( + [ + graph["experimental_data"], + "|bio" in basename, + ] + ): + # any([content in basename])]): # TODO - any() must include all_biomass and total + species, signal, phenotype = basename.split("|") + label = basename + if publishing: + species_name = ( + "total" + if "OD" in signal + else graph["painting"][species]["name"] + ) + label = f"Experimental {species_name} (from {signal})" + # print(basename, label, self.values[trial][basename].values()) + if remove_empty_plots and all( + self.values[trial][basename].values() == 0 + ): + print(f"The {basename} is empty and thus is removed.") + continue + ax, labels = self._add_plot( + ax, + labels, + label, + basename, + trial, + x_axis_split, + scatter=True, + color=self.adjust_color( + graph["painting"][species]["color"], 1.5 + ), + ) + + if content not in basename: + continue + # graph individual phenotypes + if "phenotype" in graph: + # print(graph['phenotype']) + for specie in graph["species"]: + if specie not in basename: + continue + if not any([p in basename for p in graph["phenotype"]]): + print(f"{basename} data with unknown phenotype.") + continue + if remove_empty_plots and all( + self.values[trial][basename].values() == 0 + ): + print(f"The {specie} is empty and thus is removed.") + continue + if graph["parsed"]: + fig, ax = parsed_graphs[specie] + ## define graph characteristics + label = basename.split("_")[-1] + style = "solid" + if len(graph["species"]) > 1: + label = re.sub(r"(^[a-b]+\_)", "", basename) + style = graph["painting"][specie]["linestyle"] + ax, labels = self._add_plot( + ax, labels, label, basename, trial, x_axis_split, style + ) + if graph["parsed"]: + parsed_graphs[specie] = (fig, ax) + # graph media concentration plots + elif "mets" in graph and all( + [ + any([x in basename for x in graph["mets"]]), + "c_cpd" in basename, + ] + ): + if not any( + np.array(list(self.values[trial][basename].values())) + > mM_threshold + ): + continue + if remove_empty_plots and all( + self.values[trial][basename].values() == 0 + ): + continue + label = self.msdb.compounds.get_by_id( + re.search(r"(cpd\d+)", basename).group() + ).name + ax, labels = self._add_plot( + ax, labels, label, basename, trial, x_axis_split + ) + yscale = "log" + y_label = r"Concentration ($mM$)" + + if labels: # assesses whether graph(s) were created + ## graph all of the total biomasses + if any([x in graph["content"] for x in ["OD", "biomass", "total"]]): + labeled_species = [ + label for label in labels if isinstance(label, dict) + ] + for name, vals in total_biomasses.items(): + # ic(name) + if not vals or ( + len(total_biomasses) == 2 and "OD" not in name + ): + continue + if len(total_biomasses) == 2: + specie_label = [ + graph["painting"][name]["name"] + for name in total_biomasses + if "OD" not in name + ][0] + label = f"{graph['painting'][name]['name']} ({specie_label})" + else: + label = f"{name}_biomass (model)" + if labeled_species: + for label_specie in labeled_species: + if name in label_specie: + label = label_specie[name] + break + style = ( + "solid" + if ( + len(graph["species"]) < 1 + or name not in graph["painting"] + ) + else graph["painting"][name]["linestyle"] + ) + style = "dashdot" if "model" in label else style + style = ( + "solid" + if ( + "OD" in name + and not graph["experimental_data"] + or "total" in graph["content"] + ) + else style + ) + total_biomass = sum(np.array(vals))[:-1] + xs = list(map(float, values.keys())) + if graph["parsed"]: + fig, ax = parsed_graphs[name] + self._add_plot( + ax, + labels, + label, + None, + None, + x_axis_split, + style, + False, + graph["painting"][name]["color"], + xs, + total_biomass, + ) + if graph["parsed"]: + ## process and export the parsed figures + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.grid(axis="y") + ax.set_yscale(yscale) + ax.legend() + phenotype_id = graph.get("phenotype", "") + if "phenotype" in graph and not isinstance( + graph["phenotype"], str + ): + phenotype_id = ( + f"{','.join(graph['phenotype'])} phenotypes" + ) + fig_name = f'{"_".join([trial, name, phenotype_id, content])}.jpg' + fig.savefig( + fig_name, bbox_inches="tight", transparent=True + ) + self.plots.add(fig_name) + + if graph["parsed"]: + continue + ## process and export the non-parsed figures + phenotype_id = graph.get("phenotype", "") + if "phenotype" in graph and not isinstance(graph["phenotype"], str): + phenotype_id = f"{','.join(graph['phenotype'])} phenotypes" + + species_id = "" + if "mets" not in graph and content != "c_": + species_id = ( + graph["species"] + if isinstance(graph["species"], str) + else ",".join(graph["species"]) + ) + if "species" in graph and graph["species"] == self.species_list: + species_id = "all species" + else: + phenotype_id = f"{','.join(graph['species'])} species" + if species_id == "all species" and not phenotype_id: + phenotype_id = ",".join(graph["species"]) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + if "mets" in graph: + ax.set_ylim(mM_threshold) + ax.grid(axis="y") + if len(labels) > 1: + ax.legend() + else: + yscale = "linear" + ax.set_yscale(yscale) + if not publishing: + if not title: + org_content = ( + content + if content not in contents.values() + else list(contents.keys())[ + list(contents.values()).index(content) + ] + ) + this_title = f"{org_content} of {species_id} ({phenotype_id}) in the {trial} trial" + if content == "c_": + this_title = f"{org_content} in the {trial} trial" + ax.set_title(this_title) + else: + ax.set_title(title) + fig_name = ( + f'{"_".join([trial, species_id, phenotype_id, content])}.jpg' + ) + if "mets" in graph: + fig_name = f"{trial}_{','.join(graph['mets'])}_c.jpg" + fig.savefig(fig_name, bbox_inches="tight", transparent=True) + + self.plots.add(fig_name) + + # export the figures with other simulation content + if export_zip_name: + with ZipFile(export_zip_name, "a", compression=ZIP_LZMA) as zp: + for plot in self.plots: + zp.write(plot) + os.remove(plot) + + #################### ENGINEERING PHASE METHODS #################### + + def engineering(self): + if not hasattr(self, "problem"): + self.fit() # TODO - accommodate both fitting a new model and loading an existing model + + # This will capture biomass variables at all times and trials, which seems undesirable + self.problem.objective = Objective( + sum([x for x in self.problem.variables if "bio" in x.name]) + ) + + # Use a community COBRA model and CommKinetics with the fitted kinetic parameters? + + def _add_phenotypes(self): + pass + + def _change_obj(self): + pass + + +class BIOLOGPhitting(CommPhitting): + def __init__( + self, + carbon_conc, + media_conc, + biolog_df, + fluxes_df, + experimental_metadata, + msdb_path, + community_members, + ): + self.biolog_df = biolog_df + self.experimental_metadata = experimental_metadata + self.carbon_conc = carbon_conc + self.media_conc = media_conc or [] + self.fluxes_df = fluxes_df + self.phenotypes = list(self.fluxes_df.columns) + self.phenotypes.extend( + [ + signal_species(signal) + "_stationary" + for signal in self.biolog_df + if ":" in signal + ] + ) + self.community_members = community_members + # import os + from modelseedpy.biochem import from_local + + self.msdb_path = msdb_path + self.msdb = from_local(msdb_path) + + def fitAll( + self, + parameters: dict = None, + rel_final_conc: float = None, + abs_final_conc: dict = None, + graphs: list = None, + data_timesteps: dict = None, + export_zip_name: str = None, + export_parameters: bool = True, + requisite_biomass: dict = None, + figures_zip_name: str = None, + publishing: bool = False, + ): + # simulate each condition + if export_zip_name and os.path.exists(export_zip_name): + os.remove(export_zip_name) + org_rel_final_conc = rel_final_conc + # total_reactions = set(list(chain.from_iterable([model.reactions for model in models_dict.values()]))) + model_abbreviations = ",".join( + [content["name"] for content in self.community_members.values()] + ) + for exp_index, experiment in self.experimental_metadata.iterrows(): + print(f"\n{exp_index} {experiment}") + display(experiment) + pheno = experiment["ModelSEED_ID"] + if not pheno: + print("The BIOLOG condition is not defined.") + continue + for model in self.community_members: + cpd = self.msdb.compounds.get_by_id(pheno) + if "C" not in cpd.elements or not any( + [re.search(pheno, rxn.id) for rxn in model.reactions] + ): + if "valid_condition" not in locals(): + valid_condition = False + continue + exp_list = [pheno] if isinstance(pheno, str) else pheno + self.community_members[model].update( + { + "phenotypes": { + re.sub(r"(-|\s)", "", experiment["condition"]): { + "consumed": exp_list + } + } + } + ) + # determine the requisite biomass for each condition based on which member consumes the compound + valid_condition = True + # proceed if none of the members can utilize the phenotype condition + if not valid_condition: + print( + f"The BIOLOG condition with {experiment['ModelSEED_ID']} is not" + f" absorbed by the {model_abbreviations} model(s)." + ) + continue + print( + f"The {experiment['ModelSEED_ID']} ({cpd.formula}) metabolite of the " + f"{experiment['condition']} condition may feed the {model_abbreviations} model(s)." + ) + if not any( + [experiment["ModelSEED_ID"] in pheno for pheno in self.phenotypes] + ): + print(e) + print( + f"The {experiment['ModelSEED_ID']} ({cpd.formula}) metabolite of the " + f"{experiment['condition']} condition is not a suitable phenotype for " + f"the {model_abbreviations} model(s)." + ) + continue + + # for exp_index, experiment in self.experimental_metadata.iterrows(): + # the model(s) for which the condition is a suitable carbon source must be defined here + # simulate through the kinetics ranges with conditions that can be used by one of members + rel_final_conc = {experiment["ModelSEED_ID"]: org_rel_final_conc} + export_path = os.path.join( + os.getcwd(), "BIOLOG_LPs", f"{exp_index}_{','.join(exp_list)}.lp" + ) + kcat_primal = None + for coef_index, coefs in enumerate(biomass_partition_coefs): + # solve for growth rate constants with the previously solved biomasses + new_simulation = CommPhitting( + self.fluxes_df, + self.carbon_conc, + self.media_conc, + self.msdb_path, + self.biolog_df.loc[exp_index, :], + self.experimental_metadata, + ) + new_simulation.define_problem( + parameters, + exp_list, + rel_final_conc, + set( + list( + chain.from_iterable( + [ + content["excretions"] + for content in self.community_members.values() + ] + ) + ) + ), + abs_final_conc, + data_timesteps, + export_zip_name, + export_parameters, + export_path, + kcat_primal, + coefs, + requisite_biomass, + True, + ) + time1 = process_time() + primals_export_path = ( + primals_export_path or f"BIOLOG_{experiment['ModelSEED_ID']}.json" + ) + try: + new_simulation.compute( + graphs, + export_zip_name, + None, + publishing, + primals_export_path, + True, + ) + except NoFluxError as e: + print(e) + kcat_primal = parse_primals( + new_simulation.values, + coefs=coefs, + kcat_vals=new_simulation.parameters["kcat"], + ) + time2 = process_time() + print( + f"Done simulating with the coefficients for biomass partitions: {coef_index}" + f"\n{(time2 - time1) / 60} minutes" + ) + pprint(kcat_primal) + print("\n\n\n") + return {k: val for k, val in new_simulation.values.items() if "kcat" in k} diff --git a/modelseedpy/community/commscores_old.py b/modelseedpy/community/commscores_old.py new file mode 100644 index 00000000..ed0f2eed --- /dev/null +++ b/modelseedpy/community/commscores_old.py @@ -0,0 +1,1856 @@ +from modelseedpy.core.exceptions import ObjectiveError, ParameterError +from modelseedpy.community.commhelper import build_from_species_models +from modelseedpy.community.mscompatibility import MSCompatibility +from modelseedpy.core.msminimalmedia import MSMinimalMedia +from modelseedpy.community.mscommunity import MSCommunity +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.core.msgapfill import MSGapfill +from itertools import combinations, permutations, chain +from optlang import Variable, Constraint, Objective +from numpy import array, unique, ndarray, where, sort, array_split, nan +from collections import Counter +from deepdiff import DeepDiff # (old, new) +from typing import Iterable, Union +from pprint import pprint +from numpy.random import shuffle +from multiprocess import current_process +from math import inf +import sigfig + +# from icecream import ic +import re + +# from math import prod + +# silence deprecation warnings from DeepDiff parsing the syntrophy +import warnings + +warnings.simplefilter("ignore", category=DeprecationWarning) + +rm_comp = FBAHelper.remove_compartment + + +def _compatibilize(member_models: Iterable, printing=False): + # return member_models + models = MSCompatibility.standardize( + member_models, conflicts_file_name="exchanges_conflicts.json", printing=printing + ) + if not isinstance(member_models, (set, list, tuple)): + return models[0] + return models + + +def _load_models( + member_models: Iterable, com_model=None, compatibilize=True, printing=False +): + # ic(member_models, com_model, compatibilize) + if not com_model and member_models: + model = build_from_species_models(member_models, name="SMETANA_pair") + return member_models, model # (model, names=names, abundances=abundances) + # models = PARSING_FUNCTION(community_model) # TODO the individual models of a community model can be parsed + if compatibilize: + return ( + _compatibilize(member_models, printing), + _compatibilize([com_model], printing)[0], + ) + return member_models, com_model + + +def _get_media( + media=None, + com_model=None, + model_s_=None, + min_growth=None, + environment=None, + interacting=True, + printing=False, + minimization_method="minFlux", + skip_bad_media=False, +): + # ic(media, com_model, model_s_) + if com_model is None and model_s_ is None: + raise TypeError("< com_model > or < model_s_ > must be parameterized.") + if media is not None: + if model_s_ is not None and not isinstance(model_s_, (list, set, tuple)): + return media["members"][model_s_.id]["media"] + elif com_model is not None: + return media["community_media"] + return media + # model_s_ is either a singular model or a list of models + if com_model is not None: + try: + com_media, media_sol = MSMinimalMedia.determine_min_media( + com_model, + minimization_method, + min_growth, + None, + interacting, + 5, + printing, + ) + except Exception as e: + if skip_bad_media: + com_media, media_sol = None, None + else: + print(e) + if model_s_ is not None: + if not isinstance(model_s_, (list, set, tuple, ndarray)): + try: + return MSMinimalMedia.determine_min_media( + model_s_, + minimization_method, + min_growth, + environment, + interacting, + printing, + ) + except Exception as e: + if not skip_bad_media: + print(e) + return None + members_media = {} + for model in model_s_: + try: + members_media[model.id] = { + "media": MSMinimalMedia.determine_min_media( + model, + minimization_method, + min_growth, + environment, + interacting, + printing, + )[0] + } + continue + except Exception as e: + if skip_bad_media: + continue + else: + print(e) + # print(members_media) + if com_model is None: + return members_media + else: + return com_media, media_sol + return {"community_media": com_media, "members": members_media} + + +def _sigfig_check(value, sigfigs, default): + if str(value) in ["inf", "nan"]: + value = "" + if FBAHelper.isnumber(value): + return sigfig.round(value, sigfigs) + else: + return default + + +def nanFilter(value, string=True): + if isinstance(value, str) or value is None: + if string: + return value + else: + return nan + if any([value < 0, value > 1e5]): + return "" if string else nan + return value + + +class CommScores: + def __init__( + self, + member_models, + min_growth=0.1, + n_solutions=100, + environment=None, + abstol=1e-3, + media_dict=None, + printing=True, + raw_content=False, + antismash_json_path: str = None, + antismash_zip_path: str = None, + minimal_media_method="minFlux", + ): + self.min_growth = min_growth + self.abstol = abstol + self.n_solutions = n_solutions + self.printing = printing + self.raw_content = raw_content + self.antismash_json_path = antismash_json_path + self.antismash_zip_path = antismash_zip_path + + # process the models + self.models = _compatibilize(member_models) + self.community = MSModelUtil(build_from_species_models(self.models)) + ## define the environment + if environment: + if hasattr(environment, "get_media_constraints"): + ### standardize modelseed media into COBRApy media + environment = { + "EX_" + exID: -bound[0] + for exID, bound in environment.get_media_constraints().items() + } + self.community.add_medium(environment) + self.environment = environment + ## test growth + for model in self.models: + if model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations and hence SMETANA." + ) + if self.community.model.slim_optimize() == 0: + raise ObjectiveError( + f"The community model {self.community.model.id} possesses an objective " + "value of 0 in complete media, which is incompatible with minimal " + "media computations and hence SMETANA." + ) + ## determine the minimal media for each model, including the community + self.media = ( + media_dict + if media_dict + else MSMinimalMedia.comm_media_est( + member_models, + self.community.model, + minimal_media_method, + min_growth, + self.environment, + True, + n_solutions, + printing, + ) + ) + + def all_scores( + self, + mp_score=True, + kbase_obj=None, + cobrakbase_path: str = None, + kbase_token_path: str = None, + annotated_genomes: dict = None, + ): + mro = self.mro_score() + mip = self.mip_score(interacting_media=self.media) + mp = None if not mp_score else self.mp_score() + mu = None # self.mu_score() + sc = None # self.sc_score() + smetana = None # self.smetana_score() + gyd = self.gyd_score() + fs = ( + self.fs_score() + if any( + [ + kbase_obj is not None, + annotated_genomes != [], + cobrakbase_path is not None and kbase_token_path is not None, + ] + ) + else None + ) + return { + "mro": mro, + "mip": mip, + "mp": mp, + "mu": mu, + "sc": sc, + "smetana": smetana, + "gyd": gyd, + "fs": fs, + } + + def mro_score(self): + self.mro_val = CommScores.mro( + self.models, + self.media["members"], + self.min_growth, + self.media, + self.raw_content, + self.environment, + self.printing, + True, + ) + if not self.printing: + return self.mro_val + if self.raw_content: + for pair, (interaction, media) in self.mro_val.items(): + newcomer, established = pair.split("---") + print( + f"\n(MRO) The {newcomer} media {media} possesses {interaction} shared " + f"requirements with the {established} established member." + ) + return self.mro_val + for pair, mro in self.mro_val.items(): + newcomer, established = pair.split("---") + print( + f"\nThe {newcomer} on {established} MRO score: {mro[0]} ({mro[0]*100:.2f}%). " + f"This is the percent of nutritional requirements in {newcomer} " + f"that overlap with {established} ({mro[1]}/{mro[2]})." + ) + return self.mro_val + + def mip_score( + self, interacting_media: dict = None, noninteracting_media: dict = None + ): + interacting_media = interacting_media or self.media or None + diff, self.mip_val = CommScores.mip( + self.models, + self.community.model, + self.min_growth, + interacting_media, + noninteracting_media, + self.environment, + self.printing, + True, + ) + if not self.printing: + return self.mip_val + print( + f"\nMIP score: {self.mip_val}\t\t\t{self.mip_val} required compound(s) can be sourced via syntrophy:" + ) + if self.raw_content: + pprint(diff) + return self.mip_val + + def gyd_score(self, coculture_growth=False): + self.gyd_val = CommScores.gyd( + self.models, environment=self.environment, coculture_growth=coculture_growth + ) + if not self.printing: + return self.gyd + growth_type = "monocultural" if not coculture_growth else "cocultural" + for pair, score in self.gyd_val.items(): + print( + f"\nGYD score: The {growth_type} growth difference between the {pair} member models" + f" is {score} times greater than the growth of the slower member." + ) + return self.gyd + + def fs_score( + self, + kbase_obj=None, + cobrakbase_path: str = None, + kbase_token_path: str = None, + annotated_genomes: dict = None, + ): + self.fs_val = CommScores.fs( + self.models, kbase_obj, cobrakbase_path, kbase_token_path, annotated_genomes + ) + if not self.printing: + return self.fs + for pair, score in self.fs_val.items(): + print( + f"\nFS Score: The similarity of RAST functional SSO ontology " + f"terms between the {pair} members is {score}." + ) + return self.fs + + def mp_score(self): + print("executing MP") + self.mp_val = CommScores.mp( + self.models, + self.environment, + self.community.model, + None, + self.abstol, + self.printing, + ) + if not self.printing: + return self.mp_val + if self.raw_content: + print( + "\n(MP) The possible contributions of each member in the member media include:\n" + ) + pprint(self.mp_val) + else: + print( + "\nMP score:\t\t\tEach member can possibly contribute the following to the community:\n" + ) + for member, contributions in self.mp_val.items(): + print(member, "\t", len(contributions)) + return self.mp_val + + def mu_score(self): + member_excreta = self.mp_score() if not hasattr(self, "mp_val") else self.mp_val + self.mu_val = CommScores.mu( + self.models, + self.environment, + member_excreta, + self.n_solutions, + self.abstol, + True, + self.printing, + ) + if not self.printing: + return self.mu_val + print( + "\nMU score:\t\t\tThe fraction of solutions in which each member is the " + "syntrophic receiver that contain a respective metabolite:\n" + ) + pprint(self.mu_val) + return self.mu_val + + def sc_score(self): + self.sc_val = CommScores.sc( + self.models, + self.community.model, + self.min_growth, + self.n_solutions, + self.abstol, + True, + self.printing, + ) + if not self.printing: + return self.sc_val + print( + "\nSC score:\t\t\tThe fraction of community members who syntrophically contribute to each species:\n" + ) + pprint(self.sc_val) + return self.sc_val + + def smetana_score(self): + if not hasattr(self, "sc_val"): + self.sc_val = self.sc_score() + sc_coupling = all(array(list(self.sc.values())) is not None) + if not hasattr(self, "mu_val"): + self.mu_val = self.mu_score() + if not hasattr(self, "mp_val"): + self.mp_val = self.mp_score() + + self.smetana = CommScores.smetana( + self.models, + self.community.model, + self.min_growth, + self.n_solutions, + self.abstol, + (self.sc_val, self.mu_val, self.mp_val), + True, + sc_coupling, + self.printing, + ) + if self.printing: + print("\nsmetana score:\n") + pprint(self.smetana) + return self.smetana + + def antiSMASH_scores(self, antismash_json_path=None): + self.antismash = CommScores.antiSMASH( + antismash_json_path or self.antismash_json_path + ) + if not self.printing: + return self.antismash + if self.raw_content: + print( + "\n(antismash) The biosynthetic_areas, BGCs, protein_annotations, clusterBlast, and " + "num_clusterBlast from the provided antiSMASH results:\n" + ) + print( + "The 'areas' that antiSMASH determines produce biosynthetic products:" + ) + pprint(self.antismash[0]) + print("The set of biosynthetic gene clusters:") + pprint(self.antismash[1]) + print("The set of clusterblast protein annotations:") + pprint(self.antismash[2]) + print("Resistance information from clusterblast") + pprint(self.antismash[3]) + print("The number of proteins associated with resistance") + pprint(self.antismash[4]) + return self.antismash + print("\nantiSMASH scores:\n") + print( + "The community exhibited:" + f"- {len(self.antismash[0])}'areas' that antiSMASH determines produce biosynthetic products." + f"- {len(self.antismash[1])} biosynthetic gene clusters." + f"- {len(self.antismash[2])} clusterblast protein annotations." + f"- {len(self.antismash[3])} parcels of resistance information from clusterblast." + f"- {self.antismash[4]} proteins associated with resistance." + ) + return list(map(len, self.antismash[:4])) + [self.antismash[4]] + + ###### STATIC METHODS OF THE SMETANA SCORES, WHICH ARE APPLIED IN THE ABOVE CLASS OBJECT ###### + + @staticmethod + def _check_model(model_util, media, model_str, skip_bad_media): + default_media = model_util.model.medium + if media is not None: + model_util.add_medium(media) + obj_val = model_util.model.slim_optimize() + if obj_val == 0 or not FBAHelper.isnumber(obj_val): + print( + f"The {model_str} model input does not yield an operational model, and will therefore be gapfilled." + ) + # if not skip_bad_media: return MSGapfill.gapfill(model_util.model, media) + model_util.add_medium(default_media) + return model_util.model + + @staticmethod + def _load(model, kbase_obj): + model_str = model + if len(model) == 2: + model = kbase_obj.get_from_ws(*model) + else: + model = kbase_obj.get_from_ws(model) + return model, model_str + + @staticmethod + def _determine_growths(modelUtils): + return [util.model.slim_optimize() for util in modelUtils] + + @staticmethod + def calculate_scores( + pairs, + models_media=None, + environments=None, + annotated_genomes=True, + lazy_load=False, + kbase_obj=None, + cip_score=True, + costless=True, + skip_bad_media=False, + anme_comm=False, + print_progress=False, + ): + from pandas import Series + + if isinstance(pairs, list): + ( + pairs, + models_media, + environments, + annotated_genomes, + lazy_load, + kbase_obj, + ) = pairs + series, mets = [], [] + if not isinstance(environments, (list, tuple)): + environments = [environments] + if isinstance(environments, (list, tuple)) and hasattr(environments[0], "name"): + environments = { + m.name: FBAHelper.convert_kbase_media(m, 1000) for m in environments + } + elif not isinstance(environments, dict): + environments = {f"media{i}": m for i, m in enumerate(environments)} + pid = current_process().name + model_utils = {} + count = 0 + for model1, models in pairs.items(): + if model1.id == "": + model1.id = "model1" + if lazy_load: + model1, model1_str = CommScores._load(model1, kbase_obj) + else: + model1_str = model1.id + if model1.id not in models_media: + models_media[model1.id] = { + "media": _get_media(model_s_=model1, skip_bad_media=skip_bad_media) + } + if models_media[model1.id] is None: + continue + if model1.id not in model_utils: + model_utils[model1.id] = MSModelUtil(model1) + # print(pid, model1) + for model_index, model2 in enumerate(models): + if model2.id == "": + model2.id = "model2" + if lazy_load: + model2, model2_str = CommScores._load(model2, kbase_obj) + else: + model2_str = model2.id + if model2.id not in models_media: + models_media[model2.id] = { + "media": _get_media( + model_s_=model2, skip_bad_media=skip_bad_media + ) + } + if models_media[model2.id] is None: + continue + if model2.id not in model_utils: + model_utils[model2.id] = MSModelUtil(model2) + grouping = [model1, model2] + grouping_utils = [model_utils[model1.id], model_utils[model2.id]] + modelIDs = [model.id for model in grouping] + comm_model = build_from_species_models(grouping) + community = MSCommunity(comm_model, ids=modelIDs) + comm_sol = comm_model.optimize() + print(f"{pid}~~{count}\t{modelIDs}") + for environName, environ in environments.items(): + if print_progress: + print(f"\tEnvironment\t{environName}", end="\t") + if not anme_comm: + model1 = CommScores._check_model( + model_utils[model1.id], environ, model1_str, skip_bad_media + ) + model2 = CommScores._check_model( + model_utils[model2.id], environ, model2_str, skip_bad_media + ) + # initiate the KBase output + report_dic = { + f"model{i+1}": modelID for i, modelID in enumerate(modelIDs) + } + g1, g2, comm = CommScores._determine_growths( + [model_utils[model1.id], model_utils[model2.id], community.util] + ) + g1, g2, comm = ( + _sigfig_check(g1, 5, ""), + _sigfig_check(g2, 5, ""), + _sigfig_check(comm, 5, ""), + ) + report_dic.update( + { + "media": environName, + "model1 growth": g1, + "model2 growth": g2, + "community growth": comm, + } + ) + coculture_growths = { + mem.id: comm_sol.fluxes[mem.primary_biomass.id] + for mem in community.members + } + report_dic.update( + { + f"coculture growth model{modelIDs.index(memID)}": growth + for memID, growth in coculture_growths.items() + } + ) + # define the MRO content + mro_values = CommScores.mro( + grouping, models_media, raw_content=True, environment=environ + ) + report_dic.update( + { + f"MRO_model{modelIDs.index(models_string.split('--')[0])+1}": f"{100*len(intersection)/len(memMedia):.3f}% ({len(intersection)}/{len(memMedia)})" + for models_string, ( + intersection, + memMedia, + ) in mro_values.items() + } + ) + mets.append({"MRO metabolites": list(mro_values.values())[0][0]}) + if print_progress: + print("MRO done", end="\t") + # define the CIP content + if cip_score: + cip_values = CommScores.cip( + modelutils=[model_utils[mem.id] for mem in grouping] + ) + report_dic.update({"CIP": cip_values[1]}) + mets[-1].update({"CIP metabolites": list(cip_values[0])}) + if print_progress: + print("CIP done", end="\t") + # define the MIP content + mip_values = CommScores.mip( + grouping, + comm_model, + 0.1, + None, + None, + environ, + print_progress, + True, + costless, + costless, + skip_bad_media, + ) + # print(mip_values) + if mip_values is not None: + report_dic.update( + { + f"MIP_model{modelIDs.index(models_name)+1}": str( + len(received) + ) + for models_name, received in mip_values[0].items() + } + ) + mets[-1].update( + { + "MIP model1 metabolites": list(mip_values[0].values())[ + 0 + ], + "MIP model2 metabolites": list(mip_values[0].values())[ + 1 + ], + } + ) + if costless: + for models_name, received in mip_values[1].items(): + report_dic[ + f"MIP_model{modelIDs.index(models_name)+1} (costless)" + ] = ( + report_dic[ + f"MIP_model{modelIDs.index(models_name)+1}" + ] + + f" ({len(received)})" + ) + del report_dic[ + f"MIP_model{modelIDs.index(models_name)+1}" + ] + if print_progress: + print("costless_MIP done", end="\t") + else: + report_dic.update( + {f"MIP_model1 (costless)": "", f"MIP_model2 (costless)": ""} + ) + mets[-1].update( + { + "MIP model1 metabolites": [None], + "MIP model2 metabolites": [None], + } + ) + if print_progress: + print("MIP done", end="\t") + # define the BSS content + bss_values = CommScores.bss( + grouping, + grouping_utils, + environments, + models_media, + skip_bad_media, + ) + report_dic.update( + { + f"BSS_model{modelIDs.index(name.split(' supporting ')[0])+1}": f"{_sigfig_check(100*val, 5, '')}%" + for name, (mets, val) in bss_values.items() + } + ) + mets[-1].update( + { + "BSS model1 metabolites": [ + met_set for met_set, val in bss_values.values() + ][0], + "BSS model2 metabolites": [ + met_set for met_set, val in bss_values.values() + ][1], + } + ) + # mets[-1].update({"bss_mets": list(bss_values[0].values())}) + if print_progress: + print("BSS done", end="\t") + # define the PC content + pc_values = CommScores.pc( + grouping, + grouping_utils, + comm_model, + None, + comm_sol, + environ, + True, + community, + ) + report_dic.update( + { + "PC_comm": _sigfig_check(pc_values[0], 5, ""), + "PC_model1": _sigfig_check( + list(pc_values[1].values())[0], 5, "" + ), + "PC_model2": _sigfig_check( + list(pc_values[1].values())[1], 5, "" + ), + "BIT": pc_values[3], + } + ) + if print_progress: + print("PC done\tBIT done", end="\t") + # print([mem.slim_optimize() for mem in grouping]) + # define the GYD content + gyd1, gyd2, g1, g2 = list( + CommScores.gyd( + grouping, + grouping_utils, + environ, + False, + community, + anme_comm, + ).values() + )[0] + report_dic.update( + { + "GYD1": _sigfig_check(gyd1, 5, ""), + "GYD2": _sigfig_check(gyd2, 5, ""), + } + ) + if print_progress: + print("GYD done\t\t", end="\t" if annotated_genomes else "\n") + # define the FS content + if kbase_obj is not None and annotated_genomes and not anme_comm: + fs_values = list( + CommScores.fs( + grouping, kbase_obj, annotated_genomes=annotated_genomes + ).values() + )[0] + print( + len(fs_values[0]) if fs_values[0] is not None else "NaN", + fs_values[1], + ) + report_dic.update({"FS": sigfig.round(fs_values[1], 5)}) + if fs_values is not None: + mets[-1].update({"FS features": fs_values[0]}) + if print_progress: + print("FS done\t\t") + # return a pandas Series, which can be easily aggregated with other results into a DataFrame + series.append(Series(report_dic)) + count += 1 + return series, mets + + @staticmethod + def html_report( + df, mets, export_html_path="commscores_report.html", msdb_path=None + ): + from modelseedpy.core.report import commscores_report + + return commscores_report(df, mets, export_html_path, msdb_path) + + @staticmethod + def report_generation( + all_models: iter = None, # a list of distinct lists is provided for specifying exclusive groups + pairs: dict = None, + mem_media: dict = None, + pair_limit: int = None, + exclude_pairs: list = None, + kbase_obj=None, + annotated_genomes: dict = True, # True triggers internal acquisition of the genomes, where None skips + see_media=True, + environments: iter = None, # a collection of environment dicts or KBase media objects + pool_size: int = None, + cip_score=True, + costless=True, + skip_bad_media=False, + anme_comm=False, + print_progress=False, + ): + from pandas import concat + + if pairs: + model_pairs = unique( + [ + {model1, model2} + for model1, models in pairs.items() + for model2 in models + ] + ) + elif all_models is not None: + if not isinstance(all_models[0], list): + all_models = list(set(all_models)) + model_pairs = array(list(combinations(all_models, 2))) + else: + model_pairs = [] + for models1, models2 in combinations(all_models, 2): + models1 = set(models1) + models2 = set(models2) + if len(models1) > len(models2): + larger_list = models1 + smaller_list = models2 + else: + larger_list = models2 + smaller_list = models1 + model_pairs.append( + [ + list(zip(combin, smaller_list)) + for combin in permutations(larger_list, len(smaller_list)) + ] + ) + # flatten the assembled pairs and filter duplicates + model_pairs = array( + [ + x + for x in set( + tuple(x) + for x in [ + i + for y in list(chain.from_iterable(model_pairs)) + for i in y + ] + ) + ] + ) + all_models = list(chain.from_iterable(all_models)) + if pair_limit is not None: + shuffle(model_pairs) + new_pairs = [] + for index, pair in enumerate(model_pairs): + if set(pair) not in exclude_pairs and index < pair_limit: + new_pairs.append(pair) + elif index >= pair_limit: + break + model_pairs = array(new_pairs) + if isinstance(model_pairs[0], str): + model_pairs = unique(sort(model_pairs, axis=1)) + pairs = { + first: model_pairs[where(model_pairs[:, 0] == first)][:, 1] + for first in model_pairs[:, 0] + } + else: + raise ValueError( + "Either < all_models > or < pairs > must be defined to simulate interactions." + ) + if not all_models: + all_models = list( + chain(*[list(values) for values in pairs.values()]) + ) + list(pairs.keys()) + lazy_load = len(model_pairs) > 10000 # all_models[0], (list,set,tuple)) + if lazy_load and not kbase_obj: + ValueError( + "The < kbase_obj > argument must be provided to lazy load models." + ) + new_models = [] + for index, model in enumerate(all_models): + if model.id == "": + model.id = f"model_index{index}" + new_models.append(model) + all_models = new_models[:] + if not mem_media: + models_media = _get_media( + model_s_=all_models, skip_bad_media=skip_bad_media + ) + else: + models_media = mem_media.copy() + missing_models = set() + missing_modelID = [] + for model in all_models: + if model is not None and model.id not in models_media: + missing_models.add(model) + missing_modelID.append( + model if not hasattr(model, "id") else model.id + ) + if missing_models != set(): + print( + f"Media of the {missing_modelID} models are not defined, and will be calculated separately." + ) + models_media.update( + _get_media(model_s_=missing_models), skip_bad_media=skip_bad_media + ) + if see_media: + print(f"The minimal media of all members:\n{models_media}") + print(f"\nExamining the {len(list(model_pairs))} model pairs") + if pool_size is not None: + from datetime import datetime + from multiprocess import Pool + + print( + f"Loading {int(pool_size)} workers and computing the scores", + datetime.now(), + ) + pool = Pool( + int(pool_size) + ) # .map(calculate_scores, [{k: v} for k,v in pairs.items()]) + args = [ + [ + dict([pair]), + models_media, + environments, + annotated_genomes, + lazy_load, + kbase_obj, + ] + for pair in list(pairs.items()) + ] + output = pool.map(CommScores.calculate_scores, args) + series = chain.from_iterable([ele[0] for ele in output]) + mets = chain.from_iterable([ele[1] for ele in output]) + else: + series, mets = CommScores.calculate_scores( + pairs, + models_media, + environments, + annotated_genomes, + lazy_load, + kbase_obj, + cip_score, + costless, + skip_bad_media, + anme_comm, + print_progress, + ) + return concat(series, axis=1).T, mets + + @staticmethod + def mro( + member_models: Iterable = None, + mem_media: dict = None, + min_growth=0.1, + media_dict=None, + raw_content=False, + environment=None, + skip_bad_media=False, + printing=False, + compatibilized=False, + ): + """Determine the overlap of nutritional requirements (minimal media) between member organisms.""" + # determine the member minimal media if they are not parameterized + if not mem_media: + if not member_models: + raise ParameterError( + "The either member_models or minimal_media parameter must be defined." + ) + member_models = ( + member_models + if compatibilized + else _compatibilize(member_models, printing) + ) + mem_media = _get_media( + media_dict, + None, + member_models, + min_growth, + environment, + printing=printing, + skip_bad_media=skip_bad_media, + ) + if "community_media" in mem_media: + mem_media = mem_media["members"] + # MROs = array(list(map(len, pairs.values()))) / array(list(map(len, mem_media.values()))) + mro_values = {} + for model1, model2 in combinations(member_models, 2): + intersection = set(mem_media[model1.id]["media"].keys()) & set( + mem_media[model2.id]["media"].keys() + ) + inter = [ex.replace("EX_", "").replace("_e0", "") for ex in intersection] + m1_media = mem_media[model1.id]["media"] + m2_media = mem_media[model2.id]["media"] + if raw_content: + mro_values.update( + { + f"{model1.id}---{model2.id})": (inter, m1_media), + f"{model2.id}---{model1.id})": (inter, m2_media), + } + ) + else: + mro_values.update( + { + f"{model1.id}---{model2.id})": 100 + * (len(inter) / len(m1_media), len(inter), len(m1_media)), + f"{model2.id}---{model1.id})": 100 + * (len(inter) / len(m2_media), len(inter), len(m2_media)), + "mets": inter, + } + ) + return mro_values + # return mean(list(map(len, pairs.values()))) / mean(list(map(len, mem_media.values()))) + + @staticmethod + def mip( + member_models: Iterable, + com_model=None, + min_growth=0.1, + interacting_media_dict=None, + noninteracting_media_dict=None, + environment=None, + printing=False, + compatibilized=False, + costless=False, + multi_output=False, + skip_bad_media=False, + ): + """Determine the quantity of nutrients that can be potentially sourced through syntrophy""" + member_models, community = _load_models( + member_models, com_model, not compatibilized, printing=printing + ) + # determine the interacting and non-interacting media for the specified community .util.model + noninteracting_medium, noninteracting_sol = _get_media( + noninteracting_media_dict, + community, + None, + min_growth, + environment, + False, + skip_bad_media=skip_bad_media, + ) + if noninteracting_medium is None: + return None + if "community_media" in noninteracting_medium: + noninteracting_medium = noninteracting_medium["community_media"] + interacting_medium, interacting_sol = _get_media( + interacting_media_dict, + community, + None, + min_growth, + environment, + True, + skip_bad_media=skip_bad_media, + ) + if interacting_medium is None: + return None + if "community_media" in interacting_medium: + interacting_medium = interacting_medium["community_media"] + interact_diff = DeepDiff(noninteracting_medium, interacting_medium) + if "dictionary_item_removed" not in interact_diff: + return None + cross_fed_exIDs = [ + re.sub("(root\['|'\])", "", x) + for x in interact_diff["dictionary_item_removed"] + ] + # Determine each direction of the MIP score interactions + comm_util = MSModelUtil(community) + cross_fed_metIDs = [ + ex.replace("EX_", "").replace("_e0", "") for ex in cross_fed_exIDs + ] + cross_fed_copy = cross_fed_metIDs[:] + directionalMIP = {mem.id: [] for mem in member_models} + for rxn in comm_util.transport_list(): + # print(rxn.reaction, "\t", [met.id for met in rxn.metabolites if "_e0" in met.id]) + metIDs = list( + set([met.id.split("_")[0] for met in rxn.reactants]).intersection( + set([met.id.split("_")[0] for met in rxn.products]) + ) + ) + if len(metIDs) == 1: + metID = metIDs[0] + else: + if "cpd00067" in metIDs: + metIDs.remove("cpd00067") + metID = metIDs[0] + if metID not in cross_fed_metIDs: + continue + rxn_index = FBAHelper.compartment_index(rxn.id.split("_")[-1]) + if rxn_index == 0: + continue + mets = [met for met in rxn.metabolites if met.id == f"{metID}_c{rxn_index}"] + if mets == []: + print(f"The {metID}_c{rxn_index} is missing in {rxn.reaction}.") + continue + rxn_model = member_models[rxn_index - 1] + # comm_trans[metID] = comm_trans.get(f"{metID}_c{rxn_index}", {}) + if ( + rxn.metabolites[mets[0]] > 0 + and interacting_sol.fluxes[rxn.id] > 0 + or rxn.metabolites[mets[0]] < 0 + and interacting_sol.fluxes[rxn.id] < 0 + ): # donor + directionalMIP[rxn_model.id].append(metID) + if metID in cross_fed_copy: + cross_fed_copy.remove(metID) + continue + # if printing: print(f"{mets[0]} in {rxn.id} ({rxn.reaction}) is not assigned a receiving member.") + if cross_fed_copy != [] and printing: + print(f"Missing directions for the {cross_fed_copy} cross-fed metabolites") + outputs = [directionalMIP] + # TODO categorize all of the cross-fed substrates to examine potential associations of specific compounds + if costless: + costless_mets, numExs = CommScores.cip(member_models=member_models) + # print(list(directionalMIP.values()), costless_mets) + costlessDirectionalMIP = { + member_name: set(receive_mets).intersection(costless_mets) + for member_name, receive_mets in directionalMIP.items() + } + if not multi_output: + return costlessDirectionalMIP + outputs.append(costlessDirectionalMIP) + return outputs + + @staticmethod + def cip(modelutils=None, member_models=None): # costless interaction potential + if not modelutils: + modelutils = {MSModelUtil(model) for model in member_models} + costless_mets = set( + chain.from_iterable( + [modelutil.costless_excreta() for modelutil in modelutils] + ) + ) + return costless_mets, len(costless_mets) + + @staticmethod + def contributions(org_possible_contributions, scores, model_util, abstol): + # identify and log excreta from the solution + model_util.add_objective( + sum(ex_rxn.flux_expression for ex_rxn in org_possible_contributions) + ) + sol = model_util.model.optimize() + if sol.status != "optimal": + # exit the while loop by returning the original possible_contributions, + ## hence DeepDiff == {} and the while loop terminates + return scores, org_possible_contributions + # identify and log excreta from the solution + possible_contributions = org_possible_contributions[:] + for ex in org_possible_contributions: + if ex.id in sol.fluxes.keys() and sol.fluxes[ex.id] >= abstol: + possible_contributions.remove(ex) + scores[model_util.model.id].update([met.id for met in ex.metabolites]) + return scores, possible_contributions + + @staticmethod + def mp( + member_models: Iterable, + environment, + com_model=None, + minimal_media=None, + abstol=1e-3, + printing=False, + ): + """Discover the metabolites that each species can contribute to a community""" + community = ( + _compatibilize(com_model) + if com_model + else build_from_species_models(member_models, standardize=True) + ) + community.medium = minimal_media or MSMinimalMedia.minimize_flux(community) + scores = {} + for ( + org_model + ) in ( + member_models + ): # TODO support parsing the individual members through the MSCommunity object + model_util = MSModelUtil(org_model) + model_util.compatibilize(printing=printing) + if environment: + model_util.add_medium(environment) + scores[model_util.model.id] = set() + # determines possible member contributions in the community environment, where the excretion of media compounds is irrelevant + org_possible_contr = [ + ex_rxn + for ex_rxn in model_util.exchange_list() + if (ex_rxn.id not in community.medium and ex_rxn.upper_bound > 0) + ] + # ic(org_possible_contributions, len(model_util.exchange_list()), len(community.medium)) + scores, possible_contr = CommScores.contributions( + org_possible_contr, scores, model_util, abstol + ) + while DeepDiff(org_possible_contr, possible_contr): + print("remaining possible_contributions", len(possible_contr), end="\r") + ## optimize the sum of the remaining exchanges that have not surpassed the abstol + org_possible_contr = possible_contr[:] + scores, possible_contr = CommScores.contributions( + org_possible_contr, scores, model_util, abstol + ) + + ## individually checks the remaining possible contributions + for ex_rxn in possible_contr: + model_util.model.objective = Objective(ex_rxn.flux_expression) + sol = model_util.model.optimize() + if sol.status == "optimal" or sol.objective_value > abstol: + for met in ex_rxn.metabolites: + if met.id in scores[model_util.model.id]: + scores[model_util.model.id].remove(met.id) + print("removing", met.id) + return scores + + @staticmethod + def mu( + member_models: Iterable, + environment=None, + member_excreta=None, + n_solutions=100, + abstol=1e-3, + compatibilized=False, + printing=True, + ): + """the fractional frequency of each received metabolite amongst all possible alternative syntrophic solutions""" + # member_solutions = member_solutions if member_solutions else {model.id: model.optimize() for model in member_models} + scores = {} + member_models = ( + member_models if compatibilized else _compatibilize(member_models, printing) + ) + if member_excreta: + missing_members = [ + model for model in member_models if model.id not in member_excreta + ] + if missing_members: + print( + f"The {','.join(missing_members)} members are missing from the defined " + f"excreta list and will therefore be determined through an additional MP simulation." + ) + member_excreta.update(CommScores.mp(missing_members, environment)) + else: + member_excreta = CommScores.mp( + member_models, environment, None, abstol, printing + ) + for org_model in member_models: + other_excreta = set( + chain.from_iterable( + [ + excreta + for model, excreta in member_excreta.items() + if model != org_model.id + ] + ) + ) + print(f"\n{org_model.id}\tOther Excreta", other_excreta) + model_util = MSModelUtil(org_model, True) + if environment: + model_util.add_medium(environment) + ex_rxns = { + ex_rxn: list(ex_rxn.metabolites)[0] + for ex_rxn in model_util.exchange_list() + } + print(f"\n{org_model.id}\tExtracellular reactions", ex_rxns) + variables = { + ex_rxn.id: Variable( + "___".join([model_util.model.id, ex_rxn.id]), + lb=0, + ub=1, + type="binary", + ) + for ex_rxn in ex_rxns + } + model_util.add_cons_vars(list(variables.values())) + media, solutions = [], [] + sol = model_util.model.optimize() + while sol.status == "optimal" and len(solutions) < n_solutions: + solutions.append(sol) + medium = set( + [ + ex + for ex in ex_rxns + if sol.fluxes[ex.id] < -abstol and ex in other_excreta + ] + ) + model_util.create_constraint( + Constraint( + sum([variables[ex.id] for ex in medium]), + ub=len(medium) - 1, + name=f"iteration_{len(solutions)}", + ) + ) + media.append(medium) + sol = model_util.model.optimize() + counter = Counter(chain(*media)) + scores[model_util.model.id] = { + met.id: counter[ex] / len(media) + for ex, met in ex_rxns.items() + if counter[ex] > 0 + } + return scores + + @staticmethod + def sc( + member_models: Iterable = None, + com_model=None, + min_growth=0.1, + n_solutions=100, + abstol=1e-6, + compatibilized=True, + printing=False, + ): + """Calculate the frequency of interspecies dependency in a community""" + member_models, community = _load_models( + member_models, com_model, not compatibilized, printing=printing + ) + for rxn in com_model.reactions: + rxn.lower_bound = 0 if "bio" in rxn.id else rxn.lower_bound + + # c_{rxn.id}_lb: rxn < 1000*y_{species_id} + # c_{rxn.id}_ub: rxn > -1000*y_{species_id} + variables = {} + constraints = [] + # TODO this can be converted to an MSCommunity object by looping through each index + # leverage CommKinetics + for org_model in member_models: + model_util = MSModelUtil(org_model, True) + variables[model_util.model.id] = Variable( + name=f"y_{model_util.model.id}", lb=0, ub=1, type="binary" + ) + model_util.add_cons_vars([variables[model_util.model.id]]) + for rxn in model_util.model.reactions: + if "bio" not in rxn.id: + # print(rxn.flux_expression) + lb = Constraint( + rxn.flux_expression + 1000 * variables[model_util.model.id], + name="_".join(["c", model_util.model.id, rxn.id, "lb"]), + lb=0, + ) + ub = Constraint( + rxn.flux_expression - 1000 * variables[model_util.model.id], + name="_".join(["c", model_util.model.id, rxn.id, "ub"]), + ub=0, + ) + constraints.extend([lb, ub]) + + # calculate the SCS + scores = {} + for model in member_models: + com_model_util = MSModelUtil(com_model) + com_model_util.add_cons_vars(constraints, sloppy=True) + # model growth is guaranteed while minimizing the growing members of the community + ## SMETANA_Biomass: {biomass_reactions} > {min_growth} + com_model_util.create_constraint( + Constraint( + sum( + rxn.flux_expression + for rxn in model.reactions + if "bio" in rxn.id + ), + name="SMETANA_Biomass", + lb=min_growth, + ) + ) # sloppy = True) + other_members = [other for other in member_models if other.id != model.id] + com_model_util.add_objective( + sum([variables[other.id] for other in other_members]), "min" + ) + previous_constraints, donors_list = [], [] + for i in range(n_solutions): + sol = com_model.optimize() # FIXME The solution is not optimal + if sol.status != "optimal": + scores[model.id] = None + break + donors = [ + o + for o in other_members + if com_model.solver.primal_values[f"y_{o.id}"] > abstol + ] + donors_list.append(donors) + previous_con = f"iteration_{i}" + previous_constraints.append(previous_con) + com_model_util.add_cons_vars( + [ + Constraint( + sum(variables[o.id] for o in donors), + name=previous_con, + ub=len(previous_constraints) - 1, + ) + ], + sloppy=True, + ) + if i != 0: + donors_counter = Counter(chain(*donors_list)) + scores[model.id] = { + o.id: donors_counter[o] / len(donors_list) for o in other_members + } + return scores + + @staticmethod + def gyd( + member_models: Iterable = None, + model_utils: Iterable = None, + environment=None, + coculture_growth=False, + community=None, + anme_comm=False, + ): + gyds = {} + for combination in combinations(model_utils or member_models, 2): + if model_utils is None: + model1_util = MSModelUtil(combination[0], True) + model2_util = MSModelUtil(combination[1], True) + print( + f"{model1_util.model.id} ++ {model2_util.model.id}", + model1_util.model.slim_optimize(), + model2_util.model.slim_optimize(), + ) + if environment and not anme_comm: + model1_util.add_medium(environment) + model2_util.add_medium(environment) + else: + model1_util = combination[0] + model2_util = combination[1] + if not coculture_growth: + G_m1, G_m2 = CommScores._determine_growths([model1_util, model2_util]) + G_m1, G_m2 = G_m1 if FBAHelper.isnumber(str(G_m1)) else 0, ( + G_m2 if FBAHelper.isnumber(str(G_m2)) else 0 + ) + else: + community = community or MSCommunity( + member_models=[model1_util.model, model2_util.model], + ids=[mem.id for mem in member_models], + ) + community.run_fba() + member_growths = community.parse_member_growths() + G_m1, G_m2 = ( + member_growths[model1_util.model.id], + member_growths[model2_util.model.id], + ) + if G_m2 <= 0 or G_m1 <= 0: + gyds[f"{model1_util.model.id} ++ {model2_util.model.id}"] = ( + "", + "", + G_m1, + G_m2, + ) + continue + gyds[f"{model1_util.model.id} ++ {model2_util.model.id}"] = ( + abs(G_m1 - G_m2) / G_m1, + abs(G_m2 - G_m1) / G_m2, + G_m1, + G_m2, + ) + return gyds + + @staticmethod + def pc( + member_models=None, + modelutils=None, + com_model=None, + isolate_growths=None, + comm_sol=None, + environment=None, + comm_effects=True, + community=None, + interaction_threshold=0.1, + compatibilized=False, + ): + assert member_models or modelutils or community, ( + "Members must be defined through either < member_models >" + "or < modelutils > or < community >." + ) + member_models = ( + member_models or [mem.model for mem in modelutils] or community.members + ) + if com_model is None: + member_models, com_model = _load_models( + member_models, None, not compatibilized, printing=False + ) + community = community or MSCommunity(com_model, member_models) + if comm_sol is None: + community.util.add_medium(environment) + comm_sol = community.util.model.optimize() + model_utils = modelutils or [MSModelUtil(mem, True) for mem in member_models] + modelutils = [] + for mem in model_utils: + mem.add_medium(environment) + modelutils.append(mem) + if isolate_growths is None: + isolate_growths = {mem.id: mem.model.slim_optimize() for mem in modelutils} + pc_score = comm_sol.objective_value / sum(list(isolate_growths.values())) + if not comm_effects: + return pc_score + + comm_member_growths = { + mem.id: comm_sol.fluxes[mem.primary_biomass.id] for mem in community.members + } + comm_growth_effect = { + memID: nanFilter(comm_environ / isolate_growths[memID]) + for memID, comm_environ in comm_member_growths.items() + } + growth_diffs = array( + [nanFilter(x, False) for x in list(comm_growth_effect.values())] + ) + th_pos, th_neg = 1 + interaction_threshold, 1 - interaction_threshold + if all(growth_diffs > th_pos): + bit = "mutualism" + elif all(growth_diffs < th_neg): + bit = "competitive" + elif ((th_pos > growth_diffs) & (growth_diffs > th_neg)).all(): + bit = "neutral" + elif all(growth_diffs > th_neg) and any(growth_diffs > th_pos): + bit = "commensalism" + elif all(growth_diffs < th_pos) and any(growth_diffs < th_neg): + bit = "amensalism" + elif any(growth_diffs > th_pos) and any(growth_diffs < th_neg): + bit = "parasitism" + else: + print( + f"The relative growths {comm_growth_effect} from {comm_member_growths} coculture and" + f" {isolate_growths} monoculture are not captured." + ) + bit = "" + return (pc_score, comm_growth_effect, comm_member_growths, bit) + + @staticmethod + def bss( + member_models: Iterable = None, + model_utils: Iterable = None, + environments=None, + minMedia=None, + skip_bad_media=False, + ): + def compute_score(minMedia, environment=None, index=0): + minMedia = minMedia or _get_media( + model_s_=[modelUtil.model for modelUtil in model_utils], + environment=environment, + skip_bad_media=skip_bad_media, + ) + model1_media = set( + [ + re.sub(r"(\_\w\d+$)", "", rxnID.replace("EX_", "")) + for rxnID in minMedia[model1_util.id]["media"].keys() + ] + ) + model2_media = set( + [ + re.sub(r"(\_\w\d+$)", "", rxnID.replace("EX_", "")) + for rxnID in minMedia[model2_util.id]["media"].keys() + ] + ) + model1_internal = { + rm_comp(met.id) + for rxn in model1_util.internal_list() + for met in rxn.products + } + model2_internal = { + rm_comp(met.id) + for rxn in model2_util.internal_list() + for met in rxn.products + } + bss_scores[ + f"{model1_util.id} supporting {model2_util.id} in media{index}" + ] = ( + model1_internal, + len(model2_media.intersection(model1_internal)) / len(model2_media), + ) + bss_scores[ + f"{model2_util.id} supporting {model1_util.id} in media{index}" + ] = ( + model2_internal, + len(model1_media.intersection(model2_internal)) / len(model1_media), + ) + + bss_scores = {} + for combination in combinations(model_utils or member_models, 2): + if model_utils is None: + model1_util = MSModelUtil(combination[0], True) + model2_util = MSModelUtil(combination[1], True) + model_utils = [model1_util, model2_util] + else: + model1_util = combination[0] + model2_util = combination[1] + if environments: + for index, environment in enumerate(environments): + compute_score(minMedia, environment, index) + else: + compute_score(minMedia) + return bss_scores + + @staticmethod + def mqs(): + pass + + @staticmethod + def _calculate_jaccard_score(set1, set2): + if set1 == set2: + print(f"The sets are identical, with a length of {len(set1)}.") + if len(set1.union(set2)) == 0: + return (None, None) + return ( + set1.intersection(set2), + len(set1.intersection(set2)) / len(set1.union(set2)), + ) + + @staticmethod + def get_all_genomes_from_ws( + ws_id, + kbase_object=None, + cobrakbase_repo_path: str = None, + kbase_token_path: str = None, + ): + def get_genome(genome_name): + return kbase_object.ws_client.get_objects2( + {"objects": [{"ref": f"{ws_id}/{genome_name}"}]} + )["data"][0]["data"] + + # load the kbase client instance + if not kbase_object: + import os + + os.environ["HOME"] = cobrakbase_repo_path + import cobrakbase + + with open(kbase_token_path) as token_file: + kbase_object = cobrakbase.KBaseAPI(token_file.readline()) + + # calculate the complementarity + genome_list = kbase_object.ws_client.list_objects( + { + "ids": [ws_id], + "type": "KBaseGenomes.Genome", + "minObjectID": 0, + "maxObjectID": 10000, + } + ) + genome_names = [g[1] for g in genome_list if g[1].endswith("RAST")] + return { + genome_name: set( + [ + sso + for j in get_genome(genome_name)["cdss"] + for sso in j["ontology_terms"]["SSO"].keys() + ] + ) + for genome_name in genome_names + } + + @staticmethod + def fs( + models: Iterable = None, + kbase_object=None, + cobrakbase_repo_path: str = None, + kbase_token_path: str = None, + annotated_genomes: dict = None, + printing=False, + ): + if not isinstance(annotated_genomes, dict): + if not kbase_object: + import os + + os.environ["HOME"] = cobrakbase_repo_path + import cobrakbase + + with open(kbase_token_path) as token_file: + kbase_object = cobrakbase.KBaseAPI(token_file.readline()) + annotated_genomes = { + model.id: kbase_object.get_from_ws(model.genome_ref) + for model in models + if hasattr(model, "genome_ref") + } + elif isinstance(annotated_genomes, list): + annotated_genomes = dict( + zip([model.id for model in models], annotated_genomes) + ) + elif models is not None: + annotated_genomes = { + k: v + for k, v in annotated_genomes.items() + if k in [model.id for model in models] + } + genome_combinations = list(combinations(annotated_genomes.keys(), 2)) + if printing: + print( + f"The Functionality Score (FS) will be calculated for {len(genome_combinations)} pairs." + ) + if not isinstance(list(annotated_genomes.values())[0], dict): + genome1_set, genome2_set = set(), set() + distances = {} + for genome1, genome2 in genome_combinations: + for j in annotated_genomes[genome1].features: + for key, val in j.ontology_terms.items(): + if key == "SSO": + genome1_set.update(val) + for j in annotated_genomes[genome2].features: + for key, val in j.ontology_terms.items(): + if key == "SSO": + genome2_set.update(val) + distances[f"{genome1} ++ {genome2}"] = ( + CommScores._calculate_jaccard_score(genome1_set, genome2_set) + ) + else: + distances = { + f"{genome1} ++ {genome2}": CommScores._calculate_jaccard_score( + set( + list(content["SSO"].keys())[0] + for dic in annotated_genomes[genome1]["cdss"] + for x, content in dic.items() + if x == "ontology_terms" and len(content["SSO"].keys()) > 0 + ), + set( + list(content["SSO"].keys())[0] + for dic in annotated_genomes[genome2]["cdss"] + for x, content in dic.items() + if x == "ontology_terms" and len(content["SSO"].keys()) > 0 + ), + ) + for genome1, genome2 in combinations(annotated_genomes.keys(), 2) + } + return distances + + @staticmethod + def smetana( + member_models: Iterable, + environment, + com_model=None, + min_growth=0.1, + n_solutions=100, + abstol=1e-6, + prior_values=None, + compatibilized=False, + sc_coupling=False, + printing=False, + ): + """Quantifies the extent of syntrophy as the sum of all exchanges in a given nutritional environment""" + member_models, community = _load_models( + member_models, com_model, compatibilized == False, printing=printing + ) + sc = None + if not prior_values: + mp = CommScores.mp(member_models, environment, com_model, abstol) + mu = CommScores.mu( + member_models, environment, mp, n_solutions, abstol, compatibilized + ) + if sc_coupling: + sc = CommScores.sc( + member_models, + com_model, + min_growth, + n_solutions, + abstol, + compatibilized, + ) + elif len(prior_values) == 3: + sc, mu, mp = prior_values + else: + mu, mp = prior_values + + smetana_scores = {} + for pairs in combinations(member_models, 2): + for model1, model2 in permutations(pairs): + if model1.id not in smetana_scores: + smetana_scores[model1.id] = {} + if not any([not mu[model1.id], not mp[model1.id]]): + sc_score = 1 if not sc_coupling else sc[model1.id][model2.id] + models_mets = list(model1.metabolites) + list(model2.metabolites) + unique_mets = set([met.id for met in models_mets]) + smetana_scores[model1.id][model2.id] = 0 + for met in models_mets: + if met.id in unique_mets: + mp_score = 0 if met.id not in mp[model1.id] else 1 + smetana_scores[model1.id][model2.id] += ( + mu[model1.id].get(met.id, 0) * sc_score * mp_score + ) + return smetana_scores + + @staticmethod + def antiSMASH(json_path=None, zip_path=None): + # TODO Scores 2, 4, and 5 are being explored for relevance to community formation and reveal specific member interactions/targets + # load the antiSMASH report from either the JSON or the raw ZIP, or both + from os import mkdir, listdir, path + from zipfile import ZipFile + from json import load + + if json_path: + cwd_files = listdir() + if json_path not in cwd_files and zip_path: + with ZipFile(zip_path, "r") as zip_file: + zip_file.extract(json_path) + with open(json_path, "r") as json_file: + data = load(json_file) + elif zip_path: + mkdir("extracted_antiSMASH") + with ZipFile(zip_path, "r") as zip_file: + zip_file.extractall("extracted_antiSMASH") + json_files = [ + x for x in listdir("extracted_antiSMASH") if x.endswith("json") + ] + if len(json_files) > 1: + print( + f"The antiSMASH report describes {len(json_files)} JSON files, the first of which is selected " + f"{json_files[0]} for analysis, otherwise explicitly identify the desired JSON file in the json_path parameter." + ) + with open( + path.join("extracted_antiSMASH", json_files[0]), "r" + ) as json_file: + data = load(json_file) + else: + raise ParameterError( + "Either the json_path or zip_path from the antiSMASH analysis must be provided," + " for these scores to be determined." + ) + # Parse data and scores from the antiSMASH report + biosynthetic_areas = data["records"][0]["areas"] + BGCs = set( + array( + [ + data["records"][0]["areas"][i]["products"] + for i in range(biosynthetic_areas) + ] + ).flatten() + ) + len_proteins = len( + data["records"][0]["modules"]["antismash.modules.clusterblast"][ + "knowncluster" + ]["proteins"] + ) + protein_annotations = [ + data["records"][0]["modules"]["antismash.modules.clusterblast"][ + "knowncluster" + ]["proteins"][i]["annotations"] + for i in range(len_proteins) + ] + clusterBlast = [s for s in protein_annotations if "resistance" in s] + num_clusterBlast = sum( + [item.count("resistance") for item in protein_annotations] + ) + + return ( + biosynthetic_areas, + BGCs, + protein_annotations, + clusterBlast, + num_clusterBlast, + ) diff --git a/modelseedpy/community/commscores_template.html b/modelseedpy/community/commscores_template.html new file mode 100644 index 00000000..b379568a --- /dev/null +++ b/modelseedpy/community/commscores_template.html @@ -0,0 +1,157 @@ + + + + + + CommScores Results + + + + + + + + + + + + + + + +

CommScores Results

+ + + + \ No newline at end of file diff --git a/modelseedpy/community/datastandardization.py b/modelseedpy/community/datastandardization.py new file mode 100644 index 00000000..026d008f --- /dev/null +++ b/modelseedpy/community/datastandardization.py @@ -0,0 +1,1193 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Aug 1 11:44:07 2022 + +@author: Andrew Freiburger +""" +from modelseedpy.community.commhelper import phenotypes +from modelseedpy.core.exceptions import ParameterError +from modelseedpy.core.optlanghelper import isIterable +from modelseedpy.core.fbahelper import FBAHelper +from optlang import Constraint +from optlang.symbolics import Zero +from scipy.constants import hour +from zipfile import ZipFile, ZIP_LZMA +from itertools import chain +from typing import Union, Iterable +from copy import deepcopy + +# from cplex import Cplex +import logging, json, os, re +from pandas import read_csv, DataFrame, ExcelFile +import numpy as np + + +import logging + +logger = logging.getLogger(__name__) + + +def isnumber(string): + try: + float(string) + except: + return False + return True + + +def _findDate(string, numerical=False): + monthNames = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + monthNums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + days = list(range(31, 0, -1)) # [f"{num}-" for num in list(range(31,0,-1))] + years = list(range(2010, 2025)) + list( + range(10, 25) + ) # [f"-{num}" for num in list(range(2000, 2100))] + americanDates = [ + f"{mon}-{day}-{year}" for mon in monthNums for day in days for year in years + ] + + for date in americanDates: + if re.search(date, string): + month, day, year = date.split("-") + if numerical: + return "-".join([day, month, year]) + return f"{monthNames[int(month)-1][:3]} {day}, {year}" + # # determine the month + # for monName in monthNames: + # if re.search(monName, string): + # month = monName + # break + # if not month: + # for monNum in monthNums: + # if re.search(monNum, string): + # month = monNum # maybe should be converted to the Name for standardization + # # determine the day + # for dayNum in days: + # if re.search(dayNum, string): + # day = dayNum + # break + # # determine the year + # for yearNum in years: + # if re.search(yearNum, string): + # year = yearNum + # break + # return day+month+year + + +def dict_keys_exists(dic, *keys): + if keys[0] in dic: + remainingKeys = keys[1:] + if len(remainingKeys) > 0: + dict_keys_exists(dic[keys[0]], keys[1:]) + return True + return False + + +def find_dic_number(dic): + for k, v in dic.items(): + if isnumber(v): + return v + num = find_dic_number(dic[k]) + return num + + +def default_dict_values(dic, key, default): + return default if not key in dic else dic[key] + + +def trial_contents(short_code, indices_tup, values): + matches = [ele == short_code for ele in indices_tup] + return np.array(values)[matches] + + +def _spreadsheet_extension_load(path): + if ".csv" in path: + return read_csv(path) + elif ".xls" in path: + return ExcelFile(path) + + +def _spreadsheet_extension_parse(path, raw_data, org_sheet): + if ".csv" in path: + return raw_data + elif ".xls" in path: + return raw_data.parse(org_sheet) + + +def _met_id_parser(met): + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + met_id = met_id.replace("c_", "", 1) + return met_id + + +def _column_reduction(org_df): + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dataframe.columns = map(str, dataframe.columns) + dataframe.index = dataframe["Well"] + dataframe.drop("Well", axis=1, inplace=True) + for col in dataframe.columns: + if any([x in col for x in ["Plate", "Well", "Cycle"]]): + dataframe.drop(col, axis=1, inplace=True) + dataframe.columns = list(map(int, list(map(float, dataframe.columns)))) + return dataframe + + +def _remove_trials(org_df, ignore_trials, signal, name, significant_deviation): + # refine the ignore_trials parameter + if isinstance(ignore_trials, dict): + ignore_trials["columns"] = ( + list(map(str, ignore_trials["columns"])) + if "columns" in ignore_trials + else [] + ) + ignore_trials["rows"] = ( + list(map(str, ignore_trials["rows"])) if "rows" in ignore_trials else [] + ) + ignore_trials["wells"] = ( + ignore_trials["wells"] if "wells" in ignore_trials else [] + ) + elif isIterable(ignore_trials): + if ignore_trials[0][0].isalpha() and isnumber(ignore_trials[0][1:]): + short_code = True # TODO - drop trials with respect to the short codes, and not the full codes + + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dropped_trials = [] + for trial in dataframe.index: + if ( + isinstance(ignore_trials, dict) + and any( + [ + trial[0] in ignore_trials["rows"], + trial[1:] in ignore_trials["columns"], + trial in ignore_trials["wells"], + ] + ) + or isIterable(ignore_trials) + and trial in ignore_trials + ): + dataframe.drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + elif isIterable(ignore_trials) and trial in ignore_trials: + dataframe.drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + removed_trials = [] + if "OD" not in signal: + for trial, row in dataframe.iterrows(): + row_array = np.array(row.to_list()) + ## remove trials for which the biomass growth did not change by the determined minimum deviation + if row_array[-1] / row_array[0] < significant_deviation: + dataframe.drop(trial, axis=0, inplace=True) + removed_trials.append(trial) + if removed_trials: + print( + f"The {removed_trials} trials were removed from the {name} measurements, " + f"with their deviation over time being less than the threshold of {significant_deviation}." + ) + if dropped_trials: + print( + f"The {dropped_trials} trials were dropped from the {name} measurements " + "per the ignore_trials parameter." + ) + return dataframe, dropped_trials + removed_trials + + +def _check_plateau(org_df, signal, name, significant_deviation, timesteps_len): + significant_deviation = max([2, significant_deviation]) + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dropped = [] + for trial, row in dataframe.iterrows(): + row_array = np.array(row.to_list()) + values = [] + tracking = False + ## remove trials for which the biomass growth did not change by the determined minimum deviation + for index, val in enumerate(row_array): + if val / row_array[0] >= significant_deviation or tracking: + tracking = True + values.append(val) + if len(values) > timesteps_len: + del values[0] + remaining_values = list(dataframe.columns[index - timesteps_len + 1 :]) + if all( + [ + len(values) == timesteps_len, + values[-1] <= values[0], + remaining_values[0] <= remaining_values[-1] * 1.1, + ] + ): + # the entire plateau, minus the first point of plateau, are removed + dropped = remaining_values + break + if dropped: + break + if dropped: + content = f"{name} {signal}" if name != signal else signal + print( + f"The {dropped} timesteps (with {row_array[index-len(values)+1:]} values) were removed " + f"from the {content} data since the OD plateaued and is no longer valid." + ) + return dropped + + +def _remove_timesteps(org_df, ignore_timesteps, name, signal): + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + if ignore_timesteps: + dropped = [] + for col in dataframe: + if col in ignore_timesteps: + dataframe.drop(col, axis=1, inplace=True) + dropped.append(col) + if dropped == ignore_timesteps: + print( + f"The ignore_timesteps columns were dropped for the {name} {signal} data." + ) + else: + raise ParameterError( + f"The ignore_timesteps values {ignore_timesteps} " + f"were unsuccessfully dropped for the {name} {signal} data." + ) + return dataframe, ignore_timesteps + + +def _df_construction( + name, + df_name, + ignore_trials, + ignore_timesteps, + significant_deviation, + dataframe, + row_num, + buffer_col1=True, +): + # refine the DataFrames + time_df = _column_reduction(dataframe.iloc[0::2]) + values_df = _column_reduction(dataframe.iloc[1::2]) + # display(name, time_df, values_df) + + # remove specified data trials + if ignore_trials: + values_df, removed_trials = _remove_trials( + values_df, ignore_trials, df_name, name, significant_deviation + ) + for row in removed_trials: + time_df.drop(row, axis=0, inplace=True) + + # remove specified data timesteps + if ignore_timesteps: + values_df, removed_timesteps = _remove_timesteps( + values_df, ignore_timesteps, name, df_name + ) + for col in list(map(int, removed_timesteps)): + time_df.drop(col, axis=1, inplace=True) + + # remove undefined trials + if buffer_col1: + possible_rows = [chr(ord("A") + row) for row in range(1, row_num + 1)] + for trial_code in values_df.index: + if trial_code[0] not in possible_rows: + values_df.drop(trial_code, axis=0, inplace=True) + time_df.drop(trial_code, axis=0, inplace=True) + + # process the data for subsequent operations and optimal efficiency + values_df.astype(str) + time_df.astype(str) + return time_df, values_df + + +def _find_culture(string): + matches = re.findall(r"([A-Z]{2}\+?[A-Z]*)", string) + return [m for m in matches if not any([x in m for x in ["BIOLOG", "III"]])] + + +def reverse_strip_comp(ID): + return ID.replace("~", "-") + + +def _process_csv(self, csv_path, index_col): + self.zipped_output.append(csv_path) + csv = read_csv(csv_path) + csv.index = csv[index_col] + csv.drop(index_col, axis=1, inplace=True) + csv.astype(str) + return csv + + +def add_rel_flux_cons(model, ex, phenoRXN, carbon_ratio, rel_flux=0.2): + # {ex.id}_uptakeLimit: {net_{carbonous_ex}} >= {net_{carbon_source}}*{rel_flux}*{carbon_ratio} + # The negative flux sign of influxes specifies that the carbon_source value must be lesser than the other + # carbon influx that is being constrained. + cons = Constraint(Zero, lb=0, ub=None, name=f"{ex.id}_uptakeLimit") + model.add_cons_vars(cons) + cons.set_linear_coefficients( + { + ex.forward_variable: 1, + ex.reverse_variable: -1, + phenoRXN.forward_variable: -rel_flux * carbon_ratio, + phenoRXN.reverse_variable: rel_flux * carbon_ratio, + } + ) + return model, cons + + +class GrowthData: + + @staticmethod + def process( + community_members: dict, + base_media=None, + solver: str = "glpk", + all_phenotypes=True, + data_paths: dict = None, + species_abundances: str = None, + carbon_conc_series: dict = None, + ignore_trials: Union[dict, list] = None, + ignore_timesteps: list = None, + species_identities_rows=None, + significant_deviation: float = 2, + extract_zip_path: str = None, + determine_requisite_biomass=False, + ): # , msdb_path:str=None): + # define the number of rows in the experimental data + row_num = len(species_identities_rows) + if "rows" in carbon_conc_series and carbon_conc_series["rows"]: + row_num = len(list(carbon_conc_series["rows"].values())[0]) + # load and parse data and metadata + ( + media_conc, + data_timestep_hr, + simulation_time, + dataframes, + trials, + fluxes_df, + ) = GrowthData.load_data( + base_media, + community_members, + solver, + data_paths, + ignore_trials, + all_phenotypes, + ignore_timesteps, + significant_deviation, + row_num, + extract_zip_path, + ) + experimental_metadata, standardized_carbon_conc, trial_name_conversion = ( + GrowthData.metadata( + base_media, + community_members, + species_abundances, + carbon_conc_series, + species_identities_rows, + row_num, + _findDate(data_paths["path"]), + ) + ) + data_df = GrowthData.data_process(dataframes, trial_name_conversion) + requisite_biomass = ( + {} + if not determine_requisite_biomass + else GrowthData.biomass_growth( + carbon_conc_series, + fluxes_df, + data_df.index.unique(), + trial_name_conversion, + data_paths, + community_members if all_phenotypes else None, + ) + ) + return ( + experimental_metadata, + data_df, + fluxes_df, + standardized_carbon_conc, + requisite_biomass, + trial_name_conversion, + np.mean(data_timestep_hr), + simulation_time, + media_conc, + ) + + @staticmethod + def load_data( + base_media, + community_members, + solver, + data_paths, + ignore_trials, + all_phenotypes, + ignore_timesteps, + significant_deviation, + row_num, + extract_zip_path, + min_timesteps=False, + ): + # define default values + significant_deviation = significant_deviation or 0 + data_paths = data_paths or {} + ignore_timesteps = ignore_timesteps or "0:0" + start, end = ignore_timesteps.split(":") + raw_data = _spreadsheet_extension_load(data_paths["path"]) + for org_sheet, name in data_paths.items(): + if org_sheet == "path": + continue + df = _spreadsheet_extension_parse(data_paths["path"], raw_data, org_sheet) + df.columns = df.iloc[6] + df.drop(df.index[:7], inplace=True) + ## acquire the default start and end indices of ignore_timesteps + start = int(start or df.columns[0]) + end = int(end or df.columns[-1]) + break + ignore_timesteps = list(range(start, end + 1)) if start != end else None + if extract_zip_path: + with ZipFile(extract_zip_path, "r") as zp: + zp.extractall() + + # define only species for which data is defined + fluxes_df, comm_members = phenotypes( + community_members, all_phenotypes, solver=solver + ) + modeled_species = list( + v for v in data_paths.values() if ("OD" not in v and " " not in v) + ) + removed_phenotypes = [ + col + for col in fluxes_df + if not any([species in col for species in modeled_species]) + ] + fluxes_df.drop(removed_phenotypes, axis=1, inplace=True) + if removed_phenotypes: + print( + f"The {removed_phenotypes} phenotypes were removed " + f"since their species is not among those with data: {modeled_species}." + ) + + # determine the time range in which all datasets are significant + data_timestep_hr = [] + dataframes = {} + max_timestep_cols = [] + if min_timesteps: + for org_sheet, name in data_paths.items(): + if org_sheet == "path" or "OD" in sheet: + continue + ## define the DataFrame + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + ## parse the timesteps from the DataFrame + drop_timestep_range = GrowthData._min_significant_timesteps( + dataframes[df_name], + ignore_timesteps, + significant_deviation, + ignore_trials, + df_name, + name, + ) + max_timestep_cols.append(drop_timestep_range) + ## timesteps that must be dropped for the most restrictive dataset is acquired + max_cols = max(list(map(len, max_timestep_cols))) + for ignore_timesteps in max_timestep_cols: + if len(ignore_timesteps) == max_cols: + break + + # remove trials for which the OD has plateaued + # TODO - this somehow seems to break when the requisite_biomass is ignored + for org_sheet, name in data_paths.items(): + if "OD" not in name: + continue + ## load the OD DataFrame + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + ## process the OD DataFrame + data_times_df, data_values_df = _df_construction( + name, + df_name, + ignore_trials, + ignore_timesteps, + significant_deviation, + dataframes[df_name], + row_num, + ) + plateaued_times = _check_plateau( + data_values_df, name, name, significant_deviation, 3 + ) + ## define and store the final DataFrames + for col in plateaued_times: + if col in data_times_df.columns: + data_times_df.drop(col, axis=1, inplace=True) + if col in data_values_df.columns: + data_values_df.drop(col, axis=1, inplace=True) + dataframes[df_name] = (data_times_df, data_values_df) + break + + # refine the non-OD signals + for org_sheet, name in data_paths.items(): + if org_sheet == "path" or "OD" in name: + continue + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + if df_name not in dataframes: + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + # parse the DataFrame for values + simulation_time = dataframes[df_name].iloc[0, -1] / hour + data_timestep_hr.append( + simulation_time / int(dataframes[df_name].columns[-1]) + ) + # define the times and data + data_times_df, data_values_df = _df_construction( + name, + df_name, + ignore_trials, + ignore_timesteps, + significant_deviation, + dataframes[df_name], + row_num, + ) + # display(data_times_df) ; display(data_values_df) + for col in plateaued_times: + if col in data_times_df.columns: + data_times_df.drop(col, axis=1, inplace=True) + if col in data_values_df.columns: + data_values_df.drop(col, axis=1, inplace=True) + dataframes[df_name] = (data_times_df, data_values_df) + + # differentiate the phenotypes for each species + trials = set( + chain.from_iterable( + [list(times.index) for times, values in dataframes.values()] + ) + ) + media_conc = ( + {} + if not base_media + else {cpd.id: cpd.concentration for cpd in base_media.mediacompounds} + ) + return ( + media_conc, + data_timestep_hr, + simulation_time, + dataframes, + trials, + fluxes_df, + ) + + @staticmethod + def _min_significant_timesteps( + full_df, ignore_timesteps, significant_deviation, ignore_trials, df_name, name + ): + # refine the DataFrames + values_df = _column_reduction(full_df.iloc[1::2]) + values_df, removed_trials = _remove_trials( + values_df, ignore_trials, df_name, name, significant_deviation + ) + timestep_range = list(set(list(values_df.columns)) - set(ignore_timesteps)) + start, end = ignore_timesteps[0], ignore_timesteps[-1] + start_index = list(values_df.columns).index(start) + end_index = list(values_df.columns).index(end) + ## adjust the customized range such that the threshold is reached. + for trial, row in values_df.iterrows(): + row_array = np.delete( + np.array(row.to_list()), list(range(start_index, end_index + 1)) + ) + ## remove trials for which the biomass growth did not change by the determined minimum deviation + while all( + [ + row_array[-1] / row_array[0] < significant_deviation, + end <= values_df.columns[-1], + start >= values_df.columns[0], + ] + ): + # print(timestep_range[0], values_df.columns[0], values_df.columns[-1], end, start) + if ( + timestep_range[0] == values_df.columns[0] + and start != values_df.columns[-1] + ): + timestep_range.append(timestep_range[-1] + 1) + start += 1 + print( + f"The end boundary for {name} is increased to {timestep_range[-1]}", + end="\r", + ) + elif ( + timestep_range[-1] == values_df.columns[-1] + and end != values_df.columns[0] + ): + timestep_range.append(timestep_range[0] - 1) + end -= 1 + print( + f"The start boundary for {name} is decreased to {timestep_range[0]}", + end="\r", + ) + else: + raise ParameterError( + f"All of the timesteps were omitted for {name}." + ) + row_array = np.delete( + np.array(row.to_list()), + list( + range( + list(values_df.columns).index(start), + list(values_df.columns).index(end) + 1, + ) + ), + ) + print("\n") + return list(range(start, end + 1)) + + @staticmethod + def metadata( + base_media, + community_members, + species_abundances, + carbon_conc, + species_identities_rows, + row_num, + date, + ): + # define carbon concentrations for each trial + carbon_conc = carbon_conc or {} + carbon_conc["columns"] = default_dict_values(carbon_conc, "columns", {}) + carbon_conc["rows"] = default_dict_values(carbon_conc, "rows", {}) + column_num = len(species_abundances) + + # define the metadata DataFrame and a few columns + constructed_experiments = DataFrame( + index=[f"G{x+1}" for x in list(range(column_num * row_num))] + ) + constructed_experiments.index.name = "short_code" + base_media_path = ( + "minimal components media" if not base_media else base_media.path[0] + ) + constructed_experiments["base_media"] = [base_media_path] * ( + column_num * row_num + ) + + # define community content + # species_mets = {mem["name"]: np.array([mets["consumed"] for mets in mem["phenotypes"].values()]).flatten() + # for mem in community_members.values()} + # define the strains column + strains, additional_compounds, experiment_ids = [], [], [] + trial_name_conversion = {} + count = 1 + ## apply universal values to all trials + base_row_conc = ( + [] + if "*" not in carbon_conc + else [ + ":".join( + [met, str(carbon_conc["*"][met][0]), str(carbon_conc["*"][met][1])] + ) + for met in carbon_conc["*"] + ] + ) + members = list(mem["name"] for mem in community_members.values()) + for row in range(1, row_num + 1): + row_conc = base_row_conc[:] + trial_letter = chr(ord("A") + row) + trial_name_conversion[trial_letter] = {} + ## add rows where the initial concentration in the first trial is non-zero + for met, conc_dict in carbon_conc["rows"].items(): + if conc_dict[sorted(list(conc_dict.keys()))[row - 1]] > 0: + row_conc.append( + ":".join( + [ + met, + str(conc_dict[sorted(list(conc_dict.keys()))[row - 1]]), + str( + conc_dict[ + sorted(list(conc_dict.keys()), reverse=True)[ + -row + ] + ] + ), + ] + ) + ) + + row_concentration = ";".join(row_conc) + composition = {} + for col in range(1, column_num + 1): + ## construct the columns of information + additional_compounds.append(row_concentration) + experiment_id = [] + for member in members: + ### define the relative community abundances + composition[member] = [ + member, + f"r{species_abundances[col][member]}", + ] + ### define the member strain, where it is appropriate + if member in species_identities_rows[row]: + composition[member][ + 0 + ] += f"_{species_identities_rows[row][member]}" + ### the experimental ID is abundance+memberID + if int(composition[member][1][1:]) != 0: + experiment_id.append( + f"{composition[member][1]}_{composition[member][0]}" + ) + composition[member] = ":".join(composition[member]) + strains.append(";".join(composition[member] for member in members)) + # for row2 in row_conc: + # metID, init, end = row2.split(':') + # ### get the met_name for the corresponding match in values + # met_name = None + # for index, mets in enumerate(species_mets.values()): + # if metID in mets: + # met_name = list(species_mets.keys())[index] + # break + # if "met_name" not in locals() or not met_name: + # logger.critical(f"The specified phenotypes {species_mets} for the {members} members" + # f" does not include the consumption of the available sources" + # f" {row_conc}; hence, the model cannot grow.") + # content = "" + # else: + # content = f"{init}_{met_name}" + # experiment_id.append(content) + experiment_id.extend([":".join(row.split(":")[:2]) for row in row_conc]) + experiment_id = "-".join(experiment_id) + experiment_ids.append(experiment_id) + trial_name_conversion[trial_letter][str(col + 1)] = ( + "G" + str(count), + experiment_id, + ) + count += 1 + + # convert the variable concentrations to short codes + standardized_carbon_conc = {} + for met, conc in carbon_conc["rows"].items(): + standardized_carbon_conc[met] = {} + for row, val in conc.items(): + standardized_carbon_conc[met].update( + { + short_code: val + for (short_code, expID) in trial_name_conversion[row].values() + } + ) + for met, conc in carbon_conc["columns"].items(): + standardized_carbon_conc[met] = default_dict_values( + standardized_carbon_conc, met, {} + ) + for col, val in conc.items(): + for row in trial_name_conversion: + standardized_carbon_conc[met][ + trial_name_conversion[row][str(col)][0] + ] = val + + # add columns to the exported dataframe + constructed_experiments.insert(0, "trial_IDs", experiment_ids) + constructed_experiments["additional_compounds"] = additional_compounds + constructed_experiments["strains"] = strains + constructed_experiments["date"] = [date] * (column_num * row_num) + constructed_experiments.to_csv("growth_metadata.tsv", sep="\t") + return constructed_experiments, standardized_carbon_conc, trial_name_conversion + + @staticmethod + def biomass_growth( + carbon_conc, + fluxes_df, + data_df_trials, + trial_name_conversion, + data_paths, + community_members=None, + pheno_info=None, + ): + # TODO - leverage cFBA to partition metabolite consumption between the defined phenotypes + pheno_info = pheno_info or { + f"{content['name']}_{pheno}": mets + for model, content in community_members.items() + for pheno, mets in content["phenotypes"].items() + } + # invert the trial_name_conversion and data_paths keys and values + short_code_trials = { + contents[0]: row + col + for row in trial_name_conversion + for col, contents in trial_name_conversion[row].items() + } + # short_code_trials = {contents[0]:contents[1] for contents in trial_name_conversion[row].values()} + name_signal = {name: signal for signal, name in data_paths.items()} + + # calculate the 90% concentration for each carbon source + requisite_fluxes = {} + for trial in [short_code_trials[ID] for ID in data_df_trials]: + row_letter = trial[0] + col_number = trial[1:] + ## add rows where the initial concentration in the first trial is non-zero + utilized_phenos = {} + food_gradient = carbon_conc.copy() + for dimension, content in food_gradient.items(): + for met, conc_dict in content.items(): + source_conc = conc_dict[ + row_letter if dimension == "rows" else int(col_number) + ] + # print(met, source_conc) + if source_conc == 0 or f"EX_{met}_e0" not in fluxes_df.index: + continue + for pheno, val in fluxes_df.loc[f"EX_{met}_e0"].items(): + # print(pheno, val) + if val < 0: + utilized_phenos[pheno] = source_conc * 0.9 / val + total_consumed = sum(list(utilized_phenos.values())) + # print(utilized_phenos) + + display(fluxes_df) + short_code = trial_name_conversion[row_letter][col_number][0] + requisite_fluxes[short_code] = {} + excreta = {} + for pheno, flux_conversion in utilized_phenos.items(): + species, phenotype = pheno.split("_", 1) + fluxes = ( + fluxes_df.loc[:, pheno] + * abs(flux_conversion) + * abs(flux_conversion / total_consumed) + ) + requisite_fluxes[short_code][f"{species}|{name_signal[species]}"] = ( + fluxes[fluxes != 0] + ) + pheno = reverse_strip_comp(pheno) + if "excreted" in pheno_info[pheno]: + # print(pheno_info[pheno]["excreted"]) + excreta.update( + {met: fluxes.loc[met] for met in pheno_info[pheno]["excreted"]} + ) + ## determine the fluxes for the other members of the community through cross-feeding + participated_species = [] + for pheno, mets in pheno_info.items(): + species, phenotype = pheno.split("_", 1) + if ( + any([species in ph for ph in utilized_phenos]) + or species in participated_species + ): + continue + for met in mets["consumed"]: + exMet = f"EX_{met}_e0" + if exMet not in excreta: + continue + fluxes = ( + abs(excreta[exMet] * 0.99 / fluxes_df.loc[exMet, pheno]) + * fluxes_df.loc[:, pheno] + ) + requisite_fluxes[short_code][ + f"{species}|{name_signal[species]}" + ] = fluxes[fluxes != 0] + participated_species.append(species) + # print(requisite_fluxes) + return requisite_fluxes + + @staticmethod + def data_process(dataframes, trial_name_conversion): + short_codes, trials_list = [], [] + values, times = {}, {} # The times must capture upstream + first = True + for df_name, (times_df, values_df) in dataframes.items(): + # print(df_name) + # display(times_df) ; display(values_df) + times_tup = FBAHelper.parse_df(times_df) + average_times = np.mean(times_tup.values, axis=0) + values[df_name], times[df_name] = [], [] + for trial_code in values_df.index: + row_let, col_num = trial_code[0], trial_code[1:] + # print(trial_code, row_let, col_num) + for trial_row_values in trial_contents( + trial_code, values_df.index, values_df.values + ): + if first: + short_code, experimentalID = trial_name_conversion[row_let][ + col_num + ] + trials_list.extend([experimentalID] * len(values_df.columns)) + short_codes.extend([short_code] * len(values_df.columns)) + values[df_name].extend(trial_row_values) + times[df_name].extend(average_times) + first = False + # process the data to the smallest dataset, to accommodate heterogeneous data sizes + minVal = min(list(map(len, values.values()))) + for df_name, data in values.items(): + values[df_name] = data[:minVal] + times2 = times.copy() + for df_name, data in times2.items(): + times[df_name] = data[:minVal] + # construct the growth DataFrame + df_data = { + "trial_IDs": trials_list[:minVal], + "short_codes": short_codes[:minVal], + } + df_data.update( + {"Time (s)": np.mean(list(times.values()), axis=0)} + ) # element-wise average + df_data.update({df_name: vals for df_name, vals in values.items()}) + data_df = DataFrame(df_data) + data_df.index = data_df["short_codes"] + data_df = data_df.drop(["short_codes"], axis=1) + data_df.to_csv("growth_spectra.tsv", sep="\t") + return data_df + + +class BiologData: + + @staticmethod + def process( + data_paths, + trial_conditions_path, + community_members, + col_row_num, + member_conversions, + culture=None, + date=None, + significant_deviation=None, + solver="glpk", + msdb_path: str = None, + ): + row_num = 8 + column_num = 12 + ( + zipped_output, + data_timestep_hr, + simulation_time, + dataframes, + trials, + culture, + date, + fluxes_df, + ) = BiologData.load_data( + data_paths, + significant_deviation, + community_members, + col_row_num, + row_num, + culture, + date, + solver, + ) + experimental_metadata, standardized_carbon_conc, trial_name_conversion = ( + BiologData.metadata( + trial_conditions_path, row_num, column_num, culture, date + ) + ) + biolog_df = BiologData.data_process(dataframes, trial_name_conversion) + requisite_biomass = BiologData.biomass_growth(biolog_df, member_conversions) + return ( + experimental_metadata, + biolog_df, + fluxes_df, + standardized_carbon_conc, + requisite_biomass, + trial_name_conversion, + np.mean(data_timestep_hr), + simulation_time, + ) + + @staticmethod + def load_data( + data_paths, + significant_deviation, + community_members, + col_row_num, + row_num, + culture, + date, + solver, + ): + zipped_output = [data_paths["path"], "fluxes.tsv"] + # determine the metabolic fluxes for each member and phenotype + # import and parse the raw CSV data + # TODO - this may be capable of emulating leveraged functions from the GrowthData object + fluxes_df = phenotypes(community_members, solver=solver) + # fluxes_df = None + data_timestep_hr = [] + dataframes = {} + raw_data = _spreadsheet_extension_load(data_paths["path"]) + significant_deviation = significant_deviation or 2 + # culture = culture or _find_culture(data_paths['path']) + culture = culture or ",".join( + [ + x + for x in data_paths.values() + if (x not in ["OD"] and not re.search(r"\w\.\w", x)) + ] + ) + date = date or _findDate(data_paths["path"]) + for org_sheet, name in data_paths.items(): + if org_sheet == "path": + continue + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + if df_name not in dataframes: + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[col_row_num] + dataframes[df_name].drop( + dataframes[df_name].index[: col_row_num + 1], inplace=True + ) + dataframes[df_name].dropna(inplace=True) + # parse the DataFrame for values + dataframes[df_name].columns = [ + str(x).strip() for x in dataframes[df_name].columns + ] + simulation_time = dataframes[df_name].iloc[0, -1] / hour + # display(dataframes[df_name]) + data_timestep_hr.append( + simulation_time / int(float(dataframes[df_name].columns[-1])) + ) + # define the times and data + data_times_df, data_values_df = _df_construction( + name, + df_name, + None, + None, + significant_deviation, + dataframes[df_name], + row_num, + False, + ) + # display(data_times_df) ; display(data_values_df) + dataframes[df_name] = (data_times_df, data_values_df) + + # differentiate the phenotypes for each species + trials = set( + chain.from_iterable([list(df.index) for df, times in dataframes.values()]) + ) + return ( + zipped_output, + data_timestep_hr, + simulation_time, + dataframes, + trials, + culture, + date, + fluxes_df, + ) + + @staticmethod + def metadata(trial_conditions_path, row_num, column_num, culture, date): + # define the conditions for each trial + with open(trial_conditions_path) as trials: + trial_conditions = json.load(trials) + + # define the metadata DataFrame and a few columns + constructed_experiments = DataFrame() + ex_prefix = "B" + constructed_experiments.index = [ + f"{ex_prefix}{x+1}" for x in list(range(row_num * column_num)) + ] + constructed_experiments.index.name = "short_code" + + # define the strains column + experiment_ids, trial_names = [], [] + trial_name_conversion, trial_mets = {}, {} + count = 1 + ## apply universal values to all trials + for row in range(row_num): + trial_letter = chr(ord("A") + row) + trial_name_conversion[trial_letter] = {} + ## add rows where the initial concentration in the first trial is non-zero + for col in range(1, column_num + 1): + ## construct the columns of information + dataID = trial_letter + str(col) + MSID = trial_conditions[dataID]["ModelSEED_ID"] + short_code = ex_prefix + str(count) + + experiment_ids.append(MSID) + trial_names.append(trial_conditions[dataID]["name"]) + trial_name_conversion[trial_letter][str(col)] = (short_code, MSID) + trial_mets[MSID] = {short_code: trial_conditions[dataID]["mM"]} + count += 1 + + # add columns to the exported dataframe + constructed_experiments.insert(0, "ModelSEED_ID", experiment_ids) + constructed_experiments.insert(0, "condition", trial_names) + constructed_experiments["strain"] = [culture] * (column_num * row_num) + constructed_experiments["date"] = [date] * (column_num * row_num) + constructed_experiments.to_csv("growth_metadata.tsv", sep="\t") + return constructed_experiments, trial_mets, trial_name_conversion + + @staticmethod + def data_process(dataframes, trial_name_conversion): + short_codes, trials_list = [], [] + values, times = {}, {} # The times must capture upstream + first = True + for df_name, (times_df, values_df) in dataframes.items(): + # display(df_name, times_df, values_df) + times_tup = FBAHelper.parse_df(times_df) + # display(DataFrame(times_tup.values)) + average_times = list(np.mean(times_tup.values, axis=0)) + # print(average_times) + # print(len(average_times)) + values[df_name], times[df_name] = [], [] + for exprID in values_df.index: + row_let, col_num = exprID[0], exprID[1:] + for trial_row_values in trial_contents( + exprID, values_df.index, values_df.values + ): + if first: + short_code, experimentalID = trial_name_conversion[row_let][ + col_num + ] + trials_list.extend([experimentalID] * len(values_df.columns)) + short_codes.extend([short_code] * len(values_df.columns)) + if len(trial_row_values) != len(average_times): + print( + f"The length of the trial data {len(trial_row_values)} " + f"exceeds that of the timesteps {len(average_times)} " + f"which creates an incompatible DataFrame." + ) + values[df_name].extend(trial_row_values) + times[df_name].extend(average_times) + first = False + # process the data to the smallest dataset, to accommodate heterogeneous data sizes + minVal = min(list(map(len, values.values()))) + for df_name, data in values.items(): + values[df_name] = data[:minVal] + times2 = times.copy() + for df_name, data in times2.items(): + times[df_name] = data[:minVal] + df_data = {"trial_IDs": trials_list, "short_codes": short_codes} + df_data.update( + {"Time (s)": list(np.mean(list(times.values()), axis=0))} + ) # element-wise average + df_data.update({df_name: vals for df_name, vals in values.items()}) + biolog_df = DataFrame(df_data) + biolog_df.index = biolog_df["short_codes"] + del biolog_df["short_codes"] + biolog_df.to_csv("growth_spectra.tsv", sep="\t") + + return biolog_df + + @staticmethod + def biomass_growth(biolog_df, member_conversions): + requisite_biomass = {} + for short_code in biolog_df.index.unique(): + requisite_biomass[short_code] = {} + for signal, conversion in member_conversions.items(): + short_code_df = biolog_df[biolog_df.index == short_code] + requisite_biomass[short_code][signal] = ( + conversion + * short_code_df[signal.replace("|", ":").replace(" ", "_")].iloc[-1] + ) + return requisite_biomass diff --git a/modelseedpy/community/get_ncbi_gbff.pl b/modelseedpy/community/get_ncbi_gbff.pl new file mode 100644 index 00000000..cbeddcfc --- /dev/null +++ b/modelseedpy/community/get_ncbi_gbff.pl @@ -0,0 +1,13 @@ +use strict; + +while (<>){ + chomp ($_); + next if ($_=~/^\s*$/); + my $val = `grep $_ assembly_summary_refseq.txt |cut -f 20`; + chomp ($val); + my @p = split ("/", $val); + my $n = $p[-1]; + my $url = "${val}/${n}_genomic.gbff.gz"; + my $fpath = "${n}_genomic.gbff.gz "; + print "curl $url -o $fpath" . "\n"; +} diff --git a/modelseedpy/community/metquest_code.py b/modelseedpy/community/metquest_code.py new file mode 100644 index 00000000..d6ad31e0 --- /dev/null +++ b/modelseedpy/community/metquest_code.py @@ -0,0 +1,1162 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from collections import deque, defaultdict +import os +import glob +import sys +import warnings +from itertools import combinations +import re +import pandas as pd +import numpy as np +import cobra +import networkx as nx + +from modelseedpy.community import commhelper +from modelseedpy import MSModelUtil + +warnings.filterwarnings("ignore") + + +def _create_graph_with_internal_reaction(organismsdata): + """ + This function creates a NetworkX DiGraph object which consists of + reactions and metabolites happening inside the organisms in a community. + This makes use of the reaction information i.e., irreversible and + reversible, which is obtained from another script fetch_reactions. + + Parameters + ---------- + organismsdata : dict + Dictionary containing the reaction information about organisms + + Returns + ------- + G : NetworkX DiGraph Object + Bipartite graph consisting of internal reactions in organisms + """ + G = nx.DiGraph() + for modelname in organismsdata: + G.add_nodes_from(organismsdata[modelname]["irreversible_rxn_no"], bipartite=1) + G.add_nodes_from(organismsdata[modelname]["reversible_rxn_no"], bipartite=1) + G.add_nodes_from( + organismsdata[modelname]["reversible_back_rxn_no"], bipartite=1 + ) + irrev_lhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["irreversible_lhs_nodes"] + for item in sublist + ] + ) + ) + irrev_rhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["irreversible_rhs_nodes"] + for item in sublist + ] + ) + ) + rev_lhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["reversible_lhs_nodes"] + for item in sublist + ] + ) + ) + rev_rhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["reversible_rhs_nodes"] + for item in sublist + ] + ) + ) + G.add_nodes_from(irrev_lhs_nodes, bipartite=0) + G.add_nodes_from(irrev_rhs_nodes, bipartite=0) + G.add_nodes_from(rev_lhs_nodes, bipartite=0) + G.add_nodes_from(rev_rhs_nodes, bipartite=0) + for irrevidx in range(len(organismsdata[modelname]["irreversible_rxn_no"])): + for lhsmetidx in range( + len(organismsdata[modelname]["irreversible_lhs_nodes"][irrevidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["irreversible_lhs_nodes"][ + irrevidx + ][lhsmetidx], + organismsdata[modelname]["irreversible_rxn_no"][irrevidx], + ) + ] + ) + for rhsmetidx in range( + len(organismsdata[modelname]["irreversible_rhs_nodes"][irrevidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["irreversible_rxn_no"][irrevidx], + organismsdata[modelname]["irreversible_rhs_nodes"][ + irrevidx + ][rhsmetidx], + ) + ] + ) + for revidx in range(len(organismsdata[modelname]["reversible_rxn_no"])): + for lhsmetidxrev in range( + len(organismsdata[modelname]["reversible_lhs_nodes"][revidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_lhs_nodes"][revidx][ + lhsmetidxrev + ], + organismsdata[modelname]["reversible_rxn_no"][revidx], + ) + ] + ) + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_back_rxn_no"][revidx], + organismsdata[modelname]["reversible_lhs_nodes"][revidx][ + lhsmetidxrev + ], + ) + ] + ) + for rhsmetidxrev in range( + len(organismsdata[modelname]["reversible_rhs_nodes"][revidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_rxn_no"][revidx], + organismsdata[modelname]["reversible_rhs_nodes"][revidx][ + rhsmetidxrev + ], + ) + ] + ) + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_rhs_nodes"][revidx][ + rhsmetidxrev + ], + organismsdata[modelname]["reversible_back_rxn_no"][revidx], + ) + ] + ) + return G + + +def _create_graph_with_exchange_reactions(G, orgs, namemap): + """ + This function first identifies the common exchange metabolites + and the non-common exchange metabolites and adds them to the + DiGraph object generated above. + + Parameters + ---------- + G : NetworkX DiGraph Object + Bipartite graph of reaction network from organisms + orgs : dict + Dictionary consisting of irreversible, reversible and exchange + reactions pertaining to the organisms. If more than one organism + is used, this dictionary consists of information about all the + organisms. + namemap : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + + Returns + ------- + G : NetworkX DiGraph Object + Bipartite graph consisting of internal and exchange reactions in organisms + namemap : dict + Dictionary mapping the adhoc exchange reaction names to reaction names in + the model + """ + metabolite_exchanged = [] + for orgnames in orgs: + exc_met = orgs[orgnames]["exchange_metab_nodes"] + metabolite_exchanged.append(exc_met) + # Common exchange metabolites in different organisms + common_exchange_metabolite = list( + set.intersection(*list(map(set, metabolite_exchanged))) + ) + common_exchange_metabolite.sort() + # Adding the common exchange metabolites to the graph + for orgnames in orgs: + renamed_exc_met = [ + f"{orgnames} {comexcmet}" for comexcmet in common_exchange_metabolite + ] + number_exc_met = list(range(0, len(common_exchange_metabolite))) + mod_exc_rxn_number = [ + f"Org_{orgnames} ER{str(num + 1)}" for num in number_exc_met + ] + mod_exc_rev_rxn_number = [ + f"Org_{orgnames} ERR{str(num + 1)}" for num in number_exc_met + ] + G.add_nodes_from(mod_exc_rxn_number, bipartite=1) + G.add_nodes_from(mod_exc_rev_rxn_number, bipartite=1) + G.add_nodes_from(common_exchange_metabolite, bipartite=0) + G.add_nodes_from(renamed_exc_met, bipartite=0) + for k in range(len(renamed_exc_met)): + namemap[mod_exc_rxn_number[k]] = common_exchange_metabolite[k] + namemap[mod_exc_rev_rxn_number[k]] = common_exchange_metabolite[k] + G.add_edges_from([(renamed_exc_met[k], mod_exc_rxn_number[k])]) + G.add_edges_from([(mod_exc_rxn_number[k], common_exchange_metabolite[k])]) + G.add_edges_from( + [(common_exchange_metabolite[k], mod_exc_rev_rxn_number[k])] + ) + G.add_edges_from([(mod_exc_rev_rxn_number[k], renamed_exc_met[k])]) + # Adding the uncommon exchange metabolites to the graph + for orgnames in orgs: + metitems = orgs[orgnames]["exchange_metab_nodes"] + non_common_exc_met = list(set(metitems) - set(common_exchange_metabolite)) + non_common_exc_met.sort() + renamed_non_common_exc_met = [f"{orgnames} {s}" for s in non_common_exc_met] + number_non_common_exc_met = list(range(0, len(non_common_exc_met))) + mod_non_common_exc_rxn_number = [ + f"Org_{orgnames} NCER{str(num + 1)}" for num in number_non_common_exc_met + ] + mod_non_common_exc_rev_rxn_number = [ + f"Org_{orgnames} NCERR{str(num + 1)}" for num in number_non_common_exc_met + ] + G.add_nodes_from(mod_non_common_exc_rxn_number, bipartite=1) + G.add_nodes_from(mod_non_common_exc_rev_rxn_number, bipartite=1) + G.add_nodes_from(non_common_exc_met, bipartite=0) + G.add_nodes_from(renamed_non_common_exc_met, bipartite=0) + for k in range(len(renamed_non_common_exc_met)): + namemap[mod_non_common_exc_rxn_number[k]] = non_common_exc_met[k] + namemap[mod_non_common_exc_rev_rxn_number[k]] = non_common_exc_met[k] + G.add_edges_from( + [(renamed_non_common_exc_met[k], mod_non_common_exc_rxn_number[k])] + ) + G.add_edges_from( + [(mod_non_common_exc_rxn_number[k], non_common_exc_met[k])] + ) + G.add_edges_from( + [(non_common_exc_met[k], mod_non_common_exc_rev_rxn_number[k])] + ) + G.add_edges_from( + [(mod_non_common_exc_rev_rxn_number[k], renamed_non_common_exc_met[k])] + ) + return G, namemap + + +def create_graph(file_names, no_of_orgs): + """ + This function creates bipartite graph of the organisms based on the + path provided and the number of organsisms. For instance, if a folder + has 3 model files, and the number of organisms is 2, 3 (3C2) different + bipartite graphs are created. The graph objects and the dictionary + are saved as gpickle and pickle files respectively. + + Parameters + ---------- + file_names : list + List containing the file names of models + no_of_orgs : int + Number of organisms to be used for creating the DiGraph. + + Returns + ------- + H : NetworkX DiGraph Object + Bipartite graph consisting of internal and exchange reactions in organisms + full_name_map : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + """ + + H = [] + organisms_reaction_data, partial_name_map = segregate_reactions_from_models( + file_names + ) + if organisms_reaction_data: + organisms_names = list(organisms_reaction_data.keys()) + all_possible_combis = list( + combinations(list(range(len(organisms_names))), int(no_of_orgs)) + ) + if int(no_of_orgs) > 1 and sorted(organisms_names)[0][0] == "0": + all_possible_combis = all_possible_combis[: len(organisms_names) - 1] + if all_possible_combis: + for ncom in range(len(all_possible_combis)): + file_name = "" + current_combination = {} + for numincom in range(len(all_possible_combis[ncom])): + current_combination[ + organisms_names[all_possible_combis[ncom][numincom]] + ] = organisms_reaction_data[ + organisms_names[all_possible_combis[ncom][numincom]] + ] + file_name = ( + file_name + + organisms_names[all_possible_combis[ncom][numincom]] + + "_" + ) + H.append(_create_graph_with_internal_reaction(current_combination)) + temp, full_name_map = _create_graph_with_exchange_reactions( + H[ncom], current_combination, partial_name_map + ) + H[ncom] = temp + print(len(H), H[ncom]) + print("Number of edges in graph", len(H[ncom].edges())) + print("Number of nodes in graph", len(H[ncom].nodes())) + + # Uncomment the following code to save the graph files externally in your machine + # Note: Graph files can occupy a large space for large datasets + """ + if os.access(path_name_with_models, os.W_OK): + with open(file_name + 'namemap' + '.pickle', 'wb') as filetodump: + dump(full_name_map, filetodump) + nx.write_gpickle(H[ncom], file_name + '.gpickle') + print('Graph and namemap saved for file(s) in', path_name_with_models) + """ + else: + print( + "Number of organisms for creating a consortium graph is more than the models given" + ) + print("Program will now exit") + sys.exit() + else: + print("Cannot create graph") + sys.exit() + return H, full_name_map + + +def forward_pass(graph_object, media): + """ + This function carries out the Guided Breadth First Search on a directed + bipartite graph starting from the entries in seed metabolite set. + + Parameters + ---------- + graph_object : NetworkX DiGraph Object + Bipartite graph of the metabolic network + + seedmet : set + Set of seed metabolites including the source + + Returns + ------- + lower_bound_metabolite : defaultdict + Minimum number of steps required to reach a metabolite + status_dict : defaultdict + Dictionary pertaining to the status of every reaction - whether it + has been visited or not + scope : set + Set of metabolites that can be produced from the given set of + seed metabolites + + Notes + ----- + Starting with the set of seed metabolites S, the algorithm first finds + all the reactions from the set R, whose precursor metabolites are in S. + Such reactions are marked visited and added to the visited reaction set. + Metabolites produced by these reactions are checked. The reactions where + these metabolites participate are then checked for the presence of all its + predecessors and are added to the queue. This traversal continues in a + breadth-first manner and stops when there are no further reactions to + be visited. + """ + pred = graph_object.predecessors + succ = graph_object.successors + lower_bound_metabolite = {cpd: [0] for cpd in media} + lower_bound_reaction = defaultdict(list) + status_dict = defaultdict(str) + # Using a deque since deques have O(1) speed for appendleft() and popleft() + # while lists have O(n) performance for inserting and popping. + queue = deque([]) + # All seed metabolites are always present, hence require 0 steps + stage = 1 + mediaMets = list(media.keys()) + scope = list(media.keys()) + starting_rxn_node = [] + # First stage where starting_rxn_node list contains all the reactions + # which require only the seed metabolites as input + for starting_met_nodes in mediaMets: + # Essential when analysing mutiple networks with same seed metabolite + # set, although would be redundant in case of single network + if starting_met_nodes in graph_object: + for startingrxns in succ(starting_met_nodes): + if set(pred(startingrxns)).issubset(mediaMets): + if startingrxns not in starting_rxn_node: + starting_rxn_node.append(startingrxns) + for metsprod in succ(startingrxns): + scope.add(metsprod) + if stage not in lower_bound_metabolite[metsprod]: + lower_bound_metabolite[metsprod].append(stage) + if stage not in lower_bound_reaction[startingrxns]: + lower_bound_reaction[startingrxns].append(stage) + for rxn in starting_rxn_node: + for metabs in succ(rxn): + for nextrxn in succ(metabs): + if set(pred(nextrxn)).issubset(scope): + if nextrxn not in queue: + queue.append(nextrxn) + status_dict[rxn] = "V" + while queue: + stage += 1 + for parentrxn in list(queue): + if status_dict[parentrxn] == "": + if stage not in lower_bound_reaction[parentrxn]: + lower_bound_reaction[parentrxn].append(stage) + for mets in succ(parentrxn): + scope.add(mets) + if stage not in lower_bound_metabolite[mets]: + lower_bound_metabolite[mets].append(stage) + for progeny in succ(mets): + if set(pred(progeny)).issubset(scope): + if status_dict[progeny] != "V": + if progeny not in queue: + queue.append(progeny) + status_dict[parentrxn] = "V" + elif status_dict[parentrxn] == "V": + for mets in succ(parentrxn): + if stage not in lower_bound_metabolite[mets]: + lower_bound_metabolite[mets].append(stage) + queue.popleft() + return lower_bound_metabolite, status_dict, scope + + +def find_different_reaction_types(stoi_matrix, model, current_model_name): + """ + This function finds the exchange, irreversible and the reversible reactions + from the model. + + Parameters + ---------- + stoi_matrix : numpy array + full path name where the model files are + model : COBRA model object + COBRA model object created from SBML models + current_model_name : str + Name which is to be prefixed against every + reaction/metabolite (to differentiate the entries in multiple organisms, + when a community model is built) + Returns + ------- + exchange_met_ids : list + Metabolite identifiers of exchange metabolites + irrev_lhs_nodes : list + Metabolite identifiers of reactants of irreversible reactions + irrev_rhs_nodes : list + Metabolite identifiers of products of irreversible reactions + rev_lhs_nodes : list + Metabolite identifiers of reactants of reversible reactions + rev_rhs_nodes : list + Metabolite identifiers of products of reversible reactions + exchange_rxn_ids : list + Reaction identifers of exchange reactions + irrev_rxn_ids : list + Reaction identifiers of irreversible reactions + rev_rxn_ids : list + Reaction identifiers of reversible reactions + + """ + + xdim = np.shape(stoi_matrix) + reactants_of_reaction, total_metabolites_in_reaction, products_of_reaction = ( + [], + [], + [], + ) + number_of_reactants_in_reaction, total_number_of_metabs_in_reaction = [], [] + number_of_products_in_reaction, exchange_reaction_idx = [], [] + reaction_identifiers, reaction_in_model, metabolite_identifiers = [], [], [] + for metab in model.metabolites: + metabolite_identifiers.append(metab.id) + for rxns in model.reactions: + reaction_identifiers.append(rxns.id) + reaction_in_model.append(rxns.reaction) + for rxnidx in range(xdim[0]): + reactants_of_reaction.append(np.where(stoi_matrix[rxnidx] == -1)) + total_metabolites_in_reaction.append(np.where(stoi_matrix[rxnidx] != 0)) + products_of_reaction.append(np.where(stoi_matrix[rxnidx] == 1)) + number_of_reactants_in_reaction.append(len(reactants_of_reaction[rxnidx][0])) + total_number_of_metabs_in_reaction.append( + len(total_metabolites_in_reaction[rxnidx][0]) + ) + number_of_products_in_reaction.append(len(products_of_reaction[rxnidx][0])) + + # Case 1 - Presence of bulk metabolites in the medium + + if ( + reaction_in_model[rxnidx][-1] == "b" + ): # Assuming the bulk metabolites end in 'b' + if ( + number_of_reactants_in_reaction[rxnidx] == 1 + and number_of_products_in_reaction[rxnidx] == 1 + ): + exchange_reaction_idx.append(rxnidx) + # Case 2 - Presence of exchange metabolites + elif ( + number_of_reactants_in_reaction[rxnidx] == 1 + and total_number_of_metabs_in_reaction[rxnidx] == 1 + ): + exchange_reaction_idx.append(rxnidx) + elif ( + number_of_products_in_reaction[rxnidx] == 1 + and total_number_of_metabs_in_reaction[rxnidx] == 1 + ): + exchange_reaction_idx.append(rxnidx) + exchange_met_ids, exchange_met_index, exchange_rxn_ids = [], [], [] + for excentry in exchange_reaction_idx: + exchange_rxn_ids.append(reaction_identifiers[excentry]) + if reaction_in_model[excentry][-1] == "b": + exchange_met_ids.append( + metabolite_identifiers[np.nonzero(stoi_matrix[excentry])[0][0]] + ) + else: + exchange_met_index.append(np.nonzero(stoi_matrix[excentry])[0].tolist()[0]) + if exchange_met_index: + for metind in exchange_met_index: + exchange_met_ids.append(metabolite_identifiers[metind]) + all_rxn_idx = list(range(len(reaction_in_model))) + internal_rxns = list(set(all_rxn_idx) ^ set(exchange_reaction_idx)) + reversible_rxns, irreversible_rxns, rxns_lowerbound, rxns_upperbound = ( + [], + [], + [], + [], + ) + for rxns in model.reactions: + rxns_lowerbound.append(rxns.lower_bound) + rxns_upperbound.append(rxns.upper_bound) + for idxint in internal_rxns: + if rxns_lowerbound[idxint] < 0 and rxns_upperbound[idxint] >= 0: + reversible_rxns.append(idxint) + elif rxns_lowerbound[idxint] >= 0 and rxns_upperbound[idxint] >= 0: + irreversible_rxns.append(idxint) + # Irreversible reaction nodes + ( + irrev_lhs_temporary, + irrev_rhs_temporary, + irrev_lhs_nodes, + irrev_rhs_nodes, + irrev_rxn_ids, + ) = ([], [], [], [], []) + for irridx in irreversible_rxns: + irrev_rxn_ids.append(reaction_identifiers[irridx]) + irrev_lhs_temporary.append(np.where(stoi_matrix[irridx] < 0)[0].tolist()) + irrev_rhs_temporary.append(np.where(stoi_matrix[irridx] > 0)[0].tolist()) + for lhsirridx in range(len(irrev_lhs_temporary)): + temp_metab_list_lhs = [] + for met_idx_lhs in irrev_lhs_temporary[lhsirridx]: + met_namech_lhs = ( + f"{current_model_name} {metabolite_identifiers[met_idx_lhs]}" + ) + temp_metab_list_lhs.append(met_namech_lhs) + irrev_lhs_nodes.append(temp_metab_list_lhs) + for rhsirridx in range(len(irrev_rhs_temporary)): + temp_metab_list_rhs = [] + for met_idx_rhs in irrev_rhs_temporary[rhsirridx]: + met_namech_rhs = ( + f"{current_model_name} {metabolite_identifiers[met_idx_rhs]}" + ) + temp_metab_list_rhs.append(met_namech_rhs) + irrev_rhs_nodes.append(temp_metab_list_rhs) + + # Reversible reaction nodes + rev_lhs_temporary, rev_rhs_temporary, rev_lhs_nodes, rev_rhs_nodes, rev_rxn_ids = ( + [], + [], + [], + [], + [], + ) + for rridx in reversible_rxns: + rev_rxn_ids.append(reaction_identifiers[rridx]) + rev_lhs_temporary.append(np.where(stoi_matrix[rridx] < 0)[0].tolist()) + rev_rhs_temporary.append(np.where(stoi_matrix[rridx] > 0)[0].tolist()) + for lhsrevidx in range(len(rev_lhs_temporary)): + temp_metab_list_lhs_rev = [] + for met_idx_lhs in rev_lhs_temporary[lhsrevidx]: + met_namech_lhs = "%s %s" % ( + current_model_name, + metabolite_identifiers[met_idx_lhs], + ) + temp_metab_list_lhs_rev.append(met_namech_lhs) + rev_lhs_nodes.append(temp_metab_list_lhs_rev) + for rhsrevidx in range(len(rev_rhs_temporary)): + temp_metab_list_rhs_rev = [] + for met_idx_rhs in rev_rhs_temporary[rhsrevidx]: + met_namech_rhs = "%s %s" % ( + current_model_name, + metabolite_identifiers[met_idx_rhs], + ) + temp_metab_list_rhs_rev.append(met_namech_rhs) + rev_rhs_nodes.append(temp_metab_list_rhs_rev) + return ( + exchange_met_ids, + irrev_lhs_nodes, + irrev_rhs_nodes, + rev_lhs_nodes, + rev_rhs_nodes, + exchange_rxn_ids, + irrev_rxn_ids, + rev_rxn_ids, + ) + + +def segregate_reactions_from_models(models): + """ + This function gets the data pertaining to the reactions and the + metabolites from the models of multiple organisms. + This requires as input the pathname where the '.xml' files are located. + From this path, this function reads all the files using the functions + in the COBRA toolbox and generates the stoichiometric model for these + SBML models. + + Parameters + ---------- + models : list + List of model objects + + Returns + ------- + all_organisms_info : dict + Dictionary of all model data (reaction information about all the + organisms) + namemap : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + + """ + all_organisms_info = {} + namemap = {} + for model in models: + stoi = cobra.util.array.create_stoichiometric_matrix(model) + current_organisms_info = {} + rxns_in_model, mets_in_model = [], [] + for metab in model.metabolites: + mets_in_model.append(metab.id) + for reac in model.reactions: + rxns_in_model.append(reac.id) + stoi_matrix = stoi.T + ( + exchange_nodes, + irrev_lhs_nodes, + irrev_rhs_nodes, + rev_lhs_nodes, + rev_rhs_nodes, + exc_name, + irrev_rxn_name, + rev_rxn_name, + ) = find_different_reaction_types(stoi_matrix, model, model.id) + current_organisms_info[model.id] = { + "exchange_metab_nodes": exchange_nodes, + "irreversible_lhs_nodes": irrev_lhs_nodes, + "irreversible_rhs_nodes": irrev_rhs_nodes, + "reversible_lhs_nodes": rev_lhs_nodes, + "reversible_rhs_nodes": rev_rhs_nodes, + "exch_rxn_name": exc_name, + "irrev_rxn_name": irrev_rxn_name, + "rev_rxn_name": rev_rxn_name, + } + + irrev_rxn_number = [] + for num in range(len(irrev_lhs_nodes)): + modified_name_irrev = f"Org_{model.id} IR" + str(num + 1) + irrev_rxn_number.append(modified_name_irrev) + namemap[modified_name_irrev] = irrev_rxn_name[num] + + rev_rxn_number = [] + for num in range(len(rev_lhs_nodes)): + modified_name_rev = f"Org_{model.id} RR" + str(num + 1) + rev_rxn_number.append(modified_name_rev) + namemap[modified_name_rev] = rev_rxn_name[num] + + rev_back_rxn_number = [] + for num in range(len(rev_lhs_nodes)): + modified_name_back_rev = f"Org_{model.id} RevBR" + str(num + 1) + rev_back_rxn_number.append(modified_name_back_rev) + namemap[modified_name_back_rev] = rev_rxn_name[num] + + current_organisms_info[model.id]["reversible_rxn_no"] = rev_rxn_number + current_organisms_info[model.id]["irreversible_rxn_no"] = irrev_rxn_number + current_organisms_info[model.id]["total_nodes"] = ( + len(exchange_nodes) + len(irrev_lhs_nodes) + len(rev_lhs_nodes) + ) + current_organisms_info[model.id]["model_rxns"] = rxns_in_model + current_organisms_info[model.id]["reversible_back_rxn_no"] = rev_back_rxn_number + current_organisms_info[model.id]["metabolites"] = mets_in_model + all_organisms_info.update(current_organisms_info) + return all_organisms_info, namemap + + +def find_relievedrxns(model, org_info, org_info_pert): + relieved = { + i: list(set(org_info_pert[i]) - set(org_info[i])) for i in org_info_pert + } + detailed_rel_rxns, rel_rxns_name = {}, {} + + for i in model: + j = i.id + detailed_rel_rxns[j] = [] + rel_rxns_name[j] = [] + if len(relieved[j]): + rxn_ids = [] + for r in i.reactions: + rxn_ids.append(r.id) + for rel in relieved[j]: + rel_rxn = i.reactions[rxn_ids.index(rel)].reaction + detailed_rel_rxns[j].append(rel_rxn) + rel_rxns_name[j].append(i.reactions[rxn_ids.index(rel)].name) + + return relieved, detailed_rel_rxns, rel_rxns_name + + +def find_stuckrxns(model, community, media, no_of_orgs): + # Constructing graphs + warnings.filterwarnings("ignore") + G, full_name_map = create_graph(community, no_of_orgs) + if not os.path.exists("results"): + os.makedirs("results") + all_possible_combis = list( + combinations(list(range(len(community))), int(no_of_orgs)) + ) + if no_of_orgs > 1 and sorted(community)[0][0] == "0": + all_possible_combis = all_possible_combis[: len(community) - 1] + org_info = {} + scope = {} + print("No. of graphs constructed: ", len(G)) + + # This loop finds all the stuck reaction + for i in range(len(all_possible_combis)): + lbm, sd, s = forward_pass(G[i], media) + for j in range(len(all_possible_combis[i])): + stuck, rxnNode = [], [] + model1 = model[all_possible_combis[i][j]].id + visited = list(sd.keys()) + for r in G[i].nodes: + if r.find(model1) >= 0: + rxnNode.append(r) + for rxn in rxnNode: + if rxn in visited: + continue + elif rxn.find("ERR") >= 0: + continue + elif rxn.find("Org") >= 0: + if (rxn[len(model1) + 5] == "I") or (rxn[len(model1) + 5] == "R"): + stuck.append(rxn) + org_info[model1] = stuck + scope[model1] = s + return org_info, scope, full_name_map + + +def decrypt_orginfo(org_info, namemap): + """ + This function decrypts the rxn ids using the data in corresponding namemaps + :param org_info: + :param namemap: + :return: + org_info: An dictionary of decrypted rxn ids for each community + """ + for i in org_info: + for j in range(len(org_info[i])): + org_info[i][j] = namemap[org_info[i][j]] + return org_info + + +def make_perturbed_community(rem_org, pert_models, pert_community): + pert_model_ids = [i.id for i in pert_models] + for i in rem_org: + if i in pert_model_ids: + pert_models.remove(pert_models[pert_model_ids.index(i)]) + pert_community.remove(pert_community[pert_model_ids.index(i)]) + pert_model_ids.remove(i) + + return pert_models, pert_community, pert_model_ids + + +def perform_task( + media, model, transport_rxns, pert_community, org_info_wo_trans_rxn, rem_org_list, n +): + org_info_pert, scope_pert, namemap_pert = find_stuckrxns( + model, pert_community, media, len(pert_community) + ) + org_info_pert = decrypt_orginfo(org_info_pert, namemap_pert) + org_info_pert_wo_trans_rxn = { + i: list(set(org_info_pert[i]) - set(transport_rxns)) for i in org_info_pert + } + + with open(f"results/Community_without_clus{str(n)}.csv", "w") as g: + for m in org_info_pert_wo_trans_rxn: + g.write(m + "," + str(len(org_info_pert_wo_trans_rxn[m])) + "\n") + stuck_com = stuck_pert_com = 0 + for i in org_info_wo_trans_rxn: + if i not in rem_org_list: + stuck_com += len(org_info_wo_trans_rxn[i]) + for i in org_info_pert_wo_trans_rxn: + stuck_pert_com += len(org_info_pert_wo_trans_rxn[i]) + msi = 1 - (stuck_com / stuck_pert_com) + print(n, "th cluster") + return org_info_pert, org_info_pert_wo_trans_rxn, msi + + +def write_relieved_rxns(g, relieved, detailed_rel_rxns, rel_rxns_name): + g.write("acceptor\trelieved reactions\n") + for i in relieved: + g.write(i + "\t") + for j in list(set(relieved[i])): + g.write(j + "\t\n\t") + for d in list(set(rel_rxns_name[i])): + g.write(d + "\t\n\t") + for k in list(set(detailed_rel_rxns[i])): + g.write(k + "\t\n") + + +def write_relieved_rxn_metadata(h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn): + nrelieved = {} + for i in org_info_pert_wo_trans_rxn: + nrelieved[i] = len(org_info_pert_wo_trans_rxn[i]) - len( + org_info_wo_trans_rxn[i] + ) + if nrelieved[i]: + h.write( + i + + "," + + str(len(org_info_wo_trans_rxn[i])) + + "," + + str(len(org_info_pert_wo_trans_rxn[i])) + + "," + + str(nrelieved[i]) + + "\n" + ) + + +def find_relieved_rxn(model, media_name, org_info_single, org_info_pair): + """ + This function extracts and writes the relieved rxns into a tsv file + :param model: + :param media_name: name of the media used (identifer to know what media is used when analysis is done using multiple media) + :param org_info_single: Dictionary containing stuck reactions of all microbes in the community + :param org_info_pair: Dictionary containing stuck reactions of all microbes in the community + :return: None + """ + relieved = {} + for org1 in model: + for org2 in model: + if org1.id + "_" + org2.id in org_info_pair.keys(): + relieved[org1.id + "_" + org2.id] = [] + temp = list( + set(org_info_single[org1.id + "_" + org1.id]) + - set(org_info_pair[org1.id + "_" + org2.id]) + ) + for j in temp: + relieved[org1.id + "_" + org2.id].append(j) + else: + continue + + rel_rxns_name, detailed_rel_rxns = {}, {} + for i in model: + rxn_ids = [r.id for r in i.reactions] + for j in model: + org1 = i.id + org2 = j.id + if org1 + "_" + org2 in relieved.keys(): + detailed_rel_rxns[org1 + "_" + org2] = [] + rel_rxns_name[org1 + "_" + org2] = [] + for rel in relieved[org1 + "_" + org2]: + rel_rxn = i.reactions[rxn_ids.index(rel)].reaction + detailed_rel_rxns[org1 + "_" + org2].append(rel_rxn) + rel_rxns_name[org1 + "_" + org2].append( + i.reactions[rxn_ids.index(rel)].name + ) + + relieved_rxn_output_file = f"results/relieved_rxns_{media_name}_w_excrxns.tsv" + with open(relieved_rxn_output_file, "w") as g: + header = "acceptor\tdonor\trelieved reactions\n" + g.write(header) + for i in model: + for j in model: + org1 = i.id + org2 = j.id + if org1 + "_" + org2 in relieved.keys(): + g.write(org1 + "\t" + org2 + "\t") + rel_rxns = list(set(relieved[org1 + "_" + org2])) + det_rel_rxns = list(set(detailed_rel_rxns[org1 + "_" + org2])) + rel_rxn_nam = list(set(rel_rxns_name[org1 + "_" + org2])) + for x in rel_rxns: + g.write(x + "\t\n\t\t") + for d in rel_rxn_nam: + g.write(d + "\t\n\t\t") + for k in det_rel_rxns: + g.write(k + "\t\n") + print("relieved reactions are written at:\n", relieved_rxn_output_file) + + +def find_stuck_rxns(models, community, media, comm_size): + """ + Constructs graphs using MetQuest and finds all stuck reactions in the cellular compartment + :param models: list of GEMs + :param community: the community model + :param seedmet_file: path to txt file containing seed metabolites + :param comm_size: number of organisms in a community + :return: + org_info: Dictionary containing stuck reactions of all microbes in the community + scope: Dictionary containing all the metabolites that can be produced by the microbes in the community + namemap: Dictionaru containing all the decrypted rxn ids + """ + warnings.filterwarnings("ignore") + G, full_name_map = create_graph(community, comm_size) + if not os.path.exists("results"): + os.makedirs("results") + + all_possible_combis = combinations(models, comm_size) + org_info, scope, vis = {}, {}, {} + print("No. of graphs constructed: ", len(G)) + + # This loop finds all the stuck reaction + for i in range(len(all_possible_combis)): + lbm, sd, s = forward_pass(G[i], media) + for j in range(len(all_possible_combis[i])): + stuck, rxnNode = [], [] + model1 = models[all_possible_combis[i][j]].id + visited = list(sd.keys()) + for r in G[i].nodes: + if r.find(model1) >= 0: + rxnNode.append(r) + for rxn in rxnNode: + if rxn in visited or rxn.find("ERR") >= 0: + continue + elif rxn.find("Org") >= 0: + if (rxn[len(model1) + 5] == "I") or (rxn[len(model1) + 5] == "R"): + stuck.append(rxn) + model2 = models[all_possible_combis[i][j - 1]].id + org_info[model1 + "_" + model2] = stuck + scope[model1 + "_" + model2] = s + vis[model1 + "_" + model2] = visited + return org_info, scope, full_name_map, vis + + +def decrypt_org_info(org_info, namemap): + """ + This function decrypts the rxn ids using the data in corresponding namemaps + :param org_info: + :param namemap: + :return: + org_info: An dictionary of decrypted rxn ids for each community + """ + for i in org_info: + for j in range(len(org_info[i])): + org_info[i][j] = namemap[org_info[i][j]] + return org_info + + +def pMSI(models, media): + """ + Calculates MSI for CarveMe models + Extracts and writes relieved reactions in every pair + :param community: list of GSMM files + :param sd_file: path to txt file containing seed metabolites + :return: msi: Dictionary containing MSI values for every pair + """ + # find all transport reactions + community_model = commhelper.build_from_species_models(models) + comm_util = MSModelUtil(community_model) + # find stuck reactions + org_info_single, scope_sin, namemap_sin, vis = find_stuck_rxns( + models, community_model, media, 1 + ) + org_info_pair, scope_pair, namemap_pair, vis = find_stuck_rxns( + models, models, media, 2 + ) + # decrypt the stuck reactions + org_info_single = decrypt_org_info(org_info_single, namemap_sin) + org_info_pair = decrypt_org_info(org_info_pair, namemap_pair) + # Filter out the transport reactions from every stuck reaction list + org_info_single_wo_trans_rxn, org_info_pair_wo_trans_rxn = {}, {} + for i in org_info_single: + org_info_single_wo_trans_rxn[i] = list( + set(org_info_single[i]) - set(comm_util.transport_list()) + ) + for i in org_info_pair: + org_info_pair_wo_trans_rxn[i] = list( + set(org_info_pair[i]) - set(comm_util.transport_list()) + ) + # find all the relieved reactions in every pairs + find_relieved_rxn(models, "relieved_rxns", org_info_single, org_info_pair) + # calculate MSI for every pair + msi = {} + for org1 in models: + stuck_A = len(org_info_single_wo_trans_rxn[org1.id + "_" + org1.id]) + for org2 in models: + if org1.id + "_" + org2.id in org_info_pair_wo_trans_rxn.keys(): + stuck_AUB = len(org_info_pair_wo_trans_rxn[org1.id + "_" + org2.id]) + if stuck_A == 0: + msi[org1.id + "_" + org2.id] = 0 + else: + msi[org1.id + "_" + org2.id] = 1 - (stuck_AUB / stuck_A) + return msi, community_model + + +def calculate_pairwiseMSI(models, media): + """ + This function calculates pairwise-MSI for all given microbes. + + Creates a csv file containing the MSI values of all pairs. + + Creates an tsv file containing the list of reaction relieved + in all acceptor microbes in the presence of corresponding donor microbes. + + :param path: path to all xml files + :param sd_file: path to txt file containing seed metabolites + """ + + warnings.filterwarnings("ignore") + msi, community_model = pMSI(models, media) + msi_output_file = f"results/MSI_{os.path.basename(media).replace('.txt', '')}.csv" + with open(msi_output_file, "w") as f: + header = "organism,in_the_presence,msi_value\n" + f.write(header) + for org1, org2 in combinations(models, 2): + if org1.id + "_" + org2.id in msi.keys(): + f.write(f"{org1.id},{org2.id},{str(msi[org1.id + '_' + org2.id])}\n") + print("MSI values are written at:\n", msi_output_file) + + +def calculate_higherorderMSI(models, media, clusters="individual_clusters"): + community_model = commhelper.build_from_species_models(models) + comm_util = MSModelUtil(community_model) + org_info, scope, namemap = find_stuckrxns(model, community, media, len(community)) + org_info = decrypt_orginfo(org_info, namemap) + org_info_wo_trans_rxn = { + i: list(set(org_info[i]) - set(comm_util.transport_list())) for i in org_info + } + + with open(f"results/community_unperturbed.csv", "w") as f: + for i, diff in org_info_wo_trans_rxn.items(): + f.write(i + "," + str(len(diff)) + "\n") + + if clusters == "individual_clusters": + rem_org_list1, rem_org_list2 = {}, {} + for i, model in enumerate(models): + rem_org_list1[i] = model.id + rem_org_list2[i] = model.id + else: + cluster_data = pd.read_csv(clusters, sep=",") + rem_org_list1 = cluster_data.set_index("Cluster").T.to_dict("list") + for n in rem_org_list1: + rem_org_list1[n] = [j for j in rem_org_list1[n] if pd.isna(j) is False] + for n in rem_org_list1: + rem_org_list1[n] = [ + cobra.io.read_sbml_model(i).id for i in rem_org_list1[n] + ] + # rem_org_list1[n] = [model_ids[model_ids.index(i)] for i in rem_org_list1[n]] + rem_org_list2 = rem_org_list1.copy() + + for nclus in rem_org_list2: + rem_org_list2[nclus] = [x.replace(".xml", "") for x in rem_org_list2[nclus]] + + with open(f"results/higher_order_msi.csv", "w") as f: + for n in rem_org_list1: + # os.chdir(path) + # new_models = model.copy() + # new_community = glob.glob('*.xml') + # if not new_community: + # new_community = glob.glob('*.sbml') + # new_community.sort() + + pert_models, pert_community, pert_model_ids = make_perturbed_community( + rem_org_list1[n], new_models, new_community + ) + + org_info_pert, org_info_pert_wo_trans_rxn, msi = perform_task( + media, + pert_models, + transport_rxns, + pert_community, + org_info_wo_trans_rxn, + rem_org_list2[n], + n, + ) + for i in rem_org_list2[n]: + f.write("Comm,clus_" + str(n) + "#" + i + "," + str(msi) + "\n") + + if msi: + relieved, detailed_rel_rxns, rel_rxns_name = find_relievedrxns( + pert_models, org_info, org_info_pert + ) + with open( + f"results/clusterKO_/data_analysis/relieved_rxns_Comm--clus{n}.tsv", + "w", + ) as g: + write_relieved_rxns(g, relieved, detailed_rel_rxns, rel_rxns_name) + with open( + f"results/clusterKO_/data_analysis/Comm--clus{n}.tsv", "w" + ) as h: + h.write("Comm--clus" + str(n) + "\n") + for i in rem_org_list2[n]: + h.write(i + "\n") + h.write( + "num of rxns relieved in the below orgs in the presence of clust" + + str(n) + + "\n" + ) + h.write("org,unpert,clust_" + str(n) + "KO,rxns relieved\n") + write_relieved_rxn_metadata( + h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn + ) + print("Comm--clus" + str(n)) + + new_models = model.copy() + new_community = glob.glob("*.xml") + if not new_community: + new_community = glob.glob("*.sbml") + new_community.sort() + ko_models, ko_community, model_ids = make_perturbed_community( + pert_model_ids, new_models, new_community + ) + ko_org_list = [x for x in pert_model_ids] + if len(ko_org_list) < len(model): + org_info_pert, org_info_pert_wo_trans_rxn, msi = perform_task( + media, + ko_models, + transport_rxns, + ko_community, + org_info_wo_trans_rxn, + ko_org_list, + n, + ) + for i in ko_community: + f.write("clus_" + str(n) + "#" + i + ",Comm," + str(msi) + "\n") + + if msi: + relieved, detailed_rel_rxns, rel_rxns_name = find_relievedrxns( + ko_models, org_info, org_info_pert + ) + with open( + f"results/clusterKO_/data_analysis/relieved_rxns_Comm--clus{n}.tsv", + "w", + ) as g: + write_relieved_rxns( + g, relieved, detailed_rel_rxns, rel_rxns_name + ) + with open( + f"results/clusterKO_/data_analysis/Comm{n}--clus.tsv", "w" + ) as h: + h.write("clus" + str(n) + "--Comm\n") + for i in ko_org_list: + h.write(i + "\n") + h.write( + "num of rxns relieved in the below orgs in the presence of Comm" + ) + h.write("org,unpert,commKO,rxns relieved\n") + write_relieved_rxn_metadata( + h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn + ) + print("clus" + str(n) + "--Comm") diff --git a/modelseedpy/community/mscommfitting.py b/modelseedpy/community/mscommfitting.py new file mode 100644 index 00000000..2a42d63d --- /dev/null +++ b/modelseedpy/community/mscommfitting.py @@ -0,0 +1,1091 @@ +# -*- coding: utf-8 -*- +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.exceptions import FeasibilityError +from pandas import read_table, read_csv, DataFrame +from optlang import Variable, Constraint, Objective, Model +from modelseedpy.core.fbahelper import FBAHelper +from scipy.constants import hour +from scipy.optimize import newton +from collections import OrderedDict +from zipfile import ZipFile, ZIP_LZMA +from optlang.symbolics import Zero +from sympy.core.add import Add +from matplotlib import pyplot + +# from pprint import pprint +from time import sleep, process_time +import numpy as np + +# from cplex import Cplex +import json, os, re + + +def _name(name, suffix, time, trial): + return "-".join([name + suffix, time, trial]) + + +class MSCommFitting: + def __init__(self): + ( + self.parameters, + self.variables, + self.constraints, + self.dataframes, + self.signal_species, + self.values, + ) = ({}, {}, {}, {}, {}, {}) + self.phenotypes_parsed_df: np.ndarray + self.problem: object + self.species_phenotypes_bool_df: object + self.zipped_output, self.plots = [], [] + + def _process_csv(self, csv_path, index_col): + self.zipped_output.append(csv_path) + csv = read_csv(csv_path) + csv.index = csv[index_col] + csv.drop(index_col, axis=1, inplace=True) + csv.astype(str) + return csv + + def load_data( + self, + community_members: dict = {}, + kbase_token: str = None, + solver: str = "glpk", + signal_tsv_paths: dict = {}, + phenotypes_csv_path: str = None, + media_conc_path: str = None, + species_abundance_path: str = None, + carbon_conc_series: dict = {}, + ignore_trials: dict = {}, + ignore_timesteps: list = [], + significant_deviation: float = 2, + zip_path: str = None, + ): + self.zipped_output = [] + if zip_path: + with ZipFile(zip_path, "r") as zp: + zp.extractall() + if species_abundance_path: + self.species_abundances = self._process_csv( + species_abundance_path, "trial_column" + ) + if phenotypes_csv_path: + # process a predefined exchanges table + self.zipped_output.append(phenotypes_csv_path) + fluxes_df = read_csv(phenotypes_csv_path) + fluxes_df.index = fluxes_df["rxn"] + to_drop = [col for col in fluxes_df.columns if " " in col] + for col in to_drop + ["rxn"]: + fluxes_df.drop(col, axis=1, inplace=True) + print( + f'The {to_drop+["rxn"]} columns were dropped from the phenotypes CSV.' + ) + + # import and process the media concentrations CSV + self.media_conc = self._process_csv(media_conc_path, "media_compound") + elif community_members: + # import the media for each model + models = OrderedDict() + ex_rxns: set = set() + species: dict = {} + # Using KBase media to constrain exchange reactions in model + for model, content in community_members.items(): + model.solver = solver + ex_rxns.update(model.exchanges) + species.update({content["name"]: content["phenotypes"].keys()}) + models[model] = [] + for media in content["phenotypes"].values(): + with model: # !!! Is this the correct method of parameterizing a media for a model? + pkgmgr = MSPackageManager.get_pkg_mgr(model) + pkgmgr.getpkg("KBaseMediaPkg").build_package( + media, default_uptake=0, default_excretion=1000 + ) + models[model].append(model.optimize()) + + # construct the parsed table of all exchange fluxes for each phenotype + fluxes_df = DataFrame( + data={ + "bio": [ + sol.fluxes["bio1"] + for solutions in models.values() + for sol in solutions + ] + }, + columns=["rxn"] + + [ + spec + "-" + phenotype + for spec, phenotypes in species.items() + for phenotype in phenotypes + ] + + [spec + "-stationary" for spec in species.keys()], + ) + fluxes_df.index.name = "rxn" + fluxes_df.drop("rxn", axis=1, inplace=True) + for ex_rxn in ex_rxns: + elements = [] + for model, solutions in models.items(): + for sol in solutions: + elements.append( + sol.fluxes[ex_rxn] if ex_rxn in sol.fluxes else 0 + ) + if any(np.array(elements) != 0): + fluxes_df.iloc[ex_rxn.id] = elements + + # define only species for which data is defined + signal_tsv_paths + modeled_species = list(signal_tsv_paths.values()) + modeled_species.remove("OD") + removed_phenotypes = [ + col + for col in fluxes_df + if not any([species in col for species in modeled_species]) + ] + for col in removed_phenotypes: + fluxes_df.drop(col, axis=1, inplace=True) + if removed_phenotypes != []: + print( + f"The {removed_phenotypes} phenotypes were removed since their species is not among those that are defined with data: {modeled_species}." + ) + fluxes_df.astype(str) + self.phenotypes_parsed_df = FBAHelper.parse_df(fluxes_df) + self.species_phenotypes_bool_df = DataFrame( + columns=self.phenotypes_parsed_df[1] + ) + + if "columns" not in carbon_conc_series: + carbon_conc_series["columns"] = {} + if "rows" not in carbon_conc_series: + carbon_conc_series["rows"] = {} + self.carbon_conc = carbon_conc_series + + self.parameters["data_timestep_hr"] = [] + if "columns" not in ignore_trials: + ignore_trials["columns"] = [] + if "rows" not in ignore_trials: + ignore_trials["rows"] = [] + if "wells" not in ignore_trials: + ignore_trials["wells"] = [] + ignore_trials["columns"] = list(map(str, ignore_trials["columns"])) + ignore_trials["rows"] = list(map(str, ignore_trials["rows"])) + ignore_timesteps = list(map(str, ignore_timesteps)) + for path, name in signal_tsv_paths.items(): + self.zipped_output.append(path) + signal = os.path.splitext(path)[0].split("_")[0] + # define the signal dataframe + self.signal_species[signal] = name # {name:phenotypes} + self.dataframes[signal] = read_table(path) + self.simulation_time = self.dataframes[signal].iloc[0, -1] / hour + self.parameters["data_timestep_hr"].append( + self.simulation_time / int(self.dataframes[signal].columns[-1]) + ) + self.dataframes[signal] = self.dataframes[signal].iloc[ + 1::2 + ] # excludes the times + self.dataframes[signal].index = self.dataframes[signal]["Well"] + # filter data contents + dropped_trials = [] + for trial in self.dataframes[signal].index: + if any( + [ + trial[0] in ignore_trials["rows"], + trial[1:] in ignore_trials["columns"], + trial in ignore_trials["wells"], + ] + ): + self.dataframes[signal].drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + if dropped_trials != []: + print( + f"The {dropped_trials} trials were dropped from the {name} measurements." + ) + for col in ["Plate", "Cycle", "Well"]: + self.dataframes[signal].drop(col, axis=1, inplace=True) + for col in self.dataframes[signal]: + if col in ignore_timesteps: + self.dataframes[signal].drop(col, axis=1, inplace=True) + if "OD" not in signal: + removed_trials = [] + for trial, row in self.dataframes[signal].iterrows(): + row_array = np.array(row.to_list()) + if row_array[-1] / row_array[0] < significant_deviation: + self.dataframes[signal].drop(trial, axis=0, inplace=True) + removed_trials.append(trial) + if removed_trials != []: + print( + f"The {removed_trials} trials were removed from the {name} measurements, with their deviation over time being less than the threshold of {significant_deviation}." + ) + + # process the data for subsequent operations and optimal efficiency + self.dataframes[signal].astype(str) + self.dataframes[signal]: np.ndarray = FBAHelper.parse_df( + self.dataframes[signal] + ) + + # differentiate the phenotypes for each species + if "OD" not in signal: + self.species_phenotypes_bool_df.loc[signal]: np.ndarray[int] = np.array( + [ + 1 if self.signal_species[signal] in pheno else 0 + for pheno in self.phenotypes_parsed_df[1] + ] + ) + + self.parameters["data_timestep_hr"] = sum( + self.parameters["data_timestep_hr"] + ) / len(self.parameters["data_timestep_hr"]) + self.data_timesteps = int( + self.simulation_time / self.parameters["data_timestep_hr"] + ) + + def define_problem( + self, + parameters={}, + zip_name: str = None, + export_parameters: bool = True, + export_lp: bool = True, + final_relative_carbon_conc: float = None, + metabolites_to_track: list = None, + ): + self.parameters.update( + { + "timestep_hr": self.parameters[ + "data_timestep_hr" + ], # Timestep size of the simulation in hours + "cvct": 1, # Coefficient for the minimization of phenotype conversion to the stationary phase. + "cvcf": 1, # Coefficient for the minimization of phenotype conversion from the stationary phase. + "bcv": 1, # This is the highest fraction of biomass for a given species that can change phenotypes in a single time step + "cvmin": 0, # This is the lowest value the limit on phenotype conversion goes, + "v": 1000, # the kinetics constant that is externally adjusted + "carbon_sources": ["cpd00136", "cpd00179"], # 4hb, maltose + "diffpos": 1, + "diffneg": 1, # objective coefficients to the diffpos and diffneg variables that correspond with the components of difference between experimental and predicted bimoass values + } + ) + self.parameters.update(parameters) + self.problem = Model() + print("Solver:", type(self.problem)) + trial: str + time: str + name: str + phenotype: str + met: str + obj_coef = {} + constraints: list = [] + variables: list = ( + [] + ) # lists are orders-of-magnitude faster than numpy arrays for appending + self.simulation_timesteps = list( + map( + str, + range( + 1, int(self.simulation_time / self.parameters["timestep_hr"]) + 1 + ), + ) + ) + time_1 = process_time() + for signal, parsed_df in self.dataframes.items(): + for met in self.phenotypes_parsed_df[0]: + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + if ( + not metabolites_to_track + and met_id != "cpd00001" + or metabolites_to_track + and met_id in metabolites_to_track + ): + self.variables["c_" + met] = {} + self.constraints["dcc_" + met] = {} + initial_time = True + final_time = False + for time in self.simulation_timesteps: + if time == self.simulation_timesteps[-1]: + final_time = True + self.variables["c_" + met][time] = {} + self.constraints["dcc_" + met][time] = {} + for trial in parsed_df[0]: + # define biomass measurement conversion variables + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), lb=0, ub=1000 + ) + # constrain initial time concentrations to the media or a large number if it is not explicitly defined + if ( + initial_time and not "bio" in met_id + ): # !!! the value of initial_time changes + initial_val = ( + self.media_conc.at[met_id, "mM"] + if met_id in list(self.media_conc.index) + else 100 + ) + if ( + met_id in self.carbon_conc["rows"] + and trial[0] in self.carbon_conc["rows"][met_id] + ): + initial_val = self.carbon_conc["rows"][met_id][ + trial[0] + ] + if ( + met_id in self.carbon_conc["columns"] + and trial[1:] in self.carbon_conc["columns"][met_id] + ): + initial_val = self.carbon_conc["columns"][met_id][ + trial[1:] + ] + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), + lb=initial_val, + ub=initial_val, + ) + # mandate complete carbon consumption + if ( + final_time + and met_id in self.parameters["carbon_sources"] + ): + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), lb=0, ub=0 + ) + if final_relative_carbon_conc: + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), + lb=0, + ub=self.variables["c_" + met]["1"][trial].lb + * final_relative_carbon_conc, + ) + variables.append(self.variables["c_" + met][time][trial]) + initial_time = False + break # prevents duplicated variables + for signal, parsed_df in self.dataframes.items(): + if "OD" not in signal: + for phenotype in self.phenotypes_parsed_df[1]: + if self.signal_species[signal] in phenotype: + self.constraints["dbc_" + phenotype] = {} + for time in self.simulation_timesteps: + self.constraints["dbc_" + phenotype][time] = {} + + for phenotype in self.phenotypes_parsed_df[1]: + self.variables["cvt_" + phenotype] = {} + self.variables["cvf_" + phenotype] = {} + self.variables["b_" + phenotype] = {} + self.variables["g_" + phenotype] = {} + self.variables["v_" + phenotype] = {} + self.constraints["gc_" + phenotype] = {} + self.constraints["cvc_" + phenotype] = {} + for time in self.simulation_timesteps: + self.variables["cvt_" + phenotype][time] = {} + self.variables["cvf_" + phenotype][time] = {} + self.variables["b_" + phenotype][time] = {} + self.variables["g_" + phenotype][time] = {} + self.variables["v_" + phenotype][time] = {} + self.constraints["gc_" + phenotype][time] = {} + self.constraints["cvc_" + phenotype][time] = {} + for trial in parsed_df[0]: + self.variables["b_" + phenotype][time][ + trial + ] = Variable( # predicted biomass abundance + _name("b_", phenotype, time, trial), lb=0, ub=100 + ) + self.variables["g_" + phenotype][time][ + trial + ] = Variable( # biomass growth + _name("g_", phenotype, time, trial), lb=0, ub=1000 + ) + + if "stationary" not in phenotype: + self.variables["cvt_" + phenotype][time][ + trial + ] = Variable( # conversion rate to the stationary phase + _name("cvt_", phenotype, time, trial), lb=0, ub=100 + ) + self.variables["cvf_" + phenotype][time][ + trial + ] = Variable( # conversion from to the stationary phase + _name("cvf_", phenotype, time, trial), lb=0, ub=100 + ) + + # 0 <= -cvt + bcv*b_{phenotype} + cvmin + self.constraints["cvc_" + phenotype][time][trial] = Constraint( + -self.variables["cvt_" + phenotype][time][trial] + + self.parameters["bcv"] + * self.variables["b_" + phenotype][time][trial] + + self.parameters["cvmin"], + lb=0, + ub=None, + name=_name("cvc_", phenotype, time, trial), + ) + + # g_{phenotype} - b_{phenotype}*v = 0 + self.constraints["gc_" + phenotype][time][trial] = Constraint( + self.variables["g_" + phenotype][time][trial] + - self.parameters["v"] + * self.variables["b_" + phenotype][time][trial], + lb=0, + ub=0, + name=_name("gc_", phenotype, time, trial), + ) + + obj_coef.update( + { + self.variables["cvf_" + phenotype][time][ + trial + ]: self.parameters["cvcf"], + self.variables["cvt_" + phenotype][time][ + trial + ]: self.parameters["cvct"], + } + ) + variables.extend( + [ + self.variables["cvf_" + phenotype][time][trial], + self.variables["cvt_" + phenotype][time][trial], + ] + ) + constraints.extend( + [ + self.constraints["cvc_" + phenotype][time][trial], + self.constraints["gc_" + phenotype][time][trial], + ] + ) + + variables.extend( + [ + self.variables["b_" + phenotype][time][trial], + self.variables["g_" + phenotype][time][trial], + ] + ) + + # define non-concentration variables + half_dt = self.parameters["data_timestep_hr"] / 2 + time_2 = process_time() + print(f"Done with biomass loop: {(time_2-time_1)/60} min") + for parsed_df in self.dataframes.values(): + for r_index, met in enumerate(self.phenotypes_parsed_df[0]): + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + if ( + not metabolites_to_track + and "cpd00001" != met_id + or metabolites_to_track + and met_id in metabolites_to_track + ): + for trial in parsed_df[0]: + last_column = False + for time in self.simulation_timesteps: + next_time = str(int(time) + 1) + if next_time == self.simulation_timesteps[-1]: + last_column = True + # c_{met} + dt*sum_k^K() - c+1_{met} = 0 + self.constraints["dcc_" + met][time][trial] = Constraint( + self.variables["c_" + met][time][trial] + - self.variables["c_" + met][next_time][trial] + + np.dot( + self.phenotypes_parsed_df[2][r_index] * half_dt, + np.array( + [ + self.variables["g_" + phenotype][time][ + trial + ] + + self.variables["g_" + phenotype][ + next_time + ][trial] + for phenotype in self.phenotypes_parsed_df[ + 1 + ] + ] + ), + ), + ub=0, + lb=0, + name=_name("dcc_", met, time, trial), + ) + + constraints.append( + self.constraints["dcc_" + met][time][trial] + ) + if last_column: + break + break # prevents duplicated constraints + + time_3 = process_time() + print(f"Done with metabolites loop: {(time_3-time_2)/60} min") + for signal, parsed_df in self.dataframes.items(): + data_timestep = 1 + self.variables[signal + "__conversion"] = Variable( + signal + "__conversion", lb=0, ub=1000 + ) + variables.append(self.variables[signal + "__conversion"]) + + self.variables[signal + "__bio"] = {} + self.variables[signal + "__diffpos"] = {} + self.variables[signal + "__diffneg"] = {} + self.constraints[signal + "__bioc"] = {} + self.constraints[signal + "__diffc"] = {} # diffc is defined latter + for time in self.simulation_timesteps: + if ( + int(time) * self.parameters["timestep_hr"] + >= data_timestep * self.parameters["data_timestep_hr"] + ): # synchronizes user timesteps with data timesteps + data_timestep += 1 + if int(data_timestep) > self.data_timesteps: + break + next_time = str(int(time) + 1) + self.variables[signal + "__bio"][time] = {} + self.variables[signal + "__diffpos"][time] = {} + self.variables[signal + "__diffneg"][time] = {} + self.constraints[signal + "__bioc"][time] = {} + self.constraints[signal + "__diffc"][time] = {} + for r_index, trial in enumerate(parsed_df[0]): + total_biomass: Add = 0 + signal_sum: Add = 0 + from_sum: Add = 0 + to_sum: Add = 0 + for phenotype in self.phenotypes_parsed_df[1]: + total_biomass += self.variables["b_" + phenotype][time][ + trial + ] + val = ( + 1 + if "OD" in signal + else self.species_phenotypes_bool_df.loc[ + signal, phenotype + ] + ) + signal_sum += ( + val * self.variables["b_" + phenotype][time][trial] + ) + if all( + [ + "OD" not in signal, + self.signal_species[signal] in phenotype, + "stationary" not in phenotype, + ] + ): + from_sum += ( + val + * self.variables["cvf_" + phenotype][time][trial] + ) + to_sum += ( + val + * self.variables["cvt_" + phenotype][time][trial] + ) + for phenotype in self.phenotypes_parsed_df[1]: + if ( + "OD" not in signal + and self.signal_species[signal] in phenotype + ): + if "stationary" in phenotype: + # b_{phenotype} - sum_k^K(es_k*cvf) + sum_k^K(pheno_bool*cvt) - b+1_{phenotype} = 0 + self.constraints["dbc_" + phenotype][time][ + trial + ] = Constraint( + self.variables["b_" + phenotype][time][trial] + - from_sum + + to_sum + - self.variables["b_" + phenotype][next_time][ + trial + ], + ub=0, + lb=0, + name=_name("dbc_", phenotype, time, trial), + ) + else: + # -b_{phenotype} + dt*g_{phenotype} + cvf - cvt - b+1_{phenotype} = 0 + self.constraints["dbc_" + phenotype][time][ + trial + ] = Constraint( + self.variables["b_" + phenotype][time][trial] + - self.variables["b_" + phenotype][next_time][ + trial + ] + + half_dt + * ( + self.variables["g_" + phenotype][time][ + trial + ] + + self.variables["g_" + phenotype][ + next_time + ][trial] + ) + + self.variables["cvf_" + phenotype][time][ + trial + ] + - self.variables["cvt_" + phenotype][time][ + trial + ], + ub=0, + lb=0, + name=_name("dbc_", phenotype, time, trial), + ) + + constraints.append( + self.constraints["dbc_" + phenotype][time][trial] + ) + + self.variables[signal + "__bio"][time][trial] = Variable( + _name(signal, "__bio", time, trial), lb=0, ub=1000 + ) + self.variables[signal + "__diffpos"][time][trial] = Variable( + _name(signal, "__diffpos", time, trial), lb=0, ub=100 + ) + self.variables[signal + "__diffneg"][time][trial] = Variable( + _name(signal, "__diffneg", time, trial), lb=0, ub=100 + ) + + # {signal}__conversion*datum = {signal}__bio + self.constraints[signal + "__bioc"][time][trial] = Constraint( + self.variables[signal + "__conversion"] + * parsed_df[2][r_index, int(data_timestep) - 1] + - self.variables[signal + "__bio"][time][trial], + name=_name(signal, "__bioc", time, trial), + lb=0, + ub=0, + ) + + # {speces}_bio - sum_k^K(es_k*b_{phenotype}) - {signal}_diffpos + {signal}_diffneg = 0 + self.constraints[signal + "__diffc"][time][trial] = Constraint( + self.variables[signal + "__bio"][time][trial] + - signal_sum + - self.variables[signal + "__diffpos"][time][trial] + + self.variables[signal + "__diffneg"][time][trial], + name=_name(signal, "__diffc", time, trial), + lb=0, + ub=0, + ) + + obj_coef.update( + { + self.variables[signal + "__diffpos"][time][ + trial + ]: self.parameters["diffpos"], + self.variables[signal + "__diffneg"][time][ + trial + ]: self.parameters["diffneg"], + } + ) + variables.extend( + [ + self.variables[signal + "__bio"][time][trial], + self.variables[signal + "__diffpos"][time][trial], + self.variables[signal + "__diffneg"][time][trial], + ] + ) + constraints.extend( + [ + self.constraints[signal + "__bioc"][time][trial], + self.constraints[signal + "__diffc"][time][trial], + ] + ) + + time_4 = process_time() + print(f"Done with the dbc & diffc loop: {(time_4-time_3)/60} min") + # construct the problem + self.problem.add(variables) + self.problem.update() + self.problem.add(constraints) + self.problem.update() + self.problem.objective = Objective(Zero, direction="min") # , sloppy=True) + self.problem.objective.set_linear_coefficients(obj_coef) + time_5 = process_time() + print( + f"Done with loading the variables, constraints, and objective: {(time_5-time_4)/60} min" + ) + + # print contents + if export_parameters: + self.zipped_output.append("parameters.csv") + DataFrame( + data=list(self.parameters.values()), + index=list(self.parameters.keys()), + columns=["values"], + ).to_csv("parameters.csv") + if export_lp: + self.zipped_output.extend(["mscommfitting.lp", "mscommfitting.json"]) + with open("mscommfitting.lp", "w") as lp: + lp.write(self.problem.to_lp()) + with open("mscommfitting.json", "w") as lp: + json.dump(self.problem.to_json(), lp, indent=3) + if zip_name: + self.zip_name = zip_name + sleep(2) + with ZipFile(self.zip_name, "w", compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) + os.remove(file) + + time_6 = process_time() + print(f"Done exporting the content: {(time_6-time_5)/60} min") + + def compute(self, graphs: list = [], zip_name=None): + solution = self.problem.optimize() + # categorize the primal values by trial and time + for variable, value in self.problem.primal_values.items(): + if "conversion" not in variable: + basename, time, trial = variable.split("-") + time = int(time) * self.parameters["data_timestep_hr"] + if not trial in self.values: + self.values[trial] = {} + if not basename in self.values[trial]: + self.values[trial][basename] = {} + self.values[trial][basename][time] = value + + # export the processed primal values for graphing + with open("primal_values.json", "w") as out: + json.dump(self.values, out, indent=3) + if not zip_name: + if hasattr(self, zip_name): + zip_name = self.zip_name + if zip_name: + with ZipFile(zip_name, "a", compression=ZIP_LZMA) as zp: + zp.write("primal_values.json") + os.remove("primal_values.json") + + if graphs != []: + self.graph(graphs, zip_name=zip_name) + + if "optimal" in solution: + print("The solution is optimal.") + else: + raise FeasibilityError( + f"The solution is sub-optimal, with a {solution} status." + ) + + def graph( + self, + graphs=[], + primal_values_filename: str = None, + primal_values_zip_path: str = None, + zip_name: str = None, + data_timestep_hr: float = 0.163, + ): + def add_plot(ax, labels, basename, trial): + labels.append(basename.split("-")[-1]) + ax.plot( + self.values[trial][basename].keys(), + self.values[trial][basename].values(), + label=basename, + ) + ax.legend(labels) + ax.set_xticks( + list(self.values[trial][basename].keys())[ + :: int(2 / data_timestep_hr / timestep_ratio) + ] + ) + return ax, labels + + timestep_ratio = 1 + if self.parameters != {}: + data_timestep_hr = self.parameters["data_timestep_hr"] + timestep_ratio = ( + self.parameters["data_timestep_hr"] / self.parameters["timestep_hr"] + ) + if primal_values_filename: + if primal_values_zip_path: + with ZipFile(primal_values_zip_path, "r") as zp: + zp.extract(primal_values_filename) + with open(primal_values_filename, "r", encoding="utf-8") as primal: + self.values = json.load(primal) + + # plot the content for desired trials + self.plots = [] + for graph in graphs: + if any([x in graph["content"] for x in ["total", "OD"]]): + ys = [] + print(graph) + pyplot.rcParams["figure.figsize"] = (11, 7) + pyplot.rcParams["figure.dpi"] = 150 + fig, ax = pyplot.subplots() + y_label = "Variable value" + x_label = "Time (hr)" + for trial, basenames in self.values.items(): + content = graph["content"] + if graph["content"] == "OD": + y_label = "Biomass (g)" + graph["phenotype"] = graph["species"] = "*" + elif "biomass" in graph["content"]: + content = "b" + y_label = "Biomass (g)" + elif graph["content"] == "growth": + content = "g" + y_label = "Biomass (g/hr)" + elif "stress-test" in graph["content"]: + content = graph["content"].split("_")[1] + y_label = graph["species"] + " coculture %" + x_label = content + " (mM)" + if trial == graph["trial"]: + labels: list = [] + for basename in basenames: + # parse for non-concentration variables + if any([x in graph["content"] for x in ["total", "OD"]]): + if "b_" in basename: + if graph["content"] == "OD": + labels.append("predicted") + label = "predicted" + xs = np.array( + list(self.values[trial][basename].keys()) + ) + ys.append( + np.array( + list(self.values[trial][basename].values()) + ) + ) + elif graph["content"] == "total": + if graph["species"] in basename: + labels.append("total_biomass") + label = "total_biomass" + xs = np.array( + list(self.values[trial][basename].keys()) + ) + ys.append( + np.array( + list( + self.values[trial][ + basename + ].values() + ) + ) + ) + if ( + "experimental_data" in graph + and graph["experimental_data"] + ): + if basename == "OD__bio": + labels.append("experimental") + exp_xs = np.array( + list(self.values[trial][basename].keys()) + ) + exp_xs = exp_xs.astype(np.float32) + exp_xs = np.around(exp_xs, 2) + ax.plot( + exp_xs, + list(self.values[trial][basename].values()), + label="experimental", + ) + ax.set_xticks( + exp_xs[ + :: int( + 2 / data_timestep_hr / timestep_ratio + ) + ] + ) + elif graph["phenotype"] == "*" and all( + [x in basename for x in [graph["species"], content]] + ): + if "total" in graph["content"]: + labels = [basename] + xs = np.array(list(self.values[trial][basename].keys())) + ys.append( + np.array( + list(self.values[trial][basename].values()) + ) + ) + else: + ax, labels = add_plot(ax, labels, basename, trial) + print("1") + # + elif all( + [ + x in basename + for x in [graph["species"], graph["phenotype"], content] + ] + ): + ax, labels = add_plot(ax, labels, basename, trial) + print("2") + # concentration plots + elif "EX_" in basename and graph["content"] in basename: + ax, labels = add_plot(ax, labels, basename, trial) + y_label = "Concentration (mM)" + print("3") + + if labels != []: + if any([x in graph["content"] for x in ["total", "OD"]]): + xs = xs.astype(np.float32) + xs = np.around(xs, 2) + ax.plot(xs, sum(ys), label=label) + ax.set_xticks( + xs[:: int(2 / data_timestep_hr / timestep_ratio)] + ) + phenotype_id = ( + graph["phenotype"] + if graph["phenotype"] != "*" + else "all phenotypes" + ) + species_id = ( + graph["species"] + if graph["species"] != "*" + else "all species" + ) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + if len(labels) > 1: + ax.legend() + ax.set_title( + f'{graph["content"]} of {species_id} ({phenotype_id}) in the {trial} trial' + ) + fig_name = f'{"_".join([trial, species_id, phenotype_id, graph["content"]])}.jpg' + fig.savefig(fig_name) + self.plots.append(fig_name) + + # combine the figures with the other cotent + if not zip_name: + if hasattr(self, zip_name()): + zip_name = self.zip_name + if zip_name: + with ZipFile(zip_name, "a", compression=ZIP_LZMA) as zp: + for plot in self.plots: + zp.write(plot) + os.remove(plot) + + def load_model( + self, mscomfit_json_path: str, zip_name: str = None, class_object: bool = False + ): + if zip_name: + with ZipFile(zip_name, "r") as zp: + zp.extract(mscomfit_json_path) + with open(mscomfit_json_path, "r") as mscmft: + model = Model.from_json(json.load(mscmft)) + if class_object: + self.problem = model + return model + + def change_parameters( + self, + cvt=None, + cvf=None, + diff=None, + vmax=None, + mscomfit_json_path="mscommfitting.json", + export_zip_name=None, + extract_zip_name=None, + final_concentrations: dict = None, + final_relative_carbon_conc: float = None, + previous_relative_conc: float = None, + ): + def change_param(arg, param, time, trial): + if param: + if not isinstance(param, dict): + arg[0]["value"] = param + else: + if time in param: + if trial in param[time]: + arg[0]["value"] = param[time][trial] + arg[0]["value"] = param[time] + else: + arg[0]["value"] = param["default"] + return arg + + time_1 = process_time() + if not export_zip_name: + export_zip_name = self.zip_name + if not os.path.exists(mscomfit_json_path): + if not extract_zip_name: + extract_zip_name = self.zip_name + with ZipFile(extract_zip_name, "r") as zp: + zp.extract(mscomfit_json_path) + with open(mscomfit_json_path, "r") as mscmft: + mscomfit_json = json.load(mscmft) + else: + with open("mscommfitting.json", "r") as mscmft: + mscomfit_json = json.load(mscmft) + + time_2 = process_time() + print(f"Done loading the JSON: {(time_2-time_1)/60} min") + + # change objective coefficients + for arg in mscomfit_json["objective"]["expression"]["args"]: + name, time, trial = arg["args"][1]["name"].split("-") + if "cvf" in name: + arg["args"] = change_param(arg["args"], cvf, time, trial) + elif "cvt" in name: + arg["args"] = change_param(arg["args"], cvt, time, trial) + elif "diff" in name: + arg["args"] = change_param(arg["args"], diff, time, trial) + + # change final concentrations + if final_concentrations: # absolute concentration + for met in mscomfit_json["variables"]: + name, time, trial = met["name"].split("-") + if ( + name in final_concentrations + and time == self.simulation_timesteps[-1] + ): + met["lb"] = 0 + met["ub"] = final_concentrations[met] + + if final_relative_carbon_conc: # relative concentration + for met in mscomfit_json["variables"]: + if "EX_" in met["name"]: + name, time, trial = met["name"].split("-") + if ( + any([x in name for x in self.parameters["carbon_sources"]]) + and time == self.simulation_timesteps[-1] + ): + print(met["ub"]) + met["lb"] = 0 + met["ub"] *= final_relative_carbon_conc + if previous_relative_conc: + met["ub"] /= previous_relative_conc + print(met["ub"]) + + # change Vmax values + for arg in mscomfit_json["constraints"]: + name, time, trial = arg["name"].split("-") + if "gc" in name: + arg["expression"]["args"][1]["args"] = change_param( + arg["expression"]["args"][1]["args"], vmax, time, trial + ) + + with open(mscomfit_json_path, "w") as mscmft: + json.dump(mscomfit_json, mscmft, indent=3) + with ZipFile(export_zip_name, "a", compression=ZIP_LZMA) as zp: + zp.write(mscomfit_json_path) + os.remove(mscomfit_json_path) + time_3 = process_time() + print(f"Done exporting the model: {(time_3-time_2)/60} min") + + self.problem = Model.from_json(mscomfit_json) + time_4 = process_time() + print( + f"Done loading the model: {(time_4-time_3)/60} min" + ) # ~1/2 the defining a new problem + + def introduce_km( + self, vmax, km, met, graphs, zipname, extract_zipname + ): # Good starting values to try are: vmax = 3.75; km = 2.5 : Equivalent of vmax = 0.5 because at starting maltose of 5 this is vmax/(km + [maltose]) = 3.75/(2.5+5) = 0.5 + vmax_var = {"default": -0.3} + last_conc = {} + count = 0 + while 1: # Dangerous - if there's never convergence, then this never stops + error = None + for t in self.variables["c_" + met]: + if t not in vmax_var: + vmax_var[t] = {} + if t not in last_conc: + last_conc[t] = {} + for trial in self.variables["c_" + met][t]: + if trial in last_conc[t]: + error += ( + last_conc[t][trial] + - self.variables["c_" + met][t][trial].primal + ) ** 2 + last_conc[t][trial] = self.variables["c_" + met][t][trial].primal + vmax_var[t][trial] = -1 * vmax / (km + last_conc[t][trial]) + count += 1 + # Not sure if I'm using the vmax argument right here... please check + self.change_parameters( + vmax_var, zipname, extract_zipname + ) # The Vmax argument can be either a number or a dictionary that is organized by ["time"]["trial"], just as the naming scheme of the variables and constraints + self.compute(graphs, zipname) + if error: + error = (error / count) ** 0.5 + print("Error:", error) + if ( + error < 1 + ): # Definitely don't know what the error threshold should actually be for convergence + break + + def parameter_optimization( + self, + ): + with ZipFile(self.zip_name, "r") as zp: + zp.extract("mscommfitting.json") + + newton diff --git a/modelseedpy/community/mscommunity.py b/modelseedpy/community/mscommunity.py index be8472e2..f1ce5e75 100644 --- a/modelseedpy/community/mscommunity.py +++ b/modelseedpy/community/mscommunity.py @@ -125,8 +125,15 @@ def __init__( lp_filename=None, # specify a filename to create an lp file ): # Setting model and package manager - self.model, self.lp_filename, self.pfba = model, lp_filename, pfba - self.pkgmgr = MSPackageManager.get_pkg_mgr(model) + if isinstance(model, MSModelUtil): + self.model = model.model + self.mdlutl = model + else: + self.model = model + self.mdlutl = MSModelUtil.get(model) + self.pkgmgr = MSPackageManager.get_pkg_mgr(self.model) + self.lp_filename = lp_filename + self.pfba = pfba self.gapfillings = {} # Define Data attributes as None self.solution = ( @@ -142,7 +149,7 @@ def __init__( ) = self.kinetic_coeff = self.modelseed_db_path = None self.species = DictList() # Computing data from model - msid_cobraid_hash = FBAHelper.msid_hash(model) + msid_cobraid_hash = self.mdlutl.msid_hash() if "cpd11416" not in msid_cobraid_hash: logger.critical("Could not find biomass compound") other_biomass_cpds = [] @@ -151,6 +158,7 @@ def __init__( self.biomass_cpd = biomass_cpd for reaction in model.reactions: if self.biomass_cpd in reaction.metabolites: + print(reaction.id, reaction.metabolites) if ( reaction.metabolites[self.biomass_cpd] == 1 and len(reaction.metabolites) > 1 @@ -165,13 +173,14 @@ def __init__( other_biomass_cpds.append(biomass_cpd) for biomass_cpd in other_biomass_cpds: species_obj = CommunityModelSpecies(self, biomass_cpd, names) + print(species_obj.index,species_obj.id) self.species.append(species_obj) if abundances: self.set_abundance(abundances) @staticmethod def build_from_species_models( - models, mdlid=None, name=None, names=[], abundances=None + models, mdlid=None, name=None, names=[], abundances=None,basemodel=None ): """Merges the input list of single species metabolic models into a community metabolic model @@ -196,8 +205,11 @@ def build_from_species_models( Raises ------ """ - newmodel = Model(mdlid, name) - newutl = MSModelUtil(newmodel) + if basemodel: + newmodel = basemodel + else: + newmodel = Model(mdlid, name) + newutl = MSModelUtil.get(newmodel) biomass_compounds = [] index = 1 biomass_index = 2 @@ -230,7 +242,7 @@ def build_from_species_models( met.id = output[0] + "_" + output[1] + str(index) if met.id not in newmodel.metabolites: new_metabolites.append(met) - if met.id == "cpd11416": + if newutl.metabolite_msid(met) == "cpd11416": biomass_compounds.append(met) # Rename reactions for rxn in model.reactions: diff --git a/modelseedpy/community/mskineticsfba.py b/modelseedpy/community/mskineticsfba.py new file mode 100644 index 00000000..9309b455 --- /dev/null +++ b/modelseedpy/community/mskineticsfba.py @@ -0,0 +1,444 @@ +# -*- coding: utf-8 -*- + +from scipy.constants import milli, hour, minute, day, femto +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy import MSModelUtil +from optlang import Constraint +from modelseedpy.core.fbahelper import FBAHelper +from collections import OrderedDict +from optlang.symbolics import Zero +from numpy import log10, nan, mean +from warnings import warn +from matplotlib import pyplot +from pprint import pprint +from datetime import date +from math import inf +import pandas +import json, re, os + + +def _x_axis_determination(total_time): + time = total_time * minute + if time <= 600: + return minute, "s" + if time > 600: + return 1, "min" + if time > 7200: + return 1 / hour, "hr" + return 1 / day, "days" + + +def _check_datum(datum): + if "substituted_rate_law" not in datum: + print(f"RateLawError: The {datum} datum lacks a rate law.") + return False + remainder = re.sub("([0-9A-Za-z/()e\-\+\.\*\_])", "", datum["substituted_rate_law"]) + if remainder != "": + print( + f'RateLawError: The {datum["substituted_rate_law"]}' + f" rate law contains unknown characters: {remainder}" + ) + return False + return True + + +class MSKineticsFBA: + def __init__( + self, + model, + warnings: bool = True, + verbose: bool = False, + printing: bool = False, + jupyter: bool = False, + ): + self.warnings, self.verbose, self.printing, self.jupyter = ( + warnings, + verbose, + printing, + jupyter, + ) + self.model_util = MSModelUtil(model) + self.met_ids = OrderedDict( + {met.id: met.id for met in self.model_util.model.metabolites} + ) + + def baseKinFBA( + self, + kinetics_path: str = None, + kinetics_data: dict = None, + initial_M: dict = None, # a dictionary of the initial metabolic concentrations, which supplants concentrations from the defined kinetics data + total_min: float = 200, + ts_min: float = 20, + export_name=None, + export_directory=None, + chemostat_L: float = None, + feed_profile: dict = None, + chemostat_L_hr: float = None, + temperature: float = 25, + p_h: float = 7, + cell_dry_g: float = 1.44e-13, + cellular_L: float = 1e-18, + conc_figure_title="Metabolic perturbation", + included_mets: list = None, + labeled_plots=True, + visualize=True, + export=True, + ): + # define the dataframe for the time series content + feed_profile, constrained, self.constraints = feed_profile or {}, {}, {} + included_mets, self.sols = included_mets or [], [] + self.parameters = { + "timesteps": int(total_min / ts_min), + "pH": p_h, + "temperature": temperature, + } + self.variables = {"elapsed_time": 0} + self.ts_min, self.minimum = ts_min, inf + timestep_hr = self.ts_min / (hour / minute) + self.constrained = OrderedDict() + cell_g_L = ( + cell_dry_g / cellular_L + ) # https://journals.asm.org/doi/full/10.1128/AEM.64.2.688-694.1998 + + # define reaction kinetics and initial concentrations + assert ( + kinetics_path or kinetics_data + ), "Either < kinetics_path > or < kinetics_data > must be provided" + if kinetics_path: + with open(kinetics_path) as data: + self.kinetics_data = json.load(data) + elif kinetics_data: + self.kinetics_data = kinetics_data.copy() + ## define the concentration, moles, and fluxes DataFrames + self.time = "0 min" + self.conc = pandas.DataFrame( + [0] * len(self.met_ids), + index=list(self.met_ids.keys()), + columns=[self.time], + ) + self.conc.index.name = "metabolite (mM)" + self.moles = self.conc.copy(deep=True) + self.fluxes = pandas.DataFrame( + index=[rxn.id for rxn in self.model_util.model.reactions], + columns=[self.time], + ) + self.fluxes.index.name = "reaction (\u0394mmol/hr*g_(dw)))" # Delta + ## parse the kinetics data + for content in self.kinetics_data.values(): + for condition, datum in content.items(): + if "initial_M" not in datum: + continue + for var, conc in datum["initial_M"].items(): + met_id = datum["met_id"][var] + if met_id in self.met_ids: + self.conc.at[met_id, self.time] += conc / milli + elif self.warnings: + warn( + f"KineticsError: The {met_id} reagent ({var}) in the" + f" {datum['substituted_rate_law']} rate law is not defined by the model." + ) + ## incorporate custom initial concentrations, which overwrites values from the kinetics data + for met_id in initial_M: + self.conc.at[met_id, self.time] = initial_M[met_id] / milli + defined_concs = self.conc[self.conc[self.time] != 0][self.time].to_dict() + chemostat_requirements = [ + chemostat_L is not None, + feed_profile != {}, + chemostat_L_hr is not None, + ] + # execute FBA for each timestep, then calculate custom fluxes, constrain the model, and update concentrations + model_rxns = [rxn.id for rxn in self.model_util.model.reactions] + newTime = 0 + for timestep in range(1, self.parameters["timesteps"] + 1): + oldTime = newTime + newTime = timestep * self.ts_min + t = timestep * timestep_hr + self.previous_time = f"{oldTime} min" + self.time = f"{newTime} min" + self.conc[self.time] = [float(0)] * len(self.conc.index) + self.fluxes[self.time] = [0] * len(self.fluxes.index) + ## create a metabolite variable that prevents negative concentrations + for met in self.model_util.model.metabolites: + if met.id not in defined_concs: + continue + if met.id not in self.constraints: + self.constraints[met.id] = {} + coef = {} + for rxn in met.reactions: + ### The product of the reaction stoichiometry and the timestep + stoich = abs(timestep_hr * rxn.metabolites[met]) + coef[rxn.forward_variable], coef[rxn.reverse_variable] = ( + stoich, + -stoich, + ) + ### build the metabolite constraint + if newTime - self.ts_min in self.constraints[met.id]: + self.model_util.remove_cons_vars( + [self.constraints[met.id][newTime - self.ts_min]] + ) + self.constraints[met.id][newTime] = Constraint( + Zero, lb=0, ub=None, name=f"{met.id}_conc" + ) + self.model_util.create_constraint( + self.constraints[met.id][newTime], coef + ) + ## calculate the flux + display(self.conc[self.conc["0 min"] != 0], self.fluxes) + for rxnID in self.kinetics_data: + # TODO allocate the following code into a function and recusively reduce the timestep until + ## the concentration becomes not negative, following the model of microBialSim. This may require + ## time dependency in the kinetics expression to achieve the desired behavior. + if rxnID not in model_rxns and self.warnings: + warn(f"ReactionError: {rxnID} is not in the model.") + continue + fluxes = [] + for source in self.kinetics_data[rxnID]: + datum = self.kinetics_data[rxnID][source] + if not _check_datum(datum): + continue + ### define rate law variables; calculate flux; average or overwrite the flux based on data criteria + locals().update( + { + metID: self.conc.at[metID, self.previous_time] * milli + for metID in datum["mets"] + } + ) + flux = eval(datum["substituted_rate_law"]) + print(datum["substituted_rate_law"], flux) + if ( + "metadata" not in self.kinetics_data[rxnID][source] + or self.__find_data_match(rxnID, source) == "a" + ): + fluxes.append(flux) + else: + fluxes = [flux] + + flux = mean(fluxes) + rxn = self.model_util.model.reactions.get_by_id(rxnID) + rxn.lb = rxn.ub = flux + self.fluxes.at[rxnID, self.time] = flux + ## execute the COBRA model + sol = self.model_util.model.optimize() + self.sols.append(sol) + ## add previously undefined fluxes and concentrations + for rxnID in self.fluxes.index: + if self.fluxes.at[rxnID, self.time] == 0: + self.fluxes.at[rxnID, self.time] = sol.fluxes[rxnID] + for met in self.model_util.model.metabolites: + self.conc.at[met.id, self.time] = 0 + for rxn in met.reactions: + flux = self.fluxes.at[rxn.id, self.time] + if flux == 0: + continue + # print(rxn.metabolites[met], flux, timestep_hr, cell_g_L) + self.conc.at[met.id, self.time] += ( + rxn.metabolites[met] * flux * timestep_hr * cell_g_L + ) + if all(chemostat_requirements): + self.moles[self.time] = self.conc[self.time] * milli * chemostat_L + self._chemostat(feed_profile, chemostat_L_hr, chemostat_L) + elif any(chemostat_requirements): + warn( + "The < chemostat_L > , < feed_profile >, and < chemostat_L_hr >" + " parameters must all be defined to simulate a chemostat." + ) + self.variables["elapsed_time"] += self.ts_min + if self.printing: + print( + f"\nObjective value (\u0394t{self.ts_min}): ", + self.sols[-1].objective_value, + ) + + # identify the chemicals that dynamically changed in concentrations + self.changed = set( + [ + met_id + for met_id in self.met_ids + if self.conc.at[met_id, "0 min"] != self.conc.at[met_id, self.time] + ] + ) + self.unchanged = set(self.met_ids.keys()) - self.changed + + # visualize concentration changes over time + if visualize: + self._visualize(conc_figure_title, included_mets, labeled_plots) + if export: + self._export(export_name, export_directory, total_min) + if self.verbose: + print( + f"\nChanged concentrations:\t{self.changed}", + f"\nConstrained reactions:\t{constrained.keys()}", + ) + elif self.printing: + if self.jupyter: + pandas.set_option("max_rows", None) + display(self.conc, self.fluxes) + if self.unchanged == set(): + print( + "All of the metabolites changed concentration over the simulation" + ) + else: + print(f"\nUnchanged metabolite concentrations\t{self.unchanged}") + return self.conc, self.fluxes + + def _chemostat(self, feed_profile: dict, chemostat_L_hr, chemostat_L): + L_changed = chemostat_L_hr * self.ts_min + # chemostat addition + for met_id, conc in feed_profile.items(): + self.moles.at[met_id, self.time] += conc * L_changed + self.conc.at[met_id, self.time] = ( + self.moles.at[met_id, self.time] / milli / chemostat_L + ) # normalize to the chemostat volume + # chemostat subtraction + for met in self.model_util.model.metabolites: + if met.compartment[0] != "e": + continue + ## update the chemical moles + self.moles.at[met.id, self.time] -= ( + self.conc.at[met.id, self.time] * L_changed + ) + ## define the chemical concentration + self.conc.at[met.id, self.time] = ( + self.moles.at[met.id, self.time] / milli / chemostat_L + ) + + # nested functions + def __find_data_match(self, rxnID: str, source: str): + # identifies the datum whose experimental conditions most closely matches the simulation conditions + temperature_deviation = ph_deviation = 0 + if FBAHelper.isnumber( + self.kinetics_data[rxnID][source]["metadata"]["Temperature"] + ): + temp = float(self.kinetics_data[rxnID][source]["metadata"]["Temperature"]) + temperature_deviation = ( + abs(self.parameters["temperature"] - temp) + / self.parameters["temperature"] + ) + if FBAHelper.isnumber(self.kinetics_data[rxnID][source]["metadata"]["pH"]): + pH = float(self.kinetics_data[rxnID][source]["metadata"]["pH"]) + ph_deviation = abs(self.parameters["pH"] - pH) / self.parameters["pH"] + + # equally weight between temperature and pH deviation from the simulation conditions + old_minimum = self.minimum + deviation = mean(temperature_deviation, ph_deviation) + self.minimum = min(deviation, self.minimum) + return ( + "a" if old_minimum == self.minimum else "w" + ) # append or write a list of data + + def _visualize(self, conc_fig_title, included_mets, labeled_plots): + # TODO construct a Vega visualization with a range bind that permits scanning over a time series + ## and accordingly adjusting arrowhead widths to reflect flux at the particularly timestep. + ## The heatmap may likewise be dynamic for each timestep over a bind range. + + # define the figure + pyplot.rcParams["figure.figsize"] = (11, 7) + pyplot.rcParams["figure.dpi"] = 150 + self.figure, ax = pyplot.subplots() + ax.set_title(conc_fig_title) + ax.set_ylabel("Concentrations (mM)") + + x_axis_scalar, unit = _x_axis_determination(self.total_min) + ax.set_xlabel("Time " + unit) + legend_list = [] + times = [ + t * self.ts_min * x_axis_scalar + for t in range(self.parameters["timesteps"] + 1) + ] + + # determine the plotted metabolites and the scale of the figure axis + bbox = (1, 1) + if not included_mets: + bbox = (1.7, 1) + # 1e-2 is an arbitrary concentration threshold for plotting on the figure + included_mets = [ + chem + for chem in self.changed + if max(self.conc.loc[[chem]].values[0].tolist()) > 1e-2 + ] + + log_axis = False + minimum, maximum = inf, -inf + printed_concentrations = {} + for chem in self.changed: + if chem not in included_mets: + continue + concentrations = self.conc.loc[[chem]].values[0].tolist() + maximum = max(maximum, max([x if x > 1e-9 else 0 for x in concentrations])) + minimum = min(minimum, min([x if x > 1e-9 else 0 for x in concentrations])) + # plot chemicals with perturbed concentrations + ax.plot(times, concentrations) + if len(chem) > 25: + chem = list(self.met_ids.keys())[self.met_ids.index(chem)] + if not concentrations[0] < 1e-9: + legend_list.append(chem) + else: + legend_list.append(f"(rel) {chem}") + + # design the proper location of the overlaid labels in the figure + if not labeled_plots: + continue + for i, conc in enumerate(concentrations): + if conc <= 1e-9: + continue + x_value = i * self.ts_min + vertical_adjustment = 0 + if x_value in printed_concentrations: + vertical_adjustment = (maximum - minimum) * 0.05 + if log_axis: + vertical_adjustment = log10(maximum - minimum) / 3 + ax.text( + x_value, + conc + vertical_adjustment, + f"{chem} - {round(conc, 4)}", + ha="left", + ) + printed_concentrations[x_value] = conc + break + + # finalize figure details + if maximum > 10 * minimum: + ax.set_yscale("log") + ax.set_xticks(times) + ax.grid(True) + ax.legend( + legend_list, + title="Changed chemicals", + loc="upper right", + bbox_to_anchor=bbox, + title_fontsize="x-large", + fontsize="large", + ) + + def _export(self, export_name="kineticsFBA", export_directory: str = None): + # define a unique simulation name + directory = ( + os.path.dirname(export_directory) if export_directory else os.getcwd() + ) + self.parameters["simulation_path"] = self.simulation_path = os.path.join( + directory, export_name + ) + # export simulation content + self.fluxes.to_csv(os.path.join(self.simulation_path, "fluxes.csv")) + self.conc.to_csv(os.path.join(self.simulation_path, "concentrations.csv")) + obj_vals_df = pandas.DataFrame( + [ + (self.fluxes.columns[index].replace(" min", ""), sol.objective_value) + for index, sol in enumerate(self.sols) + ], + columns=["min", "objective_value"], + ) + obj_vals_df.index = obj_vals_df["min"] + obj_vals_df.drop(["min"], axis=1, inplace=True) + obj_vals_df.to_csv(os.path.join(self.simulation_path, "objective_values.csv")) + # export the parameters + parameters_table = pandas.DataFrame( + self.parameters, columns=["parameter", "value"] + ) + parameters_table.to_csv(os.path.join(self.simulation_path, "parameters.csv")) + # export the figure + self.figure.savefig( + os.path.join(self.simulation_path, "changed_concentrations.svg") + ) + if self.verbose and not self.jupyter: + self.figure.show() diff --git a/modelseedpy/community/mssteadycom.py b/modelseedpy/community/mssteadycom.py new file mode 100644 index 00000000..db851e34 --- /dev/null +++ b/modelseedpy/community/mssteadycom.py @@ -0,0 +1,438 @@ +from modelseedpy import FBAHelper +from modelseedpy.core.exceptions import ( + ObjectAlreadyDefinedError, + ParameterError, + NoFluxError, +) + +# from modelseedpy.community.commhelper import build_from_species_models, CommHelper +from optlang import Constraint, Variable +from itertools import combinations +from optlang.symbolics import Zero +from pandas import DataFrame, concat +from matplotlib import pyplot +from numpy import array +import networkx +import sigfig +import os, re + + +def add_collection_item( + met_name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + first, + second, +): + if flux_threshold and normalized_flux <= flux_threshold: + return species_collection + if not any([re.search(x, met_name, flags=re.IGNORECASE) for x in ignore_mets]): + species_collection[first][second].append(re.sub(r"(_\w\d$)", "", met_name)) + return species_collection + + +class MSSteadyCom: + + @staticmethod + def run_fba( + mscommodel, + media, + pfba=False, + fva_reactions=None, + ava=False, + minMemGrwoth: float = 1, + interactions=True, + ): + + # minGrowth = Constraint(name="minMemGrowth", lb=, ub=None) + # mscommodel.model.add_cons_vars + + # fix member abundances + if not mscommodel.abundances_set: + for member in mscommodel.members: + member.biomass_cpd.lb = minMemGrwoth + all_metabolites = {mscommodel.primary_biomass.products[0]: 1} + all_metabolites.update( + { + mem.biomass_cpd: 1 / len(mscommodel.members) + for mem in mscommodel.members + } + ) + mscommodel.primary_biomass.add_metabolites(all_metabolites, combine=False) + # TODO constrain fluxes to be proportional to the relative abundance + + # TODO constrain the sum of fluxes to be proportional with the abundance + sol = mscommodel.run_fba(media, pfba, fva_reactions) + if interactions: + return MSSteadyCom.interactions(mscommodel, sol) + if ava: + return MSSteadyCom.abundance_variability_analysis(mscommodel, sol) + + @staticmethod + def abundance_variability_analysis(mscommodel, media): + variability = {} + for mem in mscommodel.members: + variability[mem.id] = {} + # minimal variability + mscommodel.set_objective(mem.biomasses, minimize=True) + variability[mem.id]["minVar"] = mscommodel.run_fba(media) + # maximal variability + mscommodel.set_objective(mem.biomasses, minimize=False) + variability[mem.id]["maxVar"] = mscommodel.run_fba(media) + return variability + + @staticmethod + def interactions( + mscommodel, # The MSCommunity object of the model (mandatory to prevent circular imports) + solution=None, # the COBRA simulation solution that will be parsed and visualized + media=None, # The media in which the community model will be simulated + # names=None, abundances=None, # names and abundances of the community species + flux_threshold: int = 1, # The threshold of normalized flux below which a reaction is not plotted + msdb=None, + msdb_path: str = None, + visualize: bool = True, # specifies whether the net flux will be depicted in a network diagram + filename: str = "cross_feeding", # Cross-feeding figure export name + export_format: str = "svg", + node_metabolites: bool = True, # specifies whether the metabolites of each node will be printed + show_figure: bool = True, # specifies whether the figure will be printed to the console + ignore_mets=None, # cross-fed exchanges that will not be displayed in the graphs + ): + # verify that the model has a solution and parallelize where the solver is permissible + solver = str(type(mscommodel.util.model.solver)) + print(f"{solver} model loaded") + if "gurobi" in solver: + mscommodel.util.model.problem.Params.Threads = os.cpu_count() / 2 + solution = solution or mscommodel.run_fba(media) + if not solution: + raise ParameterError( + "A solution must be provided, from which interactions are computed." + ) + if all(array(list(solution.fluxes.values)) == 0): + print(list(solution.fluxes.values)) + raise NoFluxError("The simulation lacks any flux.") + + # Initialize data + metabolite_data, species_data, species_collection = ( + {}, + {"Environment": {}}, + {"Environment": {}}, + ) + data = {"IDs": [], "Metabolites/Donor": [], "Environment": []} + species_list = {} + + # track extracellularly exchanged metabolites + exchange_mets_list = mscommodel.util.exchange_mets_list() + for met in exchange_mets_list: + data["IDs"].append(met.id) + data["Metabolites/Donor"].append(re.sub(r"(_\w\d$)", "", met.name)) + metabolite_data[met.id] = {"Environment": 0} + metabolite_data[met.id].update( + {individual.id: 0 for individual in mscommodel.members} + ) + + # computing net metabolite flux from each reaction + # print([mem.id for mem in mscommodel.members]) + for individual in mscommodel.members: + species_data[individual.id], species_collection[individual.id] = {}, {} + species_list[individual.index] = individual + data[individual.id] = [] + for other in mscommodel.members: + species_data[individual.id][other.id] = 0 + species_collection[individual.id][other.id] = [] + species_data["Environment"][individual.id] = species_data[individual.id][ + "Environment" + ] = 0 + species_collection["Environment"][individual.id] = [] + species_collection[individual.id]["Environment"] = [] + + for rxn in mscommodel.util.model.reactions: + if rxn.id[0:3] == "EX_": + cpd = list(rxn.metabolites.keys())[0] + # the Environment takes the opposite perspective to the members + metabolite_data[cpd.id]["Environment"] += -solution.fluxes[rxn.id] + rxn_index = int(FBAHelper.rxn_compartment(rxn)[1:]) + if ( + not any([met not in exchange_mets_list for met in rxn.metabolites]) + or rxn_index not in species_list + ): + continue + for met in rxn.metabolites: + if met.id not in metabolite_data: + continue + metabolite_data[met.id][species_list[rxn_index].id] += ( + solution.fluxes[rxn.id] * rxn.metabolites[met] + ) + + # translating net metabolite flux into species interaction flux + ignore_mets = ignore_mets if ignore_mets is not None else ["h2o_e0", "co2_e0"] + for met in exchange_mets_list: + # Iterating through the metabolite producers + # TODO Why are fluxes normalized? + total = sum( + [ + max([metabolite_data[met.id][individual.id], 0]) + for individual in mscommodel.members + ] + ) + max([metabolite_data[met.id]["Environment"], 0]) + for individual in mscommodel.members: + ## calculate metabolic consumption of a species from the environment + if metabolite_data[met.id][individual.id] < Zero: + if metabolite_data[met.id]["Environment"] <= Zero: + continue + normalized_flux = ( + abs( + metabolite_data[met.id][individual.id] + * metabolite_data[met.id]["Environment"] + ) + / total + ) + species_data["Environment"][individual.id] += normalized_flux + species_collection = add_collection_item( + met.name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + "Environment", + individual.id, + ) + ## calculate and track metabolic donations between a member and another or the environment + elif metabolite_data[met.id][individual.id] > Zero: + for other in mscommodel.members: + ### filter against organisms that do not consume + if metabolite_data[met.id][other.id] >= Zero: + continue + normalized_flux = ( + abs( + metabolite_data[met.id][individual.id] + * metabolite_data[met.id][other.id] + ) + / total + ) + species_data[individual.id][other.id] += normalized_flux + species_collection = add_collection_item( + met.name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + individual.id, + other.id, + ) + ## calculate donations to the environment + if metabolite_data[met.id]["Environment"] >= Zero: + continue + normalized_flux = ( + abs( + metabolite_data[met.id][individual.id] + * metabolite_data[met.id]["Environment"] + ) + / total + ) + species_data[individual.id]["Environment"] += normalized_flux + species_collection = add_collection_item( + met.name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + individual.id, + "Environment", + ) + + # construct the dataframes + for metID in metabolite_data: + for individual in mscommodel.members: + data[individual.id].append(metabolite_data[metID][individual.id]) + data["Environment"].append(metabolite_data[metID]["Environment"]) + + ## process the fluxes dataframe + data["IDs"].append("zz_Environment") + data["Metabolites/Donor"].append(0) + for individual in mscommodel.members: + data[individual.id].append(species_data["Environment"][individual.id]) + data["Environment"].append(0) + for individual in mscommodel.members: + for other in mscommodel.members: + data[individual.id].append(species_data[individual.id][other.id]) + data["Environment"].append(species_data[individual.id]["Environment"]) + data["IDs"].append(f"zz_Species{individual.index}") + data["Metabolites/Donor"].append(individual.id) + + # if len(set(list(map(len, list(data.values()))))) != 1: + # print([(col, len(content)) for col, content in data.items()]) + cross_feeding_df = DataFrame(data) + cross_feeding_df.index = [ + ID.replace("_e0", "") for ID in map(str, cross_feeding_df["IDs"]) + ] + cross_feeding_df.index.name = "Metabolite/Donor ID" + cross_feeding_df.drop(["IDs", "Metabolites/Donor"], axis=1, inplace=True) + cross_feeding_df = cross_feeding_df.loc[(cross_feeding_df != 0).any(axis=1)] + cross_feeding_df.sort_index(inplace=True) + + ## process the identities dataframe + exchanged_mets = {"Environment": [" "], "Donor ID": ["Environment"]} + exchanged_mets.update({ind.id: [] for ind in mscommodel.members}) + for individual in mscommodel.members: + ### environment exchanges + exchanged_mets[individual.id].append( + "; ".join(species_collection["Environment"][individual.id]) + ) + exchanged_mets["Environment"].append( + "; ".join(species_collection[individual.id]["Environment"]) + ) + ### member exchanges + exchanged_mets["Donor ID"].append(individual.id) + for other in mscommodel.members: + exchanged_mets[individual.id].append( + "; ".join(species_collection[individual.id][other.id]) + ) + + # if len(set(list(map(len, list(exchanged_mets.values()))))) != 1: + # print([(col, len(content)) for col, content in exchanged_mets.items()]) + exMets_df = DataFrame(exchanged_mets) + exMets_df.index = [ + ID.replace("_e0", "") for ID in map(str, exMets_df["Donor ID"]) + ] + exMets_df.index.name = "Donor ID" + exMets_df.drop(["Donor ID"], axis=1, inplace=True) + exMets_df.sort_index(inplace=True) + exMets_df.fillna(" ") + + # graph the network diagram + if visualize: + MSSteadyCom.visual_interactions( + cross_feeding_df, + filename, + export_format, + msdb, + msdb_path, + show_figure, + node_metabolites, + ) + + return cross_feeding_df, exMets_df + + @staticmethod + def visual_interactions( + cross_feeding_df, + filename="cross_feeding", + export_format="svg", + msdb=None, + msdb_path=None, + view_figure=True, + node_metabolites=True, + ): + # load the MSDB + assert msdb or msdb_path, ValueError( + "Either the MSDB object or the local MSDB path must be provided" + ) + from modelseedpy.biochem import from_local + + msdb = msdb or from_local(msdb_path) + # construct the structure of the cross-feeding DataFrame + if "Metabolite/Donor ID" in cross_feeding_df.columns: + cross_feeding_df.index = [ + metID.replace("_e0", "") + for metID in cross_feeding_df["Metabolite/Donor ID"].values + ] + cross_feeding_df.index.name = "Metabolite/Donor ID" + cross_feeding_df.drop( + [col for col in cross_feeding_df.columns if "ID" in col], + axis=1, + inplace=True, + ) + else: + cross_feeding_df.index = [ + metID.replace("_e0", "") for metID in cross_feeding_df.index + ] + # define the cross-fed metabolites + cross_feeding_rows = [] + for index, row in cross_feeding_df.iterrows(): + positive = negative = False + for col, val in row.items(): + if col not in ["Environment"]: + if val > 1e-4: + positive = True + elif val < -1e-4: + negative = True + if negative and positive: + cross_feeding_rows.append(row) + break + metabolites_df = concat(cross_feeding_rows, axis=1).T + metabolites_df.index.name = "Metabolite ID" + display(metabolites_df) + metabolites = [ + msdb.compounds.get_by_id(metID.replace("_e0", "")) + for metID in metabolites_df.index.tolist() + if metID not in ["cpdETCM", "cpdETCMe"] + ] + # define the community members that participate in cross-feeding + members = metabolites_df.loc[ + :, (metabolites_df != 0).any(axis=0) + ].columns.tolist() + members.remove("Environment") + members_cluster1, members_cluster2 = ( + members[: int(len(members) / 2)], + members[int(len(members) / 2) :], + ) + + # TODO define a third node tier of just the environment as a rectangle that spans the width of the members + ## which may alleviate much of the ambiguity about mass imbalance between the member fluxes + import graphviz + + dot = graphviz.Digraph(filename, format=export_format) # directed graph + # define nodes + ## top-layer members + # TODO hyperlink the member nodes with their Narrative link + dot.attr("node", shape="rectangle", color="lightblue2", style="filled") + for mem in members_cluster1: + index = members.index(mem) + dot.node(f"S{index}", mem) + ## mets in the middle layer + with dot.subgraph(name="mets") as mets_subgraph: + mets_subgraph.attr(rank="same") + mets_subgraph.attr("node", shape="circle", color="green", style="filled") + for metIndex, met in enumerate(metabolites): + mets_subgraph.node( + met.abbr[:3], + fixedsize="true", + height="0.4", + tooltip=f"{met.id} ; {met.name}", + URL=f"https://modelseed.org/biochem/compounds/{met.id}", + ) + ## bottom-layer members + with dot.subgraph(name="members") as members_subgraph: + members_subgraph.attr(rank="same") + for mem in members_cluster2: + index = members.index(mem) + dot.node(f"S{index}", mem) + # define the edges by parsing the interaction DataFrame + for met in metabolites: + row = metabolites_df.loc[met.id] + maxVal = max(list(row.to_numpy())) + for col, val in row.items(): + if col == "Environment": + continue + index = members.index(col) + # TODO color carbon sources red + if val > 0: + dot.edge( + f"S{index}", + met.abbr[:3], + arrowsize=f"{val / maxVal}", + edgetooltip=str(val), + ) + if val < 0: + dot.edge( + met.abbr[:3], + f"S{index}", + arrowsize=f"{abs(val / maxVal)}", + edgetooltip=str(val), + ) + + # render and export the source + dot.render(filename, view=view_figure) + return dot.source diff --git a/modelseedpy/community/steadycom_template.html b/modelseedpy/community/steadycom_template.html new file mode 100644 index 00000000..b894c7f7 --- /dev/null +++ b/modelseedpy/community/steadycom_template.html @@ -0,0 +1,54 @@ + + + + + + SteadyCom Results + + + + + + + + + + +

SteadyCom Results

+ + + + \ No newline at end of file diff --git a/modelseedpy/config.cfg b/modelseedpy/config.cfg index 3aee00c7..56fd6422 100644 --- a/modelseedpy/config.cfg +++ b/modelseedpy/config.cfg @@ -1,3 +1,5 @@ +[biochem] +path = /deps/ModelSEEDDatabase/ [data] template_folder = data/templates classifier_folder = data/ml diff --git a/modelseedpy/core/__init__.py b/modelseedpy/core/__init__.py index 7e16d262..bd374a03 100644 --- a/modelseedpy/core/__init__.py +++ b/modelseedpy/core/__init__.py @@ -9,6 +9,9 @@ from modelseedpy.core.mseditorapi import MSEditorAPI, MSEquation from modelseedpy.core.msgapfill import MSGapfill from modelseedpy.core.msatpcorrection import MSATPCorrection -from modelseedpy.core.msgrowthphenotypes import MSGrowthPhenotypes +from modelseedpy.core.msgrowthphenotypes import MSGrowthPhenotypes, MSGrowthPhenotype from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.mstemplate import MSTemplateBuilder +from modelseedpy.core.msmodelreport import MSModelReport +from modelseedpy.core.annotationontology import AnnotationOntology from modelseedpy.core.exceptions import * diff --git a/modelseedpy/core/annotationontology.py b/modelseedpy/core/annotationontology.py new file mode 100644 index 00000000..05eed49d --- /dev/null +++ b/modelseedpy/core/annotationontology.py @@ -0,0 +1,455 @@ +# -*- coding: utf-8 -*- +import logging +import re +import time +import json +import sys +import pandas as pd +import cobra +from cobra import DictList +from modelseedpy.core.msgenome import MSGenome + +# from builtins import None + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + +# Class structure +# AnnotationOntology -> Features/Events/Terms/Ontologies +# AnnotationOntologyOntology -> Events/Terms +# AnnotationOntologyEvent -> Features/Ontology +# AnnotationOntologyFeature -> Term+Event->Evidence +# AnnotationOntologyTerm -> Ontology/Events/Featurs +# AnnotationOntologyEvidence -> -- + +allowable_score_types = [ + "probability", + "evalue", + "bitscore", + "identity", + "qalignstart", + "qalignstop", + "salignstart", + "salignstop", + "kmerhits", + "tmscore", + "rmsd", + "hmmscore", +] + +def convert_to_search_role(role): + role = role.lower() + role = re.sub("\s","",role) + role = re.sub("[\d\-]+\.[\d\-]+\.[\d\-]+\.[\d\-]*","",role) + role = re.sub("\#.*$","",role) + role = re.sub("\(ec:*\)","",role) + role = re.sub("[\(\)\[\],-]","",role) + return role + +def split_role(role): + return re.split("\s*;\s+|\s+[\@\/]\s+",role) + +class AnnotationOntologyEvidence: + def __init__(self, parent, event, term, probability=1, scores={}, ref_entity=None, entity_type=None): + self.parent = parent + self.event = event + self.term = term + self.probability = probability + self.ref_entity = ref_entity + self.entity_type = entity_type + self.scores = scores + for item in self.scores: + if item not in allowable_score_types: + logger.warning(item + " not an allowable score type!") + + def to_data(self): + output = { + "event":self.event.method, + "term":self.term.id, + "ontology":self.term.ontology.id, + "probability":self.probability + } + if self.ref_entity: + output["ref_entity"] = self.ref_entity + if self.entity_type: + output["entity_type"] = self.entity_type + if self.scores: + output["scores"] = self.scores + return output + + +class AnnotationOntologyTerm: + def __init__(self, parent, term_id, ontology): + self.id = term_id + self.parent = parent + self.ontology = ontology + self.ontology.add_term(self) + self.parent.add_term(self) + self.msrxns = set() + self.events = {} + self.features = {} + + def add_msrxns(self, rxn_ids): + for rxn_id in rxn_ids: + if rxn_id[0:6] == "MSRXN:": + rxn_id = rxn_id[6:] + self.msrxns.update([rxn_id]) + + def add_event(self, event): + self.events[event.id] = event + + def add_feature(self, feature): + self.features[feature.id] = feature + + +class AnnotationOntologyOntology: + def __init__(self, parent, ontology_id): + self.id = ontology_id + self.parent = parent + self.events = {} + self.terms = {} + + def add_event(self, event): + self.events[event.id] = event + + def add_term(self, term): + self.terms[term.id] = term + + +class AnnotationOntologyFeature: + def __init__(self, parent, feature_id, type=None): + self.id = feature_id + self.parent = parent + parent.add_feature(self) + self.type = type + self.event_terms = {} + self.term_events = {} + + def add_event_term(self, event, term, scores={}, ref_entity=None, entity_type=None,probability=1): + if event.id not in self.event_terms: + self.event_terms[event.id] = {} + self.event_terms[event.id][term.id] = AnnotationOntologyEvidence( + self,event,term,probability=probability,scores=scores,ref_entity=ref_entity,entity_type=entity_type + ) + if term.id not in self.term_events: + self.term_events[term.id] = {} + self.term_events[term.id][event.id] = self.event_terms[event.id][term.id] + + def get_associated_terms( + self, + prioritized_event_list=None, + ontologies=None, + merge_all=False, + translate_to_rast=False, + ): + output = {} + for term_id in self.term_events: + term = self.parent.terms[term_id] + if not ontologies or term.ontology.id in ontologies: + if merge_all or not prioritized_event_list: + for event_id in self.term_events[term_id]: + if ( + not prioritized_event_list + or event_id in prioritized_event_list + ): + if term not in output: + output[term] = [] + output[term].append( + self.term_events[term_id][event_id].to_data() + ) + else: + for event_id in prioritized_event_list: + if event_id in self.term_events[term_id]: + rxns = self.parent.terms[term_id].msrxns + if len(rxns) > 0: + if term not in output: + output[term] = [] + output[term].append( + self.term_events[term_id][event_id].to_data() + ) + break + return output + + def get_associated_reactions( + self, prioritized_event_list=None, ontologies=None, merge_all=False + ): + output = {} + for term_id in self.term_events: + if not ontologies or self.parent.terms[term_id].ontology.id in ontologies: + if merge_all or not prioritized_event_list: + for event_id in self.term_events[term_id]: + if ( + not prioritized_event_list + or event_id in prioritized_event_list + ): + rxns = self.parent.terms[term_id].msrxns + for rxn_id in rxns: + if rxn_id not in output: + output[rxn_id] = [] + output[rxn_id].append( + self.term_events[term_id][event_id].to_data() + ) + else: + for event_id in prioritized_event_list: + if event_id in self.term_events[term_id]: + rxns = self.parent.terms[term_id].msrxns + for rxn_id in rxns: + if rxn_id not in output: + output[rxn_id] = [] + output[rxn_id].append( + self.term_events[term_id][event_id].to_data() + ) + if len(rxns) > 0: + break + return output + + +class AnnotationOntologyEvent: + def __init__( + self, + parent, + event_id, + ontology_id, + method, + method_version=None, + description=None, + timestamp=None, + ): + self.id = event_id + self.parent = parent + # Linking ontology + self.ontology = self.parent.add_ontology(ontology_id) + self.ontology.add_event(self) + if not description: + self.description = "" # TODO + else: + self.description = description + self.method = method + self.method_version = method_version + self.timestamp = timestamp + self.features = {} + + @staticmethod + def from_data(data, parent): + if "method_version" not in data: + data["method_version"] = None + if "description" not in data: + data["description"] = None + if "timestamp" not in data: + data["timestamp"] = None + self = AnnotationOntologyEvent( + parent, + data["event_id"], + data["ontology_id"], + data["method"], + data["method_version"], + data["description"], + data["timestamp"], + ) + if "ontology_terms" in data: + for feature_id in data["ontology_terms"]: + feature = self.parent.add_feature(feature_id) + self.add_feature(feature) + for item in data["ontology_terms"][feature_id]: + term = self.parent.add_term(item["term"], self.ontology) + scores = {} + ref_entity = None + entity_type = None + if "evidence" in item: + if "scores" in item["evidence"]: + scores = item["evidence"]["scores"] + if "reference" in item["evidence"]: + ref_entity = item["evidence"]["reference"][1] + entity_type = item["evidence"]["reference"][0] + probability = 1/len(data["ontology_terms"][feature_id]) + feature.add_event_term(self, term, scores, ref_entity, entity_type,probability) + if "modelseed_ids" in item: + term.add_msrxns(item["modelseed_ids"]) + return self + + def add_feature(self, feature): + self.features[feature.id] = feature + + def to_data(self): + data = { + "event_id": self.event_id, + "description": self.event_id, + "ontology_id": self.ontology_id, + "method": self.method, + "method_version": self.method_version, + "timestamp": self.timestamp, + "ontology_terms": {}, + } + for feature in self.features: + data["ontology_terms"][feature] = {"term": None} # TODO + + +class AnnotationOntology: + mdlutls = {} + + @staticmethod + def from_kbase_data(data, genome_ref=None, data_dir=None): + self = AnnotationOntology(genome_ref, data_dir) + if "feature_types" in data: + self.feature_types = data["feature_types"] + if "events" in data: + for event in data["events"]: + self.events += [AnnotationOntologyEvent.from_data(event, self)] + return self + + def __init__(self, genome_ref, data_dir): + self.genome_ref = genome_ref + self.events = DictList() + self.terms = {} + self.ontologies = {} + self.genes = {} + self.cdss = {} + self.data_dir = data_dir + self.noncodings = {} + self.feature_types = {} + self.term_names = {} + self.info = None + + def get_term_name(self, term): + if term.ontology.id not in self.term_names: + self.term_names[term.ontology.id] = {} + if term.ontology.id in [ + "SSO", + "AntiSmash", + "EC", + "TC", + "META", + "RO", + "KO", + "GO", + ]: + with open( + self.data_dir + "/" + term.ontology.id + "_dictionary.json" + ) as json_file: + ontology = json.load(json_file) + for item in ontology["term_hash"]: + self.term_names[term.ontology.id][item] = ontology["term_hash"][ + item + ]["name"] + if term.id not in self.term_names[term.ontology.id]: + return "Unknown" + return self.term_names[term.ontology.id][term.id] + + def get_gene_term_hash( + self, + prioritized_event_list=None, + ontologies=None, + merge_all=False, + feature_type=None, + translate_to_rast=True, + ): + output = {} + feature_hash = self.genes + if len(self.genes) == 0 or (feature_type == "cds" and len(self.cdss) > 0): + feature_hash = self.cdss + for feature_id in feature_hash: + if not feature_type or feature_type == self.feature_types[feature_id]: + feature = feature_hash[feature_id] + if feature not in output: + output[feature] = {} + output[feature] = feature.get_associated_terms( + prioritized_event_list, ontologies, merge_all, translate_to_rast + ) + return output + + def get_reaction_gene_hash( + self, + prioritized_event_list=None, + ontologies=None, + merge_all=False, + cds_features=False, + feature_type=None + ): + output = {} + feature_hash = self.genes + if len(self.genes) == 0 or (cds_features and len(self.cdss) == 0): + feature_hash = self.cdss + for feature_id in feature_hash: + if not feature_type or feature_type == self.feature_types[feature_id]: + reactions = feature_hash[feature_id].get_associated_reactions( + prioritized_event_list, ontologies, merge_all + ) + for rxn_id in reactions: + if rxn_id not in output: + output[rxn_id] = {} + if feature_id not in output[rxn_id]: + output[rxn_id][feature_id] = {"probability": 0, "evidence": []} + for item in reactions[rxn_id]: + output[rxn_id][feature_id]["evidence"].append(item) + for rxn_id in output: + total_prob = 0 + for feature_id in output[rxn_id]: + sub_total_prob = 0 + for evidence in output[rxn_id][feature_id]["evidence"]: + sub_total_prob += evidence["probability"] + output[rxn_id][feature_id]["probability"] = sub_total_prob + total_prob += sub_total_prob + for feature_id in output[rxn_id]: + output[rxn_id][feature_id]["probability"] = ( + output[rxn_id][feature_id]["probability"] / total_prob + ) + return output + + def add_term(self, term_or_id, ontology=None): + if not isinstance(term_or_id, AnnotationOntologyTerm): + if term_or_id in self.terms: + return self.terms[term_or_id] + else: + return AnnotationOntologyTerm(self, term_or_id, ontology) + if term_or_id.id in self.terms: + logger.critical("Term with id " + term_or_id.id + " already in annotation!") + return self.terms[term_or_id.id] + else: + self.terms[term_or_id.id] = term_or_id + + def add_ontology(self, ontology_or_id): + if not isinstance(ontology_or_id, AnnotationOntologyOntology): + if ontology_or_id in self.ontologies: + return self.ontologies[ontology_or_id] + else: + return AnnotationOntologyOntology(self, ontology_or_id) + if ontology_or_id.id in self.ontologies: + logger.critical( + "Ontology with id " + ontology_or_id.id + " already in annotation!" + ) + return self.ontologies[ontology_or_id.id] + else: + self.ontologies[ontology_or_id.id] = ontology_or_id + + def get_feature_hash(self, feature_id): + feature_hash = self.genes + if feature_id in self.feature_types: + if self.feature_types[feature_id] == "cds": + feature_hash = self.cdss + elif self.feature_types[feature_id] == "noncoding": + feature_hash = self.noncodings + return feature_hash + + def add_feature(self, feature_or_id): + feature_hash = None + if not isinstance(feature_or_id, AnnotationOntologyFeature): + feature_hash = self.get_feature_hash(feature_or_id) + if feature_or_id in feature_hash: + return feature_hash[feature_or_id] + else: + feature_or_id = AnnotationOntologyFeature(self, feature_or_id) + if not feature_hash: + feature_hash = self.get_feature_hash(feature_or_id.id) + if feature_or_id.id not in feature_hash: + feature_hash[feature_or_id.id] = feature_or_id + return feature_hash[feature_or_id.id] + + def get_msgenome(self,prioritized_event_list=None,ontologies=None,merge_all=False,feature_type=None,translate_to_rast=True): + newgenome = MSGenome.from_annotation_ontology( + self, prioritized_event_list, ontologies, merge_all,feature_type, translate_to_rast + ) + newgenome.annoont = self + return newgenome + \ No newline at end of file diff --git a/modelseedpy/core/exceptions.py b/modelseedpy/core/exceptions.py index ce708956..e3e01211 100644 --- a/modelseedpy/core/exceptions.py +++ b/modelseedpy/core/exceptions.py @@ -1,6 +1,12 @@ # -*- coding: utf-8 -*- # Adding a few exception classes to handle different types of errors in a central file +class ModelSEEDError(Exception): + """Error in ModelSEED execution logic""" + + pass + + class FeasibilityError(Exception): """Error in FBA formulation""" @@ -18,3 +24,37 @@ class GapfillingError(Exception): """Error in model gapfilling""" pass + + +class ObjectError(Exception): + """Error in the construction of a base KBase object""" + + pass + + +class ParameterError(Exception): + """Error in a parameterization""" + + pass + + +class ObjectAlreadyDefinedError(Exception): + pass + + +class NoFluxError(Exception): + """Error for FBA solutions""" + + pass + + +class ObjectiveError(Exception): + """Erroneous assignment of a secondary objective via a constraint""" + + pass + + +class ModelError(Exception): + """Errors in a model that corrupt the simulation""" + + pass \ No newline at end of file diff --git a/modelseedpy/core/fbabuilder.py b/modelseedpy/core/fbabuilder.py new file mode 100644 index 00000000..b82e161d --- /dev/null +++ b/modelseedpy/core/fbabuilder.py @@ -0,0 +1,1546 @@ +import logging + +import re +import copy +from optlang.symbolics import Zero, add +from cobra.core import Gene, Metabolite, Model, Reaction +from cobrakbase.core.kbaseobject import AttrDict +from cobrakbase.annotation_ontology_api.annotation_ontology_apiServiceClient import ( + annotation_ontology_api, +) +import modelseedpy.core.fbahelper + +logger = logging.getLogger(__name__) + + +def build_cpd_id(str): + if str.startswith("M_"): + str = str[2:] + elif str.startswith("M-"): + str = str[2:] + str_fix = str + if "-" in str_fix: + str_fix = str_fix.replace("-", "__DASH__") + if not str == str_fix: + logger.debug("[Species] rename: [%s] -> [%s]", str, str_fix) + return str + + +def build_rxn_id(str): + if str.startswith("R_"): + str = str[2:] + elif str.startswith("R-"): + str = str[2:] + str_fix = str + if "-" in str_fix: + str_fix = str_fix.replace("-", "__DASH__") + if not str == str_fix: + logger.debug("[Reaction] rename: [%s] -> [%s]", str, str_fix) + return str_fix + + +# Adding a few exception classes to handle different types of errors +class ObjectError(Exception): + """Error in the construction of a base KBase object""" + + pass + + +class FeasibilityError(Exception): + """Error in FBA formulation""" + + pass + + +# New class to store functions to building and tracking new constraints and variables related to our own custom FBA formulations +class KBaseFBAUtilities: + def __init__( + self, + cobramodel, + fbamodel, + kbapi, + media=None, + default_uptake=100, + default_excretion=100, + blacklist=[], + auto_sink=["cpd02701_c", "cpd11416_c0", "cpd15302_c"], + ): + self.cobramodel = cobramodel + self.SBO_ANNOTATION = "sbo" + self.metabolites_remap = {} + self.solution_exclusion_constraints = [] + self.kbapi = kbapi + self.potential_variables = dict() + self.reversibility_binary = dict() + self.reversibility_binary_constraints = dict() + self.binary_flux_variables = dict() + self.total_flux_variables = dict() + self.total_flux_constraints = dict() + self.binary_flux_constraints = dict() + self.simple_thermo_constraints = dict() + self.metabolomics_peak_variables = dict() + self.metabolomics_peak_constraints = dict() + self.compound_flux_variables = dict() + self.compound_flux_constraints = dict() + self.metabolomics_constraints = dict() + self.media = None + self.default_uptake = default_uptake + self.default_excretion = default_excretion + self.apply_media_to_model(media, self.default_uptake, self.default_excretion) + self.blacklist = [ + "rxn12985", + "rxn00238", + "rxn07058", + "rxn05305", + "rxn00154", + "rxn09037", + "rxn10643", + "rxn11317", + "rxn05254", + "rxn05257", + "rxn05258", + "rxn05259", + "rxn05264", + "rxn05268", + "rxn05269", + "rxn05270", + "rxn05271", + "rxn05272", + "rxn05273", + "rxn05274", + "rxn05275", + "rxn05276", + "rxn05277", + "rxn05278", + "rxn05279", + "rxn05280", + "rxn05281", + "rxn05282", + "rxn05283", + "rxn05284", + "rxn05285", + "rxn05286", + "rxn05963", + "rxn05964", + "rxn05971", + "rxn05989", + "rxn05990", + "rxn06041", + "rxn06042", + "rxn06043", + "rxn06044", + "rxn06045", + "rxn06046", + "rxn06079", + "rxn06080", + "rxn06081", + "rxn06086", + "rxn06087", + "rxn06088", + "rxn06089", + "rxn06090", + "rxn06091", + "rxn06092", + "rxn06138", + "rxn06139", + "rxn06140", + "rxn06141", + "rxn06145", + "rxn06217", + "rxn06218", + "rxn06219", + "rxn06220", + "rxn06221", + "rxn06222", + "rxn06223", + "rxn06235", + "rxn06362", + "rxn06368", + "rxn06378", + "rxn06474", + "rxn06475", + "rxn06502", + "rxn06562", + "rxn06569", + "rxn06604", + "rxn06702", + "rxn06706", + "rxn06715", + "rxn06803", + "rxn06811", + "rxn06812", + "rxn06850", + "rxn06901", + "rxn06971", + "rxn06999", + "rxn07123", + "rxn07172", + "rxn07254", + "rxn07255", + "rxn07269", + "rxn07451", + "rxn09037", + "rxn10018", + "rxn10077", + "rxn10096", + "rxn10097", + "rxn10098", + "rxn10099", + "rxn10101", + "rxn10102", + "rxn10103", + "rxn10104", + "rxn10105", + "rxn10106", + "rxn10107", + "rxn10109", + "rxn10111", + "rxn10403", + "rxn10410", + "rxn10416", + "rxn11313", + "rxn11316", + "rxn11318", + "rxn11353", + "rxn05224", + "rxn05795", + "rxn05796", + "rxn05797", + "rxn05798", + "rxn05799", + "rxn05801", + "rxn05802", + "rxn05803", + "rxn05804", + "rxn05805", + "rxn05806", + "rxn05808", + "rxn05812", + "rxn05815", + "rxn05832", + "rxn05836", + "rxn05851", + "rxn05857", + "rxn05869", + "rxn05870", + "rxn05884", + "rxn05888", + "rxn05896", + "rxn05898", + "rxn05900", + "rxn05903", + "rxn05904", + "rxn05905", + "rxn05911", + "rxn05921", + "rxn05925", + "rxn05936", + "rxn05947", + "rxn05956", + "rxn05959", + "rxn05960", + "rxn05980", + "rxn05991", + "rxn05992", + "rxn05999", + "rxn06001", + "rxn06014", + "rxn06017", + "rxn06021", + "rxn06026", + "rxn06027", + "rxn06034", + "rxn06048", + "rxn06052", + "rxn06053", + "rxn06054", + "rxn06057", + "rxn06059", + "rxn06061", + "rxn06102", + "rxn06103", + "rxn06127", + "rxn06128", + "rxn06129", + "rxn06130", + "rxn06131", + "rxn06132", + "rxn06137", + "rxn06146", + "rxn06161", + "rxn06167", + "rxn06172", + "rxn06174", + "rxn06175", + "rxn06187", + "rxn06189", + "rxn06203", + "rxn06204", + "rxn06246", + "rxn06261", + "rxn06265", + "rxn06266", + "rxn06286", + "rxn06291", + "rxn06294", + "rxn06310", + "rxn06320", + "rxn06327", + "rxn06334", + "rxn06337", + "rxn06339", + "rxn06342", + "rxn06343", + "rxn06350", + "rxn06352", + "rxn06358", + "rxn06361", + "rxn06369", + "rxn06380", + "rxn06395", + "rxn06415", + "rxn06419", + "rxn06420", + "rxn06421", + "rxn06423", + "rxn06450", + "rxn06457", + "rxn06463", + "rxn06464", + "rxn06466", + "rxn06471", + "rxn06482", + "rxn06483", + "rxn06486", + "rxn06492", + "rxn06497", + "rxn06498", + "rxn06501", + "rxn06505", + "rxn06506", + "rxn06521", + "rxn06534", + "rxn06580", + "rxn06585", + "rxn06593", + "rxn06609", + "rxn06613", + "rxn06654", + "rxn06667", + "rxn06676", + "rxn06693", + "rxn06730", + "rxn06746", + "rxn06762", + "rxn06779", + "rxn06790", + "rxn06791", + "rxn06792", + "rxn06793", + "rxn06794", + "rxn06795", + "rxn06796", + "rxn06797", + "rxn06821", + "rxn06826", + "rxn06827", + "rxn06829", + "rxn06839", + "rxn06841", + "rxn06842", + "rxn06851", + "rxn06866", + "rxn06867", + "rxn06873", + "rxn06885", + "rxn06891", + "rxn06892", + "rxn06896", + "rxn06938", + "rxn06939", + "rxn06944", + "rxn06951", + "rxn06952", + "rxn06955", + "rxn06957", + "rxn06960", + "rxn06964", + "rxn06965", + "rxn07086", + "rxn07097", + "rxn07103", + "rxn07104", + "rxn07105", + "rxn07106", + "rxn07107", + "rxn07109", + "rxn07119", + "rxn07179", + "rxn07186", + "rxn07187", + "rxn07188", + "rxn07195", + "rxn07196", + "rxn07197", + "rxn07198", + "rxn07201", + "rxn07205", + "rxn07206", + "rxn07210", + "rxn07244", + "rxn07245", + "rxn07253", + "rxn07275", + "rxn07299", + "rxn07302", + "rxn07651", + "rxn07723", + "rxn07736", + "rxn07878", + "rxn11417", + "rxn11582", + "rxn11593", + "rxn11597", + "rxn11615", + "rxn11617", + "rxn11619", + "rxn11620", + "rxn11624", + "rxn11626", + "rxn11638", + "rxn11648", + "rxn11651", + "rxn11665", + "rxn11666", + "rxn11667", + "rxn11698", + "rxn11983", + "rxn11986", + "rxn11994", + "rxn12006", + "rxn12007", + "rxn12014", + "rxn12017", + "rxn12022", + "rxn12160", + "rxn12161", + "rxn01267", + "rxn05294", + "rxn04656", + ] + for item in blacklist: + if item not in self.blacklist: + self.blacklist.append(item) + self.auto_sink = [] + full_id = re.compile("\d+$") + for id in auto_sink: + if full_id.search(id): + self.auto_sink.append(id) + else: + for i in range(0, 100): + newid = id + str(i) + self.auto_sink.append(newid) + + self.auto_exchange = "e0" + self.sink_compounds = set() + self.demand_compounds = set() + self.exchange_compounds = set() + self.COBRA_0_BOUND = 0 + self.COBRA_DEFAULT_LB = -1000 + self.COBRA_DEFAULT_UB = 1000 + + def media_const_hash(self): + bound_hash = dict() + if not self.media == None: + for compound in self.media.mediacompounds: + bound_hash[compound.id] = { + "lb": -1 * compound.maxFlux, + "ub": -1 * compound.minFlux, + } + return bound_hash + + def apply_media_to_model( + self, media=None, default_uptake=None, default_excretion=None + ): + self.media = media + if default_uptake == None: + default_uptake = self.default_uptake + if default_excretion == None: + default_excretion = self.default_excretion + + bound_hash = self.media_const_hash() + for reaction in self.cobramodel.reactions: + if reaction.id[0:3].lower() == "ex_": + compound = reaction.id[3:] + if compound[-3:] == "_e0": + compound = compound[:-3] + if compound in bound_hash: + reaction.lower_bound = bound_hash[compound]["lb"] + reaction.upper_bound = bound_hash[compound]["ub"] + else: + reaction.lower_bound = -1 * default_uptake + reaction.upper_bound = default_excretion + reaction.update_variable_bounds() + + def add_total_flux_constraints(self, reaction_filter=None): + for reaction in self.cobramodel.reactions: + if reaction_filter == None or reaction.id in reaction_filter: + self.total_flux_variables[ + reaction.id + ] = self.cobramodel.problem.Variable( + reaction.id + "_tot", lb=0, ub=self.COBRA_DEFAULT_UB + ) + self.cobramodel.add_cons_vars(self.total_flux_variables[reaction.id]) + self.total_flux_constraints[ + reaction.id + ] = self.cobramodel.problem.Constraint( + reaction.forward_variable + + reaction.reverse_variable + - self.total_flux_variables[reaction.id], + lb=0, + ub=0, + name=reaction.id + "_tot", + ) + self.cobramodel.add_cons_vars(self.total_flux_constraints[reaction.id]) + + def add_reversibility_binary_constraints(self, reaction_filter=None): + # Adding thermodynamic constraints + for reaction in self.cobramodel.reactions: + if reaction.id not in self.reversibility_binary and ( + reaction_filter == None or reaction.id in reaction_filter + ): + self.reversibility_binary[ + reaction.id + ] = self.cobramodel.problem.Variable( + reaction.id + "_rb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars(self.reversibility_binary[reaction.id]) + self.reversibility_binary_constraints[reaction.id] = dict() + self.reversibility_binary_constraints[reaction.id][ + "ff" + ] = self.cobramodel.problem.Constraint( + 1000 * self.reversibility_binary[reaction.id] + - reaction.forward_variable, + lb=0, + ub=None, + name=reaction.id + "_FB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["ff"] + ) + self.reversibility_binary_constraints[reaction.id][ + "rf" + ] = self.cobramodel.problem.Constraint( + -1000 * self.reversibility_binary[reaction.id] + - reaction.reverse_variable, + lb=-1000, + ub=None, + name=reaction.id + "_RB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["rf"] + ) + + def set_objective_from_target_reaction(self, target_reaction, maximize=1): + target_reaction = self.cobramodel.reactions.get_by_id(target_reaction) + sense = "max" + if maximize == 0: + sense = "min" + target_objective = self.cobramodel.problem.Objective( + 1 * target_reaction.flux_expression, direction=sense + ) + self.cobramodel.objective = target_objective + return target_reaction + + def add_simple_thermo_constraints(self): + # Creating potential variables for all compounds + for metabolite in self.cobramodel.metabolites: + if metabolite.id not in self.potential_variables: + self.potential_variables[ + metabolite.id + ] = self.cobramodel.problem.Variable( + metabolite.id + "_u", lb=0, ub=1000 + ) + self.cobramodel.add_cons_vars(self.potential_variables[metabolite.id]) + # Adding thermodynamic constraints + for reaction in self.cobramodel.reactions: + if ( + reaction.id not in self.simple_thermo_constraints + and reaction.id[0:3].lower() != "ex_" + and reaction.id[0:3].lower() != "dm_" + ): + if reaction.id not in self.reversibility_binary: + self.reversibility_binary[ + reaction.id + ] = self.cobramodel.problem.Variable( + reaction.id + "_rb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary[reaction.id] + ) + self.reversibility_binary_constraints[reaction.id] = dict() + self.reversibility_binary_constraints[reaction.id][ + "ff" + ] = self.cobramodel.problem.Constraint( + 1000 * self.reversibility_binary[reaction.id] + - reaction.forward_variable, + lb=0, + ub=None, + name=reaction.id + "_FB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["ff"] + ) + self.reversibility_binary_constraints[reaction.id][ + "rf" + ] = self.cobramodel.problem.Constraint( + -1000 * self.reversibility_binary[reaction.id] + - reaction.reverse_variable, + lb=-1000, + ub=None, + name=reaction.id + "_RB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["rf"] + ) + self.simple_thermo_constraints[ + reaction.id + ] = self.cobramodel.problem.Constraint( + Zero, lb=0, ub=1000, name=reaction.id + "_therm" + ) + self.cobramodel.add_cons_vars( + self.simple_thermo_constraints[reaction.id] + ) + self.cobramodel.solver.update() + const_coef = {self.reversibility_binary[reaction.id]: 1000} + for metabolite in reaction.metabolites: + const_coef[ + self.potential_variables[metabolite.id] + ] = reaction.metabolites[metabolite] + self.simple_thermo_constraints[reaction.id].set_linear_coefficients( + const_coef + ) + # Updating solver one final time + self.cobramodel.solver.update() + + def add_intracellular_metabolomics_constraints( + self, peakstring, relevant_peaks=None + ): + drain_fluxes = list() + peak_array = peakstring.split(";") + compound_reactions = dict() + reaction_hash = dict() + for reaction in self.cobramodel.reactions: + reaction_hash[reaction.id] = 1 + for compound in reaction.metabolites: + if compound.id not in compound_reactions: + compound_reactions[compound.id] = dict() + compound_reactions[compound.id][reaction.id] = reaction.metabolites[ + compound + ] + compartment_tag = re.compile("_[a-z]\d+$") + for peak in peak_array: + sub_array = peak.split(":") + if len(sub_array) > 2: + peakid = sub_array[0] + if relevant_peaks == None or peakid in relevant_peaks: + coef = sub_array[1] + peak_coef = dict() + pfound = 0 + for i in range(2, len(sub_array)): + compound_list = [] + compound = sub_array[i] + if compartment_tag.search(compound): + compound_list = [compound] + else: + for i in range(0, 1000): + compound_list.append(compound + "_c" + str(i)) + for compound in compound_list: + if compound in compound_reactions: + cfound = 0 + compound_coef = dict() + for reaction in compound_reactions[compound]: + if ( + reaction[0:3].lower() != "ex_" + and reaction[0:3].lower() != "dm_" + ): + cfound = 1 + rxnobj = self.cobramodel.reactions.get_by_id( + reaction + ) + compound_coef[rxnobj.forward_variable] = 1000 + compound_coef[rxnobj.reverse_variable] = 1000 + if cfound == 1: + if compound not in self.compound_flux_variables: + self.compound_flux_variables[ + compound + ] = self.cobramodel.problem.Variable( + compound + "_f", lb=0, ub=1 + ) + self.cobramodel.add_cons_vars( + self.compound_flux_variables[compound] + ) + self.compound_flux_constraints[ + compound + ] = self.cobramodel.problem.Constraint( + Zero, lb=0, ub=None, name=compound + "_flux" + ) + self.cobramodel.add_cons_vars( + self.compound_flux_constraints[compound] + ) + compound_coef[ + self.compound_flux_variables[compound] + ] = -1 + self.cobramodel.solver.update() + self.compound_flux_constraints[ + compound + ].set_linear_coefficients(compound_coef) + peak_coef[ + self.compound_flux_variables[compound] + ] = 1 + pfound = 1 + drain_reaction = ( + self.helper.add_drain_from_metabolite_id( + self.cobramodel, compound + ) + ) + if ( + drain_reaction.id + not in self.cobramodel.reactions + ): + self.cobramodel.add_reactions([drain_reaction]) + if pfound == 1: + if peakid not in self.metabolomics_peak_variables: + self.metabolomics_peak_variables[ + peakid + ] = self.cobramodel.problem.Variable(peakid, lb=0, ub=1) + self.cobramodel.add_cons_vars( + self.metabolomics_peak_variables[peakid] + ) + self.metabolomics_peak_constraints[ + peakid + ] = self.cobramodel.problem.Constraint( + Zero, lb=0, ub=None, name=peakid + ) + self.cobramodel.add_cons_vars( + self.metabolomics_peak_constraints[peakid] + ) + peak_coef[self.metabolomics_peak_variables[peakid]] = -1 + self.cobramodel.solver.update() + self.metabolomics_peak_constraints[ + peakid + ].set_linear_coefficients(peak_coef) + + return drain_fluxes + + def convert_template_compound(self, template_compound, index, template): + base_id = template_compound.id.split("_")[0] + base_compound = template.compounds.get_by_id(base_id) + new_id = template_compound.id + new_id += str(index) + compartment = template_compound.templatecompartment_ref.split("/").pop() + compartment += str(index) + + met = Metabolite( + new_id, + formula=base_compound.formula, + name=base_compound.name, + charge=template_compound.charge, + compartment=compartment, + ) + + met.annotation[ + "sbo" + ] = "SBO:0000247" # simple chemical - Simple, non-repetitive chemical entity. + met.annotation["seed.compound"] = base_id + return met + + def convert_template_reaction( + self, template_reaction, index, template, for_gapfilling=1 + ): + array = template_reaction.id.split("_") + base_id = array[0] + new_id = template_reaction.id + new_id += str(index) + + lower_bound = template_reaction.maxrevflux + upper_bound = template_reaction.maxforflux + + direction = template_reaction.GapfillDirection + if for_gapfilling == 0: + direction = template_reaction.direction + + if direction == ">": + lower_bound = 0 + elif direction == "<": + upper_bound = 0 + + cobra_reaction = Reaction( + new_id, + name=template_reaction.name, + lower_bound=lower_bound, + upper_bound=upper_bound, + ) + + object_stoichiometry = {} + for item in template_reaction.templateReactionReagents: + metabolite_id = item["templatecompcompound_ref"].split("/").pop() + template_compound = template.compcompounds.get_by_id(metabolite_id) + compartment = template_compound.templatecompartment_ref.split("/").pop() + if compartment == "e": + metabolite_id = metabolite_id + "0" + else: + metabolite_id = metabolite_id + str(index) + + metabolite = self.cobramodel.metabolites.get_by_id(metabolite_id) + object_stoichiometry[metabolite] = item["coefficient"] + + cobra_reaction.add_metabolites(object_stoichiometry) + + cobra_reaction.annotation["sbo"] = "SBO:0000176" # biochemical reaction + cobra_reaction.annotation["seed.reaction"] = base_id + + return cobra_reaction + + def build_model_extended_for_gapfilling( + self, + extend_with_template=1, + source_models=[], + input_templates=[], + model_penalty=1, + reaction_scores={}, + ): + model_id = self.fbamodel["id"] + ".gf" + + # Determine all indecies that should be gapfilled + indexlist = [0] * 1000 + compounds = self.fbamodel["modelcompounds"] + for compound in compounds: + compartment = compound["modelcompartment_ref"].split("/").pop() + basecomp = compartment[0:1] + if not basecomp == "e": + index = compartment[1:] + index = int(index) + indexlist[index] += 1 + + # Iterating over all indecies with more than 10 intracellular compounds: + gapfilling_penalties = dict() + for i in range(0, 1000): + if indexlist[i] > 10: + if extend_with_template == 1: + new_penalties = self.temp_extend_model_index_for_gapfilling( + i, input_templates + ) + gapfilling_penalties.update(new_penalties) + if i < len(source_models) and source_models[i] != None: + new_penalties = self.mdl_extend_model_index_for_gapfilling( + i, source_models[i], model_penalty + ) + gapfilling_penalties.update(new_penalties) + # Rescaling penalties by reaction scores and saving genes + for reaction in gapfilling_penalties: + array = reaction.split("_") + rxnid = array[0] + if rxnid in reaction_scores: + highest_score = 0 + for gene in reaction_scores[rxnid]: + if highest_score < reaction_scores[rxnid][gene]: + highest_score = reaction_scores[rxnid][gene] + factor = 1 - 0.9 * highest_score + if "reverse" in gapfilling_penalties[reaction]: + penalties[reaction.id]["reverse"] = ( + factor * penalties[reaction.id]["reverse"] + ) + if "forward" in gapfilling_penalties[reaction]: + penalties[reaction.id]["forward"] = ( + factor * penalties[reaction.id]["forward"] + ) + self.cobramodel.solver.update() + return gapfilling_penalties + + # Possible new function to add to the KBaseFBAModelToCobraBuilder to extend a model with a template for gapfilling for a specific index + def mdl_extend_model_index_for_gapfilling(self, index, source_model, model_penalty): + new_metabolites = {} + new_reactions = {} + new_exchange = [] + new_demand = [] + new_penalties = dict() + local_remap = {} + + comp = re.compile("(.*_*)(.)\d+$") + for modelcompound in source_model.metabolites: + cobra_metabolite = self.convert_modelcompound(modelcompound) + original_id = cobra_metabolite.id + groups = comp.match(cobra_metabolite.compartment) + if groups[2] == "e": + cobra_metabolite.compartment = groups[1] + groups[2] + "0" + groups = comp.match(cobra_metabolite.id) + cobra_metabolite.id = groups[1] + groups[2] + "0" + else: + cobra_metabolite.compartment = groups[1] + groups[2] + str(index) + groups = comp.match(cobra_metabolite.id) + cobra_metabolite.id = groups[1] + groups[2] + str(index) + if ( + cobra_metabolite.id not in self.cobramodel.metabolites + and cobra_metabolite.id not in new_metabolites + ): + new_metabolites[cobra_metabolite.id] = cobra_metabolite + if original_id in self.auto_sink: + self.demand_compounds.add(cobra_metabolite.id) + new_demand.append(cobra_metabolite) + if cobra_metabolite.compartment == self.auto_exchange: + self.exchange_compounds.add(cobra_metabolite.id) + new_exchange.append(cobra_metabolite) + if cobra_metabolite.id in self.cobramodel.metabolites: + cobra_metabolite = self.cobramodel.metabolites.get_by_id( + cobra_metabolite.id + ) + else: # Just in case the same compound is added twice - we want to switch the metabolite to the first new version + cobra_metabolite = new_metabolites[cobra_metabolite.id] + local_remap[original_id] = cobra_metabolite + # Adding all metabolites to model prior to adding reactions + self.cobramodel.add_metabolites(new_metabolites.values()) + + for modelreaction in source_model.reactions: + if modelreaction.id.split("_")[0] in self.blacklist: + next + # cobra_reaction = self.convert_modelreaction(modelreaction) + cobra_reaction = modelreaction.copy() + groups = comp.match(cobra_reaction.id) + cobra_reaction.id = groups[1] + groups[2] + str(index) + new_penalties[cobra_reaction.id] = dict() + # Updating metabolites in reaction to new model + metabolites = cobra_reaction.metabolites + new_stoichiometry = {} + for metabolite in metabolites: + # Adding new coefficient: + new_stoichiometry[local_remap[metabolite.id]] = metabolites[metabolite] + # Zeroing out current coefficients + if local_remap[metabolite.id] != metabolite: + new_stoichiometry[metabolite] = 0 + cobra_reaction.add_metabolites(new_stoichiometry, combine=False) + if ( + cobra_reaction.id not in self.cobramodel.reactions + and cobra_reaction.id not in new_reactions + ): + new_reactions[cobra_reaction.id] = cobra_reaction + new_penalties[cobra_reaction.id]["added"] = 1 + if cobra_reaction.lower_bound < 0: + new_penalties[cobra_reaction.id]["reverse"] = model_penalty + if cobra_reaction.upper_bound > 0: + new_penalties[cobra_reaction.id]["forward"] = model_penalty + elif ( + cobra_reaction.lower_bound < 0 + and self.cobramodel.reactions.get_by_id(cobra_reaction.id).lower_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).lower_bound = cobra_reaction.lower_bound + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["reverse"] = model_penalty + new_penalties[cobra_reaction.id]["reversed"] = 1 + elif ( + cobra_reaction.upper_bound > 0 + and self.cobramodel.reactions.get_by_id(cobra_reaction.id).upper_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).upper_bound = cobra_reaction.upper_bound + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["forward"] = model_penalty + new_penalties[cobra_reaction.id]["reversed"] = 1 + + # Only run this on new exchanges so we don't readd for all exchanges + for cpd in new_exchange: + drain_reaction = self.helper.add_drain_from_metabolite_id(cpd.id) + if ( + drain_reaction.id not in self.cobramodel.reactions + and drain_reaction.id not in new_reactions + ): + new_reactions[drain_reaction.id] = drain_reaction + + # Only run this on new demands so we don't readd for all exchanges + for cpd_id in new_demand: + drain_reaction = self.helper.add_drain_from_metabolite_id( + cpd_id, + lower_bound=self.COBRA_0_BOUND, + upper_bound=self.COBRA_DEFAULT_UB, + prefix="DM_", + prefix_name="Demand for ", + sbo="SBO:0000627", + ) + if ( + drain_reaction.id not in self.cobramodel.reactions + and drain_reaction.id not in new_reactions + ): + new_reactions[drain_reaction.id] = drain_reaction + + # Adding all new reactions to the model at once (much faster than one at a time) + self.cobramodel.add_reactions(new_reactions.values()) + return new_penalties + + # Possible new function to add to the KBaseFBAModelToCobraBuilder to extend a model with a template for gapfilling for a specific index + def temp_extend_model_index_for_gapfilling(self, index, input_templates=[]): + new_metabolites = {} + new_reactions = {} + new_exchange = [] + new_demand = [] + new_penalties = dict() + template = None + if index < len(input_templates): + template = input_templates[index] + elif index in self.fbamodel["template_refs"]: + template = self.kbapi.get_from_ws(self.fbamodel["template_refs"][index]) + else: + template = self.kbapi.get_from_ws(self.fbamodel["template_ref"]) + + if template.info.type != "KBaseFBA.NewModelTemplate": + raise ObjectError( + template.info.type + " loaded when KBaseFBA.NewModelTemplate expected" + ) + + for template_compound in template.compcompounds: + tempindex = index + compartment = template_compound.templatecompartment_ref.split("/").pop() + if compartment == "e": + tempindex = 0 + + cobra_metabolite = self.convert_template_compound( + template_compound, tempindex, template + ) + if ( + cobra_metabolite.id not in self.cobramodel.metabolites + and cobra_metabolite.id not in new_metabolites + ): + new_metabolites[cobra_metabolite.id] = cobra_metabolite + self.cobramodel.add_metabolites([cobra_metabolite]) + if cobra_metabolite.id in self.auto_sink: + self.demand_compounds.add(cobra_metabolite.id) + new_demand.append(cobra_metabolite.id) + if cobra_metabolite.compartment == self.auto_exchange: + new_exchange.append(cobra_metabolite.id) + self.exchange_compounds.add(cobra_metabolite.id) + # Adding all metabolites to model prior to adding reactions + self.cobramodel.add_metabolites(new_metabolites.values()) + + for template_reaction in template.reactions: + if template_reaction.id.split("_")[0] in self.blacklist: + continue + cobra_reaction = self.convert_template_reaction( + template_reaction, index, template, 1 + ) + new_penalties[cobra_reaction.id] = dict() + if ( + cobra_reaction.id not in self.cobramodel.reactions + and cobra_reaction.id not in new_reactions + ): + # Adding any template reactions missing from the present model + new_reactions[cobra_reaction.id] = cobra_reaction + if cobra_reaction.lower_bound < 0: + new_penalties[cobra_reaction.id]["reverse"] = ( + template_reaction.base_cost + template_reaction.reverse_penalty + ) + if cobra_reaction.upper_bound > 0: + new_penalties[cobra_reaction.id]["forward"] = ( + template_reaction.base_cost + template_reaction.forward_penalty + ) + new_penalties[cobra_reaction.id]["added"] = 1 + elif template_reaction.GapfillDirection == "=": + # Adjusting directionality as needed for existing reactions + new_penalties[cobra_reaction.id]["reversed"] = 1 + if ( + self.cobramodel.reactions.get_by_id(cobra_reaction.id).lower_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).lower_bound = template_reaction.maxrevflux + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["reverse"] = ( + template_reaction.base_cost + template_reaction.reverse_penalty + ) + if ( + self.cobramodel.reactions.get_by_id(cobra_reaction.id).upper_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).upper_bound = template_reaction.maxforflux + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["forward"] = ( + template_reaction.base_cost + template_reaction.forward_penalty + ) + + # Only run this on new exchanges so we don't readd for all exchanges + for cpd_id in new_exchange: + drain_reaction = self.helper.add_drain_from_metabolite_id(cpd_id) + if drain_reaction != None and drain_reaction.id not in new_reactions: + new_reactions[drain_reaction.id] = drain_reaction + + # Only run this on new demands so we don't readd for all exchanges + for cpd_id in new_demand: + drain_reaction = self.helper.add_drain_from_metabolite_id( + cpd_id, self.COBRA_0_BOUND, self.COBRA_DEFAULT_UB, "DM_", "Demand for " + ) + if drain_reaction != None and drain_reaction.id not in new_reactions: + new_reactions[drain_reaction.id] = drain_reaction + + # Adding all new reactions to the model at once (much faster than one at a time) + self.cobramodel.add_reactions(new_reactions.values()) + return new_penalties + + def convert_modelreaction(self, reaction, bigg=False): + mr_id = reaction.id + name = reaction.name + annotation = reaction.annotation + lower_bound, upper_bound = reaction.get_reaction_constraints() + + id = build_rxn_id(mr_id) + if bigg and "bigg.reaction" in annotation: + id = annotation["bigg.reaction"] + + gpr = reaction.get_gpr() + + cobra_reaction = Reaction( + id, name=name, lower_bound=lower_bound, upper_bound=upper_bound + ) + cobra_reaction.annotation[ + self.SBO_ANNOTATION + ] = "SBO:0000176" # biochemical reaction + cobra_reaction.annotation.update(annotation) + + if id.startswith("rxn"): + cobra_reaction.annotation["seed.reaction"] = id.split("_")[0] + + cobra_reaction.add_metabolites( + self.convert_modelreaction_stoichiometry(reaction) + ) + + cobra_reaction.gene_reaction_rule = reaction.gene_reaction_rule + + for genes in gpr: + for gene in genes: + if not gene in self.genes: + self.genes[gene] = gene + + return cobra_reaction + + def convert_modelcompound(self, metabolite, bigg=False): + formula = metabolite.formula + name = metabolite.name + charge = metabolite.charge + mc_id = metabolite.id + compartment = metabolite.compartment + annotation = metabolite.annotation + + id = build_cpd_id(mc_id) + + if bigg and "bigg.metabolite" in annotation: + id = annotation["bigg.metabolite"] + "_" + compartment + # print(id) + + met = Metabolite( + id, formula=formula, name=name, charge=charge, compartment=compartment + ) + + met.annotation[ + self.SBO_ANNOTATION + ] = "SBO:0000247" # simple chemical - Simple, non-repetitive chemical entity. + if id.startswith("cpd"): + met.annotation["seed.compound"] = id.split("_")[0] + met.annotation.update(annotation) + return met + + def convert_modelreaction_stoichiometry(self, reaction): + object_stoichiometry = {} + s = reaction.stoichiometry + for metabolite_id in s: + if metabolite_id in self.metabolites_remap: + object_stoichiometry[ + self.cobramodel.metabolites.get_by_id( + self.metabolites_remap[metabolite_id] + ) + ] = s[metabolite_id] + return object_stoichiometry + + def create_binary_variables(self, rxnobj, forward=1, reverse=1): + if rxnobj.id not in self.binary_flux_variables: + self.binary_flux_variables[rxnobj.id] = dict() + self.binary_flux_constraints[rxnobj.id] = dict() + if ( + forward == 1 + and rxnobj.upper_bound > 0 + and "forward" not in self.binary_flux_variables[rxnobj.id] + ): + self.binary_flux_variables[rxnobj.id][ + "forward" + ] = self.cobramodel.problem.Variable( + rxnobj.id + "_fb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars( + self.binary_flux_variables[rxnobj.id]["forward"] + ) + self.binary_flux_constraints[rxnobj.id][ + "forward" + ] = self.cobramodel.problem.Constraint( + 1000 * self.binary_flux_variables[rxnobj.id]["forward"] + - rxnobj.forward_variable, + lb=0, + ub=None, + name=rxnobj.id + "_fb", + ) + self.cobramodel.add_cons_vars( + self.binary_flux_constraints[rxnobj.id]["forward"] + ) + if ( + reverse == 1 + and rxnobj.lower_bound < 0 + and "reverse" not in self.binary_flux_variables[rxnobj.id] + ): + self.binary_flux_variables[rxnobj.id][ + "reverse" + ] = self.cobramodel.problem.Variable( + rxnobj.id + "_bb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars( + self.binary_flux_variables[rxnobj.id]["reverse"] + ) + self.binary_flux_constraints[rxnobj.id][ + "reverse" + ] = self.cobramodel.problem.Constraint( + 1000 * self.binary_flux_variables[rxnobj.id]["reverse"] + - rxnobj.forward_variable, + lb=0, + ub=None, + name=rxnobj.id + "_bb", + ) + self.cobramodel.add_cons_vars( + self.binary_flux_constraints[rxnobj.id]["reverse"] + ) + + def binary_check_gapfilling_solution( + self, gapfilling_penalties, add_solution_exclusion_constraint + ): + objcoef = {} + flux_values = self.compute_flux_values_from_variables() + for rxnobj in self.cobramodel.reactions: + if rxnobj.id in gapfilling_penalties: + if ( + "reverse" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["reverse"] > Zero + ): + self.create_binary_variables(rxnobj, 0, 1) + objcoef[self.binary_flux_variables[rxnobj.id]["reverse"]] = 1 + if ( + "forward" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["forward"] > Zero + ): + self.create_binary_variables(rxnobj, 1, 0) + objcoef[self.binary_flux_variables[rxnobj.id]["forward"]] = 1 + with self.cobramodel: + # Setting all gapfilled reactions not in the solution to zero + min_reaction_objective = self.cobramodel.problem.Objective( + Zero, direction="min" + ) + for rxnobj in self.cobramodel.reactions: + if rxnobj.id in gapfilling_penalties: + if ( + "reverse" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["reverse"] <= Zero + ): + rxnobj.lower_bound = 0 + if ( + "forward" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["forward"] <= Zero + ): + rxnobj.upper_bound = 0 + rxnobj.update_variable_bounds() + # Setting the objective to be minimization of sum of binary variables + self.cobramodel.objective = min_reaction_objective + min_reaction_objective.set_linear_coefficients(objcoef) + with open("GapfillBinary.lp", "w") as out: + out.write(str(self.cobramodel.solver)) + self.cobramodel.optimize() + flux_values = self.compute_flux_values_from_variables() + if add_solution_exclusion_constraint == 1: + self.add_binary_solution_exclusion_constraint(flux_values) + return flux_values + + # Adds a constraint that eliminates a gapfilled solution from feasibility so a new solution can be obtained + def add_binary_solution_exclusion_constraint(self, flux_values): + count = len(self.solution_exclusion_constraints) + solution_coef = {} + solution_size = 0 + for reaction in self.binary_flux_variables: + for direction in self.binary_flux_variables[reaction]: + if flux_values[reaction][direction] > Zero: + solution_size += 1 + solution_coef[self.binary_flux_variables[reaction][direction]] = 1 + if len(solution_coef) > 0: + new_exclusion_constraint = self.cobramodel.problem.Constraint( + Zero, + lb=None, + ub=(solution_size - 1), + name="exclusion." + str(count + 1), + ) + self.cobramodel.add_cons_vars(new_exclusion_constraint) + self.cobramodel.solver.update() + new_exclusion_constraint.set_linear_coefficients(solution_coef) + self.solution_exclusion_constraints.append(new_exclusion_constraint) + return new_exclusion_constraint + return None + + # Takes gapfilled penalties and creates and objective function minimizing gapfilled reactions + def create_minimal_reaction_objective(self, penalty_hash, default_penalty=0): + reaction_objective = self.cobramodel.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in self.cobramodel.reactions: + if reaction.id in penalty_hash: + # Minimizing gapfilled reactions + if "reverse" in penalty_hash[reaction.id]: + obj_coef[reaction.reverse_variable] = abs( + penalty_hash[reaction.id]["reverse"] + ) + elif default_penalty != 0: + obj_coef[reaction.reverse_variable] = default_penalty + if "forward" in penalty_hash[reaction.id]: + obj_coef[reaction.forward_variable] = abs( + penalty_hash[reaction.id]["forward"] + ) + elif default_penalty != 0: + obj_coef[reaction.forward_variable] = default_penalty + else: + obj_coef[reaction.forward_variable] = default_penalty + obj_coef[reaction.reverse_variable] = default_penalty + + self.cobramodel.objective = reaction_objective + reaction_objective.set_linear_coefficients(obj_coef) + + # Required this function to add gapfilled compounds to a KBase model for saving gapfilled model + def convert_cobra_compound_to_kbcompound(self, cpd, kbmodel, add_to_model=1): + refid = "cpd00000" + if re.search("cpd\d+_[a-z]+", cpd.id): + refid = cpd.id + refid = re.sub("_[a-z]\d+$", "", refid) + cpd_data = { + "aliases": [], + "charge": cpd.charge, + "compound_ref": "~/template/compounds/id/" + refid, + "dblinks": {}, + "formula": cpd.formula, + "id": cpd.id, + "inchikey": "ALYNCZNDIQEVRV-UHFFFAOYSA-M", + "modelcompartment_ref": "~/modelcompartments/id/" + cpd.id.split("_").pop(), + "name": cpd.name(), + "numerical_attributes": {}, + "string_attributes": {}, + } + cpd_data = AttrDict(cpd_data) + if add_to_model == 1: + kbmodel.modelcompounds.append(cpd_data) + return cpd_data + + # Required this function to add gapfilled reactions to a KBase model for saving gapfilled model + def convert_cobra_reaction_to_kbreaction( + self, rxn, kbmodel, direction="=", add_to_model=1 + ): + rxnref = "~/template/reactions/id/rxn00000_c" + if re.search("rxn\d+_[a-z]+", rxn.id): + rxnref = "~/template/reactions/id/" + rxn.id + rxnref = re.sub("\d+$", "", rxnref) + rxn_data = { + "id": rxn.id, + "aliases": [], + "dblinks": {}, + "direction": direction, + "edits": {}, + "gapfill_data": {}, + "maxforflux": 1000000, + "maxrevflux": 1000000, + "modelReactionProteins": [], + "modelReactionReagents": [], + "modelcompartment_ref": "~/modelcompartments/id/" + rxn.id.split("_").pop(), + "name": rxn.name, + "numerical_attributes": {}, + "probability": 0, + "protons": 0, + "reaction_ref": rxnref, + "string_attributes": {}, + } + rxn_data = AttrDict(rxn_data) + for cpd in rxn.metabolites: + if cpd.id not in kbmodel.modelcompounds: + convert_cobra_compound_to_kbcompound(cpd, kbmodel, 1) + rxn_data.modelReactionReagents.append( + { + "coefficient": rxn.metabolites[cpd], + "modelcompound_ref": "~/modelcompounds/id/" + cpd.id, + } + ) + if add_to_model == 1: + kbmodel.modelreactions.append(rxn_data) + return rxn_data + + def convert_objective_to_constraint(self, lower_bound, upper_bound): + old_obj_variable = self.cobramodel.problem.Variable( + name="old_objective_variable", lb=lower_bound, ub=upper_bound + ) + old_obj_constraint = self.cobramodel.problem.Constraint( + self.cobramodel.solver.objective.expression - old_obj_variable, + lb=0, + ub=0, + name="old_objective_constraint", + ) + self.cobramodel.add_cons_vars([old_obj_variable, old_obj_constraint]) + + def compute_flux_values_from_variables(self): + flux_values = {} + for rxnobj in self.cobramodel.reactions: + flux_values[rxnobj.id] = {} + flux_values[rxnobj.id]["reverse"] = rxnobj.reverse_variable.primal + flux_values[rxnobj.id]["forward"] = rxnobj.forward_variable.primal + return flux_values + + def compute_gapfilled_solution(self, penalties, flux_values=None): + if flux_values == None: + flux_values = self.compute_flux_values_from_variables() + output = {"reversed": {}, "new": {}} + for reaction in self.cobramodel.reactions: + if reaction.id in penalties: + if ( + flux_values[reaction.id]["forward"] > Zero + and "forward" in penalties[reaction.id] + ): + if "added" in penalties[reaction.id]: + output["new"][reaction.id] = ">" + else: + output["reversed"][reaction.id] = ">" + elif ( + flux_values[reaction.id]["reverse"] > Zero + and "reverse" in penalties[reaction.id] + ): + if "added" in penalties[reaction.id]: + output["new"][reaction.id] = "<" + else: + output["reversed"][reaction.id] = "<" + return output + + def add_gapfilling_solution_to_kbase_model(self, newmodel, penalties, media_ref): + gfid = None + if gfid == None: + largest_index = 0 + for gapfilling in newmodel.gapfillings: + current_index = gapfilling.id.split(".").pop() + if largest_index == 0 or largest_index < current_index: + largest_index = current_index + gfid = "gf." + str(largest_index + 1) + newmodel.gapfillings.append( + { + "gapfill_id": newmodel.id + "." + gfid, + "id": gfid, + "integrated": 1, + "integrated_solution": "0", + "media_ref": media_ref, + } + ) + for reaction in self.cobramodel.reactions: + if reaction.id in penalties: + if ( + reaction.forward_variable.primal > Zero + and "forward" in penalties[reaction.id] + ): + if reaction.id not in newmodel.modelreactions: + self.convert_cobra_reaction_to_kbreaction( + reaction, newmodel, ">", 1 + ) + gfrxn = newmodel.modelreactions.get_by_id(reaction.id) + gfrxn.gapfill_data[gfid] = dict() + gfrxn.gapfill_data[gfid]["0"] = [">", 1, []] + elif ( + reaction.forward_variable.primal > Zero + and "reverse" in penalties[reaction.id] + ): + if reaction.id not in newmodel.modelreactions: + self.convert_cobra_reaction_to_kbreaction( + reaction, newmodel, "<", 1 + ) + gfrxn = newmodel.modelreactions.get_by_id(reaction.id) + gfrxn.gapfill_data[gfid] = dict() + gfrxn.gapfill_data[gfid]["0"] = ["<", 1, []] + + def compute_reaction_scores(self, weigh_all_events_equally=1, weights=None): + reaction_genes = {} + if "genome_ref" in self.fbamodel: + anno_api = annotation_ontology_api() + events = anno_api.get_annotation_ontology_events( + { + "input_ref": self.fbamodel["genome_ref"], + } + ) + for event in events: + for gene in event["ontology_terms"]: + if "modelseed_ids" in event["ontology_terms"][gene]: + for rxn in event["ontology_terms"][gene]["modelseed_ids"]: + newrxn = re.sub("^MSRXN:", "", rxn) + if newrxn not in reaction_genes: + reaction_genes[newrxn] = {} + if gene not in reaction_genes[newrxn]: + reaction_genes[newrxn][gene] = 0 + if weigh_all_events_equally == 1 or weights == None: + reaction_genes[newrxn][gene] += 1 + elif event["description"] in weights: + reaction_genes[newrxn][gene] += weights[ + event["description"] + ] + elif event["event_id"] in weights: + reaction_genes[newrxn][gene] += weights[ + event["event_id"] + ] + elif event["id"] in weights: + reaction_genes[newrxn][gene] += weights[event["id"]] + return reaction_genes + + def replicate_model(self, count): + newmodel = Model(self.cobramodel.id + "_rep" + str(count)) + utilities = KBaseFBAUtilities( + newmodel, + newmodel, + self.kbapi, + self.media, + default_uptake=self.default_uptake, + default_excretion=self.default_excretion, + blacklist=self.blacklist, + ) + metabolites = [] + reactions = [] + metabolite_hash = {} + for i in range(0, count): + for metabolite in self.cobramodel.metabolites: + metabolite = metabolite.copy() + metabolite.id = metabolite.id + "__" + str(i) + metabolite_hash[metabolite.id] = metabolite + metabolites.append(metabolite) + for reaction in self.cobramodel.reactions: + reaction = reaction.copy() + reaction.id = reaction.id + "__" + str(i) + input_metabolites = {} + for metabolite in reaction.metabolites: + newid = metabolite.id + "__" + str(i) + input_metabolites[metabolite_hash[newid]] = reaction.metabolites[ + metabolite + ] + reaction.add_metabolites(input_metabolites, combine=False) + reactions.append(reaction) + newmodel.add_metabolites(metabolites) + newmodel.add_reactions(reactions) + return utilities diff --git a/modelseedpy/core/fbahelper.py b/modelseedpy/core/fbahelper.py index be17ec8a..502611d9 100644 --- a/modelseedpy/core/fbahelper.py +++ b/modelseedpy/core/fbahelper.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import - import logging from chemicals import periodic_table import re @@ -12,7 +11,6 @@ ) # !!! Gene, Metabolite, and Model are never used from cobra.util import solver as sutil # !!! sutil is never used import time -from modelseedpy.biochem import from_local from scipy.odr.odrpack import Output # !!! Output is never used from chemw import ChemMW from warnings import warn @@ -117,16 +115,26 @@ def modelseed_id_from_cobra_reaction(reaction): @staticmethod def metabolite_mw(metabolite): + fixed_masses = {"cpd11416": 1, "cpd17041": 0, "cpd17042": 0, "cpd17043": 0} + msid = FBAHelper.modelseed_id_from_cobra_metabolite(metabolite) + if msid in fixed_masses: + return fixed_masses[msid] + if not metabolite.formula: + return 0 + formula = re.sub("R\d*", "", metabolite.formula) try: - chem_mw = ChemMW() - chem_mw.mass(metabolite.formula) + chem_mw = ChemMW(printing=False) + chem_mw.mass(formula) return chem_mw.raw_mw except: - warn( + logger.warn( "The compound " + metabolite.id - + " possesses an unconventional formula {metabolite.formula}; hence, the MW cannot be computed." + + " possesses an unconventional formula " + + metabolite.formula + + "; hence, the MW cannot be computed." ) + return 0 @staticmethod def elemental_mass(): @@ -134,6 +142,8 @@ def elemental_mass(): @staticmethod def get_modelseed_db_api(modelseed_path): + from modelseedpy.biochem import from_local + return from_local(modelseed_path) @staticmethod @@ -171,7 +181,7 @@ def msid_hash(model): output = {} for met in model.metabolites: msid = FBAHelper.modelseed_id_from_cobra_metabolite(met) - if msid != None: + if msid is not None: if msid not in output: output[msid] = [] output[msid].append(met) @@ -265,6 +275,11 @@ def parse_id(cobra_obj): return (m[1], m[2], int(m[3])) return None + @staticmethod + def id_from_ref(ref): + array = ref.split("/") + return array[-1] + @staticmethod def medianame(media): if media == None: @@ -281,6 +296,54 @@ def validate_dictionary(dictionary, required_keys, optional_keys={}): dictionary[key] = optional_keys[key] return dictionary + @staticmethod + def parse_media(media): + return [cpd.id for cpd in media.data["mediacompounds"]] + + def get_reframed_model( + kbase_model, + ): + from reframed import from_cobrapy + + reframed_model = from_cobrapy(kbase_model) + if hasattr(kbase_model, "id"): + reframed_model.id = kbase_model.id + reframed_model.compartments.e0.external = True + return reframed_model + + @staticmethod + def add_vars_cons(model, vars_cons): + model.add_cons_vars(vars_cons) + model.solver.update() + return model + + @staticmethod + def update_model_media(model, media): + medium = {} + model_reactions = [rxn.id for rxn in model.reactions] + for cpd in media.data["mediacompounds"]: + ex_rxn = f"EX_{cpd.id}" + if ex_rxn not in model_reactions: + model.add_boundary( + metabolite=Metabolite(id=cpd.id, name=cpd.name, compartment="e0"), + type="exchange", + lb=cpd.minFlux, + ub=cpd.maxFlux, + ) + medium[ex_rxn] = cpd.maxFlux + model.medium = medium + return model + + @staticmethod + def filter_cobra_set(cobra_set): + unique_ids = set(obj.id for obj in cobra_set) + unique_objs = set() + for obj in cobra_set: + if obj.id in unique_ids: + unique_objs.add(obj) + unique_ids.remove(obj.id) + return unique_objs + @staticmethod def get_reframed_model( kbase_model, diff --git a/modelseedpy/core/gapfillinghelper.py b/modelseedpy/core/gapfillinghelper.py index 6c5d6afc..ed21fb00 100644 --- a/modelseedpy/core/gapfillinghelper.py +++ b/modelseedpy/core/gapfillinghelper.py @@ -1196,7 +1196,6 @@ def replicate_model(self, count): def test_reaction_additions_againt_limits(self, reactions, directions, tests): filtered_rxn = [] filtered_direction = [] - # Using "with" to ensure we don't alter the model with these tests model = self.cobramodel with model: diff --git a/modelseedpy/core/msatpcorrection.py b/modelseedpy/core/msatpcorrection.py index 1f91ac51..847fd1c0 100644 --- a/modelseedpy/core/msatpcorrection.py +++ b/modelseedpy/core/msatpcorrection.py @@ -1,9 +1,12 @@ # -*- coding: utf-8 -*- import logging -import itertools import cobra +import copy import json import time +import pandas as pd +from os.path import abspath as _abspath +from os.path import dirname as _dirname from optlang.symbolics import Zero, add from modelseedpy.core.rast_client import RastClient from modelseedpy.core.msgenome import normalize_role @@ -13,11 +16,42 @@ ) from cobra.core import Gene, Metabolite, Model, Reaction from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.mstemplate import MSTemplateBuilder from modelseedpy.core import FBAHelper, MSGapfill, MSMedia from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.helpers import get_template logger = logging.getLogger(__name__) - +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + +_path = _dirname(_abspath(__file__)) + +min_gap = { + "Glc.O2": 5, + "Etho.O2": 0.01, + "Ac.O2": 1, + "Pyr.O2": 3, + "Glyc.O2": 2, + "Fum.O2": 3, + "Succ.O2": 2, + "Akg.O2": 2, + "LLac.O2": 2, + "Dlac.O2": 2, + "For.O2": 1.875, + "For.NO3": 1.5, + "Pyr.NO": 2.5, + "Pyr.NO2": 2.5, + "Pyr.NO3": 2.5, + "Pyr.SO4": 2.5, +} + +default_threshold_multipiers = { + "Pyr": 2, + "Glc": 2, + "default": 1.2, +} class MSATPCorrection: @@ -25,57 +59,136 @@ class MSATPCorrection: def __init__( self, - model, - core_template, - atp_medias: list, + model_or_mdlutl, + core_template=None, + atp_medias=[], compartment="c0", - max_gapfilling=None, + max_gapfilling=10, gapfilling_delta=0, atp_hydrolysis_id=None, + load_default_medias=True, + forced_media=[], + default_media_path=None, ): """ - :param model: :param core_template: - :param atp_medias: - :param atp_objective: - :param max_gapfilling: - :param gapfilling_delta: - :param atp_hydrolysis_id: ATP Hydrolysis reaction ID, if None it will perform a SEED reaction search + :param atp_medias: list : list of additional medias to test + :param load_default_medias: Bool : load default media set + :param forced_media: list : name of medias in which ATP production should be forced + :param compartment: string : ID of compartment to test ATP in + :param max_gapfilling: string : maximum gapfilling allowed in accepted media + :param gapfilling_delta: string : difference between lowest gapfilling and current gapfilling where media will be accepted + :param atp_hydrolysis_id: string : ATP Hydrolysis reaction ID, if None it will perform a SEED reaction search """ - if isinstance(model, MSModelUtil): - self.model = model.model - self.modelutl = model + # Discerning input is model or mdlutl and setting internal links + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.modelutl = model_or_mdlutl else: - self.model = model - self.modelutl = MSModelUtil(model) + self.model = model_or_mdlutl + self.modelutl = MSModelUtil.get(model_or_mdlutl) + # Setting atpcorrection attribute in model utl so link is bidirectional + self.modelutl.atputl = self + + if default_media_path: + self.default_media_path = default_media_path + else: + self.default_media_path = _path + "/../data/atp_medias.tsv" + self.compartment = compartment + if atp_hydrolysis_id and atp_hydrolysis_id in self.model.reactions: self.atp_hydrolysis = self.model.reactions.get_by_id(atp_hydrolysis_id) else: output = self.modelutl.add_atp_hydrolysis(compartment) self.atp_hydrolysis = output["reaction"] + + self.media_hash = {} self.atp_medias = [] - for media in atp_medias: - if isinstance(media, MSMedia): - self.atp_medias.append([media, 0.01]) - else: - self.atp_medias.append(media) + if load_default_medias: + self.load_default_medias() + + self.forced_media = [] + for media_id in forced_media: + for item in self.atp_medias: + if item[0].id == media_id: + print("Forced media: " + media_id) + self.forced_media.append(item[0]) + break + self.max_gapfilling = max_gapfilling self.gapfilling_delta = gapfilling_delta - self.coretemplate = core_template - self.msgapfill = MSGapfill( - self.modelutl, default_gapfill_templates=core_template - ) + + if not core_template: + self.load_default_template() + else: + self.coretemplate = core_template + + # These should stay as None until atp correction is actually run + self.msgapfill = None + self.cumulative_core_gapfilling = None + self.selected_media = None self.original_bounds = {} self.noncore_reactions = [] self.other_compartments = [] self.media_gapfill_stats = {} - self.selected_media = [] self.filtered_noncore = [] self.lp_filename = None self.multiplier = 1.2 + def get_msgapfill(self): + if self.msgapfill is None: + self.msgapfill = MSGapfill( + self.modelutl, + default_gapfill_templates=[self.coretemplate], + default_target=self.atp_hydrolysis.id, + ) + return self.msgapfill + + def load_default_template(self): + self.coretemplate = MSTemplateBuilder.from_dict( + get_template("template_core"), None + ).build() + + def load_default_medias(self): + filename = self.default_media_path + medias = pd.read_csv(filename, sep="\t", index_col=0).to_dict() + for media_id in medias: + media_d = {} + for exchange, v in medias[media_id].items(): + if v > 0: + k = exchange.split("_")[1] + media_d[k] = v + media_d["cpd00001"] = 1000 + media_d["cpd00067"] = 1000 + media = MSMedia.from_dict(media_d) + media.id = media_id + media.name = media_id + self.atp_medias.append([media, min_gap.get(media_id, 0.01)]) + + media_ids = set() + temp_medias = self.atp_medias + self.atp_medias = [] + for media in temp_medias: + if isinstance(media, list): + if media[0].id in media_ids: + raise ValueError("media ids not unique") + media_ids.add(media[0].id) + self.atp_medias.append(media) + self.media_hash[media[0].id] = media[0] + else: + if media.id in media_ids: + raise ValueError("media ids not unique") + media_ids.add(media.id) + self.atp_medias.append([media, 0.01]) + self.media_hash[media.id] = media + if "empty" not in self.media_hash: + media = MSMedia.from_dict({}) + media.id = "empty" + media.name = "empty" + self.media_hash[media.id] = media + @staticmethod def find_reaction_in_template(model_reaction, template, compartment): template_reaction = None # we save lookup result here @@ -126,6 +239,7 @@ def disable_noncore_reactions(self): self.other_compartments = [] # Iterating through reactions and disabling for reaction in self.model.reactions: + gfrxn = self.get_msgapfill().gfmodel.reactions.get_by_id(reaction.id) if reaction.id == self.atp_hydrolysis.id: continue if FBAHelper.is_ex(reaction): @@ -150,10 +264,12 @@ def disable_noncore_reactions(self): logger.debug(reaction.id + " core but reversible") self.noncore_reactions.append([reaction, "<"]) reaction.lower_bound = 0 + gfrxn.lower_bound = 0 if reaction.upper_bound > 0 and template_reaction.upper_bound <= 0: logger.debug(reaction.id + " core but reversible") self.noncore_reactions.append([reaction, ">"]) reaction.upper_bound = 0 + gfrxn.upper_bound = 0 else: logger.debug(f"{reaction.id} non core") if FBAHelper.rxn_compartment(reaction) != self.compartment: @@ -168,8 +284,10 @@ def disable_noncore_reactions(self): self.noncore_reactions.append([reaction, ">"]) reaction.lower_bound = 0 reaction.upper_bound = 0 + gfrxn.lower_bound = 0 + gfrxn.upper_bound = 0 - def evaluate_growth_media(self): + def evaluate_growth_media(self,no_gapfilling=False): """ Determines how much gap filling each input test media requires to make ATP @@ -177,20 +295,18 @@ def evaluate_growth_media(self): """ self.disable_noncore_reactions() self.media_gapfill_stats = {} - self.msgapfill.default_gapfill_templates = [self.coretemplate] + self.get_msgapfill().default_gapfill_templates = [self.coretemplate] if self.lp_filename: - self.msgapfill.lp_filename = self.lp_filename + self.get_msgapfill().lp_filename = self.lp_filename output = {} with self.model: self.model.objective = self.atp_hydrolysis.id - # self.model.objective = self.model.problem.Objective(Zero,direction="max") - - logger.debug( - f"ATP bounds: ({self.atp_hydrolysis.lower_bound}, {self.atp_hydrolysis.upper_bound})" - ) - # self.model.objective.set_linear_coefficients({self.atp_hydrolysis.forward_variable:1}) pkgmgr = MSPackageManager.get_pkg_mgr(self.model) + # First prescreening model for ATP production without gapfilling + media_list = [] + min_objectives = {} for media, minimum_obj in self.atp_medias: + logger.debug("evaluate media %s", media) pkgmgr.getpkg("KBaseMediaPkg").build_package(media) logger.debug("model.medium %s", self.model.medium) @@ -202,99 +318,138 @@ def evaluate_growth_media(self): solution.status, ) self.media_gapfill_stats[media] = None + output[media.id] = solution.objective_value + if ( solution.objective_value < minimum_obj or solution.status != "optimal" ): - self.media_gapfill_stats[media] = self.msgapfill.run_gapfilling( - media, self.atp_hydrolysis.id, minimum_obj - ) - # IF gapfilling fails - need to activate and penalize the noncore and try again + media_list.append(media) + min_objectives[media] = minimum_obj elif solution.objective_value >= minimum_obj: self.media_gapfill_stats[media] = {"reversed": {}, "new": {}} - logger.debug( - "gapfilling stats: %s", - json.dumps(self.media_gapfill_stats[media], indent=2), + + # Now running gapfilling on all conditions where initially there was no growth + if not no_gapfilling: + all_solutions = self.get_msgapfill().run_multi_gapfill( + media_list, + target=self.atp_hydrolysis.id, + minimum_objectives=min_objectives, + prefilter=False, + check_for_growth=False, + gapfilling_mode="Independent", + run_sensitivity_analysis=False, + integrate_solutions=False, ) + print(str(all_solutions)) + # Adding the new solutions to the media gapfill stats + for media in all_solutions: + self.media_gapfill_stats[media] = all_solutions[media] if MSATPCorrection.DEBUG: + export_data = {} + for media in self.media_gapfill_stats: + export_data[media.id] = self.media_gapfill_stats[media] with open("debug.json", "w") as outfile: - json.dump(self.media_gapfill_stats[media], outfile) + json.dump(export_data, outfile) return output - def determine_growth_media(self): + def determine_growth_media(self, max_gapfilling=None): """ Decides which of the test media to use as growth conditions for this model :return: """ + atp_att = {"tests": {}, "selected_media": {}, "core_atp_gapfilling": {}} self.selected_media = [] best_score = None for media in self.media_gapfill_stats: - gfscore = 0 + atp_att["core_atp_gapfilling"][media.id] = { + "score": 0, + "new": {}, + "reversed": {}, + } if self.media_gapfill_stats[media]: - gfscore = len( + atp_att["core_atp_gapfilling"][media.id]["score"] = len( self.media_gapfill_stats[media]["new"].keys() ) + 0.5 * len(self.media_gapfill_stats[media]["reversed"].keys()) - if best_score is None or gfscore < best_score: - best_score = gfscore + atp_att["core_atp_gapfilling"][media.id][ + "new" + ] = self.media_gapfill_stats[media]["new"] + atp_att["core_atp_gapfilling"][media.id][ + "reversed" + ] = self.media_gapfill_stats[media]["reversed"] + else: + atp_att["core_atp_gapfilling"][media.id] = { + "score": 1000, + "failed": True, + } + if ( + best_score is None + or atp_att["core_atp_gapfilling"][media.id]["score"] < best_score + ): + best_score = atp_att["core_atp_gapfilling"][media.id]["score"] + if self.max_gapfilling is None: self.max_gapfilling = best_score - logger.debug(f"max_gapfilling: {self.max_gapfilling}, best_score: {best_score}") + logger.info(f"max_gapfilling: {self.max_gapfilling}, best_score: {best_score}") for media in self.media_gapfill_stats: - gfscore = 0 - if self.media_gapfill_stats[media]: - gfscore = len( - self.media_gapfill_stats[media]["new"].keys() - ) + 0.5 * len(self.media_gapfill_stats[media]["reversed"].keys()) - - logger.debug(f"media gapfilling score: {media.id}: {gfscore}") - if gfscore <= self.max_gapfilling and gfscore <= ( + if atp_att["core_atp_gapfilling"][media.id][ + "score" + ] <= self.max_gapfilling and atp_att["core_atp_gapfilling"][media.id][ + "score" + ] <= ( best_score + self.gapfilling_delta ): self.selected_media.append(media) - - def determine_growth_media2(self, max_gapfilling=None): - """ - Decides which of the test media to use as growth conditions for this model - :return: - """ - - def scoring_function(media): - return len(self.media_gapfill_stats[media]["new"].keys()) + 0.5 * len( - self.media_gapfill_stats[media]["reversed"].keys() - ) - - self.selected_media = [] - media_scores = dict( - (media, scoring_function(media)) - for media in self.media_gapfill_stats - if self.media_gapfill_stats[media] - ) - best_score = min(media_scores.values()) - if max_gapfilling is None: - max_gapfilling = best_score - for media in media_scores: - score = media_scores[media] - logger.debug(score, best_score, max_gapfilling) - if score <= max_gapfilling and score <= ( - best_score + self.gapfilling_delta - ): + atp_att["selected_media"][media.id] = 0 + elif media in self.forced_media: self.selected_media.append(media) + atp_att["selected_media"][media.id] = 0 + + + self.modelutl.save_attributes(atp_att, "ATP_analysis") def apply_growth_media_gapfilling(self): """ Applies the gapfilling to all selected growth media :return: """ + self.cumulative_core_gapfilling = [] + # TODO: In case someone runs ATP correction twice with different parameters, + # before resetting this, maybe check if any of these reactions are already in + # the model and remove them so we're starting fresh??? for media in self.selected_media: - if media in self.media_gapfill_stats and self.media_gapfill_stats[media]: - self.model = self.msgapfill.integrate_gapfill_solution( - self.media_gapfill_stats[media] + stats = self.media_gapfill_stats.get(media, None) + if ( + stats is not None + and MSGapfill.gapfill_count(self.media_gapfill_stats[media]) > 0 + ): + self.get_msgapfill().integrate_gapfill_solution( + stats, self.cumulative_core_gapfilling,check_for_growth=False ) + # Adding reactions to gapfilling sensitivity structure so we can track all gapfilled reactions + gf_sensitivity = self.modelutl.get_attributes("gf_sensitivity", {}) + if media.id not in gf_sensitivity: + gf_sensitivity[media.id] = {} + if self.atp_hydrolysis.id not in gf_sensitivity[media.id]: + gf_sensitivity[media.id][self.atp_hydrolysis.id] = {} + gf_sensitivity[media.id][self.atp_hydrolysis.id]["success"] = {} + for item in stats["new"]: + gf_sensitivity[media.id][self.atp_hydrolysis.id]["success"][ + item + ] = {stats["new"][item]: []} + for item in stats["reversed"]: + gf_sensitivity[media.id][self.atp_hydrolysis.id]["success"][ + item + ] = {stats["reversed"][item]: []} + self.modelutl.save_attributes(gf_sensitivity, "gf_sensitivity") + self.modelutl.save_attributes( + len(self.cumulative_core_gapfilling), "total_core_gapfilling" + ) def expand_model_to_genome_scale(self): """Restores noncore reactions to model while filtering out reactions that break ATP @@ -312,11 +467,11 @@ def expand_model_to_genome_scale(self): self.restore_noncore_reactions(noncore=True, othercompartment=False) # Extending model with non core reactions while retaining ATP accuracy self.filtered_noncore = self.modelutl.reaction_expansion_test( - self.noncore_reactions, tests + self.noncore_reactions, tests, attribute_label="atp_expansion_filter" ) # Removing filtered reactions for item in self.filtered_noncore: - print("Removing " + item[0].id + " " + item[1]) + logger.info("Removing " + item[0].id + " " + item[1]) if item[1] == ">": item[0].upper_bound = 0 else: @@ -326,6 +481,16 @@ def expand_model_to_genome_scale(self): self.model.remove_reactions([item[0]]) # Restoring other compartment reactions but not the core because this would undo reaction filtering self.restore_noncore_reactions(noncore=False, othercompartment=True) + # Setting core model attribute in model + core_reactions = [] + for reaction in self.model.reactions: + # check if reaction is in core template + template_reaction = self.find_reaction_in_template( + reaction, self.coretemplate, self.compartment[0:1] + ) + if template_reaction is not None: + core_reactions.append(reaction.id) + self.modelutl.save_attributes(core_reactions, "core_reactions") def restore_noncore_reactions(self, noncore=True, othercompartment=True): """ @@ -352,7 +517,7 @@ def restore_noncore_reactions(self, noncore=True, othercompartment=True): reaction.lower_bound = self.original_bounds[reaction.id][0] reaction.upper_bound = self.original_bounds[reaction.id][1] - def build_tests(self, multiplier=None): + def build_tests(self, multiplier_hash_override={}): """Build tests based on ATP media evaluations Parameters @@ -368,22 +533,66 @@ def build_tests(self, multiplier=None): Raises ------ """ - if multiplier is None: - multiplier = self.multiplier + #Checking if ATP stats have been run yet and if not, running them + if not self.selected_media: + logger.warning("ATP tests not yet computed - running without allowing for model changes!") + self.evaluate_growth_media(no_gapfilling=True) + self.determine_growth_media() + self.restore_noncore_reactions() + # Applying threshold multiplier + for key in default_threshold_multipiers: + if key not in multiplier_hash_override: + multiplier_hash_override[key] = default_threshold_multipiers[key] + # Initialzing atp test attributes + atp_att = self.modelutl.get_attributes( + "ATP_analysis", + {"tests": {}, "selected_media": {}, "core_atp_gapfilling": {}}, + ) + # Initializing tests and adding empty media every time tests = [] + if "empty" in self.media_hash: + tests.append( + { + "media": self.media_hash["empty"], + "is_max_threshold": True, + "threshold": 0.00001, + "objective": self.atp_hydrolysis.id, + } + ) + atp_att["tests"]["empty"] = { + "threshold": 0.00001, + "objective": self.atp_hydrolysis.id, + } + # Setting objective to ATP hydrolysis self.model.objective = self.atp_hydrolysis.id for media in self.selected_media: + # Setting multiplier for test threshold + multiplier = multiplier_hash_override["default"] + if media.id in multiplier_hash_override: + multiplier = multiplier_hash_override[media.id] + # Constraining model exchanges for media self.modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + # Computing core ATP production obj_value = self.model.slim_optimize() - logger.debug(f"{media.name} = {obj_value}") + logger.debug(f"{media.name} = {obj_value};{multiplier}") + threshold = multiplier * obj_value + if threshold == 0: + threshold += 0.00001 tests.append( { "media": media, "is_max_threshold": True, - "threshold": multiplier * obj_value, + "threshold": threshold, "objective": self.atp_hydrolysis.id, } ) + atp_att["selected_media"][media.id] = obj_value + atp_att["tests"][media.id] = { + "threshold": multiplier * obj_value, + "objective": self.atp_hydrolysis.id, + } + # Saving test attributes to the model + self.modelutl.save_attributes(atp_att, "ATP_analysis") return tests def run_atp_correction(self): @@ -395,7 +604,7 @@ def run_atp_correction(self): self.evaluate_growth_media() self.determine_growth_media() self.apply_growth_media_gapfilling() - self.evaluate_growth_media() + # self.evaluate_growth_media() self.expand_model_to_genome_scale() return self.build_tests() diff --git a/modelseedpy/core/msbuilder.py b/modelseedpy/core/msbuilder.py index 8a65ff70..f7feda2d 100644 --- a/modelseedpy/core/msbuilder.py +++ b/modelseedpy/core/msbuilder.py @@ -1,64 +1,84 @@ # -*- coding: utf-8 -*- import logging import itertools +from enum import Enum import cobra +from modelseedpy.core.exceptions import ModelSEEDError from modelseedpy.core.rast_client import RastClient from modelseedpy.core.msgenome import normalize_role +from modelseedpy.core.mstemplate import TemplateReactionType from modelseedpy.core.msmodel import ( get_gpr_string, get_reaction_constraints_from_direction, ) -from cobra.core import Gene, Metabolite, Model, Reaction +from cobra.core import Gene, Metabolite, Model, Reaction, Group from modelseedpy.core import FBAHelper +from modelseedpy.core.msmodel import MSModel from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.biochem.modelseed_biochem import ModelSEEDBiochem +from modelseedpy.biochem.modelseed_to_cobra import modelseed_to_cobra_reaction SBO_ANNOTATION = "sbo" +DEFAULT_SINKS = { + "cpd02701_c": 1000, # S-Adenosyl-4-methylthio-2-oxobutanoate + "cpd11416_c": 1000, # Biomass + "cpd15302_c": 1000, # glycogen(n-1) + "cpd03091_c": 1000, # 5'-Deoxyadenosine + "cpd01042_c": 1000, # p-Cresol +} + logger = logging.getLogger(__name__) ### temp stuff ### core_biomass = { - "cpd00032_c0": -1.7867, - "cpd00005_c0": -1.8225, - "cpd00169_c0": -1.496, - "cpd11416_c0": 1, - "cpd00003_c0": -3.547, - "cpd00008_c0": 41.257, - "cpd00024_c0": -1.0789, - "cpd00009_c0": 41.257, - "cpd00102_c0": -0.129, - "cpd00101_c0": -0.8977, - "cpd00236_c0": -0.8977, - "cpd00002_c0": -41.257, - "cpd00022_c0": -3.7478, - "cpd00020_c0": -2.8328, - "cpd00006_c0": 1.8225, - "cpd00001_c0": -41.257, - "cpd00072_c0": -0.0709, - "cpd00010_c0": 3.7478, - "cpd00004_c0": 3.547, - "cpd00061_c0": -0.5191, - "cpd00067_c0": 46.6265, - "cpd00079_c0": -0.205, + "cpd00032_c": -1.7867, + "cpd00005_c": -1.8225, + "cpd00169_c": -1.496, + "cpd11416_c": 1, + "cpd00003_c": -3.547, + "cpd00008_c": 41.257, + "cpd00024_c": -1.0789, + "cpd00009_c": 41.257, + "cpd00102_c": -0.129, + "cpd00101_c": -0.8977, + "cpd00236_c": -0.8977, + "cpd00002_c": -41.257, + "cpd00022_c": -3.7478, + "cpd00020_c": -2.8328, + "cpd00006_c": 1.8225, + "cpd00001_c": -41.257, + "cpd00072_c": -0.0709, + "cpd00010_c": 3.7478, + "cpd00004_c": 3.547, + "cpd00061_c": -0.5191, + "cpd00067_c": 46.6265, + "cpd00079_c": -0.205, } core_atp2 = { - "cpd00067_c0": 46.6265, - "cpd00002_c0": -41.257, - "cpd00008_c0": 41.257, - "cpd00001_c0": -41.257, - "cpd00009_c0": 41.257, + "cpd00067_c": 46.6265, + "cpd00002_c": -41.257, + "cpd00008_c": 41.257, + "cpd00001_c": -41.257, + "cpd00009_c": 41.257, } core_atp = { - "cpd00067_c0": 1, - "cpd00002_c0": -1, - "cpd00008_c0": 1, - "cpd00001_c0": -1, - "cpd00009_c0": 1, + "cpd00067_c": 1, + "cpd00002_c": -1, + "cpd00008_c": 1, + "cpd00001_c": -1, + "cpd00009_c": 1, } gramneg = { + "cpd11463_c0": 1, + "cpd00008_c0": 40, + "cpd00067_c0": 40, + "cpd00009_c0": 40, + "cpd00001_c0": -40, + "cpd00002_c0": -40, "cpd00166_c0": -0.00280615915959131, "cpd00087_c0": -0.00280615915959131, "cpd15560_c0": -0.00280615915959131, @@ -70,162 +90,100 @@ "cpd00220_c0": -0.00280615915959131, "cpd00003_c0": -0.00280615915959131, "cpd00557_c0": -0.00280615915959131, - "cpd00002_c0": -40.1101757365074, - "cpd00023_c0": -0.219088153012743, - "cpd00062_c0": -0.0908319049068452, "cpd00050_c0": -0.00280615915959131, - "cpd00008_c0": 40, "cpd00264_c0": -0.00280615915959131, "cpd00010_c0": -0.00280615915959131, "cpd15533_c0": -0.0311453449430676, - "cpd11416_c0": 1, "cpd15540_c0": -0.0311453449430676, "cpd00048_c0": -0.00280615915959131, - "cpd00035_c0": -0.427934380173264, - "cpd17042_c0": -1, "cpd00030_c0": -0.00280615915959131, "cpd00034_c0": -0.00280615915959131, - "cpd00161_c0": -0.211072732780569, "cpd00201_c0": -0.00280615915959131, "cpd00016_c0": -0.00280615915959131, "cpd00104_c0": -0.00280615915959131, - "cpd00067_c0": 40, "cpd11493_c0": -0.00280615915959131, - "cpd00051_c0": -0.246696822701341, "cpd00017_c0": -0.00280615915959131, - "cpd00357_c0": -0.0157642107352084, - "cpd17041_c0": -1, - "cpd00038_c0": -0.135406821203723, - "cpd00107_c0": -0.375388847540127, "cpd00042_c0": -0.00280615915959131, "cpd00149_c0": -0.00280615915959131, "cpd00058_c0": -0.00280615915959131, - "cpd00041_c0": -0.200830806928348, - "cpd00129_c0": -0.184354665339991, - "cpd15432_c0": -0.0250105977108944, - "cpd00052_c0": -0.0841036156544863, - "cpd00012_c0": 0.484600235732628, + "cpd03736_c0": -0.0250105977108944, "cpd15352_c0": -0.00280615915959131, - "cpd00322_c0": -0.241798510337235, - "cpd00053_c0": -0.219088153012743, "cpd00006_c0": -0.00280615915959131, "cpd00345_c0": -0.00280615915959131, "cpd00063_c0": -0.00280615915959131, - "cpd00033_c0": -0.509869786991038, - "cpd00066_c0": -0.154519490031345, - "cpd17043_c0": -1, "cpd00118_c0": -0.00280615915959131, - "cpd00009_c0": 39.9971938408404, "cpd15793_c0": -0.0311453449430676, - "cpd00356_c0": -0.01627686799489, "cpd01997_c0": 0.00280615915959131, - "cpd00132_c0": -0.200830806928348, - "cpd00060_c0": -0.127801422590767, "cpd00037_c0": -0.00280615915959131, - "cpd00115_c0": -0.0157642107352084, "cpd00099_c0": -0.00280615915959131, - "cpd00156_c0": -0.352233189091625, "cpd02229_c0": -0.0250105977108944, - "cpd00069_c0": -0.120676604606612, - "cpd00065_c0": -0.0472019191450218, - "cpd00241_c0": -0.01627686799489, "cpd15666_c0": 0.0250105977108944, "cpd10516_c0": -0.00280615915959131, - "cpd00084_c0": -0.0761464922056484, "cpd00056_c0": -0.00280615915959131, - "cpd00119_c0": -0.0792636000737159, - "cpd00001_c0": -35.5403092430435, "cpd03422_c0": 0.00280615915959131, "cpd00015_c0": -0.00280615915959131, - "cpd00054_c0": -0.179456352975885, "cpd00205_c0": -0.00280615915959131, - "cpd00039_c0": -0.285438020490179, "cpd00254_c0": -0.00280615915959131, + "cpd11463_c0": -0.5, + "cpd11461_c0": -0.1, + "cpd11462_c0": -0.2, } grampos = { - "cpd00241_c0": -0.0116907079028565, + "cpd11416_c0": 1, + "cpd00001_c0": -40, + "cpd00009_c0": 40, + "cpd00008_c0": 40, + "cpd00002_c0": -40, + "cpd00067_c0": 40, "cpd00017_c0": -0.00719527989638797, - "cpd00033_c0": -0.409331301687739, - "cpd00066_c0": -0.176188648374102, - "cpd17043_c0": -1, "cpd03422_c0": 0.00719527989638797, - "cpd17041_c0": -1, "cpd00557_c0": -0.00719527989638797, - "cpd00129_c0": -0.161028229793075, "cpd00166_c0": -0.00719527989638797, "cpd00030_c0": -0.00719527989638797, "cpd00087_c0": -0.00719527989638797, "cpd00015_c0": -0.00719527989638797, - "cpd00065_c0": -0.0544955586831525, - "cpd00357_c0": -0.0151844826784228, - "cpd00009_c0": 41.2498047201036, "cpd00038_c0": -0.0424026391792249, "cpd15667_c0": -0.00309563020839783, - "cpd00069_c0": -0.111039822579957, "cpd15540_c0": -0.0251172136637642, - "cpd00161_c0": -0.186841915485094, "cpd15748_c0": -0.00309563020839783, - "cpd00035_c0": -0.267560900902997, "cpd00048_c0": -0.00719527989638797, "cpd12370_c0": 0.00719527989638797, "cpd00052_c0": -0.0261242266150642, "cpd15757_c0": -0.00309563020839783, - "cpd00053_c0": -0.261005044219309, "cpd15533_c0": -0.0251172136637642, - "cpd00002_c0": -41.2913947104178, "cpd00006_c0": -0.00719527989638797, - "cpd00084_c0": -0.0569540049395353, "cpd10515_c0": -0.00719527989638797, "cpd00104_c0": -0.00719527989638797, - "cpd00051_c0": -0.193397772168782, "cpd00028_c0": -0.00719527989638797, "cpd00118_c0": -0.00719527989638797, - "cpd00107_c0": -0.347460404235438, "cpd00037_c0": -0.00719527989638797, "cpd15793_c0": -0.0251172136637642, "cpd00010_c0": -0.00719527989638797, "cpd11493_c0": -0.00719527989638797, "cpd00264_c0": -0.00719527989638797, "cpd15766_c0": -0.00309563020839783, - "cpd00041_c0": -0.14832625746843, "cpd00056_c0": -0.00719527989638797, "cpd01997_c0": 0.00719527989638797, "cpd15668_c0": -0.00309563020839783, "cpd00254_c0": -0.00719527989638797, - "cpd11416_c0": 1, "cpd02229_c0": -0.00309563020839783, "cpd00003_c0": -0.00719527989638797, - "cpd00008_c0": 41.257, - "cpd17042_c0": -1, - "cpd00023_c0": -0.261005044219309, "cpd15665_c0": -0.00309563020839783, "cpd11459_c0": -0.00309563020839783, "cpd15666_c0": 0.0123825208335913, - "cpd00115_c0": -0.0151844826784228, "cpd00050_c0": -0.00719527989638797, "cpd00063_c0": -0.00719527989638797, "cpd00205_c0": -0.00719527989638797, - "cpd00054_c0": -0.216753011604418, "cpd00042_c0": -0.00719527989638797, "cpd00034_c0": -0.00719527989638797, "cpd15500_c0": -0.00719527989638797, - "cpd00156_c0": -0.307715523090583, - "cpd00132_c0": -0.14832625746843, - "cpd00067_c0": -41.257, "cpd15775_c0": -0.00309563020839783, - "cpd00119_c0": -0.0819482085460939, - "cpd00060_c0": -0.11349826883634, - "cpd00001_c0": 45.354000686262, "cpd00099_c0": -0.00719527989638797, - "cpd00356_c0": -0.0116907079028565, "cpd00220_c0": -0.00719527989638797, - "cpd00322_c0": -0.27042908820211, "cpd00062_c0": -0.0282246669459237, "cpd00345_c0": -0.00719527989638797, - "cpd00012_c0": 0.184896624320595, "cpd10516_c0": -0.00719527989638797, - "cpd00039_c0": -0.323695423757071, "cpd00201_c0": -0.00719527989638797, "cpd15669_c0": -0.00309563020839783, "cpd15560_c0": -0.00719527989638797, @@ -233,9 +191,19 @@ "cpd00058_c0": -0.00719527989638797, "cpd00016_c0": -0.00719527989638797, "cpd15352_c0": -0.00719527989638797, + "cpd11463_c0": -0.5, + "cpd11461_c0": -0.1, + "cpd11462_c0": -0.2, } +class MSGenomeClass(Enum): + P = "Gram Positive" + N = "Gram Negative" + C = "Cyano" + A = "Archaea" + + def build_biomass(rxn_id, cobra_model, template, biomass_compounds, index="0"): bio_rxn = Reaction(rxn_id, "biomass", "", 0, 1000) metabolites = {} @@ -326,13 +294,143 @@ def build_gpr(cpx_gene_role): class MSBuilder: - def __init__(self, genome, template=None): + def __init__( + self, genome, template=None, name=None, ontology_term="RAST", index="0" + ): """ - for future methods with better customization + + @param genome: MSGenome + @param template: MSTemplate + @param name: + @param ontology_term: """ + if index is None or type(index) != str: + raise TypeError("index must be str") + if ontology_term is None or type(ontology_term) != str: + raise TypeError("ontology_term must be str") + self.name = name self.genome = genome self.template = template - self.search_name_to_genes, self.search_name_to_original = _aaaa(genome, "RAST") + self.genome_class = None + self.search_name_to_genes, self.search_name_to_original = _aaaa( + genome, ontology_term + ) + self.template_species_to_model_species = None + self.reaction_to_complex_sets = None + self.compartments = None + self.base_model = None + self.compartments_index = None # TODO: implement custom index by compartment + self.index = index + + def build_drains(self): + if self.template_species_to_model_species is None: + logger.warning("cannot build model drains without generating model species") + return None + if self.template.drains: + sinks = self.build_sinks() + demands = self.build_demands() + return sinks + demands + else: + # template without drain specification we build only default sinks + return self.build_sinks() + + def build_sinks(self): + if self.template_species_to_model_species is None: + logger.warning("cannot build model sinks without generating model species") + return None + if self.template.drains: + sinks = { + x.id: t[1] + for x, t in self.template.drains.items() + if t[1] > 0 and x.id in self.template_species_to_model_species + } + return [self.build_sink_reaction(x, v) for x, v in sinks.items()] + else: + # template without drain specification we build only default sinks + in_model = { + k: v + for k, v in DEFAULT_SINKS.items() + if k in self.template_species_to_model_species + } + return [self.build_sink_reaction(x, v) for x, v in in_model.items()] + + def build_demands(self): + if self.template_species_to_model_species is None: + logger.warning("cannot build model sinks without generating model species") + return None + if self.template.drains: + demands = { + x.id: t[0] + for x, t in self.template.drains.items() + if t[0] < 0 and x.id in self.template_species_to_model_species + } + return [self.build_demand_reaction(x, v) for x, v in demands.items()] + else: + return [] + + def build_drain_reaction( + self, + template_cpd_id, + prefix="EX_", + name_prefix="Exchange for ", + subsystem="exchanges", + lower_bound=0, + upper_bound=1000, + sbo_term="SBO:0000627", + ): + """ + SK_ for sink (SBO_0000632) DM_ for demand (SBO_0000628) EX_ for exchange (SBO_0000627) + @param template_cpd_id: + @param prefix: + @param name_prefix: + @param subsystem: + @param lower_bound: + @param upper_bound: + @param sbo_term: + @return: + """ + + if self.template_species_to_model_species is None: + logger.warning("cannot build model drains without generating model species") + return None + else: + m = self.template_species_to_model_species[template_cpd_id] + drain = Reaction( + f"{prefix}{m.id}", + f"{name_prefix}{m.name}", + subsystem, + lower_bound, + upper_bound, + ) + drain.add_metabolites({m: -1}) + drain.annotation[SBO_ANNOTATION] = sbo_term + return drain + + def build_sink_reaction(self, template_cpd_id, upper_bound): + if upper_bound <= 0: + raise ModelSEEDError("Sink reactions must have upper bound > 0") + return self.build_drain_reaction( + template_cpd_id, + "SK_", + "Sink for ", + "exchanges", + 0, + upper_bound, + "SBO:0000632", + ) + + def build_demand_reaction(self, template_cpd_id, lower_bound): + if lower_bound >= 0: + raise ModelSEEDError("Demand reactions must have lower bound < 0") + return self.build_drain_reaction( + template_cpd_id, + "DM_", + "Demand for ", + "exchanges", + lower_bound, + 0, + "SBO:0000628", + ) def _get_template_reaction_complexes(self, template_reaction): """ @@ -434,40 +532,7 @@ def get_gpr_from_template_reaction( return gpr_set @staticmethod - def _build_reaction(reaction_id, gpr_set, template, index="0", sbo=None): - template_reaction = template.reactions.get_by_id(reaction_id) - - reaction_compartment = template_reaction.compartment - metabolites = {} - - for cpd, value in template_reaction.metabolites.items(): - compartment = f"{cpd.compartment}{index}" - name = f"{cpd.name}_{compartment}" - cpd = Metabolite( - cpd.id + str(index), cpd.formula, name, cpd.charge, compartment - ) - metabolites[cpd] = value - - reaction = Reaction( - "{}{}".format(template_reaction.id, index), - "{}_{}{}".format(template_reaction.name, reaction_compartment, index), - "", - template_reaction.lower_bound, - template_reaction.upper_bound, - ) - - gpr_str = build_gpr2(gpr_set) if gpr_set else "" - reaction.add_metabolites(metabolites) - if gpr_str and len(gpr_str) > 0: - reaction.gene_reaction_rule = gpr_str # get_gpr_string(gpr_ll) - - reaction.annotation["seed.reaction"] = template_reaction.reference_id - if sbo: - reaction.annotation[SBO_ANNOTATION] = sbo - return reaction - - @staticmethod - def build_exchanges(model, extra_cell="e0"): + def add_exchanges_to_model(model, extra_cell="e0"): """ Build exchange reactions for the "extra_cell" compartment :param model: Cobra Model @@ -494,15 +559,25 @@ def build_exchanges(model, extra_cell="e0"): return reactions_exchanges @staticmethod - def build_biomasses(model, template, index): + def get_or_create_metabolite( + model, template, cpd_base_id, compartment="c", index=0 + ): + if isinstance(index, int): + index = str(index) + full_id = cpd_base_id + "_" + compartment + index + if full_id not in model.metabolites: + pass + return model.metabolites.get_by_id(full_id) + + def build_static_biomasses(self, model, template): res = [] if template.name.startswith("CoreModel"): - res.append(build_biomass("bio1", model, template, core_biomass, index)) - res.append(build_biomass("bio2", model, template, core_atp, index)) + res.append(self.build_biomass("bio1", model, template, core_biomass)) + res.append(self.build_biomass("bio2", model, template, core_atp)) if template.name.startswith("GramNeg"): - res.append(build_biomass("bio1", model, template, gramneg, index)) + res.append(self.build_biomass("bio1", model, template, gramneg)) if template.name.startswith("GramPos"): - res.append(build_biomass("bio1", model, template, grampos, index)) + res.append(self.build_biomass("bio1", model, template, grampos)) return res def auto_select_template(self): @@ -513,9 +588,10 @@ def auto_select_template(self): from modelseedpy.helpers import get_template, get_classifier from modelseedpy.core.mstemplate import MSTemplateBuilder - genome_classifier = get_classifier("knn_ACNP_RAST_filter") - genome_class = genome_classifier.classify(self.genome) + genome_classifier = get_classifier("knn_ACNP_RAST_filter_01_17_2023") + self.genome_class = genome_classifier.classify(self.genome) + # TODO: update with enum MSGenomeClass template_genome_scale_map = { "A": "template_gram_neg", "C": "template_gram_neg", @@ -530,68 +606,351 @@ def auto_select_template(self): } if ( - genome_class in template_genome_scale_map - and genome_class in template_core_map + self.genome_class in template_genome_scale_map + and self.genome_class in template_core_map ): self.template = MSTemplateBuilder.from_dict( - get_template(template_genome_scale_map[genome_class]) + get_template(template_genome_scale_map[self.genome_class]) ).build() elif self.template is None: - raise Exception(f"unable to select template for {genome_class}") + raise Exception(f"unable to select template for {self.genome_class}") - return genome_class + return self.genome_class - def build_metabolic_reactions(self, index="0", allow_incomplete_complexes=True): - metabolic_reactions = {} + def generate_reaction_complex_sets(self, allow_incomplete_complexes=True): + self.reaction_to_complex_sets = {} for template_reaction in self.template.reactions: gpr_set = self.get_gpr_from_template_reaction( template_reaction, allow_incomplete_complexes ) if gpr_set: - metabolic_reactions[template_reaction.id] = gpr_set + self.reaction_to_complex_sets[template_reaction.id] = gpr_set logger.debug("[%s] gpr set: %s", template_reaction.id, gpr_set) - reactions = list( - map( - lambda x: self._build_reaction( - x[0], x[1], self.template, index, "SBO:0000176" - ), - metabolic_reactions.items(), + return self.reaction_to_complex_sets + + """ + def _build_reaction(self, reaction_id, gpr_set, template, index="0", sbo=None): + template_reaction = template.reactions.get_by_id(reaction_id) + + reaction_compartment = template_reaction.compartment + metabolites = {} + + for cpd, value in template_reaction.metabolites.items(): + compartment = f"{cpd.compartment}{index}" + name = f"{cpd.name}_{compartment}" + cpd = Metabolite( + cpd.id + str(index), cpd.formula, name, cpd.charge, compartment ) + metabolites[cpd] = value + + reaction = Reaction( + "{}{}".format(template_reaction.id, index), + "{}_{}{}".format(template_reaction.name, reaction_compartment, index), + "", + template_reaction.lower_bound, + template_reaction.upper_bound, ) + gpr_str = build_gpr2(gpr_set) if gpr_set else "" + reaction.add_metabolites(metabolites) + if gpr_str and len(gpr_str) > 0: + reaction.gene_reaction_rule = gpr_str # get_gpr_string(gpr_ll) + + reaction.annotation["seed.reaction"] = template_reaction.reference_id + if sbo: + reaction.annotation[SBO_ANNOTATION] = sbo + return reaction + """ + + def build_complex_groups(self, complex_sets): + """ + Builds complex Group from complex sets computed from template and genome + Example: {'cpx00700': {'ftr01608': {'b3177'}}, 'cpx01370': {'ftr01607': {'b0142'}}} + @param complex_sets: + @return: + """ + group_complexes = {} + for complex_set in complex_sets: + for complex_id in complex_set: + if ( + complex_id not in group_complexes + and complex_id in self.template.complexes + ): + cpx = self.template.complexes.get_by_id(complex_id) + g = Group(complex_id) + g.notes["complex_source"] = cpx.source + for role, (t, o) in cpx.roles.items(): + if role.id in complex_set[complex_id]: + g.notes[f"complex_subunit_note_{role.id}"] = role.name + g.notes[f"complex_subunit_optional_{role.id}"] = ( + 1 if o else 0 + ) + g.notes[f"complex_subunit_triggering_{role.id}"] = ( + 1 if t else 0 + ) + g.notes[f"complex_subunit_features_{role.id}"] = ";".join( + sorted(list(complex_set[complex_id][role.id])) + ) + group_complexes[g.id] = g + + return group_complexes + + def build_metabolic_reactions(self): + if self.base_model is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without base model" + ) + if self.reaction_to_complex_sets is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without generate complex sets" + ) + + if self.template_species_to_model_species is None: + self.template_species_to_model_species = {} + if self.compartments is None: + self.compartments = {} + + reactions = [] + for rxn_id, complex_set in self.reaction_to_complex_sets.items(): + template_reaction = self.template.reactions.get_by_id(rxn_id) + for m in template_reaction.metabolites: + if m.compartment not in self.compartments: + self.compartments[ + m.compartment + ] = self.template.compartments.get_by_id(m.compartment) + if m.id not in self.template_species_to_model_species: + model_metabolite = m.to_metabolite(self.index) + self.template_species_to_model_species[m.id] = model_metabolite + self.base_model.add_metabolites([model_metabolite]) + reaction = template_reaction.to_reaction(self.base_model, self.index) + gpr_str = build_gpr2(complex_set) if complex_set else "" + if gpr_str and len(gpr_str) > 0: + reaction.gene_reaction_rule = gpr_str + reaction.annotation[SBO_ANNOTATION] = "SBO:0000176" + reaction.notes["modelseed_complex"] = ";".join(sorted(list(complex_set))) + reactions.append(reaction) + return reactions + def build_from_annotaton_ontology( + self, + model_or_id, + anno_ont, + index="0", + allow_all_non_grp_reactions=False, + annotate_with_rast=False, + biomass_classic=False, + biomass_gc=0.5, + add_non_template_reactions=True, + prioritized_event_list=None, + ontologies=None, + merge_all=True, + convert_to_sso=True, + ): + # Build base model without annotation + self.search_name_to_orginal = {} + self.search_name_to_genes = {} + gene_term_hash = anno_ont.get_gene_term_hash( + prioritized_event_list, ontologies, merge_all, convert_to_sso + ) + residual_reaction_gene_hash = {} + for gene in gene_term_hash: + for term in gene_term_hash[gene]: + if term.ontology.id == "SSO": + name = anno_ont.get_term_name(term) + f_norm = normalize_role(name) + if f_norm not in self.search_name_to_genes: + self.search_name_to_genes[f_norm] = set() + self.search_name_to_orginal[f_norm] = set() + self.search_name_to_orginal[f_norm].add(name) + self.search_name_to_genes[f_norm].add(gene.id) + else: + for rxn_id in term.msrxns: + if rxn_id not in residual_reaction_gene_hash: + residual_reaction_gene_hash[rxn_id] = {} + if gene not in residual_reaction_gene_hash[rxn_id]: + residual_reaction_gene_hash[rxn_id][gene] = [] + residual_reaction_gene_hash[rxn_id][gene] = gene_term_hash[ + gene + ][term] + + model_or_id = self.build( + model_or_id, + index, + allow_all_non_grp_reactions, + annotate_with_rast, + biomass_classic, + biomass_gc, + ) + for rxn in model_or_id.reactions: + probability = None + for gene in rxn.genes(): + annoont_gene = anno_ont.get_feature(gene.id) + if annoont_gene and annoont_gene in gene_term_hash: + for term in gene_term_hash[annoont_gene]: + if rxn.id[0:-3] in term.msrxns: + for item in gene_term_hash[gene][term]: + if "probability" in item.scores: + if ( + not probability + or item.scores["probability"] > probability + ): + probability = item.scores["probability"] + if hasattr(rxn, "probability"): + rxn.probability = probability + + reactions = [] + modelseeddb = ModelSEEDBiochem.get() + for rxn_id in residual_reaction_gene_hash: + if rxn_id + "_c0" not in model_or_id.reactions: + reaction = None + template_reaction = None + if rxn_id + "_c" in self.template.reactions: + template_reaction = self.template.reactions.get_by_id(rxn_id + "_c") + elif rxn_id in modelseeddb.reactions: + msrxn = modelseeddb.reactions.get_by_id(rxn_id) + template_reaction = msrxn.to_template_reaction({0: "c", 1: "e"}) + if template_reaction: + for m in template_reaction.metabolites: + if m.compartment not in self.compartments: + self.compartments[ + m.compartment + ] = self.template.compartments.get_by_id(m.compartment) + if m.id not in self.template_species_to_model_species: + model_metabolite = m.to_metabolite(self.index) + self.template_species_to_model_species[ + m.id + ] = model_metabolite + self.base_model.add_metabolites([model_metabolite]) + reaction = template_reaction.to_reaction( + self.base_model, self.index + ) + gpr = "" + probability = None + for gene in residual_reaction_gene_hash[rxn_id]: + for item in residual_reaction_gene_hash[rxn_id][gene]: + if "probability" in item["scores"]: + if ( + not probability + or item["scores"]["probability"] > probability + ): + probability = item["scores"]["probability"] + if len(gpr) > 0: + gpr += " or " + gpr += gene.id + if hasattr(rxn, "probability"): + reaction.probability = probability + reaction.gene_reaction_rule = gpr + reaction.annotation[SBO_ANNOTATION] = "SBO:0000176" + reactions.append(reaction) + if not reaction: + print("Reaction ", rxn_id, " not found in template or database!") + + model_or_id.add_reactions(reactions) + return model_or_id + def build_non_metabolite_reactions( - self, cobra_model, index="0", allow_all_non_grp_reactions=False + self, cobra_model, allow_all_non_grp_reactions=False ): - reactions_no_gpr = [] - reactions_in_model = set(map(lambda x: x.id, cobra_model.reactions)) - metabolites_in_model = set(map(lambda x: x.id, cobra_model.metabolites)) - for rxn in self.template.reactions: - if rxn.type == "universal" or rxn.type == "spontaneous": - reaction = self._build_reaction( - rxn.id, {}, self.template, index, "SBO:0000176" - ) - reaction_metabolite_ids = set( - map(lambda x: x.id, set(reaction.metabolites)) - ) + if self.base_model is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without base model" + ) + if self.reaction_to_complex_sets is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without generate complex sets" + ) + + if self.template_species_to_model_species is None: + self.template_species_to_model_species = {} + if self.compartments is None: + self.compartments = {} + + reactions = [] + for template_reaction in self.template.reactions: + rxn_type = template_reaction.type + if ( + rxn_type == "universal" + or rxn_type == "spontaneous" + or rxn_type == TemplateReactionType.UNIVERSAL + or rxn_type == TemplateReactionType.SPONTANEOUS + ): + reaction_metabolite_ids = {m.id for m in template_reaction.metabolites} if ( - len(metabolites_in_model & reaction_metabolite_ids) > 0 + len( + set(self.template_species_to_model_species) + & reaction_metabolite_ids + ) + > 0 or allow_all_non_grp_reactions - ) and reaction.id not in reactions_in_model: - reaction.annotation["seed.reaction"] = rxn.id - reactions_no_gpr.append(reaction) + ): + for m in template_reaction.metabolites: + if m.compartment not in self.compartments: + self.compartments[ + m.compartment + ] = self.template.compartments.get_by_id(m.compartment) + if m.id not in self.template_species_to_model_species: + model_metabolite = m.to_metabolite(self.index) + self.template_species_to_model_species[ + m.id + ] = model_metabolite + self.base_model.add_metabolites([model_metabolite]) + + reaction = template_reaction.to_reaction( + self.base_model, self.index + ) + reaction.annotation[SBO_ANNOTATION] = "SBO:0000672" + # if template_reaction.type == "spontaneous": + # reaction.annotation[SBO_ANNOTATION] = "SBO:0000176" + + if reaction.id not in cobra_model.reactions: + reactions.append(reaction) - return reactions_no_gpr + return reactions + + def build_biomass(self, rxn_id, cobra_model, template, biomass_compounds): + bio_rxn = Reaction(rxn_id, "biomass", "", 0, 1000) + metabolites = {} + for template_cpd_id in biomass_compounds: + if template_cpd_id in self.template_species_to_model_species: + model_species_id = self.template_species_to_model_species[ + template_cpd_id + ].id + cpd = cobra_model.metabolites.get_by_id(model_species_id) + metabolites[cpd] = biomass_compounds[template_cpd_id] + else: + template_cpd = template.compcompounds.get_by_id(template_cpd_id[:-1]) + m = template_cpd.to_metabolite(self.index) + metabolites[m] = biomass_compounds[template_cpd_id] + self.template_species_to_model_species[template_cpd_id] = m + cobra_model.add_metabolites([m]) + bio_rxn.add_metabolites(metabolites) + bio_rxn.annotation[SBO_ANNOTATION] = "SBO:0000629" + return bio_rxn def build( self, - model_id, + model_or_id, index="0", allow_all_non_grp_reactions=False, annotate_with_rast=True, + biomass_classic=False, + biomass_gc=0.5, + add_reaction_from_rast_annotation=True, ): + """ + + @param model_or_id: a string ID to build from cobra.core.Model otherwise a type of cobra.core.Model + as Base Model + @param index: + @param allow_all_non_grp_reactions: + @param annotate_with_rast: + @param biomass_classic: + @param biomass_gc: + @return: + """ + self.index = index if annotate_with_rast: rast = RastClient() @@ -604,28 +963,83 @@ def build( if self.template is None: self.auto_select_template() - cobra_model = Model(model_id) - cobra_model.add_reactions(self.build_metabolic_reactions(index=index)) - cobra_model.add_reactions( - self.build_non_metabolite_reactions( - cobra_model, index, allow_all_non_grp_reactions - ) + cobra_model = model_or_id + if type(model_or_id) == str: + from cobra.core import Model + + cobra_model = Model(model_or_id) + + self.base_model = cobra_model + + self.generate_reaction_complex_sets() + complex_groups = self.build_complex_groups( + self.reaction_to_complex_sets.values() ) - self.build_exchanges(cobra_model) + if add_reaction_from_rast_annotation: + metabolic_reactions = self.build_metabolic_reactions() + cobra_model.add_reactions(metabolic_reactions) + + non_metabolic_reactions = self.build_non_metabolite_reactions( + cobra_model, allow_all_non_grp_reactions + ) + cobra_model.add_reactions(non_metabolic_reactions) + cobra_model.add_groups(list(complex_groups.values())) + self.add_exchanges_to_model(cobra_model) + + biomass_reactions = [] + for rxn_biomass in self.template.biomasses: + reaction = rxn_biomass.build_biomass( + cobra_model, index, biomass_classic, biomass_gc + ) + for m in reaction.metabolites: + if "modelseed_template_id" in m.notes: + self.template_species_to_model_species[ + m.notes["modelseed_template_id"] + ] = m + biomass_reactions.append(reaction) + + if len(biomass_reactions) > 0: + for rxn in biomass_reactions: + if rxn.id not in cobra_model.reactions: + cobra_model.add_reactions([rxn]) + cobra_model.objective = biomass_reactions[0].id + + """ if ( self.template.name.startswith("CoreModel") or self.template.name.startswith("GramNeg") or self.template.name.startswith("GramPos") ): - cobra_model.add_reactions( - self.build_biomasses(cobra_model, self.template, index) - ) + gc = 0.5 + if hasattr(self.genome,"info"): + gc = float(self.genome.info.metadata["GC content"]) + print("Genome custom GC:",gc) + for bio in self.template.biomasses: + bio.build_biomass(cobra_model, index, classic=False, GC=gc,add_to_model=True) cobra_model.objective = "bio1" + """ - reactions_sinks = [] + reactions_sinks = self.build_drains() + cobra_model.add_reactions(reactions_sinks) + + compartment_data = {} + for cmp_id, data in self.compartments.items(): + cmp_index_id = f"{cmp_id}{self.index}" + compartment_data[cmp_index_id] = data.name + kbase_compartment_data_key = f"kbase_compartment_data_{cmp_index_id}" + kbase_compartment_data = { + "pH": data.ph, + "potential": 0, + "compartmentIndex": self.index, + } + cobra_model.notes[kbase_compartment_data_key] = kbase_compartment_data + + cobra_model.compartments = compartment_data + + """ for cpd_id in ["cpd02701_c0", "cpd11416_c0", "cpd15302_c0"]: - if cpd_id in cobra_model.metabolites: + if cpd_id in cobra_model.metabolites: m = cobra_model.metabolites.get_by_id(cpd_id) rxn_exchange = Reaction( "SK_" + m.id, "Sink for " + m.name, "exchanges", 0, 1000 @@ -633,7 +1047,7 @@ def build( rxn_exchange.add_metabolites({m: -1}) rxn_exchange.annotation[SBO_ANNOTATION] = "SBO:0000627" reactions_sinks.append(rxn_exchange) - cobra_model.add_reactions(reactions_sinks) + """ return cobra_model @@ -646,33 +1060,30 @@ def build_full_template_model(template, model_id=None, index="0"): :param index: index for the metabolites :return: """ - model = Model(model_id if model_id else template.id) + model = MSModel(model_id if model_id else template.id, template=template) all_reactions = [] for rxn in template.reactions: - reaction = MSBuilder._build_reaction( - rxn.id, {}, template, index, "SBO:0000176" - ) + reaction = rxn.to_reaction(model, index) reaction.annotation["seed.reaction"] = rxn.id all_reactions.append(reaction) model.add_reactions(all_reactions) - model.add_reactions(MSBuilder.build_exchanges(model)) + MSBuilder.add_exchanges_to_model(model) if template.name.startswith("CoreModel"): bio_rxn1 = build_biomass("bio1", model, template, core_biomass, index) bio_rxn2 = build_biomass("bio2", model, template, core_atp, index) model.add_reactions([bio_rxn1, bio_rxn2]) model.objective = "bio1" - if template.name.startswith("GramNeg"): - bio_rxn1 = build_biomass("bio1", model, template, gramneg, index) - model.add_reactions([bio_rxn1]) - model.objective = "bio1" - if template.name.startswith("GramPos"): - bio_rxn1 = build_biomass("bio1", model, template, grampos, index) - model.add_reactions([bio_rxn1]) - model.objective = "bio1" + else: + for bio in template.biomasses: + bio.build_biomass( + model, index, classic=False, GC=0.5, add_to_model=True + ) + if "bio1" in model.reactions: + model.objective = "bio1" reactions_sinks = [] - for cpd_id in ["cpd02701_c0", "cpd11416_c0", "cpd15302_c0"]: + for cpd_id in ["cpd02701_c0", "cpd11416_c0", "cpd15302_c0", "cpd03091_c0"]: if cpd_id in model.metabolites: m = model.metabolites.get_by_id(cpd_id) rxn_exchange = Reaction( @@ -694,10 +1105,15 @@ def build_metabolic_model( allow_all_non_grp_reactions=False, annotate_with_rast=True, gapfill_model=True, + classic_biomass=False, ): builder = MSBuilder(genome, template) model = builder.build( - model_id, index, allow_all_non_grp_reactions, annotate_with_rast + model_id, + index, + allow_all_non_grp_reactions, + annotate_with_rast, + classic_biomass, ) # Gapfilling model if gapfill_model: diff --git a/modelseedpy/core/msensemble.py b/modelseedpy/core/msensemble.py new file mode 100755 index 00000000..3ab43eed --- /dev/null +++ b/modelseedpy/core/msensemble.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +import logging +import re +import time +import json +import sys +import pandas as pd +import cobra +import random +from cobra.core.dictlist import DictList +from optlang.symbolics import Zero, add +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.msfba import MSFBA +from modelseedpy.core.msatpcorrection import MSATPCorrection + +# from builtins import None + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + +class MSEnsemble: + @staticmethod + def from_models(models): + #Converting models to MSModelUtil + if not isinstance(model_or_mdlutl, MSModelUtil): + for (i,mdl) in enumerate(models): + models[i] = MSModelUtil.get(mdl) + #Cloning the first model as a starting point + clone_model = cobra.io.json.from_json(cobra.io.json.to_json(models[0].model)) + clone_mdlutl = MSModelUtil.get(clone_model) + ensemble = MSEnsemble(clone_mdlutl) + ensemble.rebuild_from_models(models) + + def from_annotation(model_or_mdlutl,reaction_probability_hash,sample_count=100): + #Create genome from probabilities + mdl = MSBuilder(genome,template).build(base_model, '0', False, False) + mdl.template = self.gs_template + mdlutl = MSModelUtil.get(mdl) + ensemble = MSEnsemble(mdlutl) + ensemble.build_ensemble(reaction_probability_hash, gpr_level_sampling, sample_count) + + def __init__(self,model_or_mdlutl,reaction_probabilities=None): + # Discerning input is model or mdlutl and setting internal links + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.mdlutl = model_or_mdlutl + else: + self.model = model_or_mdlutl + self.mdlutl = MSModelUtil.get(model_or_mdlutl) + attributes = self.mdlutl.get_attributes() + if "ensemble" not in attributes: + self.data = { + "size": 0, + "reactions": {} + } + for rxn in self.model.reactions: + self.data["reactions"][rxn.id] = { + "presence": "", + "gapfilling":"", + "genes": {} + } + for gene in rxn.genes: + self.data["reactions"][rxn.id]["genes"][gene.id] = { + "presence": "" + } + if reaction_probabilities: + self.reset_reaction_probabilities(reaction_probabilities) + logger.warning("Input model is not an ensemble model. You will need to run build_ensemble() to create an ensemble model.") + else: + self.data = attributes["ensemble"] + + def reset_reaction_probabilities(self,reaction_probability_hash,clear_existing=False): + #clear_existing: if true, clear existing probabilities before setting new ones + if clear_existing: + for rxnid in self.data["reactions"]: + self.data["reactions"][rxnid]["probability"] = 0 + for geneid in self.data["reactions"][rxnid]["genes"]: + self.data["reactions"][rxnid]["genes"][geneid]["probability"] = 0 + #Overwriting reaction probabilities from input hash + for rxnid in reaction_probability_hash: + if rxnid in self.model.reactions: + rxnobj = self.model.reactions.get_by_id(rxnid) + if rxnid not in self.data["reactions"]: + self.data["reactions"][rxnid] = {"presence":"","genes":{}} + if "probability" in reaction_probability_hash[rxnid]: + self.data["reactions"][rxnid]["probability"] = reaction_probability_hash[rxnid]["probability"] + if "genes" in reaction_probability_hash[rxnid]: + for geneid in reaction_probability_hash[rxnid]["genes"]: + #if geneid in rxnobj.genes: + self.data["reactions"][rxnid]["genes"][geneid] = {"presence":"","probability":reaction_probability_hash[rxnid]["genes"][geneid]} + + def rebuild_from_models(self,models):#DONE + #Clearing existing data + self.data["ATP_analysis"] = {"core_atp_gapfilling":{},"selected_media":{},"tests":{}} + for rxnid in self.data["reactions"]: + self.data["reactions"][rxnid]["presence"] = "" + self.data["reactions"][rxnid]["gapfilling"] = "" + if "genes" in self.data["reactions"][rxnid]: + for geneid in self.data["reactions"][rxnid]["genes"]: + self.data["reactions"][rxnid]["genes"][geneid]["presence"] = "" + else: + self.data["reactions"][rxnid]["genes"] = {} + #Building presence strings from models + self.data["size"] = len(models) + for (i,mdlutl) in enumerate(models): + attributes = mdlutl.get_attributes() + if "ATP_analysis" in attributes: + if "core_atp_gapfilling" in attributes["ATP_analysis"]: + for media in attributes["ATP_analysis"]["core_atp_gapfilling"]: + if media not in self.data["ATP_analysis"]["core_atp_gapfilling"]: + self.data["ATP_analysis"]["core_atp_gapfilling"][media] = [] + for j in range(i): + self.data["ATP_analysis"]["core_atp_gapfilling"][media].append(None) + self.data["ATP_analysis"]["core_atp_gapfilling"][media].append(attributes["ATP_analysis"]["core_atp_gapfilling"][media]) + + if "selected_media" in attributes["ATP_analysis"]: + for media in attributes["ATP_analysis"]["selected_media"]: + if media not in self.data["ATP_analysis"]["selected_media"]: + self.data["ATP_analysis"]["selected_media"][media] = [] + for j in range(i): + self.data["ATP_analysis"]["selected_media"][media].append(None) + self.data["ATP_analysis"]["selected_media"][media].append(attributes["ATP_analysis"]["selected_media"][media]) + if "tests" in attributes["ATP_analysis"]: + for media in attributes["ATP_analysis"]["tests"]: + if media not in self.data["ATP_analysis"]["tests"]: + self.data["ATP_analysis"]["tests"][media] = {"objective":attributes["ATP_analysis"]["tests"][media]["objective"],"threshold":[]} + for j in range(i): + self.data["ATP_analysis"]["tests"][media]["threshold"].append(None) + self.data["ATP_analysis"]["tests"][media]["threshold"].append(attributes["ATP_analysis"]["tests"][media]["threshold"]) + add_reactions = [] + for rxn in mdlutl.model.reactions: + if rxn.id not in self.mdlutl.model.reactions: + add_reactions.append(rxn) + if rxn.id not in self.data["reactions"]: + self.data["reactions"][rxn.id] = { + "presence":'0' * i, + "genes":{} + } + self.data["reactions"][rxn.id]["presence"] += "1" + for gene in rxn.genes: + if gene.id not in self.data["reactions"][rxn.id]["genes"]: + self.data["reactions"][rxn.id]["genes"][gene.id] = '0' * i + self.data["reactions"][rxn.id]["genes"][gene.id] += "1" + self.mdlutl.model.add_reactions(add_reactions) + #Updating GPR of base model + for rxnid in self.data["reactions"]: + rxn = self.mdlutl.model.reactions.get_by_id(rxnid) + rxn.gene_reaction_rule = " or ".join(self.data["reactions"][rxnid]["genes"].keys()) + #Computing probabilities from presence if missing + for rxnid in self.ensemble_data["reactions"]: + if "probabilty" not in self.ensemble_data["reactions"][rxnid]: + self.ensemble_data["reactions"][rxnid]["probabilty"] = self.ensemble_data["reactions"][rxnid]["presence"].count('1')/len(self.ensemble_data["reactions"][rxnid]["presence"]) + for geneid in self.ensemble_data["reactions"][rxnid]["genes"]: + if "probabilty" not in self.ensemble_data["reactions"][rxnid]["genes"][geneid]: + self.ensemble_data["reactions"][rxnid]["genes"][geneid]["probabilty"] = self.ensemble_data["reactions"][rxnid]["genes"][geneid]["presence"].count('1')/len(self.ensemble_data["reactions"][rxnid]["genes"][geneid]["presence"]) + + def sample_from_probabilities(self,reaction_probabilities=None,from_reaction_probabilities=False,sample_count=1000): + #Overwriting reaction probabilities if provided + if reaction_probabilities: + self.reset_reaction_probabilities(reaction_probabilities) + self.data["size"] = sample_count + #Scrolling through ensemble data with probabilities + for rxnid in self.data["reactions"]: + if "probability" not in self.data["reactions"][rxnid]: + logger.critical("Reaction probability missing for "+rxnid+"!") + return None + if rxnid not in self.mdlutl.model.reactions: + logger.critical("Reaction probability for "+rxnid+" but reaction not in base model!") + return None + rxn = self.mdlutl.model.reactions.get_by_id(rxnid) + #Clearing existing data + self.data["reactions"][rxnid]["presence"] = "" + self.data["reactions"][rxnid]["gapfilling"] = "" + #Loading gene-level data + if "genes" not in self.data["reactions"][rxnid]: + self.data["reactions"][rxnid]["genes"] = {} + for gene in rxn.genes: + if gene.id not in self.data["reactions"][rxnid]["genes"] or "probability" not in self.data["reactions"][rxnid]["genes"][gene.id]: + logger.warning("Reaction "+rxnid+" has gene "+gene.id+" but no associated probability data!") + self.data["reactions"][rxnid]["genes"][gene.id] = {"presence":"","probablity":1} + self.data["reactions"][rxnid]["genes"][gene.id]["presence"] = "" + #Sampling from probabilities + for i in range(sample_count): + for rxnid in self.data["reactions"]: + present = False + if from_reaction_probabilities or len(self.data["reactions"][rxnid]["genes"]) == 0: + if random.uniform(0,1) < self.data["reactions"][rxnid]["probability"]: + present = True + else: + for geneid in self.data["reactions"][rxnid]["genes"]: + if random.uniform(0,1) < self.data["reactions"][rxnid]["genes"][geneid]["probability"]: + present = True + self.data["reactions"][rxnid]["genes"][geneid]["presence"] += "1" + else: + self.data["reactions"][rxnid]["genes"][geneid]["presence"] += "0" + if present: + self.data["reactions"][rxnid]["presence"] += "1" + else: + self.data["reactions"][rxnid]["presence"] += "0" + #Updating reaction probabilities from presence data + count = 0 + for item in self.data["reactions"][rxnid]["presence"]: + if item == "1": + count += 1 + self.data["reactions"][rxnid]["probability"] = count/len(self.data["reactions"][rxnid]["presence"]) + #Saving ensemble data in model attributes + return self.save_ensemble_model() + + def unpack_models(self,model_list=None): + output_models = [None]*self.size + for i in range(self.size): + if not model_list or i in model_list: + clone_mdl = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + clone_mdl_utl = MSModelUtil.get(clone_mdl) + remove_reactions = [] + for rxn in clone_mdl_utl.model.reactions: + if rxn.id in self.data["reactions"]: + if self.data["reactions"][rxn.id][i] == "0": + remove_reactions.append(rxn) + else: + new_genes = [] + for gene in rxn.genes: + if gene.id in self.data["reactions"][rxn.id]["genes"]: + if self.data["reactions"][rxn.id]["genes"][gene.id][i] == "1": + new_genes.append(gene) + rxn.gene_reaction_rule = " or ".join([gene.id for gene in new_genes]) + else: + logger.warning("Ensemble model contains reaction not included in ensemble data. Removing reaction "+rxn.id+" from ensemble model.") + remove_reactions.append(rxn) + clone_mdl.remove_reactions(remove_reactions) + if "ATP_analysis" in self.data: + attributes = clone_mdl_utl.get_attributes() + attributes["ATP_analysis"] = {"core_atp_gapfilling":{},"selected_media":{},"tests":{}} + for media in self.data["ATP_analysis"]["core_atp_gapfilling"]: + if self.data["ATP_analysis"]["core_atp_gapfilling"][media][i] != None: + attributes["ATP_analysis"]["core_atp_gapfilling"][media] = self.data["ATP_analysis"]["core_atp_gapfilling"][media][i] + for media in self.data["ATP_analysis"]["selected_media"]: + if self.data["ATP_analysis"]["selected_media"][media][i] != None: + attributes["ATP_analysis"]["selected_media"][media] = self.data["ATP_analysis"]["selected_media"][media][i] + for media in self.data["ATP_analysis"]["tests"]: + if self.data["ATP_analysis"]["tests"][media]["threshold"][i] != None: + attributes["ATP_analysis"]["tests"][media] = { + "objective":self.data["ATP_analysis"]["tests"][media]["objective"], + "threshold":self.data["ATP_analysis"]["tests"]["threshold"][media][i] + } + clone_mdl_utl.save_attributes(attributes) + output_models[i] = clone_mdl_utl + return output_models + + def save_ensemble_model(self): + self.mdlutl.save_attributes(self.data,"ensemble") + return self.mdlutl + + def run_fba(self,media,objective,maximize,gene_ko=[],reaction_ko=[],pfba=True,fva=True): + msfba = MSFBA(self.model,media,objective,maximize,gene_ko,reaction_ko,pfba,fva,clone=True) + msfba.run() + models = self.unpack_models() + #Iterating over each model to run FBA on each + for mdlutl in models: + subfba = MSFBA(mdlutl,media,objective,maximize,gene_ko,reaction_ko,pfba,fva,clone=False) + subfba.run() + msfba.add_secondary_solution(subfba.primary_solution,subfba.fva_results) + return msfba + + def run_atp_method( + self, + core_template=None, + atp_medias=[], + compartment="c0", + max_gapfilling=10, + gapfilling_delta=0, + atp_hydrolysis_id=None, + load_default_medias=True, + forced_media=[], + default_media_path=None, + ): + models = self.unpack_models() + for mdlutl in models: + atpcorrection = MSATPCorrection( + core_template, + atp_medias, + compartment, + max_gapfilling, + gapfilling_delta, + atp_hydrolysis_id, + load_default_medias, + forced_media, + default_media_path + ) + tests = atpcorrection.run_atp_correction() + self.rebuild_from_models(models) + + def run_gapfilling(self): + pass \ No newline at end of file diff --git a/modelseedpy/core/msfba.py b/modelseedpy/core/msfba.py new file mode 100644 index 00000000..2a86f8ee --- /dev/null +++ b/modelseedpy/core/msfba.py @@ -0,0 +1,230 @@ +# -*- coding: utf-8 -*- +import logging +import re +import traceback +import cobra +from cobra.flux_analysis import pfba +from cobra.flux_analysis import flux_variability_analysis +from modelseedpy.core.msmodelutl import MSModelUtil + +logger = logging.getLogger(__name__) + +class MSFBA: + def __init__(self,model_or_mdlutl,media,objective_reactions={"bio1":1},maximize=True,gene_ko=[],reaction_ko=[],pfba=True,fva=True,clone=True,primary_solution=None,id=None): + if isinstance(model_or_mdlutl, MSModelUtil): + model_or_mdlutl = model_or_mdlutl.model + if clone: + model_or_mdlutl = cobra.io.json.from_json(cobra.io.json.to_json(model_or_mdlutl)) + self.model = model_or_mdlutl + self.mdlutl = MSModelUtil.get(model_or_mdlutl) + self.media = media + self.objective_reactions = objective_reactions + self.maximize = maximize + self.gene_ko = gene_ko + self.reaction_ko = reaction_ko + self.pkgmgr = self.mdlutl.pkgmgr + self.apply_parameters() + self.primary_solution = primary_solution + self.secondary_solutions = None + self.fva = fva + self.pfba = pfba + self.fva_results = None + self.secondary_fva = None + if id == None: + id = self.mdlutl.model.id+".fba" + self.id = id + + def build_objective(self): + sense = "max" + if not self.maximize: + sense = "min" + obj = self.model.problem.Objective(0, direction=sense) + objcoef = {} + for rxnid in self.objective_reactions: + if rxnid in self.model.reactions: + rxn = self.model.reactions.get_by_id(rxnid) + objcoef[rxn.forward_variable] = self.objective_reactions[rxnid] + objcoef[rxn.reverse_variable] = -1*self.objective_reactions[rxnid] + else: + logger.warning(f"KO reaction {rxnid} not found in model") + obj.set_linear_coefficients(objcoef) + + def apply_parameters(self): + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(self.media) + for gene in self.gene_ko: + if gene in self.model.genes: + self.model.genes.get_by_id(gene).knock_out() + else: + logger.warning(f"KO gene {gene} not found in model") + for rxn in self.reaction_ko: + if rxn in self.model.reactions: + self.model.reactions.get_by_id(rxn).knock_out() + else: + logger.warning(f"KO reaction {rxn} not found in model") + + def run(self): + if self.pfba: + self.primary_solution = pfba(self.model) + else: + self.primary_solution = self.model.optimize() + if self.fva: + self.fva_results = flux_variability_analysis(self.model) + + def add_secondary_solution(self,solution,fva=None): + if self.secondary_solutions == None: + self.secondary_solutions = [] + self.secondary_solutions.append(solution) + if fva: + if self.secondary_fva == None: + self.secondary_fva = [] + self.secondary_fva.append(fva) + + def get_variable_class(self,variable_min, variable_max): + variable_class = "Unknown" + if variable_min is None or variable_max is None: + return variable_class + if variable_min == 0 and variable_max == 0: + variable_class = "Blocked" + elif variable_min > 0 and variable_max > 0: + variable_class = "Positive" + elif variable_min >= 0 and variable_max > 0: + variable_class = "Positive variable" + elif variable_min < 0 and variable_max < 0: + variable_class = "Negative" + elif variable_min < 0 and variable_max <= 0: + variable_class = "Negative variable" + else: + variable_class = "Variable" + return variable_class + + def generate_kbase_data(self,fbamodel_ref,media_ref): + output = { + "FBABiomassVariables": [], + "FBACompoundBounds": [], + "FBACompoundVariables": [], + "FBAConstraints": [], + "FBADeletionResults": [], + "FBAMetaboliteProductionResults": [], + "FBAMinimalMediaResults": [], + "FBAMinimalReactionsResults": [], + "FBAPromResults": [], + "FBAReactionBounds": [], + "FBAReactionVariables": [], + "FBATintleResults": [], + "MFALog": "", + "PROMKappa": 1, + "QuantitativeOptimizationSolutions": [], + "__VERSION__": 1, + "additionalCpd_refs": [], + "allReversible": 0, + "biomassRemovals": {}, + "biomassflux_objterms": {"bio1": 1}, + "calculateReactionKnockoutSensitivity": 0, + "comboDeletions": 0, + "compoundflux_objterms": {}, + "decomposeReversibleDrainFlux": 0, + "decomposeReversibleFlux": 0, + "defaultMaxDrainFlux": 0, + "defaultMaxFlux": 1000, + "defaultMinDrainFlux": -1000, + "drainfluxUseVariables": 0, + "fbamodel_ref": fbamodel_ref, + "findMinimalMedia": 0, + "fluxMinimization": 1, + "fluxUseVariables": 0, + "fva": 0, + "gapfillingSolutions": [], + "geneKO_refs": [], + "id": self.id, + "inputfiles": {}, + "maximizeActiveReactions": 0, + "maximizeObjective": 1, + "media_list_refs": [], + "media_ref": media_ref, + "minimizeErrorThermodynamicConstraints": 0, + "minimize_reaction_costs": {}, + "minimize_reactions": 0, + "noErrorThermodynamicConstraints": 0, + "numberOfSolutions": 1, + "objectiveConstraintFraction": 0.1, + "objectiveValue": self.primary_solution.objective_value, + "other_objectives": [], + "outputfiles": {}, + "parameters": { + "Auxotrophy metabolite list": "", + "Beachhead metabolite list": "", + "minimum_target_flux": "0.01", + "save phenotype fluxes": "0", + "suboptimal solutions": "1", + }, + "quantitativeOptimization": 0, + "reactionKO_refs": [], + "reactionflux_objterms": {}, + "simpleThermoConstraints": 0, + "thermodynamicConstraints": 0, + "uptakeLimits": {}, + } + + for rxn in self.model.reactions: + flux = 0 + if rxn.id in self.primary_solution.fluxes: + flux = self.primary_solution.fluxes[rxn.id] + min_flux = rxn.lower_bound + max_flux = rxn.upper_bound + if self.fva_results and rxn.id in self.fva_results: + min_flux, max_flux = self.fva_results[rxn.id] + other_mins= [] + other_maxes = [] + other_fluxes = [] + if self.secondary_solutions: + for sol in self.secondary_solutions: + if rxn.id in sol.fluxes: + other_fluxes.append(sol.fluxes[rxn.id]) + else: + other_fluxes.append(0) + if self.secondary_fva: + othermin = rxn.lower_bound + othermax = rxn.upper_bound + for fva in self.secondary_fva: + if rxn.id in fva: + othermin, othermax = fva[rxn.id] + other_mins.append(othermin) + other_maxes.append(othermax) + variable_class = self.get_variable_class(min_flux, max_flux) + variable_data = { + "class": variable_class, + "lowerBound": rxn.lower_bound, + "max": max_flux, + "min": min_flux, + "upperBound": rxn.upper_bound, + "other_max": other_maxes, + "other_min": other_mins, + "other_values": other_fluxes, + "value": flux, + "variableType": "flux" + } + variable_key = "FBAReactionVariables" + if rxn.id.startswith("EX_"): + lower = variable_data["lowerBound"] + variable_data["lowerBound"] = -1 * variable_data["upperBound"] + variable_data["upperBound"] = -1 * lower + lower = variable_data["min"] + variable_data["min"] = -1 * variable_data["max"] + variable_data["max"] = -1 * lower + variable_data["value"] = -1 * variable_data["value"] + variable_data["variableType"] = "drainflux" + variable_data["modelcompound_ref"] = "~/fbamodel/modelcompounds/id/" + rxn.id[3:] + variable_key = "FBACompoundVariables" + elif rxn.id.startswith("bio"): + variable_data["variableType"] = "biomassflux" + variable_data["biomass_ref"] = "~/fbamodel/biomasses/id/" + rxn.id + variable_key = "FBABiomassVariables" + else: + variable_data["modelreaction_ref"] = "~/fbamodel/modelreactions/id/" + rxn.id + variable_data["exp_state"] = "unknown" + variable_data["biomass_dependencies"] = [] + variable_data["coupled_reactions"] = [] + variable_data["expression"] = 0 + variable_data["scaled_exp"] = 0 + output[variable_key].append(variable_data) + return output \ No newline at end of file diff --git a/modelseedpy/core/msfbareport.py b/modelseedpy/core/msfbareport.py new file mode 100644 index 00000000..df5c34bb --- /dev/null +++ b/modelseedpy/core/msfbareport.py @@ -0,0 +1,636 @@ +# -*- coding: utf-8 -*- +import pandas as pd +import logging +import os +import re +import jinja2 +from os.path import dirname +from pandas.io.formats.style import Styler +from modelseedpy.core.msmodelutl import MSModelUtil + +module_path = dirname(os.path.abspath(__file__)) + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + + +class MSModelReport: + def __init__(self, model_or_mdlutl): + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.modelutl = model_or_mdlutl + else: + self.model = model_or_mdlutl + self.modelutl = MSModelUtil.get(model_or_mdlutl) + + def generate_reports(self, report_path, multi_tab_report_path): + self.build_report(report_path) + self.build_multitab_report(multi_tab_report_path) + + # Helper function to build overview data + def build_overview_data(self): + # Get the number of compartments + number_compartments = len( + set([metabolite.compartment for metabolite in self.model.metabolites]) + ) + + # Extract gapfilling information + core_gapfilling_media = [] + gapfilling_media = [] + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + if gf_sensitivity: + for media in gf_sensitivity: + if ( + "bio1" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["bio1"] + ): + gapfilling_media.append(media) + if ( + "rxn00062_c0" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["rxn00062_c0"] + ): + core_gapfilling_media.append(media) + + # Count the number of gapfills + number_gapfills = len(gapfilling_media) + + # Convert the lists to strings + core_gapfilling_str = ( + "; ".join(core_gapfilling_media) + if core_gapfilling_media + else "No core gapfilling needed." + ) + gapfilling_media_str = ( + "; ".join(gapfilling_media) + if gapfilling_media + else "No genome-scale gapfilling." + ) + + overview = { + "Model ID": self.model.id, + "Full Gapfilling and ATP Analysis Report": "TBD", # You may replace 'TBD' with actual data when available + "Genome Scale Template": self.model.notes.get( + "kbase_template_refs", "Data Not Available" + ), + "Core Gapfilling Media": core_gapfilling_str, + "Gapfilling Media": gapfilling_media_str, + "Source Genome": self.model.notes.get( + "kbase_genome_ref", "Data Not Available" + ), + "Total Number of reactions": self.modelutl.nonexchange_reaction_count(), + "Number compounds": len(self.model.metabolites), + "Number compartments": number_compartments, + "Number biomass": len( + [ + rxn + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + ] + ), + "Number gapfills": number_gapfills, + } + return overview + + # Helper function for extracting gapfilling data + def extract_gapfilling_data(self, gf_sensitivity): + if gf_sensitivity is None: + return [], {} + + gapfilling_dict = {} + gapfilling_summary = {} + + for media, media_data in gf_sensitivity.items(): + for target, target_data in media_data.items(): + gf_data = target_data.get("success", {}) + if isinstance(gf_data, dict): + for reaction_id, reaction_data in gf_data.items(): + for direction, metabolites in reaction_data.items(): + # If metabolites is None, set to empty string + if metabolites is None: + metabolites = "" + + # Extract both IDs and Names for Gapfilling Sensitivity + sensitivity_ids = [] + sensitivity_names = [] + if isinstance(metabolites, (list, tuple)): + for met_id in metabolites: + sensitivity_ids.append(met_id) + met_name = ( + self.model.metabolites.get_by_id(met_id).name + if met_id in self.model.metabolites + else met_id + ) + sensitivity_names.append(met_name) + else: + metabolites = str(metabolites) + entry = { + "reaction_id": reaction_id, + "reaction_name": self.model.reactions.get_by_id( + reaction_id + ).name + if reaction_id in self.model.reactions + else reaction_id, + "media": media, + "direction": direction, + "target": target, + "gapfilling_sensitivity_id": "; ".join(sensitivity_ids) + if sensitivity_ids + else metabolites, + "gapfilling_sensitivity_name": "; ".join( + sensitivity_names + ) + if sensitivity_names + else metabolites, + } + + # Update the summary dictionary + if reaction_id not in gapfilling_summary: + gapfilling_summary[reaction_id] = [] + gapfilling_summary[reaction_id].append( + f"{media}: {direction}" + ) + + # Check if reaction_id is already in dictionary + if reaction_id in gapfilling_dict: + # Update the media + existing_entry = gapfilling_dict[reaction_id] + existing_media = existing_entry["media"].split("; ") + if media not in existing_media: + existing_media.append(media) + existing_entry["media"] = "; ".join(existing_media) + else: + gapfilling_dict[reaction_id] = entry + + return list(gapfilling_dict.values()), gapfilling_summary + + # transform data to be used in tabular format to use in build_model_report + def transform_gapfilling_data(self, gapfilling_data): + transformed_data = [] + for entry in gapfilling_data: + row = [ + entry["reaction_id"], + entry["reaction_name"], + entry["media"], + entry["direction"], + entry["target"], + entry["gapfilling_sensitivity_id"], + entry["gapfilling_sensitivity_name"], + ] + transformed_data.append(row) + return transformed_data + + # Extract ATP analysis data + def extract_atp_analysis_data(self, atp_analysis, atp_expansion_filter): + entries = [] + if atp_analysis and "core_atp_gapfilling" in atp_analysis: + for media, data in atp_analysis["core_atp_gapfilling"].items(): + score = data.get("score", None) + new_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("new", {}).items() + ] + reversed_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("reversed", {}).items() + ] + atp_production = "Not integrated" + if ( + "selected_media" in atp_analysis + and media in atp_analysis["selected_media"] + ): + atp_production = atp_analysis["selected_media"][media] + + # Extracting the "Filtered Reactions" in the required format + filtered_reactions = [] + for k, v in atp_expansion_filter.get(media, {}).items(): + if isinstance(v, dict): + for sub_k, sub_v in v.items(): + if isinstance(sub_v, dict): + for reaction, direction_dict in sub_v.items(): + direction = list(direction_dict.keys())[0] + filtered_reactions.append( + f"{reaction}: {direction}" + ) + filtered_reactions_str = "; ".join(filtered_reactions) + + if score is not None: + entries.append( + { + "media": media, + "no_of_gapfilled_reactions": score, + "atp_production": atp_production, + "gapfilled_reactions": "; ".join(new_reactions), + "reversed_reaction_by_gapfilling": "; ".join( + reversed_reactions + ), + "filtered_reactions": filtered_reactions_str, + } + ) + # Sorting the entries based on the 'no_of_gapfilled_reactions' column + entries.sort(key=lambda x: x["no_of_gapfilled_reactions"]) + return entries + + # Extract ATP production data for the ATP Analysis tab + def extract_atp_production_data(self, atp_analysis): + atp_production_dict = {} + if atp_analysis: + selected_media = atp_analysis.get("selected_media", {}) + core_atp_gapfilling = atp_analysis.get("core_atp_gapfilling", {}) + + # First, process selected_media + for media, value in selected_media.items(): + atp_production_dict[media] = round(value, 2) + + # Next, process core_atp_gapfilling for media not in selected_media + for media, data in core_atp_gapfilling.items(): + if media not in atp_production_dict: + if data.get("failed"): + atp_production_dict[media] = "failed" + else: + # If the media was not processed in selected_media and it's not failed, set as 'Not Integrated' + atp_production_dict[media] = "Not Integrated" + + return atp_production_dict + + def build_multitab_report(self, output_path): + + # Build overview data + overview_data = self.build_overview_data() + + # Get gf_sensitivity attribute from the model + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + + # Extract gapfilling data + gapfilling_entries, gapfilling_reaction_summary = self.extract_gapfilling_data( + gf_sensitivity + ) + + # Check if ATP_analysis attribute is present in the model + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + if atp_analysis: + atp_expansion_filter = self.modelutl.attributes.get( + "atp_expansion_filter", {} + ) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + else: + atp_analysis_entries = [] + + # Initialize context dictionary + context = { + "overview": overview_data, + "reactions": [], + "compounds": [], + "genes": [], + "biomass": [], + "gapfilling": gapfilling_entries, # Populated with gapfilling data + "atpanalysis": atp_analysis_entries, # Populated with ATP analysis data + } + + print("Module Path:", module_path + "/../data/") + + exchanges = {r.id for r in self.model.exchanges} + + # Identify biomass reactions using SBO annotation + biomass_reactions_ids = { + rxn.id + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + } + + # Reactions Tab + for rxn in self.model.reactions: + if rxn.id not in exchanges and rxn.id not in biomass_reactions_ids: + equation = rxn.build_reaction_string(use_metabolite_names=True) + rxn_data = { + "id": rxn.id, + "name": rxn.name, + "equation": equation, + "genes": rxn.gene_reaction_rule, + "gapfilling": "; ".join( + gapfilling_reaction_summary.get(rxn.id, []) + ), # Empty list results in an empty string + } + context["reactions"].append(rxn_data) + + # Compounds Tab + for cpd in self.model.metabolites: + cpd_data = { + "id": cpd.id, + "name": cpd.name, + "formula": cpd.formula, + "charge": cpd.charge, + "compartment": cpd.compartment, + } + context["compounds"].append(cpd_data) + + # Genes Tab + for gene in self.model.genes: + gene_data = { + "gene": gene.id, + "reactions": "; ".join([rxn.id for rxn in gene.reactions]), + } + context["genes"].append(gene_data) + + # Biomass Tab + if biomass_reactions_ids: + for biomass_rxn_id in biomass_reactions_ids: + biomass_rxn = self.model.reactions.get_by_id(biomass_rxn_id) + for metabolite, coefficient in biomass_rxn.metabolites.items(): + compound_id = metabolite.id + compound_name = metabolite.name.split("_")[0] + compartment = compound_id.split("_")[-1] + + biomass_data = { + "biomass_reaction_id": biomass_rxn.id, + "biomass_compound_id": compound_id, + "name": compound_name, + "coefficient": coefficient, + "compartment": compartment, + } + context["biomass"].append(biomass_data) + else: + print("No biomass reactions found in the model.") + + # Gapfilling Tab + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + context["gapfilling"] = gapfilling_entries + + # Extract ATP Production Data + atp_production_data = self.extract_atp_production_data(atp_analysis) + + # Populate the 'atpanalysis' context with ATP production data + for entry in context["atpanalysis"]: + media = entry["media"] + entry["atp_production"] = atp_production_data.get(media, None) + + # Diagnostics + unique_biomass_rxns = biomass_reactions_ids + print(f"Unique biomass reactions identified: {len(unique_biomass_rxns)}") + print(f"Biomass Reaction IDs: {', '.join(unique_biomass_rxns)}") + + print("\nFirst 2 reactions:") + for rxn in context["reactions"][:2]: + print(rxn) + + print("\nFirst 2 compounds:") + for cpd in context["compounds"][:2]: + print(cpd) + + print("\nFirst 2 genes:") + for gene in context["genes"][:2]: + print(gene) + + print("\nFirst 2 biomass compounds:") + for bm in context["biomass"][:2]: + print(bm) + + print("\nFirst 2 gapfilling entries:") + for gf in context["gapfilling"][:2]: + print(gf) + + print("\nFirst 2 ATP Analysis entries:") + for entry in context["atpanalysis"][:2]: + print(entry) + + # Render with template + env = jinja2.Environment( + loader=jinja2.FileSystemLoader(module_path + "/../data/"), + autoescape=jinja2.select_autoescape(["html", "xml"]), + ) + html = env.get_template("ModelReportTemplate.html").render(context) + directory = dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w") as f: + f.write(html) + + def build_report(self, output_path): + """Builds model HTML report for the Model Summary table + Parameters + ---------- + model : cobra.Model + Model to use to build the report + """ + + # 1. Utilize the build_overview_data method + model_summary_data = self.build_overview_data() + # Remove the unwanted entry + model_summary_data.pop("Full Gapfilling and ATP Analysis Report", None) + # 2. Transform the dictionary into a list of tuples + model_summary_list = [(key, value) for key, value in model_summary_data.items()] + # 3. Convert to DataFrame + model_summary_df = pd.DataFrame(model_summary_list, columns=["", ""]) + + # Style the DataFrame (as was done previously) + model_summary_df_styled = model_summary_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Fetching the gapfilling sensitivity data + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + gapfilling_list = self.transform_gapfilling_data(gapfilling_data[0]) + + # Convert the gapfilling_list to a DataFrame + gapfillings_analysis_df = pd.DataFrame( + gapfilling_list, + columns=[ + "Reaction ID", + "Reaction Name", + "Media", + "Direction", + "Target", + "Gapfilling Sensitivity ID", + "Gapfilling Sensitivity Name", + ], + ) + + # Apply style to Gapfillings Analysis DataFrame + gapfillings_analysis_df_styled = gapfillings_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for Gapfillings Analysis + annotations_text_gapfillings = """ +
    +
  • Reaction ID: The identifier of the reaction.
  • +
  • Reaction Name: The name of the reaction.
  • +
  • Media: The media used by gap filling.
  • +
  • Direction: The direction of the reaction. Can be ">" for forward, "<" for reverse, or "=" for both directions.
  • +
  • Target: The reaction selected as the objective function target for the gapfilling optimization problem. Targets here can be the model’s biomass reaction, commonly named “bio1” for models created by this app. + Alternatively, “rxn00062” (ATP Production) reaction is shown for cases where gapfilling was applied to guarantee ATP production in a given media. + When reactions are gapfilled for ATP production, we recommend checking the full Core ATP Analysis in the table below.
  • +
  • Gapfilling Sensitivity ID and Name: Gapfilling is necessary when compounds in the biomass objective function can not be produced by the model. + For each reaction we list the biomass compound(s) that can not be synthesized by the model without gapfilling. + In cases where gap filling fails there are two possible scenarios: + 1) FBF (failed before filtering) : the gapfilling immediately failed, even before we filtered out the ATP breaking reactions. This means this objective CANNOT be satisfied with the entire current database. + 2) FAF (failed after filtering): the gapfilling succeeded before filtering, but failed after filtering out reactions that break ATP. This tells you definitively if the ATP filtering caused the gapfilling to fail
  • +
+ """ + + # Extract ATP analysis data + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + atp_expansion_filter = self.modelutl.attributes.get("atp_expansion_filter", {}) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + + # Convert the atp_analysis_entries list to a DataFrame + atp_analysis_df = pd.DataFrame(atp_analysis_entries) + + # Apply style to ATP Analysis DataFrame + atp_analysis_df_styled = atp_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for ATP Analysis + annotations_text_atp_analysis = """ +
    +
  • No. of gapfilled reactions: The number of reactions filled by the gapfilling process.
  • +
  • Media: The media in which the reaction takes place.
  • +
  • ATP Production: ATP production by the core metabolism model.
  • +
  • Gapfilled Reactions: Reactions added during the gapfilling process.
  • +
  • Reversed Reaction by Gapfilling: Reactions that have been reversed during the gapfilling process.
  • +
  • Filtered Reactions: Reactions that have been filtered out during the analysis. When a reaction addition would lead to a large increase in ATP production or an infinite energy loop, we filter that reaction out of the gapfilling database and prevent it from being added to the model.
  • +
+ """ + + # ATP analysis explanation text + explanation_text_atp_analysis = """ +

During model reconstruction, we analyze the genome’s core metabolism draft model (model without gapfilling) to assess energy biosynthesis capabilities. + The goal of this analysis is to ensure the core metabolism model is able to produce ATP before we expand the model to the genome-scale. + This step is designed to prevent gapfilling from introducing reactions that create energy-generating loops. + The tests are conducted on a large collection of minimal conditions, with the goal of simulating the model’s capability to produce energy with different electron donor, electron acceptor, and carbon source combinations.

+

When the draft model of the core metabolism is capable of producing ATP in at least one of the test media, no gapfilling reactions part of this analysis will be added to the model. While we still report the gapfilling requirements for the test media formulations that fail to produce ATP with that draft core model, we only integrate these solutions in the model when no test media succeeds in producing ATP. + In this case, the integrated gap-filling solution(s) will be displayed in the “Gapfilling Analysis” table above, with the “Target” “rxn00062” (ATP Production) objective function.

+

The goal is to display the test results for all media to provide clues for the metabolic capabilities of the genome(s). When many reactions are required for growth on the SO4 testing media conditions, this could be a good indicator that the organism is not capable of performing sulfate reduction. + On the other hand, when only one gapfill reaction is required for ATP production in a given media, multiple scenarios can be considered. + 1) Organism(s) can’t grow on test condition, and we correctly did not add the reaction to the model. 2) Possible issue with the source genome annotation missing a specific gene function 3) Possible issue with the model reconstruction database. We hope this data helps make more informed decisions on reactions that may need to be manually curated in the model. + In cases where is known from the literature or unpublished experimental results that an organism is capable of producing ATP in a given media condition that requires gapfilling in this analysis, you can use the parameter “Force ATP media” in the reconstruction app to ensure those reactions are integrated into the model. + .

+ """ + + # Save the data to HTML with the styled DataFrames and the legends + directory = os.path.dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + f.write('') + f.write("

Model Summary

") + f.write(model_summary_df_styled.render(escape=False)) + f.write("

") + f.write("

Gapfillings Analysis

") + + # Check for Gapfillings Analysis data + if not gapfillings_analysis_df.empty: + f.write(gapfillings_analysis_df_styled.render(escape=False)) + f.write(f"

Legend:

{annotations_text_gapfillings}") + else: + f.write( + "

Warning: No Gapfillings Analysis data available for this model.

" + ) + + f.write("

Core ATP Analysis

") + + # Check for ATP Analysis data + if not atp_analysis_df.empty: + f.write(atp_analysis_df_styled.render(escape=False)) + f.write(f"

Legend:

{annotations_text_atp_analysis}") + f.write(explanation_text_atp_analysis) + else: + f.write( + "

Warning: No Core ATP Analysis data available for this model.

" + ) diff --git a/modelseedpy/core/msgapfill.py b/modelseedpy/core/msgapfill.py old mode 100644 new mode 100755 index 6544d74c..221cd3cb --- a/modelseedpy/core/msgapfill.py +++ b/modelseedpy/core/msgapfill.py @@ -1,66 +1,96 @@ +#!/usr/bin/python # -*- coding: utf-8 -*- import logging -import itertools # !!! the import is never used - -logger = logging.getLogger(__name__) - import cobra import re +import json +import numpy as np +import pandas as pd +from optlang.symbolics import Zero, add from modelseedpy.core import FBAHelper # !!! the import is never used from modelseedpy.fbapkg.mspackagemanager import MSPackageManager from modelseedpy.core.msmodelutl import MSModelUtil -from modelseedpy.fbapkg.gapfillingpkg import default_blacklist from modelseedpy.core.exceptions import GapfillingError +from collections import defaultdict + + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO # WARNING +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO class MSGapfill: + @staticmethod + def gapfill_count(solution): + total = 0 + if "new" in solution: + total += len(solution["new"]) + if "reversed" in solution: + total += len(solution["reversed"]) + return total + def __init__( self, - model, + model_or_mdlutl, default_gapfill_templates=[], default_gapfill_models=[], test_conditions=[], reaction_scores={}, blacklist=[], + atp_gapfilling=False, + minimum_obj=0.01, + default_excretion=100, + default_uptake=0, + default_target=None, + base_media = None, + base_media_target_element = "C" ): - if isinstance(model, MSModelUtil): - self.model = model.model - self.modelutl = model + # Discerning input is model or mdlutl and setting internal links + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.mdlutl = model_or_mdlutl else: - self.model = model - self.modelutl = MSModelUtil(model) + self.model = model_or_mdlutl + self.mdlutl = MSModelUtil.get(model_or_mdlutl) + # Setting gapfilling attribute in model utl so link is bidirectional + if not atp_gapfilling: + self.mdlutl.gfutl = self self.auto_sink = [ + "cpd01042", "cpd02701", "cpd11416", "cpd15302", + "cpd03091", ] # the cpd11416 compound is filtered during model extension with templates - self.gfmodel = self.lp_filename = self.last_solution = None + # Cloning model to create gapfilling model + self.gfmodel = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + self.gfmodelutl = MSModelUtil.get(self.gfmodel) + # Getting package manager for gapfilling model + self.gfpkgmgr = MSPackageManager.get_pkg_mgr(self.gfmodelutl) + # Setting target from input + if default_target: + self.default_target = default_target + self.gfmodel.objective = self.gfmodel.problem.Objective( + self.gfmodel.reactions.get_by_id(default_target).flux_expression, + direction="max", + ) + # Setting parameters for gapfilling + self.lp_filename = self.last_solution = None self.model_penalty = 1 + self.default_minimum_objective = minimum_obj self.default_gapfill_models = default_gapfill_models self.default_gapfill_templates = default_gapfill_templates self.gapfill_templates_by_index, self.gapfill_models_by_index = {}, {} self.gapfill_all_indecies_with_default_templates = True self.gapfill_all_indecies_with_default_models = True - self.blacklist = list(set(default_blacklist + blacklist)) + self.blacklist = list(set(blacklist)) self.test_condition_iteration_limit = 10 self.test_conditions = test_conditions self.reaction_scores = reaction_scores - - def run_gapfilling( - self, - media=None, - target=None, - minimum_obj=0.01, - binary_check=False, - prefilter=True, - ): - if target: - self.model.objective = self.model.problem.Objective( - self.model.reactions.get_by_id(target).flux_expression, direction="max" - ) - self.gfmodel = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) - pkgmgr = MSPackageManager.get_pkg_mgr(self.gfmodel) - pkgmgr.getpkg("GapfillingPkg").build_package( + self.cumulative_gapfilling = [] + # Building gapfilling package + self.gfpkgmgr.getpkg("GapfillingPkg").build_package( { "auto_sink": self.auto_sink, "model_penalty": self.model_penalty, @@ -70,84 +100,734 @@ def run_gapfilling( "gapfill_models_by_index": self.gapfill_models_by_index, "gapfill_all_indecies_with_default_templates": self.gapfill_all_indecies_with_default_templates, "gapfill_all_indecies_with_default_models": self.gapfill_all_indecies_with_default_models, - "default_excretion": 100, - "default_uptake": 100, + "default_excretion": default_excretion, + "default_uptake": default_uptake, "minimum_obj": minimum_obj, "blacklist": self.blacklist, "reaction_scores": self.reaction_scores, "set_objective": 1, + "base_media": base_media, + "base_media_target_element":base_media_target_element } ) - pkgmgr.getpkg("KBaseMediaPkg").build_package(media) - # Filtering breaking reactions out of the database - if prefilter and self.test_conditions: - pkgmgr.getpkg("GapfillingPkg").filter_database_based_on_tests( - self.test_conditions + def test_gapfill_database(self, media, target=None, before_filtering=True): + # Testing if gapfilling can work before filtering + if target: + self.gfpkgmgr.getpkg("GapfillingPkg").set_base_objective(target,None) + else: + target = str(self.gfmodel.objective) + target = target.split(" ")[0] + target = target[13:] + #Setting media + self.gfpkgmgr.getpkg("KBaseMediaPkg").build_package(media) + if self.gfpkgmgr.getpkg("GapfillingPkg").test_gapfill_database(): + return True + if self.gfpkgmgr.getpkg("GapfillingPkg").test_solution.status == 'infeasible': + return False + gf_sensitivity = {} + if target != "rxn00062_c0": + gf_sensitivity = self.mdlutl.get_attributes("gf_sensitivity", {}) + if media.id not in gf_sensitivity: + gf_sensitivity[media.id] = {} + if target not in gf_sensitivity[media.id]: + gf_sensitivity[media.id][target] = {} + filter_msg = " " + note = "FAF" + if before_filtering: + filter_msg = " before filtering " + note = "FBF" + gf_sensitivity[media.id][target][ + note + ] = self.mdlutl.find_unproducible_biomass_compounds(target) + if target != "rxn00062_c0": + self.mdlutl.save_attributes(gf_sensitivity, "gf_sensitivity") + logger.warning( + "No gapfilling solution found" + + filter_msg + + "for " + + media.id + + " activating " + + target + ) + return False + + def prefilter(self,test_conditions=None,growth_conditions=[],use_prior_filtering=False,base_filter_only=False): + """Prefilters the database by removing any reactions that break specified ATP tests + Parameters + ---------- + test_conditions : [] + List of conditions to be tested when filtering the gapfilling database. If not specified, the test_conditions attribute will be used + """ + if not test_conditions: + test_conditions = self.test_conditions + if self.test_conditions: + logger.debug(f"PREFILTERING WITH {str(len(growth_conditions))} GROWTH CONDITIONS") + base_filter = None + if use_prior_filtering: + base_filter = self.mdlutl.get_attributes("gf_filter", {}) + self.gfpkgmgr.getpkg("GapfillingPkg").filter_database_based_on_tests( + self.test_conditions, + growth_conditions=growth_conditions, + base_filter=base_filter, + base_filter_only=base_filter_only + ) + gf_filter = self.gfpkgmgr.getpkg("GapfillingPkg").modelutl.get_attributes( + "gf_filter", {} ) + base_filter = self.mdlutl.get_attributes("gf_filter", {}) + for media_id in gf_filter: + base_filter[media_id] = gf_filter[media_id] + + def run_gapfilling( + self, + media=None, + target=None, + minimum_obj=None, + binary_check=False, + prefilter=True, + ): + """Run gapfilling on a single media condition to force the model to achieve a nonzero specified objective + Parameters + ---------- + media : MSMedia + Media in which the model should be gapfilled + target : string + Name or expression describing the reaction or combination of reactions to the optimized + minimum_obj : double + Value to use for the minimal objective threshold that the model must be gapfilled to achieve + binary_check : bool + Indicates if the solution should be checked to ensure it is minimal in the number of reactions involved + prefilter : bool + Indicates if the gapfilling database should be prefiltered using the tests provided in the MSGapfill constructor before running gapfilling + """ + # Setting target and media if specified + if not target: + target = self.default_target + if not minimum_obj: + minimum_obj = self.default_minimum_objective + self.gfpkgmgr.getpkg("GapfillingPkg").set_base_objective(target,minimum_obj) + if media: + self.gfpkgmgr.getpkg("GapfillingPkg").set_media(media) + + # Testing if gapfilling can work before filtering + if not self.test_gapfill_database(media,target,before_filtering=prefilter): + return None + + # Filtering + if prefilter: + self.prefilter(growth_conditions=[{ + "media": media, + "is_max_threshold": False, + "threshold": minimum_obj, + "objective": target, + }]) + if not self.test_gapfill_database(media,target,before_filtering=False): + return None + # Printing the gapfilling LP file if self.lp_filename: - with open(self.lp_filename, "w") as out: - out.write(str(self.gfmodel.solver)) + pass + #with open(self.lp_filename, "w") as out: + # out.write(str(self.gfmodel.solver)) + + # Running gapfil/ling and checking solution sol = self.gfmodel.optimize() logger.debug( - "gapfill solution objective value %f (%s) for media %s", - sol.objective_value, - sol.status, - media, + f"gapfill solution objective value {sol.objective_value} ({sol.status}) for media {media}" ) - if sol.status != "optimal": - logger.debug("No solution found for %s", media) + logger.warning("No solution found for %s", media) return None - self.last_solution = pkgmgr.getpkg("GapfillingPkg").compute_gapfilled_solution() + # Computing solution and ensuring all tests still pass + self.last_solution = self.gfpkgmgr.getpkg( + "GapfillingPkg" + ).compute_gapfilled_solution() if self.test_conditions: - self.last_solution = pkgmgr.getpkg("GapfillingPkg").run_test_conditions( + self.last_solution = self.gfpkgmgr.getpkg( + "GapfillingPkg" + ).run_test_conditions( self.test_conditions, self.last_solution, self.test_condition_iteration_limit, ) if self.last_solution is None: - logger.debug( - "No solution could be found that satisfied all \ - specified test conditions in specified iterations!" + logger.warning( + "no solution could be found that satisfied all specified test conditions in specified iterations!" ) return None + + # Running binary check to reduce solution to minimal reaction solution if binary_check: - return pkgmgr.getpkg("GapfillingPkg").binary_check_gapfilling_solution() + self.last_solution = self.gfpkgmgr.getpkg( + "GapfillingPkg" + ).binary_check_gapfilling_solution() + + # Setting last solution data + self.last_solution["media"] = media + self.last_solution["target"] = target + self.last_solution["minobjective"] = minimum_obj + self.last_solution["binary_check"] = binary_check return self.last_solution + + def run_global_gapfilling( + self, + medias, + targets, + thresholds, + binary_check=False, + prefilter=True, + ): + """Run gapfilling on a single media condition to force the model to achieve a nonzero specified objective + Parameters + ---------- + medias : [MSMedia] + Media in which the model should be gapfilled + targets : [string] + Name or expression describing the reaction or combination of reactions to the optimized + thresholds : [double] + Value to use for the minimal objective threshold that the model must be gapfilled to achieve + binary_check : bool + Indicates if the solution should be checked to ensure it is minimal in the number of reactions involved + prefilter : bool + Indicates if the gapfilling database should be prefiltered using the tests provided in the MSGapfill constructor before running gapfilling + check_for_growth : bool + Indicates if the model should be checked to ensure that the resulting gapfilling solution produces a nonzero objective + """ + # Testing if gapfilling can work before filtering + final_media = [] + final_targets = [] + final_thresholds = [] + growth_conditions = [] + for i,media in enumerate(medias): + if self.test_gapfill_database(media,targets[i],before_filtering=True): + final_media.append(media) + final_targets.append(targets[i]) + final_thresholds.append(thresholds[i]) + growth_conditions.append({ + "media": media, + "is_max_threshold": False, + "threshold": thresholds[i], + "objective": targets[i], + }) + # Filtering + if prefilter: + self.prefilter(growth_conditions=growth_conditions) + medias = [] + targets = [] + thresholds = [] + for i,media in enumerate(final_media): + if self.test_gapfill_database(media,final_targets[i],before_filtering=True): + medias.append(media) + targets.append(targets[i]) + thresholds.append(thresholds[i]) + #If none of the media conditions can be gapfilled, then return None + if len(medias) == 0: + return None + #Instantiating all models to be merged + merged_model = None + model_list = [] + pkgmgrs = {} + for i,media in enumerate(medias): + model_cpy = self.gfmodel.copy() + pkgmgrs[model_cpy] = MSPackageManager.get_pkg_mgr(model_cpy) + #Creating max flux variables + pkgmgrs[model_cpy].getpkg("GapfillingPkg").create_max_flux_variables() + #Setting the objective + pkgmgrs[model_cpy].getpkg("GapfillingPkg").set_base_objective(targets[i],thresholds[i]) + #Setting the media + pkgmgrs[model_cpy].getpkg("GapfillingPkg").set_media(media) + if i == 0: + merged_model = model_cpy + else: + model_list.append(model_cpy) + #Merging all models + gfpkg = pkgmgrs[merged_model].getpkg("GapfillingPkg") + pkgmgrs[merged_model].getpkg("ProblemReplicationPkg").build_package({ + "models":model_list, + "shared_variable_packages":{ + gfpkg : ["rmaxf","fmaxf"] + } + }) + #Setting the objective + reaction_objective = merged_model.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in merged_model.reactions: + if reaction.id in gfpkg.gapfilling_penalties: + if reaction.id[0:3] != "EX_": + if "reverse" in gfpkg.gapfilling_penalties[reaction.id]: + if reaction.id in gfpkg.maxflux_variables: + if "reverse" in gfpkg.maxflux_variables[reaction.id]: + obj_coef[gfpkg.maxflux_variables[reaction.id]["reverse"]] = abs( + gfpkg.gapfilling_penalties[reaction.id]["reverse"] + ) + if "forward" in gfpkg.gapfilling_penalties[reaction.id]: + if reaction.id in gfpkg.maxflux_variables: + if "forward" in gfpkg.maxflux_variables[reaction.id]: + obj_coef[gfpkg.maxflux_variables[reaction.id]["forward"]] = abs( + gfpkg.gapfilling_penalties[reaction.id]["forward"] + ) + merged_model.objective = reaction_objective + reaction_objective.set_linear_coefficients(obj_coef) + gfpkg.parameters["gfobj"] = self.model.objective + + # Printing the gapfilling LP file + if self.lp_filename: + pass + #with open(self.lp_filename, "w") as out: + # out.write(str(merged_model.solver)) - def integrate_gapfill_solution(self, solution): - for rxn_id in solution["reversed"]: - rxn = self.model.reactions.get_by_id(rxn_id) - if solution["reversed"][rxn_id] == ">": - rxn.upper_bound = 100 + # Running gapfilling and checking solution + sol = merged_model.optimize() + logger.debug( + f"gapfill solution objective value {sol.objective_value} ({sol.status}) for media {media}" + ) + if sol.status != "optimal": + logger.warning("No solution found for %s", media) + return None + + # Computing solution and ensuring all tests still pass + self.last_solution = {"new":{},"reversed":{},"media":medias[0],"target":targets[0],"minobjective":thresholds[0],"binary_check":False} + flux_values = {} + for rxn in self.model.reactions: + flux_values[rxn.id] = { + "reverse": self.gfpkgmgr.getpkg("GapfillingPkg").maxflux_variables[reaction.id]["reverse"].primal, + "forward": self.gfpkgmgr.getpkg("GapfillingPkg").maxflux_variables[reaction.id]["forward"].primal + } + self.gfpkgmgr.getpkg("GapfillingPkg").compute_gapfilled_solution(flux_values) + return self.last_solution + + def run_multi_gapfill( + self, + media_list, + target=None, + target_hash={}, + minimum_objectives={}, + default_minimum_objective=None, + binary_check=False, + prefilter=True, + check_for_growth=True, + gapfilling_mode="Sequential", + run_sensitivity_analysis=True, + integrate_solutions=True, + remove_unneeded_reactions=True + ): + """Run gapfilling across an array of media conditions ultimately using different integration policies: simultaneous gapfilling, independent gapfilling, cumulative gapfilling + Parameters + ---------- + media_list : [MSMedia] + List of the medias in which the model should be gapfilled + target : string + Name or expression describing the reaction or combination of reactions to the optimized + minimum_objectives : {string - media ID : double - minimum objective value} + Media-specific minimal objective thresholds that the model must be gapfilled to achieve + default_minimum_objective : double + Default value to use for the minimal objective threshold that the model must be gapfilled to achieve + binary_check : bool + Indicates if the solution should be checked to ensure it is minimal in the number of reactions involved + prefilter : bool + Indicates if the gapfilling database should be prefiltered using the tests provided in the MSGapfill constructor before running gapfilling + check_for_growth : bool + Indicates if the model should be checked to ensure that the resulting gapfilling solution produces a nonzero objective + gapfilling_mode : string + Indicates the integration policy to be used: Global, Independent, and Cumulative + run_sensitivity_analysis : bool + Indicates if sensitivity analysis should be run on the gapfilling solution to determine biomass dependency + """ + #If not integrating, backing up and replacing self.mdlutl + oldmdlutl = self.mdlutl + if not integrate_solutions: + self.model = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + self.mdlutl = MSModelUtil.get(self.model) + #Setting the default minimum objective + if default_minimum_objective == None: + default_minimum_objective = self.default_minimum_objective + self.gfpkgmgr.getpkg("GapfillingPkg").parameters["minimum_obj"] = default_minimum_objective + #Checking that each media to ensure gapfilling works before filtering + for media in media_list: + currtarget = target + if media in target_hash: + currtarget = target_hash[media] + if not self.test_gapfill_database(media,currtarget,before_filtering=True): + #Remove media that fail initial test + print("Removing ungapfillable media "+media.id) + media_list.remove(media) + #If there are no media left, don't run gapfilling + if len(media_list) == 0: + return None + #Running prefiltering once for all media if specified. Rememeber - filtering does not care about the target or media - it is just a set of tests that are run on the database + if prefilter: + growth_conditions=[] + for media in media_list: + minimum_obj = default_minimum_objective + if media in minimum_objectives: + minimum_obj = minimum_objectives[media] + currtarget = target + if media in target_hash: + currtarget = target_hash[media] + growth_conditions.append({ + "media": media, + "is_max_threshold": False, + "threshold": minimum_obj, + "objective": currtarget, + }) + self.prefilter(growth_conditions=growth_conditions) + #Iterating over all media and running gapfilling + solution_dictionary = {} + cumulative_solution = [] + targets = [] + thresholds = [] + for item in media_list: + currtarget=target + if media in target_hash: + targets.append(target_hash[media]) else: - rxn.lower_bound = -100 - for rxn_id in solution["new"]: - rxn = self.gfmodel.reactions.get_by_id(rxn_id) - rxn = rxn.copy() - self.model.add_reactions([rxn]) - coreid = re.sub(r"_[a-z]\d+$", "", rxn_id) - if coreid in self.reaction_scores: - bestgene = None - for gene in self.reaction_scores[coreid]: - if ( - not bestgene - or self.reaction_scores[coreid][gene] - > self.reaction_scores[coreid][bestgene] - ): - bestgene = gene - rxn = self.model.reactions.get_by_id(rxn_id) - rxn.gene_reaction_rule = bestgene - if solution["new"][rxn_id] == ">": - rxn.upper_bound = 100 + targets.append(target) + #Determining the minimum objective for the current media + minimum_obj = default_minimum_objective + if item in minimum_objectives: + minimum_obj = minimum_objectives[item] + thresholds.append(minimum_obj) + #Implementing specified gapfilling mode + if gapfilling_mode == "Independent" or gapfilling_mode == "Sequential": + solution = self.run_gapfilling( + item, + currtarget, + minimum_obj, + binary_check, + False, + ) + #If there is a solution, go ahead and integrate it into the model + if solution: + solution_dictionary[item] = self.integrate_gapfill_solution( + solution, + cumulative_solution=cumulative_solution, + remove_unneeded_reactions=remove_unneeded_reactions, + check_for_growth=check_for_growth, + gapfilling_mode=gapfilling_mode + ) + #If we are doing cumulative gapfilling, then we need adjust the gapfilling objective so it no longer penalizes using the current solution reactions + if gapfilling_mode == "Sequential": + self.gfpkgmgr.getpkg("GapfillingPkg").compute_gapfilling_penalties(exclusion_solution=cumulative_solution,reaction_scores=self.reaction_scores) + self.gfpkgmgr.getpkg("GapfillingPkg").build_gapfilling_objective_function() + if gapfilling_mode == "Global": + #Now we run simultaneous gapfilling on a combination of all our various gapfilled models + full_solution = self.run_global_gapfilling( + media_list, + targets, + thresholds, + binary_check, + False, + check_for_growth, + ) + #Now we integrate the full solution into the model for every media which effectively determines which reactions are needed for each media + for i,item in enumerate(media_list): + full_solution["media"] = item + full_solution["target"] = targets[i] + full_solution["minobjective"] = thresholds[i] + #In this case we donot remove unnneeded reactions from the model because they may be needed for other media + solution_dictionary[item] = self.integrate_gapfill_solution( + full_solution, + cumulative_solution=cumulative_solution, + remove_unneeded_reactions=False, + check_for_growth=check_for_growth, + gapfilling_mode=gapfilling_mode + ) + #Now we remove reactions uneeded for any of the specified media conditions + #These is a danger here that the integration step will put a reaction into a solution that subsequently gets removed at this step. This is something to look out for + unneeded = self.mdlutl.test_solution( + cumulative_solution, + targets, + media_list, + thresholds=[0.1], + remove_unneeded_reactions=True, + do_not_remove_list=[] + )#Returns reactions in cumulative solution that are not needed for growth + elif gapfilling_mode == "Sequential": + #Restoring the gapfilling objective function + self.gfpkgmgr.getpkg("GapfillingPkg").compute_gapfilling_penalties(reaction_scores=self.reaction_scores) + self.gfpkgmgr.getpkg("GapfillingPkg").build_gapfilling_objective_function() + #Running sensitivity analysis once on the cumulative solution for all media + #with open("datacache/solutions.json", 'w') as f: + #json.dump(solution_dictionary,f,indent=4,skipkeys=True) + if run_sensitivity_analysis: + logger.info( + "Gapfilling sensitivity analysis running" + ) + #First aggregating all unique reactions with a media for each + reaction_media_hash = {} + solution_rxn_types = ["new","reversed"] + media_reaction_hash = {} + for media in solution_dictionary: + if solution_dictionary[media]["growth"] > 0: + for rxn_type in solution_rxn_types: + for rxn_id in solution_dictionary[media][rxn_type]: + if rxn_id not in reaction_media_hash: + reaction_media_hash[rxn_id] = {} + if solution_dictionary[media][rxn_type][rxn_id] not in reaction_media_hash[rxn_id]: + reaction_media_hash[rxn_id][solution_dictionary[media][rxn_type][rxn_id]] = media + if media not in media_reaction_hash: + media_reaction_hash[media] = {} + media_reaction_hash[media][rxn_id] = solution_dictionary[media][rxn_type][rxn_id] + #Running sensitivity analysis on minimal reactions in each media + rxn_sensitivity_hash = {} + for media in media_reaction_hash: + test_solution = [] + for rxn in media_reaction_hash[media]: + test_solution.append([rxn, media_reaction_hash[media][rxn]]) + self.mdlutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + sensitivity_results = self.mdlutl.find_unproducible_biomass_compounds( + target, test_solution + ) + for rxn in sensitivity_results: + if rxn not in rxn_sensitivity_hash: + rxn_sensitivity_hash[rxn] = {} + for dir in sensitivity_results[rxn]: + rxn_sensitivity_hash[rxn][dir] = sensitivity_results[rxn][dir] + #Building gapfilling sensitivity output + gf_sensitivity = self.mdlutl.get_attributes("gf_sensitivity", {}) + for media in solution_dictionary: + if media.id not in gf_sensitivity: + gf_sensitivity[media.id] = {} + if target not in gf_sensitivity[media.id]: + gf_sensitivity[media.id][target] = {} + if solution_dictionary[media]["growth"] > 0: + gf_sensitivity[media.id][target]["success"] = {} + for rxn_type in solution_rxn_types: + for rxn_id in solution_dictionary[media][rxn_type]: + if rxn_id not in gf_sensitivity[media.id][target]["success"]: + gf_sensitivity[media.id][target]["success"][rxn_id] = {} + gf_sensitivity[media.id][target]["success"][rxn_id][solution_dictionary[media][rxn_type][rxn_id]] = rxn_sensitivity_hash[rxn_id][solution_dictionary[media][rxn_type][rxn_id]] + else: + gf_sensitivity[media.id][target]["failure"] = {} + self.mdlutl.save_attributes(gf_sensitivity, "gf_sensitivity") + #Restoring backedup model + self.mdlutl = oldmdlutl + self.model = oldmdlutl.model + #Returning the solution dictionary + return solution_dictionary + + def integrate_gapfill_solution( + self,solution,cumulative_solution=[],remove_unneeded_reactions=False,check_for_growth=True,gapfilling_mode="Sequential" + ): + """Integrating gapfilling solution into model + Parameters + ---------- + solution : dict + Specifies the reactions to be added to the model to implement the gapfilling solution + cumulative_solution : list + Optional array to cumulatively track all reactions added to the model when integrating multiple solutions + remove_unneeded_reactions : bool + Indicate where unneeded reactions should be removed from the model + check_for_growth : bool + Indicate if the model should be checked to ensure that the resulting gapfilling solution produces a nonzero objective + gapfilling_mode : Cumulative, Independent, Simultaneous + Specify what the gapfilling mode is because this determines how integration is performed + """ + logger.info(f"Initial solution: {str(solution)}") + original_objective = self.mdlutl.model.objective + self.mdlutl.model.objective = solution["target"] + self.mdlutl.model.objective.direction = "max" + #If gapfilling mode is independent, we should remove the cumulative solution from the model before integrating the current solution + if gapfilling_mode == "Independent": + for item in cumulative_solution: + rxn = self.model.reactions.get_by_id(item[0]) + if item[1] == ">": + rxn.upper_bound = 0 + else: + rxn.lower_bound = 0 + new_cumulative_reactions = [] + #Converting the solution to list + list_solution = self.mdlutl.convert_solution_to_list(solution) + for item in list_solution: + if item[0] not in self.model.reactions: + logger.debug(f"adding reaction: {str(item[0])}") + #Copying and adding the reaction to the model + rxn = self.gfmodel.reactions.get_by_id(item[0]) + rxn = rxn.copy() + self.model.add_reactions([rxn]) + #Clearing current bounds because we only want to add reaction in the direction it was gapfilled in + rxn.upper_bound = 0 rxn.lower_bound = 0 + logger.info(f"integrating rxn: {item[0]}") + rxn = self.model.reactions.get_by_id(item[0]) + #Setting genes if the reaction has no genes + if len(rxn.genes) == 0: + #Setting genes from reaction scores in we have them + coreid = re.sub(r"_[a-z]\d+$", "", item[0]) + if coreid in self.reaction_scores: + logger.debug(f"Found reaction scores for coreid: {coreid}") + bestgene = None + bestscore = None + for gene in self.reaction_scores[coreid]: + score = None + if isinstance(self.reaction_scores[coreid][gene], dict): + score = self.reaction_scores[coreid][gene]["probability"] + else: + score = self.reaction_scores[coreid][gene] + if ( + not bestgene + or score + > bestscore + ): + bestgene = gene + bestscore = score + rxn = self.model.reactions.get_by_id(item[0]) + logger.debug(f"Assigning gene to reaction: {item[0]} {bestgene}") + rxn.gene_reaction_rule = bestgene + rxn.notes["new_genes"] = bestgene + print("Assigning gene to reaction: "+item[0]+" "+bestgene) + #Setting bounds according to the direction the reaction was gapfilled in + if item[1] == ">": + rxn.upper_bound = 100 else: - rxn.upper_bound = 0 rxn.lower_bound = -100 - return self.model + #Adding reaction to cumulative solution if it is not already there + if not self.mdlutl.find_item_in_solution(cumulative_solution,item): + new_cumulative_reactions.append([item[0], item[1],item[2]]) + #Testing the full cumulative solution to see which reactions are needed for current media/target + full_solution = cumulative_solution + new_cumulative_reactions + logger.info(f"Full solution: {str(full_solution)}") + #Setting up structure to store the finalized solution for this media/target + current_media_target_solution = {"growth":0,"media":solution["media"],"target":solution["target"],"minobjective":solution["minobjective"],"binary_check":solution["binary_check"] ,"new":{},"reversed":{}} + #If gapfilling is independent, we only check the specific solution + if gapfilling_mode == "Independent": + unneeded = self.mdlutl.test_solution(list_solution,[solution["target"]],[solution["media"]],[solution["minobjective"]],remove_unneeded_reactions,do_not_remove_list=cumulative_solution)#Returns reactions in input solution that are not needed for growth + for item in list_solution: + if not self.mdlutl.find_item_in_solution(unneeded,item): + current_media_target_solution[item[2]][item[0]] = item[1] + if not self.mdlutl.find_item_in_solution(cumulative_solution,item): + cumulative_solution.append(item) + #elif not remove_unneeded_reactions and not self.mdlutl.find_item_in_solution(cumulative_solution,item): + # cumulative_solution.append(item) + logger.info(f"Cumulative media target solution: {str(current_media_target_solution)}") + else: + unneeded = self.mdlutl.test_solution(full_solution,[solution["target"]],[solution["media"]],[solution["minobjective"]],remove_unneeded_reactions,do_not_remove_list=cumulative_solution)#Returns reactions in input solution that are not needed for growth + for item in cumulative_solution: + if not self.mdlutl.find_item_in_solution(unneeded,item): + current_media_target_solution[item[2]][item[0]] = item[1] + for item in new_cumulative_reactions: + if not self.mdlutl.find_item_in_solution(unneeded,item): + current_media_target_solution[item[2]][item[0]] = item[1] + cumulative_solution.append(item) + #elif not remove_unneeded_reactions: + # cumulative_solution.append(item) + logger.info(f"Unneeded: {str(unneeded)}") + logger.info(f"Cumulative: {str(self.cumulative_gapfilling)}") + #Checking that the final integrated model grows + if check_for_growth: + self.mdlutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(solution["media"]) + current_media_target_solution["growth"] = self.mdlutl.model.slim_optimize() + logger.info(f"Growth: {str(current_media_target_solution['growth'])} {solution['media'].id}") + # Adding the gapfilling solution data to the model, which is needed for saving the model in KBase + self.mdlutl.add_gapfilling(solution) + # Testing which gapfilled reactions are needed to produce each reactant in the objective function + self.cumulative_gapfilling.extend(cumulative_solution) + return current_media_target_solution + + def compute_reaction_weights_from_expression_data(self, omics_data, annoont): + """Computing reaction weights based on input gene-level omics data + Parameters + ---------- + omics_data : pandas dataframe with genes as rows and conditions as columns + Specifies the reactions to be added to the model to implement the gapfilling solution + annoont : annoont object + Contains reaction, feature id, ontologies, probabilities. Restructured into dataframe in function + Returns : + A dictionary with Rxns as the keys and calculated result as the value. + """ + + ### Restructure annoont into Dataframe + rows_list = [] + for reaction, genes in annoont.get_reaction_gene_hash(feature_type="gene").items(): + for gene, gene_info in genes.items(): + # Initialize the row with 'Gene' and 'Reactions' + row = {"Gene": gene, "Reactions": reaction} + # Loop through each evidence in the gene's evidence list + for evidence in gene_info["evidence"]: + # Construct column name from the event and ontology for uniqueness + column_name = f"{evidence['ontology']}" + if column_name in row: + row[column_name] = f"{row[column_name]}, {evidence['term']}" + else: + row[column_name] = evidence["term"] + rows_list.append(row) + restructured_anoot = pd.DataFrame(rows_list) + + ### Integrate Omics, set weights, find indexes for features + feature_ids_set = set(omics_data.index) + + # Find indices where 'Gene' values are in 'feature_ids' + # isin method returns a boolean series that is True where tbl_supAno['Gene'] is in feature_ids_set + mask = restructured_anoot["Gene"].isin(feature_ids_set) + # Get the indices of True values in the mask + idx_measuredGene = mask[mask].index.tolist() + # Calculate the dimensions for the measuredGeneScore array + num_genes = len(restructured_anoot["Gene"]) + num_columns = len(restructured_anoot.columns[2:]) + # Initialize the measuredGeneScore array with zeros + measuredGeneScore = np.zeros((num_genes, num_columns)) + measuredGeneScore[idx_measuredGene, :] = 1 + num_weights = len(restructured_anoot.columns[3:]) + w = np.repeat(1 / num_weights, num_weights) + + ### Calculate Weights and generate the reaction/weight hash + num_cols = len(restructured_anoot.columns[2:]) + w = np.full((num_cols, 1), 1 / num_cols) + p = np.zeros(len(restructured_anoot["Reactions"])) + # computed_weights is the rxn_hash ({rxn: weight, ...}) + computed_weights = {} + + # Precompute gene reaction lookups + gene_reaction_lookup = {} + for idx, row in restructured_anoot.iterrows(): + gene = row["Gene"] + reaction = row["Reactions"] + if gene in gene_reaction_lookup: + gene_reaction_lookup[gene].append(reaction) + else: + gene_reaction_lookup[gene] = [reaction] + + for rxn in range(0, len(restructured_anoot)): + substr_rxns = [rxn for rxn in restructured_anoot["Reactions"][[rxn]]] + # Get the indices of the rows where the condition is True + mask = restructured_anoot["Reactions"] == substr_rxns[0] + idx_gene = mask[mask].index + nAG = 0 + nMG = 0 + nCG = 0 + + if len(idx_gene) > 0: + # number of genes that map to a reaction + nAG = len(idx_gene) + for iGene in range(0, nAG): + subset = restructured_anoot.iloc[idx_gene[iGene], 2:].to_numpy() + # Checking for non-empty elements in the subset + non_empty_check = np.vectorize(lambda x: x is not None and x == x)( + subset + ) + # Finding the maximum value between the non-empty check and the corresponding row in measuredGeneScore + max_value = np.maximum( + non_empty_check, measuredGeneScore[idx_gene[iGene], :] + ) + # Multiplying by the weight and adding to nMG + nMG += max(sum((max_value * w))) + selected_gene = restructured_anoot["Gene"].iloc[idx_gene[iGene]] + + # Finding reactions associated with genes that contain the selected gene + associated_reactions = gene_reaction_lookup.get(selected_gene, []) + + # Checking if there are more than one unique reactions + if len(associated_reactions) > 1: + nCG += 1 + + p[rxn] = (nMG / nAG) * (1 / (1 + (nCG / nAG))) + + # Add item to output rxn hash dictionary + computed_weights[restructured_anoot.iloc[rxn, 0]] = p[rxn] + + return computed_weights @staticmethod def gapfill( diff --git a/modelseedpy/core/msgenome.py b/modelseedpy/core/msgenome.py index f052130d..03c2b08c 100644 --- a/modelseedpy/core/msgenome.py +++ b/modelseedpy/core/msgenome.py @@ -15,8 +15,14 @@ def normalize_role(s): def read_fasta(f, split=DEFAULT_SPLIT, h_func=None): - with open(f, "r") as fh: - return parse_fasta_str(fh.read(), split, h_func) + if f.endswith(".gz"): + import gzip + + with gzip.open(f, "rb") as fh: + return parse_fasta_str(fh.read().decode("utf-8"), split, h_func) + else: + with open(f, "r") as fh: + return parse_fasta_str(fh.read(), split, h_func) def parse_fasta_str(faa_str, split=DEFAULT_SPLIT, h_func=None): @@ -48,7 +54,7 @@ def parse_fasta_str(faa_str, split=DEFAULT_SPLIT, h_func=None): class MSFeature: - def __init__(self, feature_id, sequence, description=None): + def __init__(self, feature_id, sequence, description=None, aliases=[]): """ @param feature_id: identifier for the protein coding feature @@ -60,7 +66,7 @@ def __init__(self, feature_id, sequence, description=None): self.seq = sequence self.description = description # temporary replace with proper parsing self.ontology_terms = {} - self.aliases = [] + self.aliases = aliases def add_ontology_term(self, ontology_term, value): """ @@ -78,6 +84,9 @@ def add_ontology_term(self, ontology_term, value): class MSGenome: def __init__(self): self.features = DictList() + self.id = None + self.annoont = None + self.scientific_name = None def add_features(self, feature_list: list): """ @@ -96,6 +105,28 @@ def add_features(self, feature_list: list): self.features += feature_list + def create_new_feature(self,id,sequence): + newftr = MSFeature(id,sequence) + self.add_features([newftr]) + return newftr + + @staticmethod + def from_annotation_ontology( + annoont, prioritized_event_list=None, ontologies=None, merge_all=False,feature_type=None, translate_to_rast=True + ): + gene_hash = annoont.get_gene_term_hash() + genome = MSGenome() + features = [] + for gene in gene_hash: + feature = MSFeature(gene.id,"") + features.append(feature) + for term in gene_hash[gene]: + feature.add_ontology_term(term.ontology.id, term.id) + if term.ontology.id == "SSO": + feature.add_ontology_term("RAST",annoont.get_term_name(term)) + genome.add_features(features) + return genome + @staticmethod def from_fasta( filename, contigs=0, split="|", h_func=None @@ -104,6 +135,20 @@ def from_fasta( genome.features += read_fasta(filename, split, h_func) return genome + def to_fasta(self, filename, l=80, fn_header=None): + with open(filename, "w") as fh: + for feature in self.features: + h = f">{feature.id}\n" + if fn_header: + h = fn_header(feature) + fh.write(h) + lines = [ + feature.seq[i : i + l] + "\n" for i in range(0, len(feature.seq), l) + ] + for line in lines: + fh.write(line) + return filename + @staticmethod def from_dna_fasta(filename): pass @@ -116,7 +161,17 @@ def from_protein_sequences_hash(sequences): return genome def alias_hash(self): - return {alias: gene for gene in self.features for alias in gene.aliases} + output = {} + for gene in self.features: + for alias in gene.aliases: + #Check if alias is a list + if isinstance(alias,list): + if alias[1] not in output: + output[alias[1]] = gene + else: + if alias not in output: + output[alias] = gene + return output def search_for_gene(self, query): if query in self.features: diff --git a/modelseedpy/core/msgrowthphenotypes.py b/modelseedpy/core/msgrowthphenotypes.py old mode 100644 new mode 100755 index 6c30bb2a..986eb72e --- a/modelseedpy/core/msgrowthphenotypes.py +++ b/modelseedpy/core/msgrowthphenotypes.py @@ -9,136 +9,322 @@ from modelseedpy.core.msgapfill import MSGapfill logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO +zero_threshold = 0.0000001 class MSGrowthPhenotype: def __init__( self, id, media=None, - growth=None, + experimental_value=None, gene_ko=[], additional_compounds=[], parent=None, name=None, + type="growth" ): self.id = id self.name = name if name == None: self.name = self.id - self.growth = growth + self.experimental_value = experimental_value self.media = media self.gene_ko = gene_ko self.gapfilling = None self.additional_compounds = additional_compounds self.parent = parent + self.type = type - def build_media(self): + def build_media(self, include_base_media=True): + """Builds media object to use when simulating the phenotype + Parameters + ---------- + include_base_media : bool + Indicates whether to include the base media for the phenotype set in the formulation + """ cpd_hash = {} for cpd in self.additional_compounds: cpd_hash[cpd] = 100 full_media = MSMedia.from_dict(cpd_hash) - if self.media != None: + if self.media: full_media.merge(self.media, overwrite_overlap=False) - if self.parent != None and self.parent.base_media != None: - full_media.merge(parent.base_media, overwrite_overlap=False) + if include_base_media: + if self.parent and self.parent.base_media: + full_media.merge(self.parent.base_media, overwrite_overlap=False) return full_media def simulate( self, - modelutl, - growth_threshold=0.001, + model_or_mdlutl, + multiplier=3, add_missing_exchanges=False, save_fluxes=False, - pfba=False, + save_reaction_list=False, + ignore_experimental_data=False, + baseline_objective=0.01, + flux_coefficients=None, ): - if not isinstance(modelutl, MSModelUtil): - modelutl = MSModelUtil(modelutl) - media = self.build_media() - output = {"growth": None, "class": None, "missing_transports": []} + """Simulates a single phenotype + Parameters + ---------- + model_or_modelutl : Model | MSModelUtl + Model to use to run the simulations + add_missing_exchanges : bool + Boolean indicating if exchanges for compounds mentioned explicitly in phenotype media should be added to the model automatically + multiplier : double + Indicates a multiplier to use for positive growth above the growth on baseline media + save_fluxes : bool + Indicates if the fluxes should be saved and returned with the results + pfba : bool + Runs pFBA to compute fluxes after initially solving for growth + ignore_experimental_data : bool + Indicates if existing growth data in the phenotype should be ignored when computing the class of the simulated phenotype + """ + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + + #Setting the objective from the phenotype type - this will add missing exchanges for the primary compound for uptake and excretion phenotypes + missing_transporters = [] + objstring = modelutl.set_objective_from_phenotype(self,missing_transporters) + + #Creating output datastructure and returning if the objective cannot be created + output = { + "objective_value": 0, + "class": "N", + "missing_transports": missing_transporters, + "baseline_objective": 0, + "objective":objstring, + "baseline_objective":baseline_objective + } + if objstring == None: + return output + + # Building full media and adding missing exchanges + full_media = self.build_media() + + #Adding missing exchanges if add_missing_exchanges: - output["missing_transports"] = modelutl.add_missing_exchanges(media) - pkgmgr = MSPackageManager.get_pkg_mgr(modelutl.model) - pkgmgr.getpkg("KBaseMediaPkg").build_package( - media, self.parent.base_uptake, self.parent.base_excretion - ) - for gene in self.gene_ko: - if gene in modelutl.model.genes: - geneobj = modelutl.model.genes.get_by_id(gene) - geneobj.knock_out() - solution = modelutl.model.optimize() - output["growth"] = solution.objective_value - if solution.objective_value > 0 and pfba: - solution = cobra.flux_analysis.pfba(modelutl.model) - if save_fluxes: - output["fluxes"] = solution.fluxes - if output["growth"] >= growth_threshold: - if self.growth > 0: + output["missing_transports"].extend(modelutl.add_missing_exchanges(full_media)) + + # Getting basline growth + if objstring != None and output["baseline_objective"] == None and self.parent: + output["baseline_objective"] = self.parent.baseline_objective(modelutl, objstring) + if output["baseline_objective"] < 1e-5: + output["baseline_objective"] = 0.01 + + # Building specific media and setting compound exception list + if self.parent and self.parent.atom_limits and len(self.parent.atom_limits) > 0: + reaction_exceptions = [] + specific_media = self.build_media(False) + for mediacpd in specific_media.mediacompounds: + ex_hash = mediacpd.get_mdl_exchange_hash(modelutl) + for mdlcpd in ex_hash: + reaction_exceptions.append(ex_hash[mdlcpd]) + modelutl.pkgmgr.getpkg("ElementUptakePkg").build_package( + self.parent.atom_limits, exception_reactions=reaction_exceptions + ) + + # Applying media + if self.parent: + modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package( + full_media, self.parent.base_uptake, self.parent.base_excretion + ) + else: + modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(full_media, 0, 1000) + + with modelutl.model: + # Applying gene knockouts + for gene in self.gene_ko: + if gene in modelutl.model.genes: + geneobj = modelutl.model.genes.get_by_id(gene) + geneobj.knock_out() + + # Optimizing model + if '1_objc' in modelutl.model.constraints: + constraint = modelutl.model.constraints['1_objc'] + modelutl.model.remove_cons_vars([constraint]) + solution = modelutl.model.optimize() + output["objective_value"] = solution.objective_value + if solution.objective_value != None and solution.objective_value > 0: + if flux_coefficients == None: + solution = cobra.flux_analysis.pfba(modelutl.model) + else: + #modelutl.printlp(lpfilename="lpfiles/gapfill.lp") + modelutl.pkgmgr.getpkg("ObjConstPkg").build_package( + 0.1, None + ) + coefobj = modelutl.model.problem.Objective(0, direction="min") + modelutl.model.objective = coefobj + obj_coef = {} + for rxn in flux_coefficients: + rxnid = rxn + direction = "=" + if rxn[0:1] == ">" or rxn[0:1] == "<": + direction = rxn[0:1] + rxnid = rxn[1:] + if rxnid in modelutl.model.reactions: + rxnobj = modelutl.model.reactions.get_by_id(rxnid) + if direction == ">" or direction == "=": + obj_coef[rxnobj.forward_variable] = flux_coefficients[rxn] + if direction == "<" or direction == "=": + obj_coef[rxnobj.reverse_variable] = flux_coefficients[rxn] + coefobj.set_linear_coefficients(obj_coef) + solution = modelutl.model.optimize() + modelutl.pkgmgr.getpkg("ObjConstPkg").clear() + if save_reaction_list: + output["reactions"] = [] + if save_fluxes: + output["fluxes"] = solution.fluxes + output["gapfill_count"] = 0 + output["reaction_count"] = 0 + for reaction in modelutl.model.reactions: + if reaction.id in solution.fluxes: + flux = solution.fluxes[reaction.id] + if abs(flux) > zero_threshold: + output["reaction_count"] += 1 + if reaction.id[0:3] != "bio" and reaction.id[0:3] != "EX_" and reaction.id[0:3] != "DM_" and len(reaction.genes) == 0: + output["gapfill_count"] += 1 + if save_reaction_list and flux > zero_threshold: + output["reactions"].append(">"+reaction.id) + elif save_reaction_list: + output["reactions"].append("<"+reaction.id) + + # Determining phenotype class + if output["objective_value"] != None and output["objective_value"] >= output["baseline_objective"] * multiplier: + output["postive"] = True + if not self.experimental_value or ignore_experimental_data: + output["class"] = "P" + elif self.experimental_value > 0: output["class"] = "CP" - else: + elif self.experimental_value == 0: output["class"] = "FP" else: - if self.growth > 0: + output["postive"] = False + if self.experimental_value == None or ignore_experimental_data: + output["class"] = "N" + elif self.experimental_value > 0: output["class"] = "FN" - else: + elif self.experimental_value == 0: output["class"] = "CN" return output def gapfill_model_for_phenotype( self, - modelutl, - default_gapfill_templates, + msgapfill, test_conditions, - default_gapfill_models=[], - blacklist=[], - growth_threshold=0.001, + multiplier=10, add_missing_exchanges=False, ): - if not isinstance(modelutl, MSModelUtil): - modelutl = MSModelUtil(modelutl) - self.gapfilling = MSGapfill( - modelutl.model, - default_gapfill_templates, - default_gapfill_models, - test_conditions, - modelutl.reaction_scores(), - blacklist, + """Gapfills the model to permit this single phenotype to be positive + Parameters + ---------- + msgapfill : MSGapfill + Fully configured gapfilling object + add_missing_exchanges : bool + Boolean indicating if exchanges for compounds mentioned explicitly in phenotype media should be added to the model automatically + multiplier : double + Indicates a multiplier to use for positive growth above the growth on baseline media + objective : string + Expression for objective to be activated by gapfilling + """ + # First simulate model without gapfilling to assess ungapfilled growth + output = self.simulate( + msgapfill.mdlutl,multiplier, add_missing_exchanges ) - media = self.build_media() - if add_missing_exchanges: - modelutl.add_missing_exchanges(media) - for gene in self.gene_ko: - if gene in modelutl.model.genes: - geneobj = modelutl.model.genes.get_by_id(gene) - geneobj.knock_out() - gfresults = self.gapfilling.run_gapfilling(media, None) - if gfresults is None: + if output["objective_value"] >= output["baseline_objective"] * multiplier: + # No gapfilling needed - original model grows without gapfilling + return { + "reversed": {}, + "new": {}, + "media": self.build_media(), + "target": output["objective"], + "minobjective": output["baseline_objective"] * multiplier, + "binary_check": False, + } + + # Now pulling the gapfilling configured model from MSGapfill + gfmodelutl = MSModelUtil.get(msgapfill.gfmodel) + # Saving the gapfill objective because this will be replaced when the simulation runs + gfobj = gfmodelutl.model.objective + # Running simulate on gapfill model to add missing exchanges and set proper media and uptake limit constraints + output = self.simulate( + gfmodelutl, multiplier=multiplier, add_missing_exchanges=add_missing_exchanges + ) + # If the gapfilling model fails to achieve the minimum growth, then no solution exists + if output["objective_value"] < output["baseline_objective"] * multiplier: logger.warning( "Gapfilling failed with the specified model, media, and target reaction." ) - return self.gapfilling.integrate_gapfill_solution(gfresults) + return None + + # Running the gapfilling itself + full_media = self.build_media() + with gfmodelutl.model: + # Applying gene knockouts + for gene in self.gene_ko: + if gene in gfmodelutl.model.genes: + geneobj = gfmodelutl.model.genes.get_by_id(gene) + geneobj.knock_out() + + gfresults = self.gapfilling.run_gapfilling( + full_media, None, minimum_obj=output["baseline_objective"] * multiplier + ) + if gfresults is None: + logger.warning( + "Gapfilling failed with the specified model, media, and target reaction." + ) + + return gfresults class MSGrowthPhenotypes: - def __init__(self, base_media=None, base_uptake=0, base_excretion=1000): + def __init__( + self, base_media=None, base_uptake=0, base_excretion=1000, global_atom_limits={} + ): self.base_media = base_media self.phenotypes = DictList() self.base_uptake = base_uptake self.base_excretion = base_excretion + self.atom_limits = global_atom_limits + self.baseline_objective_data = {} + self.cached_based_growth = {} @staticmethod - def from_compound_hash(compounds, base_media, base_uptake=0, base_excretion=1000): - growthpheno = MSGrowthPhenotypes(base_media, base_uptake, base_excretion) + def from_compound_hash( + compounds, + base_media=None, + base_uptake=0, + base_excretion=1000, + global_atom_limits={}, + type="growth" + ): + growthpheno = MSGrowthPhenotypes( + base_media, base_uptake, base_excretion, global_atom_limits + ) new_phenos = [] for cpd in compounds: - newpheno = MSGrowthPhenotype(cpd, None, compounds[cpd], [], [cpd]) + newpheno = MSGrowthPhenotype(cpd,media=None,experimental_value=compounds[cpd],gene_ko=[],additional_compounds=[cpd],type=type) new_phenos.append(newpheno) growthpheno.add_phenotypes(new_phenos) return growthpheno @staticmethod - def from_kbase_object(data, kbase_api): - growthpheno = MSGrowthPhenotypes(None, 0, 1000) + def from_kbase_object( + data, + kbase_api, + base_media=None, + base_uptake=0, + base_excretion=1000, + global_atom_limits={}, + ): + growthpheno = MSGrowthPhenotypes( + base_media, base_uptake, base_excretion, global_atom_limits + ) new_phenos = [] for pheno in data["phenotypes"]: media = kbase_api.get_from_ws(pheno["media_ref"], None) @@ -149,16 +335,25 @@ def from_kbase_object(data, kbase_api): for added_cpd in pheno["additionalcompound_refs"]: added_compounds.append(added_cpd.split("/").pop()) newpheno = MSGrowthPhenotype( - pheno["id"], media, pheno["normalizedGrowth"], geneko, added_compounds + media.info.id, media, pheno["normalizedGrowth"], geneko, added_compounds ) new_phenos.append(newpheno) growthpheno.add_phenotypes(new_phenos) return growthpheno @staticmethod - def from_kbase_file(filename, kbase_api): + def from_kbase_file( + filename, + kbase_api, + base_media=None, + base_uptake=0, + base_excretion=1000, + global_atom_limits={}, + ): # TSV file with the following headers:media mediaws growth geneko addtlCpd - growthpheno = MSGrowthPhenotypes(base_media, 0, 1000) + growthpheno = MSGrowthPhenotypes( + base_media, base_uptake, base_excretion, global_atom_limits + ) headings = [] new_phenos = [] with open(filename) as f: @@ -190,8 +385,16 @@ def from_kbase_file(filename, kbase_api): return growthpheno @staticmethod - def from_ms_file(filename, basemedia, base_uptake=0, base_excretion=100): - growthpheno = MSGrowthPhenotypes(base_media, base_uptake, base_excretion) + def from_ms_file( + filename, + base_media=None, + base_uptake=0, + base_excretion=100, + global_atom_limits={}, + ): + growthpheno = MSGrowthPhenotypes( + base_media, base_uptake, base_excretion, global_atom_limits + ) df = pd.read_csv(filename) required_headers = ["Compounds", "Growth"] for item in required_headers: @@ -211,6 +414,15 @@ def from_ms_file(filename, basemedia, base_uptake=0, base_excretion=100): growthpheno.add_phenotypes(new_phenos) return growthpheno + def build_super_media(self): + super_media = None + for pheno in self.phenotypes: + if not super_media: + super_media = pheno.build_media() + else: + super_media.merge(pheno.build_media(), overwrite_overlap=False) + return super_media + def add_phenotypes(self, new_phenotypes): keep_phenos = [] for pheno in new_phenotypes: @@ -220,76 +432,276 @@ def add_phenotypes(self, new_phenotypes): additions = DictList(keep_phenos) self.phenotypes += additions + def baseline_objective(self, model_or_mdlutl, objective): + """Simulates all the specified phenotype conditions and saves results + Parameters + ---------- + model_or_modelutl : Model | MSModelUtl + Model to use to run the simulations + """ + # Discerning input is model or mdlutl and setting internal links + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + # Checking if base growth already computed + if modelutl in self.cached_based_growth: + if objective in self.cached_based_growth[modelutl]: + return self.cached_based_growth[modelutl][objective] + else: + self.cached_based_growth[modelutl] = {} + # Setting objective + modelutl.objective = objective + # Setting media + modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package( + self.base_media, self.base_uptake, self.base_excretion + ) + # Adding uptake limits + if len(self.atom_limits) > 0: + modelutl.pkgmgr.getpkg("ElementUptakePkg").build_package(self.atom_limits) + # Simulating + self.cached_based_growth[modelutl][objective] = modelutl.model.slim_optimize() + return self.cached_based_growth[modelutl][objective] + def simulate_phenotypes( self, - model, - biomass, + model_or_mdlutl, + multiplier=3, add_missing_exchanges=False, - correct_false_negatives=False, - template=None, - growth_threshold=0.001, save_fluxes=False, + save_reaction_list=False, + gapfill_negatives=False, + msgapfill=None, + test_conditions=None, + ignore_experimental_data=False, + flux_coefficients=None, + recall_phenotypes=True ): - model.objective = biomass - modelutl = MSModelUtil(model) + """Simulates all the specified phenotype conditions and saves results + Parameters + ---------- + model_or_mdlutl : Model | MSModelUtl + Model to use to run the simulations + multiplier : double + Indicates a multiplier to use for positive growth above the growth on baseline media + add_missing_exchanges : bool + Boolean indicating if exchanges for compounds mentioned explicitly in phenotype media should be added to the model automatically + save_fluxes : bool + Indicates if the fluxes should be saved and returned with the results + ignore_experimental_data : bool + Indicates if existing growth data in the phenotype set should be ignored when computing the class of a simulated phenotype + """ + # Discerning input is model or mdlutl and setting internal links + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + # Establishing output of the simulation method summary = { - "Label": ["Accuracy", "CP", "CN", "FP", "FN"], - "Count": [0, 0, 0, 0, 0], + "Label": ["Accuracy", "CP", "CN", "FP", "FN", "P", "N"], + "Count": [0, 0, 0, 0, 0, 0, 0], } data = { "Phenotype": [], - "Observed growth": [], - "Simulated growth": [], + "Observed objective": [], + "Simulated objective": [], "Class": [], "Transports missing": [], "Gapfilled reactions": [], + "Gapfilling score": None, } + # Running simulations + gapfilling_solutions = {} + totalcount = 0 + datahash = {} for pheno in self.phenotypes: - with model: - result = pheno.simulate( - modelutl, growth_threshold, add_missing_exchanges, save_fluxes - ) # Result should have "growth" and "class" - if result["class"] == "FN" and correct_false_negatives: - pheno.gapfill_model_for_phenotype(modelutl, [template], None) - if pheno.gapfilling.last_solution != None: - list = [] - for rxn_id in pheno.gapfilling.last_solution["reversed"]: - list.append( - pheno.gapfilling.last_solution["reversed"][rxn_id] - + rxn_id - ) - for rxn_id in pheno.gapfilling.last_solution["new"]: - list.append( - pheno.gapfilling.last_solution["new"][rxn_id] + rxn_id - ) - data["Gapfilled reactions"].append(";".join(list)) - else: - data["Gapfilled reactions"].append(None) + result = pheno.simulate( + modelutl, + multiplier, + add_missing_exchanges, + save_fluxes, + save_reaction_list=save_reaction_list, + ignore_experimental_data=ignore_experimental_data, + flux_coefficients=flux_coefficients + ) + datahash[pheno.id] = result + data["Class"].append(result["class"]) + data["Phenotype"].append(pheno.id) + data["Observed objective"].append(pheno.experimental_value) + data["Simulated objective"].append(result["objective_value"]) + data["Transports missing"].append(";".join(result["missing_transports"])) + if result["class"] == "CP": + summary["Count"][1] += 1 + summary["Count"][0] += 1 + totalcount += 1 + elif result["class"] == "CN": + summary["Count"][2] += 1 + summary["Count"][0] += 1 + totalcount += 1 + elif result["class"] == "FP": + summary["Count"][3] += 1 + totalcount += 1 + elif result["class"] == "FN": + summary["Count"][4] += 1 + totalcount += 1 + elif result["class"] == "P": + summary["Count"][5] += 1 + elif result["class"] == "N": + summary["Count"][6] += 1 + # Gapfilling negative growth conditions + if gapfill_negatives and result["class"] in ["N", "FN", "CN"]: + gapfilling_solutions[pheno] = pheno.gapfill_model_for_phenotype( + msgapfill, + test_conditions, + multiplier, + add_missing_exchanges, + ) + if gapfilling_solutions[pheno] != None: + data["Gapfilling score"] = 0 + list = [] + for rxn_id in gapfilling_solutions[pheno]["reversed"]: + list.append( + gapfilling_solutions[pheno]["reversed"][rxn_id] + rxn_id + ) + data["Gapfilling score"] += 0.5 + for rxn_id in gapfilling_solutions[pheno]["new"]: + list.append(gapfilling_solutions[pheno]["new"][rxn_id] + rxn_id) + data["Gapfilling score"] += 1 + data["Gapfilled reactions"].append(";".join(list)) else: data["Gapfilled reactions"].append(None) - result = pheno.simulate( - modelutl, growth_threshold, add_missing_exchanges, save_fluxes - ) # Result should have "growth" and "class" - data["Class"].append(result["class"]) - data["Phenotype"].append(pheno.id) - data["Observed growth"].append(pheno.growth) - data["Simulated growth"].append(result["growth"]) - data["Transports missing"].append( - ";".join(result["missing_transports"]) - ) - if result["class"] == "CP": - summary["Count"][1] += 1 - summary["Count"][0] += 1 - if result["class"] == "CN": - summary["Count"][2] += 1 - summary["Count"][0] += 1 - if result["class"] == "FP": - summary["Count"][3] += 1 - if result["class"] == "FN": - summary["Count"][4] += 1 - - summary["Count"][0] = summary["Count"][0] / len(self.phenotypes) + else: + data["Gapfilled reactions"].append(None) + if totalcount == 0: + summary["Count"][0] = None + else: + summary["Count"][0] = summary["Count"][0] / totalcount sdf = pd.DataFrame(summary) df = pd.DataFrame(data) - logger.info(df) - return {"details": df, "summary": sdf} + self.adjust_phenotype_calls(df) + return {"details": df, "summary": sdf,"data":datahash} + + def adjust_phenotype_calls(self,data,baseline_objective=0.01): + lowest = data["Simulated objective"].min() + if baseline_objective < lowest: + lowest = baseline_objective + highest = data["Simulated objective"].max() + threshold = (highest-lowest)/2+lowest + if highest/(lowest+0.000001) < 1.5: + threshold = highest + grow = 0 + nogrow = 0 + change = 0 + for (i,item) in data.iterrows(): + oldclass = item["Class"] + if item["Simulated objective"] >= threshold: + grow += 1 + if item["Class"] == "N": + data.loc[i, 'Class'] = "P" + change += 1 + elif item["Class"] == "FN": + data.loc[i, 'Class'] = "CP" + change += 1 + elif item["Class"] == "CN": + data.loc[i, 'Class'] = "FP" + change += 1 + else: + nogrow += 1 + if item["Class"] == "P": + data.loc[i, 'Class'] = "N" + change += 1 + elif item["Class"] == "CP": + data.loc[i, 'Class'] = "FN" + change += 1 + elif item["Class"] == "FP": + data.loc[i, 'Class'] = "CN" + change += 1 + + def fit_model_to_phenotypes( + self, + msgapfill, + multiplier, + correct_false_positives=False, + minimize_new_false_positives=True, + atp_safe=True, + integrate_results=True, + global_gapfilling=True, + ): + + """Simulates all the specified phenotype conditions and saves results + Parameters + ---------- + msgapfill : MSGapfill + Gapfilling object used for the gapfilling process + correct_false_positives : bool + Indicates if false positives should be corrected + minimize_new_false_positives : bool + Indicates if new false positivies should be avoided + integrate_results : bool + Indicates if the resulting modifications to the model should be integrated + """ + + # Running simulations + positive_growth = [] + negative_growth = [] + for pheno in self.phenotypes: + with model: + result = pheno.simulate( + modelutl, + multiplier, + add_missing_exchanges, + save_fluxes, + ) + # Gapfilling negative growth conditions + if gapfill_negatives and result["class"] in ["N", "FN", "CN"]: + negative_growth.append(pheno.build_media()) + elif gapfill_negatives and result["class"] in ["P", "FP", "CP"]: + positive_growth.append(pheno.build_media()) + + # Create super media for all + super_media = self.build_super_media() + # Adding missing exchanges + msgapfill.gfmodel.add_missing_exchanges(super_media) + # Adding elemental constraints + self.add_elemental_constraints() + # Getting ATP tests + + # Filtering database for ATP tests + + # Penalizing database to avoid creating false positives + + # Building additional tests from current correct negatives + + # Computing base-line growth + + # Computing growth threshold + + # Running global gapfill + + # Integrating solution + + def gapfill_all_phenotypes( + self, + model_or_mdlutl, + msgapfill=None, # Needed if the gapfilling object in model utl is not initialized + threshold=None, + add_missing_exchanges=False, + ): + mdlutl = MSModelUtil.get(model_or_mdlutl) + # if msgapfill: + # mdlutl.gfutl = msgapfill + # if not mdlutl.gfutl: + # logger.critical( + # "Must either provide a gapfilling object or provide a model utl with an existing gapfilling object" + # ) + # media_list = [] + # for pheno in self.phenotypes: + # + # + # output = mdlutl.gfutl.run_multi_gapfill( + # media_list, + # default_minimum_objective=growth_threshold + # target=mdlutl.primary_biomass(), + # + # binary_check=False, + # prefilter=True, + # check_for_growth=True, + # ) diff --git a/modelseedpy/core/msmedia.py b/modelseedpy/core/msmedia.py index 488aad57..22fa91f3 100644 --- a/modelseedpy/core/msmedia.py +++ b/modelseedpy/core/msmedia.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import logging from cobra.core.dictlist import DictList +from modelseedpy.core.msmodelutl import MSModelUtil logger = logging.getLogger(__name__) @@ -22,6 +23,18 @@ def minFlux(self): # TODO: will be removed later just for old methods return -self.upper_bound + def get_mdl_exchange_hash(self, model_or_mdlutl): + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + mets = modelutl.find_met(self.id) + output = {} + exchange_hash = modelutl.exchange_hash() + for met in mets: + if met in exchange_hash: + output[met] = exchange_hash[met] + return output + class MSMedia: def __init__(self, media_id, name=""): @@ -62,6 +75,12 @@ def get_media_constraints(self, cmp="e0"): media[met_id] = (compound.lower_bound, compound.upper_bound) return media + def find_mediacpd(self, cpd_id): + for cpd in self.mediacompounds: + if cpd.id == cpd_id: + return cpd + return None + def merge(self, media, overwrite_overlap=False): new_cpds = [] for cpd in media.mediacompounds: diff --git a/modelseedpy/core/msminimalmedia.py b/modelseedpy/core/msminimalmedia.py new file mode 100644 index 00000000..8c2f6e2d --- /dev/null +++ b/modelseedpy/core/msminimalmedia.py @@ -0,0 +1,696 @@ +from modelseedpy.core.exceptions import ObjectiveError, FeasibilityError +from modelseedpy.fbapkg.reactionusepkg import ReactionUsePkg +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from itertools import combinations, permutations, chain +from optlang import Variable, Constraint +from cobra.medium import minimal_medium +from optlang.symbolics import Zero +from math import isclose, inf, factorial +#from deepdiff import DeepDiff +from time import process_time +from pprint import pprint +import logging +import json, re + +logger = logging.getLogger(__name__) + + +def _exchange_solution(sol_dict): + if isinstance(list(sol_dict.keys())[0], str): + return { + rxn: abs(flux) + for rxn, flux in sol_dict.items() + if "EX_" in rxn and flux < 0 + } + elif hasattr(list(sol_dict.keys())[0], "id"): + return { + rxn.id: abs(flux) + for rxn, flux in sol_dict.items() + if "EX_" in rxn.id and flux < 0 + } + return { + rxn.name: abs(flux) + for rxn, flux in sol_dict.items() + if "EX_" in rxn.name and flux < 0 + } + + +def _model_growth(sol_dict): + return sum( + [flux for var, flux in sol_dict.items() if re.search(r"(^bio\d+$)", var.name)] + ) + + +def _var_to_ID(var): + rxnID = var.name + if "_ru" in rxnID: + rxnID = rxnID.replace("_ru", "") + return rxnID + + +def _compatibilize(org_models, printing=False): + from commscores import GEMCompatibility + + return GEMCompatibility.standardize( + org_models, + conflicts_file_name="standardization_corrections.json", + printing=printing, + ) + + +def verify(org_model, min_media): + model2 = org_model.copy() + model2.medium = min_media + return model2.optimize() + + +def bioFlux_check(model, sol=None, sol_dict=None, min_growth=0.1): + sol_dict = sol_dict or FBAHelper.solution_to_variables_dict(sol, model) + # print({k:v for k,v in sol_dict.items() if v > 1E-8}) + simulated_growth = max( + sum( + [ + flux + for var, flux in sol_dict.items() + if re.search(r"(^bio\d+$)", var.name) + ] + ), + sol.objective_value, + ) + if simulated_growth < min_growth * 0.9999: + raise ObjectiveError( + f"The assigned minimal_growth of {min_growth} was not maintained during the simulation," + f" where the observed growth value was {simulated_growth}." + ) + if sol.status != "optimal": + display(sol) + return sol_dict + + +def minimizeFlux_withGrowth(model_util, min_growth, obj): + model_util.add_minimal_objective_cons(min_growth) + model_util.add_objective(obj, "min") + # print(model_util.model.objective) + # print([(cons.lb, cons.expression) for cons in model_util.model.constraints if "min" in cons.name]) + sol = model_util.model.optimize() + # print(sol.objective_value) + sol_dict = bioFlux_check(model_util.model, sol, min_growth=min_growth) + return sol, sol_dict + + +class MSMinimalMedia: + + @staticmethod + def _influx_objective(model_util, interacting): + rxns = ( + model_util.exchange_list() if interacting else model_util.transport_list() + ) + influxes = [] + for rxn in rxns: + if any( + ["e0" in met.id for met in rxn.reactants] + ): # this is essentially every exchange + influxes.append(rxn.reverse_variable) + elif any( + ["e0" in met.id for met in rxn.products] + ): # this captures edge cases or transporters + influxes.append(rxn.forward_variable) + else: + logger.critical( + f"The reaction {rxn} lacks exchange metabolites, which indicates an error." + ) + return influxes + + @staticmethod + def minimize_flux( + org_model, min_growth=None, environment=None, interacting=True, printing=True + ): + """minimize the total in-flux of exchange reactions in the model""" + if org_model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {org_model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations." + ) + model_util = MSModelUtil(org_model, True) + model_util.add_medium(environment or model_util.model.medium) + # define the MILP + min_growth = ( + model_util.model.slim_optimize() + if min_growth is None + else min(min_growth, model_util.model.slim_optimize()) + ) + # min_flux = MSMinimalMedia._min_consumption_objective(model_util, interacting) + media_exchanges = MSMinimalMedia._influx_objective(model_util, interacting) + # parse the minimal media + sol, sol_dict = minimizeFlux_withGrowth( + model_util, min_growth, sum(media_exchanges) + ) + min_media = _exchange_solution(sol_dict) + total_flux = sum([abs(flux) for flux in min_media.values()]) + simulated_sol = verify(org_model, min_media) + if simulated_sol.status != "optimal": + raise FeasibilityError( + f"The simulation was not optimal, with a status of {simulated_sol.status}" + ) + if printing: + print( + f"The minimal flux media for {org_model.id} consists of {len(min_media)} compounds and a {total_flux} total influx," + f" with a growth value of {simulated_sol.objective_value}" + ) + return min_media, sol + + @staticmethod + def _min_consumption_objective(model_util, interacting): + rxns = ( + model_util.exchange_list() if interacting else model_util.transport_list() + ) + vars = {} + for rxn in rxns: + cons_name = rxn.id + "_bin" + if cons_name in model_util.model.constraints: + print( + f"The {cons_name} constraint already exists in " + f"{model_util.model.id} and thus is skipped.\n" + ) + continue + + # define the variable + var_name = rxn.id + "_ru" + if var_name in model_util.model.variables: + print( + f"The {var_name} variable already exists in " + f"{model_util.model.id} and thus is skipped.\n" + ) + continue + vars[rxn.id] = Variable(var_name, lb=0, ub=1, type="binary") + model_util.add_cons_vars([vars[rxn.id]]) + # bin_flux: {rxn_bin}*1000 >= {rxn_rev_flux} + model_util.create_constraint( + Constraint(Zero, lb=0, ub=None, name=cons_name), + coef={vars[rxn.id]: 1000, rxn.reverse_variable: -1}, + ) + return vars + + @staticmethod + def conserved_exchanges(): + pass + + @staticmethod + def relative_media(): + pass + + @staticmethod + def minimize_components( + org_model, + min_growth=0.1, + environment=None, + interacting=True, + solution_limit=5, + printing=True, + ): + """minimize the quantity of metabolites that are consumed by the model""" + if org_model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {org_model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations." + ) + model_util = MSModelUtil(org_model, True) + model_util.add_timeout(10) + print("Minimal Components media") + if environment: + model_util.add_medium(environment) + # ic(org_model, min_growth, solution_limit) + model_util.add_minimal_objective_cons( + min_growth + ) # , model_util.model.reactions.bio1.flux_expression) + # print(model_util.model.constraints[-1]) + # define the binary variable and constraint + time1 = process_time() + variables = { + "ru": MSMinimalMedia._min_consumption_objective(model_util, interacting) + } + model_util.add_objective(sum(variables["ru"].values()), "min") + time2 = process_time() + print(f"\nDefinition of minimum objective time: {(time2 - time1)/60} mins") + + # determine each solution + # interdependencies = {} + solution_dicts, min_media = [], [0] * 1000 + sol = ( + model_util.model.optimize() + ) # TODO This is the troublesome line that occasionally refuses to solve + if "optimal" not in sol.status: + raise FeasibilityError( + f"The simulation for minimal uptake in {model_util.model.id} was {sol.status}." + ) + time3 = process_time() + broken = False + while ( + not broken + and sol.status == "optimal" + and len(solution_dicts) < solution_limit + ): + print(f"Iteration {len(solution_dicts)}", end="\r") + sol_dict = FBAHelper.solution_to_variables_dict(sol, model_util.model) + ## ensure that the minimal growth is respected + simulated_growth = _model_growth(sol_dict) + if simulated_growth < min_growth * 0.9999: + raise ObjectiveError( + f"The minimal growth of {min_growth} was not maintained; " + f"the simulation achieved {simulated_growth} growth." + ) + sol_rxns_dict = FBAHelper.solution_to_rxns_dict(sol, model_util.model) + solution_dicts.append(sol_dict) + sol_media = _exchange_solution(sol_rxns_dict) + min_media = sol_media if len(sol_media) < len(min_media) else min_media + ## omit the solution from future searches + model_util.create_constraint( + Constraint( + Zero, + lb=None, + ub=len(sol_dict) - 1, + name=f"exclude_sol{len(solution_dicts)}", + ), + sol_dict, + ) + + # search the permutation space by omitting previously investigated solution_dicts + # sol_exchanges = [rxn for rxn in sol_dict if "EX_" in rxn.name] + # interdependencies[count] = MSMinimalMedia._examine_permutations( + # model, sol_exchanges, variables, sol_dict, count, interacting) + try: + sol = model_util.model.optimize() + except: + broken = True + if broken: + break + if not solution_dicts: + raise FeasibilityError("The model was not feasibly simulated.") + min_media = {rxn: flux for rxn, flux in min_media.items()} + simulated_sol = verify(org_model, min_media) + if simulated_sol.status != "optimal": + raise FeasibilityError( + f"The predicted media {min_media} is not compatible with its model {org_model.id}, " + f"and possesses a(n) {simulated_sol.status} status." + ) + time6 = process_time() + print(f"Optimization time: {(time6-time3)/60} mins") + return min_media + + @staticmethod + def _knockout(org_model, rxnVar, variables, sol_dict, sol_index, interacting): + # knockout the specified exchange + knocked_model_utl = MSModelUtil(org_model, True) + knocked_model_utl, vars = MSMinimalMedia._min_consumption_objective( + knocked_model_utl, interacting + ) + coef = {rxnVar: 0} + if interacting: + coef.update( + { + variables["ru"][_var_to_ID(rxnVar2)]: 1 + for rxnVar2 in sol_dict + if rxnVar != rxnVar2 and "EX_" in rxnVar2.name + } + ) + else: + coef.update( + { + variables["ru"][_var_to_ID(rxnVar2)]: 1 + for rxnVar2 in sol_dict + if ( + rxnVar != rxnVar2 + and any(["_e0" in met.id for met in rxnVar2.metabolites]) + ) + } + ) + knocked_model_utl.create_constraint( + Constraint(Zero, lb=0.1, ub=None, name=f"{rxnVar.name}-sol{sol_index}"), + coef, + ) + return knocked_model_utl.optimize() + + @staticmethod + def _examine_permutations( + model, exchange_ids_to_explore, variables, sol_dict, sol_index, interacting + ): + for index, ex in enumerate(exchange_ids_to_explore): + print( + f"{ex.name}: {index}/{len(exchange_ids_to_explore)-1} exchanges to explore" + ) + sol_dict_sans_ex = sol_dict.copy() + sol_dict_sans_ex.pop(ex) + # interdependencies[sol_index][exID] = MSMinimalMedia._examine_permutations( + # exID, sol_dict, sol_index, variables, sol_dict_sans_ex) + interdependencies = {} + + ## explore permutations after removing the selected variable + diff = DeepDiff( + sol_dict_sans_ex, + FBAHelper.solution_to_dict( + MSMinimalMedia._knockout( + model, ex, variables, sol_dict, sol_index, interacting + ) + ), + ) + if ( + diff + ): # the addition of new exchanges or altered exchange fluxes are detected after the removed exchange + print(diff) + for key, changes in diff.items(): + # for change in changes: + # print(change) + changed_reactions = [ + re.search("(?<=\[')(.+)(?='\])", change).group() + for change in changes + ] + # this dictionary should be parsed into a list of substitute metabolites and a list of functionally coupled reactions + for exchange in [rxn for rxn in changed_reactions if "EX_" in rxn]: + interdependencies[sol_index][exchange] = ( + MSMinimalMedia._examine_permutations( + model, + exchange_ids_to_explore, + variables, + sol_dict, + sol_index + 1, + interacting, + ) + ) + # coef = {variables["met"][exID]: 0 for cpd in new_mets.keys()} + # coef.update({variables["met"][exID]: 1 for exID in sol_dict if exID not in new_mets.keys()}) + # cpd_name = "_".join(new_mets.keys()) + new_sol = model.optimize() + new_sol_dict = FBAHelper.solution_to_variables_dict(new_sol, model) + new_sol_exchanges = [rxn for rxn in sol_dict if "EX_" in rxn.name] + if new_sol.status != "optimal": + return interdependencies + MSMinimalMedia._examine_permutations( + model, + new_sol_exchanges, + variables, + new_sol_dict, + sol_index + 1, + interacting, + ) + return interdependencies + + @staticmethod + def determine_min_media( + model, + minimization_method="minFlux", + min_growth=None, + environment=None, + interacting=True, + solution_limit=5, + printing=True, + ): + if minimization_method == "minFlux": + return MSMinimalMedia.minimize_flux( + model, min_growth, environment, interacting, printing + ) + if minimization_method == "minComponents": + return minimal_medium(model, min_growth, minimize_components=True) + # return MSMinimalMedia.minimize_components( + # model, min_growth, environment, interacting, solution_limit, printing) + if minimization_method == "jenga": + return MSMinimalMedia.jenga_method(model, printing=printing) + + @staticmethod + def comm_media_est( + models, + comm_model, + minimization_method="minComponents", + min_growth=0.1, + environment=None, + interacting=True, + n_solutions=5, + printing=False, + ): + media = {"community_media": {}, "members": {}} + # print("com_media_est") + for org_model in models: + model_util = MSModelUtil(org_model, True) + # print(model_util.model.optimize()) + if environment: + # print(environment) + model_util.add_medium(environment) + # reactions = [rxn.name for rxn in model.variables] + # duplicate_reactions = DeepDiff(sorted(set(reactions)), sorted(reactions)) + # if duplicate_reactions: + # logger.critical(f'CodeError: The model {model.id} contains {duplicate_reactions}' + # f' that compromise the model.') + media["members"][model_util.model.id] = { + "media": MSMinimalMedia.determine_min_media( + model_util.model, + minimization_method, + min_growth, + environment, + interacting, + n_solutions, + printing, + ), + "solution": FBAHelper.solution_to_dict(model_util.model.optimize()), + } + if minimization_method == "jenga": + media["community_media"] = FBAHelper.sum_dict( + media["members"][model_util.model.id]["media"], + media["community_media"], + ) + if comm_model: + comm_util = MSModelUtil(comm_model) + if environment: + comm_util.add_medium(environment) + # if minimization_method == "jenga": + # print("Community models are too excessive for direct assessment via the JENGA method; " + # "thus, the community minimal media is estimated as the combination of member media.") + # return media + media["community_media"] = MSMinimalMedia.determine_min_media( + comm_model, + minimization_method, + min_growth, + environment, + interacting, + n_solutions, + printing, + ) + return media + + @staticmethod + def interacting_comm_media( + models, + comm_model, + minimization_method="jenga", + min_growth=0.1, + media=None, + environment=None, + printing=True, + ): + # define the community minimal media + media = media or MSMinimalMedia.comm_media_est( + models, + comm_model, + min_growth, + minimization_method, + environment, + printing=printing, + ) + org_media = media["community_media"].copy() + original_time = process_time() + # remove exchanges that can be satisfied by cross-feeding + for model in models: + for rxnID, flux in media["members"][model.id]["solution"].items(): + if ( + rxnID in media["community_media"] and flux > 0 + ): ## outflux in solutions + stoich = list( + model.reactions.get_by_id(rxnID).metabolites.values() + )[0] + media["community_media"][rxnID] += ( + flux * stoich + ) ## the cytoplasmic removal is captured by negative reactant stoich + media["community_media"] = { + ID: flux for ID, flux in media["community_media"].items() if flux > 0 + } # influx in media + syntrophic_diff = DeepDiff(org_media, media["community_media"]) + changed_quantity = ( + 0 + if not syntrophic_diff + else len(list(chain(*[v for v in list(dict(syntrophic_diff).values())]))) + ) + if printing: + print( + f"Syntrophic fluxes examined after {(process_time() - original_time) / 60} minutes, " + f"with {changed_quantity} change(s): {syntrophic_diff}" + ) + return media + + @staticmethod + def jenga_method( + org_model, + org_media=None, + conserved_cpds: list = None, + export=True, + printing=True, + compatibilize=False, + environment=None, + ): + # copy and compatibilize the parameter objects + if org_model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {org_model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations." + ) + copied_model = org_model.copy() + copied_model.medium = environment or copied_model.medium + if compatibilize: + copied_model = _compatibilize(copied_model) + original_media = org_media or MSMinimalMedia.minimize_components(copied_model) + # {cpd.replace("EX_", ""): flux for cpd, flux in .items()} + + # identify removal=ble compounds + original_time = process_time() + copied_model.medium = original_media + original_obj_value = org_model.optimize().objective_value + redundant_cpds = set() + for cpd in original_media: + new_media = original_media.copy() + new_media.pop(cpd) + copied_model.medium = new_media + sol_obj_val = copied_model.slim_optimize() + if isclose(sol_obj_val, original_obj_value, abs_tol=1e-4): + redundant_cpds.add(cpd) + else: + logger.debug( + f"The {sol_obj_val} objective value after the removal of {cpd} " + f"does not match the original objective value of {original_obj_value}." + ) + if not redundant_cpds: + logger.debug( + "None of the media components were determined to be removable." + ) + return original_media + if len(redundant_cpds) > 9: + import sigfig + + num_permuts = sigfig.round( + factorial(len(redundant_cpds)), sigfigs=2, format="sci" + ) + raise FeasibilityError( + f"The model {copied_model.id} contains {len(redundant_cpds)} removable" + f" compounds, which yields {num_permuts} permutations and is untenable for computation." + " Select a different minimal media method such as 'minFlux' or 'minComponents'." + ) + + # vet all permutation removals of the redundant compounds + permuts = [p for p in permutations(redundant_cpds)] + if printing: + print( + f"The {len(permuts)} permutations of the {redundant_cpds} redundant compounds, " + "from absolute tolerance of 1e-4, will be examined." + ) + permut_results, failed_permut_starts = [], [] + best = 0 + for perm_index, permut in enumerate(permuts): + print(f"{perm_index+1}/{len(permuts)}", end="\r") + successful_removal = 0 + permut_segments = [permut[:index] for index in range(len(permut), 2, -1)] + ## eliminate previously discovered failures and successes, respectively + if any([seg in failed_permut_starts for seg in permut_segments]): + continue + if best >= len(permut) / 2 and any( + [ + set(permut[: best - 1]) == set(list(success)[: best - 1]) + for success in permut_results + ] + ): + continue + new_media = original_media.copy() + for cpd in permut: + ### parameterize and simulate the community + new_media.pop(cpd) + copied_model.medium = new_media + sol = copied_model.optimize() + if not isclose(sol.objective_value, original_obj_value, abs_tol=1e-7): + failed_permut_starts.append(permut[: successful_removal + 1]) + break + successful_removal += 1 + + if successful_removal >= best: + if successful_removal > best: + best = successful_removal + permut_results = [] + permut_removable = permut[ + :best + ] # slice only the elements that are removable + if permut_removable not in permut_results: + permut_results.append(permut_removable) + if printing: + print(permut_removable) + print("best:", best) + + # filter to only the most minimal media + unique_combinations, unique_paths = [], [] + for removal_path in permut_results: + path_permutations = permutations(removal_path) + if all([path in permut_results for path in path_permutations]): + for com in combinations(removal_path, len(removal_path)): + com = set(com) + if com not in unique_combinations: + unique_combinations.append(com) + else: + unique_paths.append(removal_path) + if unique_combinations and printing: + print("Unique combinations:") + print(len(unique_combinations), unique_combinations) + if unique_paths and printing: + print("Unique paths:") + print(len(unique_paths), unique_paths) + + # further remove compounds from the media, while defaulting to the removal with the largest ID values + best_removals = {} + possible_removals = unique_combinations + unique_paths + if conserved_cpds: + possible_removals = [ + opt + for opt in possible_removals + if not any(cpd in conserved_cpds for cpd in opt) + ] + best = -inf + for removal in possible_removals: + cpdID_sum = sum( + [ + int(cpd.split("_")[1].replace("cpd", "") if "cpd" in cpd else 500) + for cpd in removal + ] + ) + if cpdID_sum > best: + best = cpdID_sum + best_removals = {best: [removal]} + elif cpdID_sum == best: + best_removals[best].append(removal) + ## arbitrarily select the first removal from those that both maximize the summed cpdID and avoid conserved compounds + media = FBAHelper.remove_media_compounds( + original_media, list(best_removals.values())[0][0], printing + ) + if printing: + print(best_removals) + pprint(media) + + # communicate results + jenga_media = media.copy() + jenga_difference = DeepDiff(original_media, jenga_media) + changed_quantity = ( + 0 if not jenga_difference else len(list(jenga_difference.values())[0]) + ) + if printing: + print( + f"Jenga fluxes examined after {(process_time()-original_time)/60} minutes, " + f"with {changed_quantity} change(s): {jenga_difference}" + ) + if export: + export_name = copied_model.id + "_media.json" + with open(export_name, "w") as out: + json.dump(media, out, indent=3) + return media diff --git a/modelseedpy/core/msmodel.py b/modelseedpy/core/msmodel.py old mode 100644 new mode 100755 index baaa0315..5c38e501 --- a/modelseedpy/core/msmodel.py +++ b/modelseedpy/core/msmodel.py @@ -1,10 +1,8 @@ # -*- coding: utf-8 -*- import logging import re +import traceback from cobra.core import Model -from pyeda.inter import ( - expr, -) # wheels must be specially downloaded and installed for Windows https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyeda logger = logging.getLogger(__name__) @@ -113,6 +111,7 @@ def get_set_set(expr_str): # !!! this currently returns dictionaries, not sets? return {frozenset({str(x) for x in dnf.inputs})} else: return {frozenset({str(x) for x in o.inputs}) for o in dnf.xs} + return {} class MSModel(Model): @@ -120,27 +119,27 @@ def __init__(self, id_or_model=None, genome=None, template=None): """ Class representation for a ModelSEED model. """ - super().__init__(self, id_or_model) + super().__init__(id_or_model) if genome: - self.genome_object = genome + self._genome = genome if template: - self.template_object = template + self._template = template @property def template(self): - return self.template_object + return self._template @template.setter def template(self, template): - self.template_object = template + self._template = template @property def genome(self): - return self.genome_object + return self._genome @genome.setter def genome(self, genome): - self.genome_object = genome + self._genome = genome def _set_genome_to_model(self, genome): # TODO: implement genome assignment checks if features matches genes diff --git a/modelseedpy/core/msmodelreport.py b/modelseedpy/core/msmodelreport.py new file mode 100644 index 00000000..2d980e38 --- /dev/null +++ b/modelseedpy/core/msmodelreport.py @@ -0,0 +1,636 @@ +# -*- coding: utf-8 -*- +import pandas as pd +import logging +import os +import re +import jinja2 +from os.path import dirname +from pandas.io.formats.style import Styler +from modelseedpy.core.msmodelutl import MSModelUtil + +module_path = dirname(os.path.abspath(__file__)) + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + + +class MSModelReport: + def __init__(self, model_or_mdlutl): + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.modelutl = model_or_mdlutl + else: + self.model = model_or_mdlutl + self.modelutl = MSModelUtil.get(model_or_mdlutl) + + def generate_reports(self, report_path, multi_tab_report_path): + self.build_report(report_path) + self.build_multitab_report(multi_tab_report_path) + + # Helper function to build overview data + def build_overview_data(self): + # Get the number of compartments + number_compartments = len( + set([metabolite.compartment for metabolite in self.model.metabolites]) + ) + + # Extract gapfilling information + core_gapfilling_media = [] + gapfilling_media = [] + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + if gf_sensitivity: + for media in gf_sensitivity: + if ( + "bio1" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["bio1"] + ): + gapfilling_media.append(media) + if ( + "rxn00062_c0" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["rxn00062_c0"] + ): + core_gapfilling_media.append(media) + + # Count the number of gapfills + number_gapfills = len(gapfilling_media) + + # Convert the lists to strings + core_gapfilling_str = ( + "; ".join(core_gapfilling_media) + if core_gapfilling_media + else "No core gapfilling needed." + ) + gapfilling_media_str = ( + "; ".join(gapfilling_media) + if gapfilling_media + else "No genome-scale gapfilling." + ) + + overview = { + "Model ID": self.model.id, + "Full Gapfilling and ATP Analysis Report": "TBD", # You may replace 'TBD' with actual data when available + "Genome Scale Template": self.model.notes.get( + "kbase_template_refs", "Data Not Available" + ), + "Core Gapfilling Media": core_gapfilling_str, + "Gapfilling Media": gapfilling_media_str, + "Source Genome": self.model.notes.get( + "kbase_genome_ref", "Data Not Available" + ), + "Total Number of reactions": self.modelutl.nonexchange_reaction_count(), + "Number compounds": len(self.model.metabolites), + "Number compartments": number_compartments, + "Number biomass": len( + [ + rxn + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + ] + ), + "Number gapfills": number_gapfills, + } + return overview + + # Helper function for extracting gapfilling data + def extract_gapfilling_data(self, gf_sensitivity): + if gf_sensitivity is None: + return [], {} + + gapfilling_dict = {} + gapfilling_summary = {} + + for media, media_data in gf_sensitivity.items(): + for target, target_data in media_data.items(): + gf_data = target_data.get("success", {}) + if isinstance(gf_data, dict): + for reaction_id, reaction_data in gf_data.items(): + for direction, metabolites in reaction_data.items(): + # If metabolites is None, set to empty string + if metabolites is None: + metabolites = "" + + # Extract both IDs and Names for Gapfilling Sensitivity + sensitivity_ids = [] + sensitivity_names = [] + if isinstance(metabolites, (list, tuple)): + for met_id in metabolites: + sensitivity_ids.append(met_id) + met_name = ( + self.model.metabolites.get_by_id(met_id).name + if met_id in self.model.metabolites + else met_id + ) + sensitivity_names.append(met_name) + else: + metabolites = str(metabolites) + entry = { + "reaction_id": reaction_id, + "reaction_name": self.model.reactions.get_by_id( + reaction_id + ).name + if reaction_id in self.model.reactions + else reaction_id, + "media": media, + "direction": direction, + "target": target, + "gapfilling_sensitivity_id": "; ".join(sensitivity_ids) + if sensitivity_ids + else metabolites, + "gapfilling_sensitivity_name": "; ".join( + sensitivity_names + ) + if sensitivity_names + else metabolites, + } + + # Update the summary dictionary + if reaction_id not in gapfilling_summary: + gapfilling_summary[reaction_id] = [] + gapfilling_summary[reaction_id].append( + f"{media}: {direction}" + ) + + # Check if reaction_id is already in dictionary + if reaction_id in gapfilling_dict: + # Update the media + existing_entry = gapfilling_dict[reaction_id] + existing_media = existing_entry["media"].split("; ") + if media not in existing_media: + existing_media.append(media) + existing_entry["media"] = "; ".join(existing_media) + else: + gapfilling_dict[reaction_id] = entry + + return list(gapfilling_dict.values()), gapfilling_summary + + # transform data to be used in tabular format to use in build_model_report + def transform_gapfilling_data(self, gapfilling_data): + transformed_data = [] + for entry in gapfilling_data: + row = [ + entry["reaction_id"], + entry["reaction_name"], + entry["media"], + entry["direction"], + entry["target"], + entry["gapfilling_sensitivity_id"], + entry["gapfilling_sensitivity_name"], + ] + transformed_data.append(row) + return transformed_data + + # Extract ATP analysis data + def extract_atp_analysis_data(self, atp_analysis, atp_expansion_filter): + entries = [] + if atp_analysis and "core_atp_gapfilling" in atp_analysis: + for media, data in atp_analysis["core_atp_gapfilling"].items(): + score = data.get("score", None) + new_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("new", {}).items() + ] + reversed_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("reversed", {}).items() + ] + atp_production = "Not integrated" + if ( + "selected_media" in atp_analysis + and media in atp_analysis["selected_media"] + ): + atp_production = atp_analysis["selected_media"][media] + + # Extracting the "Filtered Reactions" in the required format + filtered_reactions = [] + for k, v in atp_expansion_filter.get(media, {}).items(): + if isinstance(v, dict): + for sub_k, sub_v in v.items(): + if isinstance(sub_v, dict): + for reaction, direction_dict in sub_v.items(): + direction = list(direction_dict.keys())[0] + filtered_reactions.append( + f"{reaction}: {direction}" + ) + filtered_reactions_str = "; ".join(filtered_reactions) + + if score is not None: + entries.append( + { + "media": media, + "no_of_gapfilled_reactions": score, + "atp_production": atp_production, + "gapfilled_reactions": "; ".join(new_reactions), + "reversed_reaction_by_gapfilling": "; ".join( + reversed_reactions + ), + "filtered_reactions": filtered_reactions_str, + } + ) + # Sorting the entries based on the 'no_of_gapfilled_reactions' column + entries.sort(key=lambda x: x["no_of_gapfilled_reactions"]) + return entries + + # Extract ATP production data for the ATP Analysis tab + def extract_atp_production_data(self, atp_analysis): + atp_production_dict = {} + if atp_analysis: + selected_media = atp_analysis.get("selected_media", {}) + core_atp_gapfilling = atp_analysis.get("core_atp_gapfilling", {}) + + # First, process selected_media + for media, value in selected_media.items(): + atp_production_dict[media] = round(value, 2) + + # Next, process core_atp_gapfilling for media not in selected_media + for media, data in core_atp_gapfilling.items(): + if media not in atp_production_dict: + if data.get("failed"): + atp_production_dict[media] = "failed" + else: + # If the media was not processed in selected_media and it's not failed, set as 'Not Integrated' + atp_production_dict[media] = "Not Integrated" + + return atp_production_dict + + def build_multitab_report(self, output_path): + + # Build overview data + overview_data = self.build_overview_data() + + # Get gf_sensitivity attribute from the model + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + + # Extract gapfilling data + gapfilling_entries, gapfilling_reaction_summary = self.extract_gapfilling_data( + gf_sensitivity + ) + + # Check if ATP_analysis attribute is present in the model + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + if atp_analysis: + atp_expansion_filter = self.modelutl.attributes.get( + "atp_expansion_filter", {} + ) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + else: + atp_analysis_entries = [] + + # Initialize context dictionary + context = { + "overview": overview_data, + "reactions": [], + "compounds": [], + "genes": [], + "biomass": [], + "gapfilling": gapfilling_entries, # Populated with gapfilling data + "atpanalysis": atp_analysis_entries, # Populated with ATP analysis data + } + + print("Module Path:", module_path + "/../data/") + + exchanges = {r.id for r in self.model.exchanges} + + # Identify biomass reactions using SBO annotation + biomass_reactions_ids = { + rxn.id + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + } + + # Reactions Tab + for rxn in self.model.reactions: + if rxn.id not in exchanges and rxn.id not in biomass_reactions_ids: + equation = rxn.build_reaction_string(use_metabolite_names=True) + rxn_data = { + "id": rxn.id, + "name": rxn.name, + "equation": equation, + "genes": rxn.gene_reaction_rule, + "gapfilling": "; ".join( + gapfilling_reaction_summary.get(rxn.id, []) + ), # Empty list results in an empty string + } + context["reactions"].append(rxn_data) + + # Compounds Tab + for cpd in self.model.metabolites: + cpd_data = { + "id": cpd.id, + "name": cpd.name, + "formula": cpd.formula, + "charge": cpd.charge, + "compartment": cpd.compartment, + } + context["compounds"].append(cpd_data) + + # Genes Tab + for gene in self.model.genes: + gene_data = { + "gene": gene.id, + "reactions": "; ".join([rxn.id for rxn in gene.reactions]), + } + context["genes"].append(gene_data) + + # Biomass Tab + if biomass_reactions_ids: + for biomass_rxn_id in biomass_reactions_ids: + biomass_rxn = self.model.reactions.get_by_id(biomass_rxn_id) + for metabolite, coefficient in biomass_rxn.metabolites.items(): + compound_id = metabolite.id + compound_name = metabolite.name.split("_")[0] + compartment = compound_id.split("_")[-1] + + biomass_data = { + "biomass_reaction_id": biomass_rxn.id, + "biomass_compound_id": compound_id, + "name": compound_name, + "coefficient": coefficient, + "compartment": compartment, + } + context["biomass"].append(biomass_data) + else: + print("No biomass reactions found in the model.") + + # Gapfilling Tab + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + context["gapfilling"] = gapfilling_entries + + # Extract ATP Production Data + atp_production_data = self.extract_atp_production_data(atp_analysis) + + # Populate the 'atpanalysis' context with ATP production data + for entry in context["atpanalysis"]: + media = entry["media"] + entry["atp_production"] = atp_production_data.get(media, None) + + # Diagnostics + unique_biomass_rxns = biomass_reactions_ids + print(f"Unique biomass reactions identified: {len(unique_biomass_rxns)}") + print(f"Biomass Reaction IDs: {', '.join(unique_biomass_rxns)}") + + print("\nFirst 2 reactions:") + for rxn in context["reactions"][:2]: + print(rxn) + + print("\nFirst 2 compounds:") + for cpd in context["compounds"][:2]: + print(cpd) + + print("\nFirst 2 genes:") + for gene in context["genes"][:2]: + print(gene) + + print("\nFirst 2 biomass compounds:") + for bm in context["biomass"][:2]: + print(bm) + + print("\nFirst 2 gapfilling entries:") + for gf in context["gapfilling"][:2]: + print(gf) + + print("\nFirst 2 ATP Analysis entries:") + for entry in context["atpanalysis"][:2]: + print(entry) + + # Render with template + env = jinja2.Environment( + loader=jinja2.FileSystemLoader(module_path + "/../data/"), + autoescape=jinja2.select_autoescape(["html", "xml"]), + ) + html = env.get_template("ModelReportTemplate.html").render(context) + directory = dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w") as f: + f.write(html) + + def build_report(self, output_path): + """Builds model HTML report for the Model Summary table + Parameters + ---------- + model : cobra.Model + Model to use to build the report + """ + + # 1. Utilize the build_overview_data method + model_summary_data = self.build_overview_data() + # Remove the unwanted entry + model_summary_data.pop("Full Gapfilling and ATP Analysis Report", None) + # 2. Transform the dictionary into a list of tuples + model_summary_list = [(key, value) for key, value in model_summary_data.items()] + # 3. Convert to DataFrame + model_summary_df = pd.DataFrame(model_summary_list, columns=["", ""]) + + # Style the DataFrame (as was done previously) + model_summary_df_styled = model_summary_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Fetching the gapfilling sensitivity data + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + gapfilling_list = self.transform_gapfilling_data(gapfilling_data[0]) + + # Convert the gapfilling_list to a DataFrame + gapfillings_analysis_df = pd.DataFrame( + gapfilling_list, + columns=[ + "Reaction ID", + "Reaction Name", + "Media", + "Direction", + "Target", + "Gapfilling Sensitivity ID", + "Gapfilling Sensitivity Name", + ], + ) + + # Apply style to Gapfillings Analysis DataFrame + gapfillings_analysis_df_styled = gapfillings_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for Gapfillings Analysis + annotations_text_gapfillings = """ +
    +
  • Reaction ID: The identifier of the reaction.
  • +
  • Reaction Name: The name of the reaction.
  • +
  • Media: The media used by gap filling.
  • +
  • Direction: The direction of the reaction. Can be ">" for forward, "<" for reverse, or "=" for both directions.
  • +
  • Target: The reaction selected as the objective function target for the gapfilling optimization problem. Targets here can be the model’s biomass reaction, commonly named “bio1” for models created by this app. + Alternatively, “rxn00062” (ATP Production) reaction is shown for cases where gapfilling was applied to guarantee ATP production in a given media. + When reactions are gapfilled for ATP production, we recommend checking the full Core ATP Analysis in the table below.
  • +
  • Gapfilling Sensitivity ID and Name: Gapfilling is necessary when compounds in the biomass objective function can not be produced by the model. + For each reaction we list the biomass compound(s) that can not be synthesized by the model without gapfilling. + In cases where gap filling fails there are two possible scenarios: + 1) FBF (failed before filtering) : the gapfilling immediately failed, even before we filtered out the ATP breaking reactions. This means this objective CANNOT be satisfied with the entire current database. + 2) FAF (failed after filtering): the gapfilling succeeded before filtering, but failed after filtering out reactions that break ATP. This tells you definitively if the ATP filtering caused the gapfilling to fail
  • +
+ """ + + # Extract ATP analysis data + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + atp_expansion_filter = self.modelutl.attributes.get("atp_expansion_filter", {}) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + + # Convert the atp_analysis_entries list to a DataFrame + atp_analysis_df = pd.DataFrame(atp_analysis_entries) + + # Apply style to ATP Analysis DataFrame + atp_analysis_df_styled = atp_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for ATP Analysis + annotations_text_atp_analysis = """ +
    +
  • No. of gapfilled reactions: The number of reactions filled by the gapfilling process.
  • +
  • Media: The media in which the reaction takes place.
  • +
  • ATP Production: ATP production by the core metabolism model.
  • +
  • Gapfilled Reactions: Reactions added during the gapfilling process.
  • +
  • Reversed Reaction by Gapfilling: Reactions that have been reversed during the gapfilling process.
  • +
  • Filtered Reactions: Reactions that have been filtered out during the analysis. When a reaction addition would lead to a large increase in ATP production or an infinite energy loop, we filter that reaction out of the gapfilling database and prevent it from being added to the model.
  • +
+ """ + + # ATP analysis explanation text + explanation_text_atp_analysis = """ +

During model reconstruction, we analyze the genome’s core metabolism draft model (model without gapfilling) to assess energy biosynthesis capabilities. + The goal of this analysis is to ensure the core metabolism model is able to produce ATP before we expand the model to the genome-scale. + This step is designed to prevent gapfilling from introducing reactions that create energy-generating loops. + The tests are conducted on a large collection of minimal conditions, with the goal of simulating the model’s capability to produce energy with different electron donor, electron acceptor, and carbon source combinations.

+

When the draft model of the core metabolism is capable of producing ATP in at least one of the test media, no gapfilling reactions part of this analysis will be added to the model. While we still report the gapfilling requirements for the test media formulations that fail to produce ATP with that draft core model, we only integrate these solutions in the model when no test media succeeds in producing ATP. + In this case, the integrated gap-filling solution(s) will be displayed in the “Gapfilling Analysis” table above, with the “Target” “rxn00062” (ATP Production) objective function.

+

The goal is to display the test results for all media to provide clues for the metabolic capabilities of the genome(s). When many reactions are required for growth on the SO4 testing media conditions, this could be a good indicator that the organism is not capable of performing sulfate reduction. + On the other hand, when only one gapfill reaction is required for ATP production in a given media, multiple scenarios can be considered. + 1) Organism(s) can’t grow on test condition, and we correctly did not add the reaction to the model. 2) Possible issue with the source genome annotation missing a specific gene function 3) Possible issue with the model reconstruction database. We hope this data helps make more informed decisions on reactions that may need to be manually curated in the model. + In cases where is known from the literature or unpublished experimental results that an organism is capable of producing ATP in a given media condition that requires gapfilling in this analysis, you can use the parameter “Force ATP media” in the reconstruction app to ensure those reactions are integrated into the model. + .

+ """ + + # Save the data to HTML with the styled DataFrames and the legends + directory = os.path.dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + f.write('') + f.write("

Model Summary

") + f.write(model_summary_df_styled.to_html(escape=False)) + f.write("

") + f.write("

Gapfillings Analysis

") + + # Check for Gapfillings Analysis data + if not gapfillings_analysis_df.empty: + f.write(gapfillings_analysis_df_styled.to_html(escape=False)) + f.write(f"

Legend:

{annotations_text_gapfillings}") + else: + f.write( + "

Warning: No Gapfillings Analysis data available for this model.

" + ) + + f.write("

Core ATP Analysis

") + + # Check for ATP Analysis data + if not atp_analysis_df.empty: + f.write(atp_analysis_df_styled.to_html(escape=False)) + f.write(f"

Legend:

{annotations_text_atp_analysis}") + f.write(explanation_text_atp_analysis) + else: + f.write( + "

Warning: No Core ATP Analysis data available for this model.

" + ) diff --git a/modelseedpy/core/msmodelutl.py b/modelseedpy/core/msmodelutl.py old mode 100644 new mode 100755 index c9f5996a..aa6b099e --- a/modelseedpy/core/msmodelutl.py +++ b/modelseedpy/core/msmodelutl.py @@ -2,93 +2,534 @@ import logging import re import time +import json +import sys +import pandas as pd +import cobra from cobra import Model, Reaction, Metabolite +from optlang.symbolics import Zero +from cobra.flux_analysis import pfba from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.biochem.modelseed_biochem import ModelSEEDBiochem +from modelseedpy.core.fbahelper import FBAHelper +from multiprocessing import Value + +# from builtins import None logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO +core_rxns = { + "rxn00994_c0": "<", + "rxn00151_c0": ">", + "rxn24606_c0": ">", + "rxn00161_c0": ">", + "rxn14426_c0": ">", + "rxn00762_c0": "=", + "rxn05145_c0": ">", + "rxn00871_c0": ">", + "rxn01236_c0": "<", + "rxn05226_c0": ">", + "rxn01116_c0": "=", + "rxn00251_c0": "=", + "rxn05602_c0": "=", + "rxn09001_c0": ">", + "rxn00995_c0": ">", + "rxn14419_c0": ">", + "rxn14420_c0": ">", + "rxn24607_c0": "=", + "rxn00324_c0": "<", + "rxn01334_c0": "=", + "rxn05209_c0": "=", + "rxn00611_c0": "=", + "rxn00544_c0": "<", + "rxn01121_c0": ">", + "rxn03249_c0": "=", + "rxn00392_c0": "=", + "rxn05581_c0": "=", + "rxn00990_c0": ">", + "rxn00985_c0": "=", + "sul00004_c0": "=", + "rxn00160_c0": ">", + "rxn00615_c0": ">", + "rxn09003_c0": ">", + "rxn00083_c0": ">", + "rxn05493_c0": "=", + "rxn00248_c0": "=", + "rxn00678_c0": "=", + "rxn00558_c0": "=", + "rxn02376_c0": "=", + "rxn24608_c0": ">", + "rxn14424_c0": ">", + "rxn09174_c0": "=", + "rxn03250_c0": "=", + "rxn00162_c0": ">", + "rxn00549_c0": ">", + "rxn00779_c0": ">", + "rxn05573_c0": ">", + "rxn00506_c0": ">", + "rxn14425_c0": ">", + "rxn01872_c0": "=", + "rxn01996_c0": "=", + "rxn00507_c0": ">", + "rxn08528_c0": "=", + "rxn24609_c0": "=", + "rxn03884_c0": ">", + "rxn05488_c0": "=", + "rxn03079_c0": "=", + "rxn24610_c0": "=", + "rxn00178_c0": ">", + "rxn08793_c0": ">", + "rxn01130_c0": ">", + "rxn00512_c0": "<", + "rxn08355_c0": ">", + "rxn02342_c0": ">", + "rxn02314_c0": "=", + "rxn39373_c0": "=", + "rxn31759_c0": "=", + "rxn11937_c0": "<", + "rxn46184_c0": "=", + "rxn01123_c0": ">", + "rxn14421_c0": ">", + "rxn00379_c0": ">", + "rxn08734_c0": ">", + "rxn00668_c0": "=", + "rxn14418_c0": ">", + "rxn10570_c0": "=", + "rxn05553_c0": ">", + "rxn09295_c0": ">", + "rxn05759_c0": "=", + "rxn01343_c0": ">", + "rxn00545_c0": ">", + "rxn00250_c0": "=", + "rxn00785_c0": "=", + "rxn00305_c0": ">", + "rxn01387_c0": "=", + "rxn00974_c0": "=", + "rxn00604_c0": ">", + "rxn00875_c0": ">", + "rxn05528_c0": ">", + "rxn00623_c0": "<", + "rxn13974_c0": "<", + "rxn00770_c0": "=", + "rxn08900_c0": ">", + "rxn05468_c0": ">", + "rxn00199_c0": ">", + "rxn00499_c0": "=", + "rxn06493_c0": "=", + "rxn01275_c0": ">", + "rxn14412_c0": ">", + "rxn01106_c0": "=", + "rxn08428_c0": "=", + "rxn00777_c0": "=", + "rxn03644_c0": "=", + "rxn14414_c0": ">", + "rxn01480_c0": "=", + "rxn06526_c0": "=", + "rxn00543_c0": "=", + "rxn01115_c0": ">", + "rxn01870_c0": "=", + "rxn00677_c0": "=", + "rxn00799_c0": "=", + "rxn08975_c0": ">", + "rxn03240_c0": "=", + "rxn05312_c0": "<", + "rxn08558_c0": ">", + "sul00008_c0": ">", + "rxn01187_c0": ">", + "rxn00171_c0": "=", + "rxn15383_c0": ">", + "rxn00224_c0": "=", + "rxn03127_c0": "=", + "rxn01834_c0": "=", + "rxn24613_c0": "=", + "rxn14428_c0": "<", + "rxn08689_c0": "=", + "rxn02527_c0": ">", + "rxn00336_c0": ">", + "rxn05040_c0": ">", + "rxn08783_c0": ">", + "rxn14427_c0": ">", + "rxn00616_c0": "=", + "rxn05313_c0": ">", + "rxn03020_c0": "=", + "rxn11322_c0": "=", + "rxn00206_c0": "<", + "rxn09167_c0": ">", + "rxn10122_c0": ">", + "rxn00763_c0": "=", + "rxn06299_c0": "=", + "rxn05561_c0": "=", + "rxn08966_c0": "=", + "rxn10471_c0": "=", + "rxn15962_c0": "<", + "rxn00786_c0": "=", + "rxn00157_c0": "<", + "rxn00216_c0": "=", + "rxn00077_c0": "=", + "rxn01241_c0": "=", + "rxn01100_c0": "=", + "rxn00748_c0": ">", + "rxn00935_c0": "=", + "rxn00548_c0": "=", + "rxn08557_c0": ">", + "rxn05466_c0": "=", + "rxn08655_c0": ">", + "rxn00441_c0": ">", + "rxn01476_c0": ">", + "rxn02168_c0": "=", + "rxn00569_c0": "<", + "rxn17445_c0": ">", + "rxn01274_c0": ">", + "rxn00006_c0": "<", + "rxn08792_c0": ">", + "rxn08691_c0": "=", + "sul00003_c0": "=", + "rxn04794_c0": "=", + "rxn00568_c0": "<", + "rxn00225_c0": "=", + "rxn09318_c0": "=", + "rxn01057_c0": "=", + "rxn00247_c0": ">", + "rxn00285_c0": "=", + "rxn09004_c0": "=", + "rxn24612_c0": "=", + "rxn00371_c0": ">", + "rxn00159_c0": ">", + "rxn01333_c0": "=", + "rxn01388_c0": "=", + "rxn02480_c0": "=", + "rxn02167_c0": ">", + "rxn08971_c0": ">", + "rxn00612_c0": "=", + "rxn01806_c0": ">", + "rxn00148_c0": "<", + "rxn00122_c0": ">", + "rxn05469_c0": "=", + "rxn00265_c0": ">", + "rxn00330_c0": "<", + "rxn00602_c0": "<", + "rxn08179_c0": ">", + "rxn09269_c0": ">", + "rxn01200_c0": "=", + "rxn08556_c0": ">", + "rxn05627_c0": ">", + "rxn08656_c0": ">", + "rxn00097_c0": "=", + "rxn05319_c0": "=", + "rxn03085_c0": "=", + "rxn08178_c0": ">", + "rxn00747_c0": "=", + "rxn05559_c0": "=", + "rxn09314_c0": ">", + "rxn15961_c0": "=", + "rxn08976_c0": ">", + "rxn00172_c0": "<", + "rxn00868_c0": "<", + "rxn08173_c0": "=", + "rxn00102_c0": "=", + "rxn09272_c0": ">", + "rxn03126_c0": "=", + "sul00002_c0": "=", + "rxn01871_c0": "<", + "rxn00500_c0": "=", + "rxn00175_c0": ">", + "rxn00459_c0": "=", + "rxn24611_c0": "=", + "rxn09008_c0": "=", + "rxn00173_c0": "=", + "rxn33011_c0": "=", + "rxn08901_c0": ">", + "rxn00782_c0": "<", + "rxn03643_c0": "=", + "rxn08527_c0": "=", + "rxn00869_c0": "<", + "rxn05651_c0": "=", + "rxn10126_c0": ">", + "rxn00874_c0": "=", + "rxn10577_c0": ">", + "rxn00001_c0": ">", + "sul00010_c0": ">", + "rxn05625_c0": "=", + "rxn00670_c0": "=", + "rxn00147_c0": ">", + "rxn00288_c0": ">", + "rxn06777_c0": "=", + "rxn01452_c0": "<", + "rxn08518_c0": ">", + "rxn14422_c0": ">", + "rxn01477_c0": ">", + "rxn08350_c0": "=", + "rxn00256_c0": "<", + "rxn08977_c0": ">", + "rxn00781_c0": "=", + "rxn05467_c0": "=", + "rxn00011_c0": "<", + "rxn39175_c0": "=", + "rxn14423_c0": ">", + "rxn40505_c0": "=" +} -def metabolite_msid(metabolite): - if re.search("^(cpd\d+)", metabolite.id): - m = re.search("^(cpd\d+)", metabolite.id) - return m[1] - for anno in metabolite.annotation: - if isinstance(metabolite.annotation[anno], list): - for item in metabolite.annotation[anno]: - if re.search("^(cpd\d+)", item): - m = re.search("^(cpd\d+)", item) - return m[1] - elif re.search("^(cpd\d+)", metabolite.annotation[anno]): - m = re.search("^(cpd\d+)", metabolite.annotation[anno]) - return m[1] - return None - - -def reaction_msid(reaction): - if re.search("^(rxn\d+)", reaction.id): - m = re.search("^(rxn\d+)", reaction.id) - return m[1] - for anno in reaction.annotation: - if isinstance(reaction.annotation[anno], list): - for item in reaction.annotation[anno]: - if re.search("^(rxn\d+)", item): - m = re.search("^(rxn\d+)", item) - return m[1] - elif re.search("^(rxn\d+)", reaction.annotation[anno]): - m = re.search("^(rxn\d+)", reaction.annotation[anno]) +class MSModelUtil: + mdlutls = {} + + @staticmethod + def metabolite_msid(metabolite): + if re.search("^(cpd\d+)", metabolite.id): + m = re.search("^(cpd\d+)", metabolite.id) return m[1] - return None + for anno in metabolite.annotation: + if isinstance(metabolite.annotation[anno], list): + for item in metabolite.annotation[anno]: + if re.search("^(cpd\d+)", item): + m = re.search("^(cpd\d+)", item) + return m[1] + elif re.search("^(cpd\d+)", metabolite.annotation[anno]): + m = re.search("^(cpd\d+)", metabolite.annotation[anno]) + return m[1] + return None + @staticmethod + def reaction_msid(reaction): + if re.search("^(rxn\d+)", reaction.id): + m = re.search("^(rxn\d+)", reaction.id) + return m[1] + for anno in reaction.annotation: + if isinstance(reaction.annotation[anno], list): + for item in reaction.annotation[anno]: + if re.search("^(rxn\d+)", item): + m = re.search("^(rxn\d+)", item) + return m[1] + elif re.search("^(rxn\d+)", reaction.annotation[anno]): + m = re.search("^(rxn\d+)", reaction.annotation[anno]) + return m[1] + return None -def stoichiometry_to_string(stoichiometry): - reactants = [] - products = [] - for met in stoichiometry: - coef = stoichiometry[met] - if not isinstance(met, str): - if metabolite_msid(met) == "cpd00067": - met = None - else: - met = met.id - if met != None: - if coef < 0: - reactants.append(met) - else: - products.append(met) - reactants.sort() - products.sort() - return [ - "+".join(reactants) + "=" + "+".join(products), - "+".join(products) + "=" + "+".join(reactants), - ] + @staticmethod + def stoichiometry_to_string(stoichiometry): + reactants = [] + products = [] + for met in stoichiometry: + coef = stoichiometry[met] + if not isinstance(met, str): + if MSModelUtil.metabolite_msid(met) == "cpd00067": + met = None + else: + met = met.id + if met != None: + if coef < 0: + reactants.append(met) + else: + products.append(met) + reactants.sort() + products.sort() + return [ + "+".join(reactants) + "=" + "+".join(products), + "+".join(products) + "=" + "+".join(reactants), + ] + @staticmethod + def search_name(name): + name = name.lower() + name = re.sub(r"_[a-z]\d*$", "", name) + name = re.sub(r"\W+", "", name) + return name -def search_name(name): - name = name.lower() - name = re.sub(r"_[a-z]\d*$", "", name) - name = re.sub(r"\W+", "", name) - return name + @staticmethod + def get(model, create_if_missing=True): + if isinstance(model, MSModelUtil): + return model + if model in MSModelUtil.mdlutls: + return MSModelUtil.mdlutls[model] + elif create_if_missing: + MSModelUtil.mdlutls[model] = MSModelUtil(model) + return MSModelUtil.mdlutls[model] + else: + return None + + @staticmethod + def build_from_kbase_json_file(filename, kbaseapi): + """ + Builds an MSModelUtil object from a KBase JSON file. + + Args: + filename (str): The path to the KBase JSON file. + kbaseapi (KBaseAPI): An instance of the KBase API. + Returns: + An MSModelUtil object representing the contents of the KBase JSON file. + """ + factory = kbaseapi.KBaseObjectFactory() + model = factory.build_object_from_file(filename, "KBaseFBA.FBAModel") + return MSModelUtil(model) -class MSModelUtil: def __init__(self, model): self.model = model self.pkgmgr = MSPackageManager.get_pkg_mgr(model) + self.wsid = None self.atputl = None self.gfutl = None self.metabolite_hash = None self.search_metabolite_hash = None self.test_objective = None + self.reaction_scores = None self.score = None + self.breaking_reaction = None + self.integrated_gapfillings = [] + self.attributes = {} + self.atp_tests = None + self.reliability_scores = None + if hasattr(self.model, "computed_attributes"): + if self.model.computed_attributes: + self.attributes = self.model.computed_attributes + if "pathways" not in self.attributes: + self.attributes["pathways"] = {} + if "auxotrophy" not in self.attributes: + self.attributes["auxotrophy"] = {} + if "fbas" not in self.attributes: + self.attributes["fbas"] = {} + ########I/O functions + @staticmethod + def from_cobrapy_json(filename): + model = cobra.io.load_json_model(filename) + return MSModelUtil(model) + + def save_model(self, filename): + """ + Saves the associated cobrapy model to a json file + + Parameters + ---------- + filename: name of the file the model should be saved to + """ + cobra.io.save_json_model(self.model, filename) + def printlp(self, lpfilename="debug.lp"): with open(lpfilename, "w") as out: out.write(str(self.model.solver)) + def print_solutions(self, solution_hash,filename="reaction_solutions.csv"): + records = [] + for rxn in self.model.reactions: + record = {"id":rxn.id,"name":rxn.name,"equation":rxn.build_reaction_string(use_metabolite_names=True)} + records.append(record) + for key in solution_hash: + record[key] = solution_hash[key].fluxes[rxn.id] + df = pd.DataFrame.from_records(records) + df.to_csv(filename) + + ########FBA utility functions + def set_media(self, media): + """ + Sets the media of the model from a media object or dictionary + + Parameters + ---------- + media: MSMedia object | dict : media object or dictionary with media formulation + """ + if isinstance(media, dict): + from modelseedpy.core.msmedia import MSMedia + media = MSMedia.from_dict(media) + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + + ########Functions related to ATP gapfilling method + def get_atputl(self,atp_media_filename=None,core_template=None,gapfilling_delta=0,max_gapfilling=0,forced_media=[],remake_atputil=False): + """ + Returns and creates, if needed, an atp correction object for the model + + Parameters + ---------- + core_template (optional) : MSTemplate object with core reactions + atp_media_filename (optional) : string to tsv file with ATP media formulations + gapfilling_delta (optional) : maximum difference in gapfilling to accept ATP condition + max_gapfilling (optional) : maximum gapfilling allowable to accept an ATP growth condition + forced_media (optional) : list of media in which model MUST make ATP + + Returns + ------- + MSATPCorrection : Object for ATP correction + + Raises + ------ + """ + if not self.atputl or remake_atputil: + from modelseedpy.core.msatpcorrection import MSATPCorrection + self.atputl = MSATPCorrection( + self,core_template,[], + load_default_medias=True, + max_gapfilling=max_gapfilling, + gapfilling_delta=gapfilling_delta, + forced_media=forced_media, + default_media_path=atp_media_filename + ) + self.atputl = MSATPCorrection(self.model) + return self.atputl + + def get_atp_tests(self,core_template=None,atp_media_filename=None,recompute=False,remake_atputil=False): + """ + Attempts to get ATP tests from attributes and failing that compute denovo using MSATPCorrection + + Parameters + ---------- + core_template (optional) : MSTemplate object with core reactions + atp_media_filename (optional) : string to tsv file with ATP media formulations + + Returns + ------- + list<{"media":obj media,"is_max_threshold":bool,"threshold":float,"objective":string}> + List of test specifications + + Raises + ------ + """ + #Creating MSATPCorrection object which we need regardless + atpcorrection = self.get_atputl(core_template=core_template,atp_media_filename=atp_media_filename,remake_atputil=remake_atputil) + #Returning cached tests if available + if self.atp_tests and not recompute: + return self.atp_tests + #Attempting to pull ATP tests from attributes + if not recompute: + print("Getting tests from attributes") + atp_analysis = self.get_attributes("ATP_analysis",None) + if atp_analysis: + if "tests" in atp_analysis: + self.atp_tests = [] + for item in atp_analysis["tests"]: + if item in atpcorrection.media_hash: + self.atp_tests.append({ + "media":atpcorrection.media_hash[item], + "is_max_threshold":True, + "threshold":atp_analysis["tests"][item]["threshold"], + "objective":atp_analysis["tests"][item]["objective"] + }) + return self.atp_tests + else: + logger.warning("tests attribute missing in ATP analysis. Must recalculate ATP tests!") + else: + logger.warning("ATP analysis attributes missing. Must recalculate ATP tests!") + #If recompute called for or if attributes are missing, recompute tests + if not core_template: + logger.warning("Cannot recompute ATP tests without a core template!") + return None + self.atp_tests = atpcorrection.build_tests() + return self.atp_tests + + def compute_automated_reaction_scores(self): + """ + Computes reaction scores automatically from model data + :return: + """ + self.reaction_scores = {} + def build_metabolite_hash(self): self.metabolite_hash = {} self.search_metabolite_hash = {} for met in self.model.metabolites: + if len(met.id.split("_")) == 2: + self.add_name_to_metabolite_hash(met.id.split("_")[0],met) self.add_name_to_metabolite_hash(met.id, met) self.add_name_to_metabolite_hash(met.name, met) for anno in met.annotation: @@ -101,33 +542,47 @@ def build_metabolite_hash(self): def add_name_to_metabolite_hash(self, name, met): if name not in self.metabolite_hash: self.metabolite_hash[name] = [] - self.metabolite_hash[name].append(met) - sname = search_name(name) + if met not in self.metabolite_hash[name]: + self.metabolite_hash[name].append(met) + sname = MSModelUtil.search_name(name) if sname not in self.search_metabolite_hash: self.search_metabolite_hash[sname] = [] - self.search_metabolite_hash[sname].append(met) + if met not in self.search_metabolite_hash[sname]: + self.search_metabolite_hash[sname].append(met) - def find_met(self, name): + def find_met(self, name, compartment=None): if self.metabolite_hash == None: self.build_metabolite_hash() if name in self.metabolite_hash: - return self.metabolite_hash[name] - sname = search_name(name) + if not compartment: + return self.metabolite_hash[name] + for met in self.metabolite_hash[name]: + array = met.id.split("_") + if array[1] == compartment or met.compartment == compartment: + return [met] + return [] + sname = MSModelUtil.search_name(name) if sname in self.search_metabolite_hash: - return self.search_metabolite_hash[sname] - logger.info(name, " not found in model!") + if not compartment: + return self.search_metabolite_hash[sname] + for met in self.search_metabolite_hash[sname]: + array = met.id.split("_") + if array[1] == compartment or met.compartment == compartment: + return [met] + return [] + logger.info(name + " not found in model!") return [] def rxn_hash(self): output = {} for rxn in self.model.reactions: - strings = stoichiometry_to_string(rxn.metabolites) + strings = MSModelUtil.stoichiometry_to_string(rxn.metabolites) output[strings[0]] = [rxn, 1] output[strings[1]] = [rxn, -1] return output def find_reaction(self, stoichiometry): - output = stoichiometry_to_string(stoichiometry) + output = MSModelUtil.stoichiometry_to_string(stoichiometry) atpstring = output[0] rxn_hash = self.rxn_hash() if atpstring in rxn_hash: @@ -137,7 +592,7 @@ def find_reaction(self, stoichiometry): def msid_hash(self): output = {} for cpd in self.model.metabolites: - msid = metabolite_msid(cpd) + msid = MSModelUtil.metabolite_msid(cpd) if msid != None: if msid not in output: output[msid] = [] @@ -151,6 +606,130 @@ def exchange_list(self): exchange_reactions.append(reaction) return exchange_reactions + def nonexchange_reaction_count(self): + count = 0 + for reaction in self.model.reactions: + if ( + reaction.id[:3] != "EX_" + and reaction.id[:3] != "SK_" + and reaction.id[:3] != "DM_" + and reaction.id[:3] != "bio" + ): + if reaction.upper_bound > 0 or reaction.lower_bound < 0: + count += 1 + return count + + def reaction_scores(self): + return {} + + ################################################################################# + # Functions related to phenotype simultion + # Design philosophy: the phenotype types should be aware of phenotype data and + # agnostic to the model, so this code handles how to simulate a phenotype in a + # model. This code sets the model objective based on the phenotype type and adds + # the appropriate exchange reactions. + ################################################################################# + def set_objective_from_phenotype(self,phenotype,missing_transporters=[],create_missing_compounds=False): + if phenotype.type == "growth": + if "bio1" in self.model.reactions: + self.model.objective = "bio1" + else: + logger.critical(phenotype.id+": growth phenotype but could not find biomass reaction!") + return None + if phenotype.type == "uptake" or phenotype.type == "excretion": + uptake = excretion = 0 + if phenotype.type == "uptake": + uptake = 1000 + else: + excretion = 1000 + if len(phenotype.additional_compounds) == 0: + logger.critical(phenotype.id+": can't set uptake or excretion objective without additional compounds specified!") + return None + first = True + for cpd in phenotype.additional_compounds: + exid = "EX_"+cpd+"_e0" + if exid not in self.model.reactions: + exid = "EX_"+cpd+"_c0" + if exid not in self.model.reactions: + exmets = self.find_met(cpd,"c0") + if len(exmets) == 0: + if create_missing_compounds: + exmets = [Metabolite(cpd+"_c0",name=cpd+"_c0",compartment="c0")] + self.model.add_metabolites(exmets) + else: + logger.warning(phenotype.id+": could not find metabolite for "+cpd) + return None + self.add_exchanges_for_metabolites(exmets,uptake=uptake,excretion=excretion) + missing_transporters.append(cpd) + if first: + self.model.objective = exid + first = False + else: + self.model.objective += exid + if phenotype.type == "excretion": + for reaction in self.model.reactions: + if reaction.objective_coefficient != 0: + reaction.objective_coefficient = -1*reaction.objective_coefficient + self.model.objective.direction = 'max' + return str(self.model.objective) + + ################################################################################# + # Functions related to exchanges and transport reactions + ################################################################################# + def add_transport_and_exchange_for_metabolite(self, met,direction="=",prefix="trans",override=False): + #If met is a string, attempt to find the associated metabolite + if isinstance(met,str): + mets = self.find_met(met) + if len(mets) == 0: + logger.critical("Metabolite "+met+" not found in model") + return None + met = mets[0] + #Breaking down the ID to see the compartment and index - ID must take form _ + output = MSModelUtil.parse_id(met) + if not output: + logger.critical("Transport metabolite ID " + met.id + " not in proper format") + return None + (baseid,compartment,index) = output + #Checking if exchange already exists + if baseid+"_e0" in self.model.metabolites and not override: + logger.critical("Transport reaction appears to already exist for " + met.id+". Override if transport still desired.") + return None + elif baseid+"_e0" not in self.model.metabolites: + exmet = Metabolite(baseid+"_e0",name=met.name+"_e0",compartment="e0",charge=met.charge,formula=met.formula) + self.model.add_metabolites([exmet]) + else: + exmet = self.model.metabolites.get_by_id(baseid+"_e0") + #Checking charge so transport will be charge balanced + hmet = None + exhmet = None + if met.charge != 0: + #Finding H+ compound in model: + output = self.find_met("cpd00067",compartment+str(index)) + if len(output) > 0: + hmet = output[0] + output = self.find_met("cpd00067","e0") + if len(output) > 0: + exhmet = output[0] + if not hmet or not exhmet: + logger.warning("No H+ metabolite found in model") + stoich = {met:-1,exmet:1} + if met.charge != 0 and hmet and exhmet: + stoich[hmet] = met.charge + stoich[exhmet] = -1*met.charge + transport = Reaction(prefix + met.id + "_"+compartment+str(index)) + transport.name = "Charge-nuetral transport for " + met.name + transport.add_metabolites(stoich) + transport.annotation["sbo"] = "SBO:0000185" + transport.upper_bound = 0 + transport.lower_bound = 0 + if direction == ">" or direction == "=": + transport.upper_bound = 1000 + if direction == "<" or direction == "=": + transport.lower_bound = -1000 + self.model.add_reactions([transport]) + self.add_exchanges_for_metabolites([exmet],0,1000) + return transport + def exchange_hash(self): exchange_reactions = {} exlist = self.exchange_list() @@ -161,7 +740,7 @@ def exchange_hash(self): else: logger.warn("Nonstandard exchange reaction ignored:" + reaction.id) return exchange_reactions - + def add_missing_exchanges(self, media): output = [] exchange_hash = self.exchange_hash() @@ -207,8 +786,231 @@ def add_exchanges_for_metabolites( self.model.add_reactions(drains) return drains - def reaction_scores(self): - return {} + ################################################################################# + # Functions related to editing the model + ################################################################################# + def get_attributes(self, key=None, default=None): + if not key: + return self.attributes + if key not in self.attributes: + self.attributes[key] = default + return self.attributes[key] + + def save_attributes(self, value=None, key=None): + if value: + if key: + self.attributes[key] = value + else: + self.attributes = value + if hasattr(self.model, "computed_attributes"): + logger.info("Setting FBAModel computed_attributes to mdlutl attributes") + self.attributes["gene_count"] = len(self.model.genes) + self.model.computed_attributes = self.attributes + + def add_ms_reaction(self, rxn_dict, compartment_trans=["c0", "e0"]): + modelseed = ModelSEEDBiochem.get() + output = [] + for rxnid, compartment in rxn_dict.items(): + fullid = rxnid + "_" + compartment + modelseed_reaction = modelseed.get_seed_reaction(rxnid) + reaction_stoich = modelseed_reaction.cstoichiometry + cobra_reaction = Reaction(fullid) + output.append(cobra_reaction) + cobra_reaction.name = modelseed_reaction.data["name"] + "_" + compartment + metabolites_to_add = {} + for metabolite, stoich in reaction_stoich.items(): + id = metabolite[0] + compound = modelseed.get_seed_compound(id).data + compartment_number = int(metabolite[1]) + if compartment_number > len(compartment_trans): + logger.critical( + "Compartment index " + str(compartment_number) + " out of range" + ) + compartment_string = compartment_trans[compartment_number] + met_output = self.find_met(id, compartment_string) + cobramet = None + if met_output: + cobramet = met_output[0] + else: + cobramet = Metabolite( + id + "_" + compartment_string, + name=compound["name"] + "_" + compartment_string, + compartment=compartment_string, + ) + metabolites_to_add[cobramet] = stoich + cobra_reaction.add_metabolites(metabolites_to_add) + cobra_reaction.reaction + self.model.add_reactions(output) + return output + + ################################################################################# + # Functions related to utility functions + ################################################################################# + def assign_reliability_scores_to_reactions(self): + """Assigns a reliability score to every model reaction which indicates how likely the reaction is to be accurate and to take place + + Returns + ------- + { reaction ID : { reaction direction : score } } + """ + if self.reliability_scores == None: + self.reliability_scores = {} + biochem = ModelSEEDBiochem.get() + for reaction in self.model.reactions: + #Pulling model reaction related data + transported_charge = 0 + for met in reaction.metabolites: + coef = reaction.metabolites[met] + if met.id.split("_")[-1][0:1] == "e": + transported_charge += coef * met.charge + #Pulling ModelSEED Biochemistry related data + msid = MSModelUtil.reaction_msid(reaction) + if msid and msid != "rxn00000": + #Penalizing for net transport of ions in the wrong direction + forwardscore = 0 + reversescore = 0 + if transported_charge > 0: + forwardscore += 50*transported_charge + if transported_charge < 0: + reversescore += -50*transported_charge + basescore = 0 + msrxn = biochem.reactions[msid] + #Penalizing for mass imbalance + if msrxn["status"][0:2] == "MI": + basescore = 1000 + #Penalizing for charge imbalance + if msrxn["status"][0:2] == "CI": + basescore = 800 + #Penalizing if no pathways + if msrxn["pathways"] == None: + basescore = 50 + #Penalizing if there is no deltaG + if "deltag" not in msrxn or msrxn["deltag"] == 10000000: + basescore = 200 + else: + #Penalizing in the direction of infeasiblility + if msrxn["deltag"] <= -5: + reversescore += 20 + if msrxn["deltag"] <= -10: + reversescore += 20 + if msrxn["deltag"] >= 5: + forwardscore += 20 + if msrxn["deltag"] >= 10: + forwardscore += 20 + #Penalizing reactions in direction of production of ATP + array = str(msrxn["stoichiometry"]).split(";") + for item in array: + subarray = item.split(":") + if len(subarray) > 1: + if subarray[1] == "cpd00002": + if float(subarray[0]) < 0: + reversescore += 100 + elif float(subarray[0]) > 0: + forwardscore += 100 + #Penalizing if a compound structure is unkown + if subarray[1] in biochem.compounds: + if "inchikey" not in biochem.compounds[subarray[1]] or biochem.compounds[subarray[1]]["inchikey"] == None: + basescore += 40 + if "formula" not in biochem.compounds[subarray[1]] or biochem.compounds[subarray[1]]["formula"] == None: + basescore += 60 + if "deltag" not in biochem.compounds[subarray[1]] or biochem.compounds[subarray[1]]["deltag"] == 10000000: + basescore += 20 + self.reliability_scores[reaction.id] = {} + self.reliability_scores[reaction.id][">"] = basescore+forwardscore + self.reliability_scores[reaction.id]["<"] = basescore+reversescore + elif reaction.id[0:3] == "EX_" or reaction.id[0:3] == "SK_" or reaction.id[0:3] == "DM_" or reaction.id[0:3] == "bio": + self.reliability_scores[reaction.id] = {} + self.reliability_scores[reaction.id][">"] = -10 + self.reliability_scores[reaction.id]["<"] = -10 + else: + self.reliability_scores[reaction.id] = {} + self.reliability_scores[reaction.id][">"] = 1000 + self.reliability_scores[reaction.id]["<"] = 1000 + return self.reliability_scores + + def is_core(self,rxn): + """Indicates if a specified reaction is a core reaction + + Parameters + ---------- + reaction: Raction|string + + Returns + ------- + bool + """ + if not isinstance(rxn, str): + rxn = rxn.id + if "core_reactions" in self.get_attributes(): + print("Using core reactions attribute!") + if rxn in self.get_attributes("core_reactions"): + return True + return False + elif rxn in core_rxns: + return True + return False + + def build_model_data_hash(self): + data = { + "Model": self.id, + "Genome": self.genome.info.metadata["Name"], + "Genes": self.genome.info.metadata["Number of Protein Encoding Genes"], + } + return data + + def compare_reactions(self, reaction_list, filename): + data = {} + for rxn in reaction_list: + for met in rxn.metabolites: + if met.id not in data: + data[met.id] = {} + for other_rxn in reaction_list: + data[met.id][other_rxn.id] = 0 + data[met.id][rxn.id] = rxn.metabolites[met] + df = pd.DataFrame(data) + df = df.transpose() + df.to_csv(filename) + + ################################################################################# + # Functions related to managing biomass reactions + ################################################################################# + def evaluate_biomass_reaction_mass(self, biomass_rxn_id, normalize=False): + biorxn = self.model.reactions.get_by_id(biomass_rxn_id) + # First computing energy biosynthesis coefficients + atp = None + atp_compounds = { + "cpd00002": -1, + "cpd00001": -1, + "cpd00008": 1, + "cpd00009": 1, + "cpd00067": 1, + } + mass_compounds = {"cpd11463": 1, "cpd11461": 1, "cpd11462": 1} + process_compounds = {"cpd17041": 1, "cpd17042": 1, "cpd17043": 1} + for met in biorxn.metabolites: + msid = self.metabolite_msid(met) + if msid == "cpd00008": + atp = abs(biorxn.metabolites[met]) + # Computing non ATP total mass + total = 0 + for met in biorxn.metabolites: + msid = self.metabolite_msid(met) + if msid == "cpd11416": + continue + coef = biorxn.metabolites[met] + if msid in mass_compounds: + total += coef + elif msid in process_compounds: + total += 0 + else: + mw = FBAHelper.metabolite_mw(met) + if msid in atp_compounds: + if coef < 0: + coef += atp + else: + coef += -1 * atp + total += mw * coef / 1000 + return {"ATP": atp, "Total": total} # Required this function to add gapfilled compounds to a KBase model for saving gapfilled model def convert_cobra_compound_to_kbcompound(self, cpd, kbmodel, add_to_model=1): @@ -320,83 +1122,229 @@ def convert_cobra_reaction_to_kbreaction( kbmodel["modelreactions"].append(rxn_data) return rxn_data - def add_gapfilling_solution_to_kbase_model( - self, - newmodel, - gapfilled_reactions, - gfid=None, - media_ref=None, - reaction_genes=None, - ): + ################################################################################# + # Functions related to gapfilling of models + ################################################################################# + def convert_solution_to_list(self,solution): + """Converting solution to list format, which is easier to work with + Parameters + ---------- + solution : dict + Specifies the reactions to be added to the model to implement the gapfilling solution """ - NOTE: to be moved to cobrakbase + output = [] + for label in ["new","reversed"]: + for rxn_id in solution[label]: + output.append([rxn_id, solution[label][rxn_id],label]) + return output + + def find_item_in_solution(self,input_list,input,ignore_dir=False): + for item in input_list: + if input[0] == item[0] and input[1] == item[1]: + return True + elif ignore_dir and input[0] == item[0]: + return True + return False + + def test_solution(self,solution,targets,medias,thresholds=[0.1],remove_unneeded_reactions=False,do_not_remove_list=[]): + """Tests if every reaction in a given gapfilling solution is actually needed for growth. Note, this code assumes the gapfilling solution is already integrated. + + Parameters + ---------- + solution : {"new":{string reaction_id: string direction},"reversed":{string reaction_id: string direction}} + or + list> + Data for gapfilling solution to be tested + target : string, + media : MSMedia, + threshold : float, default 0.1 + + Returns + ------- + list> + List of unneeded reactions + + Raises + ------ """ - rxn_table = [] - gapfilling_obj = None - if gfid == None: - largest_index = 0 - for gapfilling in newmodel["gapfillings"]: - current_index = int(gapfilling["id"].split(".").pop()) - if largest_index == 0 or largest_index < current_index: - largest_index = current_index - largest_index += 1 - gfid = "gf." + str(largest_index) + #Saving the current objective + current_objective = self.model.objective + #Saving the current media + current_media = self.pkgmgr.getpkg("KBaseMediaPkg").current_media + #Computing the initial objective values + initial_objectives = [] + for (i,target) in enumerate(targets): + #Setting the media + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(medias[i]) + #Setting the objective + self.model.objective = target + #Computing the objective value + objective = self.model.slim_optimize() + initial_objectives.append(objective) + logger.debug("Starting objective for " + medias[i].id + "/"+target+" = " + str(objective)) + #Iterating through solution reactions and flagging them if they are unneeded to achieve the specified minimum objective + unneeded = [] + #If object is a dictionary, convert to a list + if isinstance(solution,dict): + solution = self.convert_solution_to_list(solution) + #Processing solution in standardized format + for item in solution: + rxn_id = item[0] + other_original_bound = None + rxnobj = self.model.reactions.get_by_id(rxn_id) + #Testing all media and target and threshold combinations to see if the reaction is needed + needed = False + for (i,target) in enumerate(targets): + if len(targets) > 1:#If there's only one target, then these steps were done earlier already + #Setting the media + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(medias[i]) + #Setting the objective + self.model.objective = target + #Knocking out the reaction to test for the impact on the objective + #This has to happen after media is applied in case the reaction is an exchange + if item[1] == ">": + original_bound = rxnobj.upper_bound + if rxnobj.lower_bound > 0: + other_original_bound = rxnobj.lower_bound + rxnobj.lower_bound = 0 + rxnobj.upper_bound = 0 + else: + original_bound = rxnobj.lower_bound + if rxnobj.upper_bound < 0: + other_original_bound = rxnobj.upper_bound + rxnobj.upper_bound = 0 + rxnobj.lower_bound = 0 + #Computing the objective value + objective = self.model.slim_optimize() + if objective < thresholds[i]: + needed = True + logger.info( + medias[i].id + "/" + target + ":" +rxn_id + + item[1] + + " needed:" + + str(objective) + + " with min obj:" + + str(thresholds[i]) + ) + #If the reaction isn't needed for any media and target combinations, add it to the unneeded list + if not needed: + unneeded.append([rxn_id, item[1], item[2],original_bound,other_original_bound]) + logger.info( + rxn_id + + item[1] + + " not needed:" + + str(objective) + ) + #VERY IMPORTANT: Leave the reaction knocked out for now so we screen for combinatorial effects + else: + #Restore the reaction if it is needed + if item[1] == ">": + rxnobj.upper_bound = original_bound + if other_original_bound != None: + rxnobj.lower_bound = other_original_bound + else: + rxnobj.lower_bound = original_bound + if other_original_bound != None: + rxnobj.upper_bound = other_original_bound + if not remove_unneeded_reactions: + #Restoring the bounds on the unneeded reactions + for item in unneeded: + rxnobj = self.model.reactions.get_by_id(item[0]) + if item[1] == ">": + rxnobj.upper_bound = item[3] + if item[4] != None: + rxnobj.lower_bound = item[4] + else: + rxnobj.lower_bound = item[3] + if item[4] != None: + rxnobj.upper_bound = item[4] else: - for gapfilling in newmodel["gapfillings"]: - if gapfilling["id"] == gfid: - gapfilling_obj = gapfilling - if gapfilling_obj == None: + #Do not restore bounds on unneeded reactions and remove reactions from model if their bounds are zero + removed_rxns = [] + for item in unneeded: + rxnobj = self.model.reactions.get_by_id(item[0]) + if self.find_item_in_solution(do_not_remove_list,item): + if item[1] == ">": + rxnobj.upper_bound = item[3] + if item[4] != None: + rxnobj.lower_bound = item[4] + else: + rxnobj.lower_bound = item[3] + if item[4] != None: + rxnobj.upper_bound = item[4] + elif rxnobj.lower_bound == 0 and rxnobj.upper_bound == 0 and not self.find_item_in_solution(do_not_remove_list,item,ignore_dir=True): + removed_rxns.append(rxnobj) + if len(removed_rxns) > 0: + self.model.remove_reactions(removed_rxns) + #Restoring the original objective + self.model.objective = current_objective + #Restoring the original media + if current_media: + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(current_media) + #Returning the unneeded list + return unneeded + + def add_gapfilling(self, solution): + print("Adding gapfilling",str(solution)) + self.integrated_gapfillings.append(solution) + + def create_kb_gapfilling_data(self, kbmodel, atpmedia_ws="94026"): + gapfilling_hash = {} + if "gapfillings" not in kbmodel: + kbmodel["gapfillings"] = [] + for gapfilling in kbmodel["gapfillings"]: + gapfilling_hash[gapfilling["id"]] = gapfilling + rxn_hash = {} + for rxn in kbmodel["modelreactions"]: + rxn_hash[rxn["id"]] = rxn + for gf in self.integrated_gapfillings: + media_ref = "KBaseMedia/Empty" + gf["media"].id.replace("/", ".") + gfid = gf["media"].id + if self.atputl: + for item in self.atputl.atp_medias: + if item[0] == gf["media"]: + gfid = "ATP-" + gfid + media_ref = atpmedia_ws + "/" + gf["media"].id + ".atp" + break + if hasattr(gf["media"], "info"): + media_ref = gf["media"].info.workspace_id + "/" + gf["media"].info.id + suffix = 0 + while gfid in gapfilling_hash: + suffix += 1 + gfid += "." + str(suffix) + gapfilling_hash[gfid] = 1 gapfilling_obj = { - "gapfill_id": newmodel["id"] + "." + gfid, + "gapfill_id": gfid, "id": gfid, "integrated": 1, "integrated_solution": "0", + "target": gf["target"], + "minobjective": gf["minobjective"], + "binary_check": gf["binary_check"], "media_ref": media_ref, } - newmodel["gapfillings"].append(gapfilling_obj) - cpd_hash = {} - for cpd in newmodel["modelcompounds"]: - cpd_hash[cpd["id"]] = cpd - for rxn in gapfilled_reactions["new"]: - reaction = self.model.reactions.get_by_id(rxn) - kbrxn = self.convert_cobra_reaction_to_kbreaction( - reaction, - newmodel, - cpd_hash, - gapfilled_reactions["new"][rxn], - 1, - reaction_genes, - ) - kbrxn["gapfill_data"][gfid] = dict() - kbrxn["gapfill_data"][gfid]["0"] = [gapfilled_reactions["new"][rxn], 1, []] - # rxn_table.append({ - # 'id':kbrxn["id"], - # 'name':kbrxn["name"], - # 'direction':format_direction(kbrxn["direction"]), - # 'gene':format_gpr(kbrxn), - # 'equation':format_equation(kbrxn,cpd_hash), - # 'newrxn':1 - # }) - for rxn in gapfilled_reactions["reversed"]: - for kbrxn in newmodel["modelreactions"]: - if kbrxn["id"] == rxn: - kbrxn["direction"] = "=" - # rxn_table.append({ - # 'id':kbrxn["id"], - # 'name':kbrxn["name"], - # 'direction':format_direction(kbrxn["direction"]), - # 'gene':format_gpr(kbrxn), - # 'equation':format_equation(kbrxn,cpd_hash), - # 'newrxn':0 - # }) - kbrxn["gapfill_data"][gfid] = dict() - kbrxn["gapfill_data"][gfid]["0"] = [ - gapfilled_reactions["reversed"][rxn], - 1, - [], - ] - return rxn_table + kbmodel["gapfillings"].append(gapfilling_obj) + for rxn in gf["new"]: + if rxn in rxn_hash: + rxnobj = rxn_hash[rxn] + if "gapfill_data" not in rxnobj: + rxnobj["gapfill_data"] = {} + if gfid not in rxnobj["gapfill_data"]: + rxnobj["gapfill_data"][gfid] = {"0": [gf["new"][rxn], 1, []]} + for rxn in gf["reversed"]: + if rxn in rxn_hash: + rxnobj = rxn_hash[rxn] + if "gapfill_data" not in rxnobj: + rxnobj["gapfill_data"] = {} + if gfid not in rxnobj["gapfill_data"]: + rxnobj["gapfill_data"][gfid] = { + "0": [gf["reversed"][rxn], 1, []] + } + ################################################################################# + # Functions related to applying, running, and expanding with test conditions + ################################################################################# def apply_test_condition(self, condition, model=None): """Applies constraints and objective of specified condition to model @@ -421,13 +1369,13 @@ def apply_test_condition(self, condition, model=None): else: pkgmgr = MSPackageManager.get_pkg_mgr(model) model.objective = condition["objective"] - if condition["is_max_threshold"]: - model.objective.direction = "max" - else: - model.objective.direction = "min" + #if condition["is_max_threshold"]: + model.objective.direction = "max" + #else: TODO - need to revisit this + # model.objective.direction = "min" pkgmgr.getpkg("KBaseMediaPkg").build_package(condition["media"]) - def test_single_condition(self, condition, apply_condition=True, model=None): + def test_single_condition(self, condition, apply_condition=True, model=None,report_atp_loop_reactions=False,analyze_failures=False,rxn_list=[]): """Runs a single test condition to determine if objective value on set media exceeds threshold Parameters @@ -454,33 +1402,44 @@ def test_single_condition(self, condition, apply_condition=True, model=None): new_objective = model.slim_optimize() value = new_objective if "change" in condition and condition["change"]: - if self.test_objective is not None: + if self.test_objective: value = new_objective - self.test_objective + logger.debug( + condition["media"].id + + " testing for change:" + + str(value) + + "=" + + str(new_objective) + + "-" + + str(self.test_objective) + ) self.score = value if model.solver.status != "optimal": - self.printlp("Infeasible.lp") - logger.critical("Infeasible problem - LP file printed to debug!") + self.printlp(condition["media"].id + "-Testing-Infeasible.lp") + logger.critical( + condition["media"].id + + "testing leads to infeasible problem. LP file printed to debug!" + ) return False if value >= condition["threshold"] and condition["is_max_threshold"]: - logger.debug( - "Failed high:" - + str(self.test_objective) - + ";" - + str(condition["threshold"]) - ) + logger.debug("Failed high:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) + if analyze_failures and len(rxn_list) == 1: + #Constraining test objective at failed value + if value > 1000: + value = 1000 + self.model.reactions.get_by_id(condition["objective"]).lower_bound = value + solution = pfba(self.model) + self.analyze_minimal_reaction_set(solution,rxn_list[0][0].id) + self.model.reactions.get_by_id(condition["objective"]).lower_bound = 0 return False elif value <= condition["threshold"] and not condition["is_max_threshold"]: - logger.debug( - "Failed low:" - + str(self.test_objective) - + ";" - + str(condition["threshold"]) - ) + print("Failed low:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) return False self.test_objective = new_objective + logger.debug("Passed:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) return True - def test_condition_list(self, condition_list: list, model=None): + def test_condition_list(self, condition_list, model=None,positive_growth=[],rxn_list=[]): """Runs a set of test conditions to determine if objective values on set medias exceed thresholds Parameters @@ -501,13 +1460,65 @@ def test_condition_list(self, condition_list: list, model=None): if model == None: model = self.model for condition in condition_list: - if not self.test_single_condition(condition, True, model): + if not self.test_single_condition(condition,apply_condition=True,model=model,rxn_list=rxn_list): return False return True - def reaction_expansion_test(self, reaction_list: list, condition_list: list): - """Adds reactions in reaction list one by one and appplies tests, filtering reactions that fail + def linear_expansion_test(self, reaction_list, condition, currmodel,positive_growth=[]): + """Tests addition of reactions one at a time + + Parameters + ---------- + reaction_list : list<[obj reaction,{>|>}]> + List of reactions and directions to test for addition in the model (should already be in model) + + Returns + ------- + list<[obj reaction,{>|>}]> + List of reactions and directions filtered because they fail tests when in the model + Raises + ------ + """ + # First run the full test + if self.test_single_condition(condition, apply_condition=False, model=currmodel,positive_growth=positive_growth): + return [] + # First knockout all reactions in the input list and save original bounds + filtered_list = [] + original_bound = [] + for item in reaction_list: + if item[1] == ">": + original_bound.append(item[0].upper_bound) + item[0].upper_bound = 0 + else: + original_bound.append(item[0].lower_bound) + item[0].lower_bound = 0 + # Now restore reactions one at a time + count = 0 + for item in reaction_list: + if item[1] == ">": + item[0].upper_bound = original_bound[count] + if not self.test_single_condition(condition, apply_condition=False, model=currmodel): + # logger.debug(item[0].id+":"+item[1]) + item[0].upper_bound = 0 + if item not in filtered_list: + item.append(original_bound[count]) + item.append(self.score) + filtered_list.append(item) + else: + item[0].lower_bound = original_bound[count] + if not self.test_single_condition(condition, apply_condition=False, model=currmodel): + # logger.debug(item[0].id+":"+item[1]) + item[0].lower_bound = 0 + if item not in filtered_list: + item.append(original_bound[count]) + item.append(self.score) + filtered_list.append(item) + count += 1 + return filtered_list + + def binary_expansion_test(self, reaction_list, condition, currmodel, depth=0,positive_growth=[]): + """Conducts a binary search for bad reaction combinations Parameters ---------- reaction_list : list<[obj reaction,{>|>}]> @@ -523,54 +1534,428 @@ def reaction_expansion_test(self, reaction_list: list, condition_list: list): Raises ------ """ - tic = time.perf_counter() - - logger.info( - f"Expansion started! reaction list: {len(reaction_list)} conditions: {len(condition_list)}" + newdepth = depth + 1 + filtered_list = [] + # First run the full test + if self.test_single_condition(condition,apply_condition=False,model=currmodel,rxn_list=reaction_list): + #print("Reaction set passed"," ".join(map(str, reaction_list))) + return [] + # Check if input list contains only one reaction: + if len(reaction_list) == 1: + #print("Failed:"+reaction_list[0][1]+reaction_list[0][0].id) + if reaction_list[0][1] == ">": + reaction_list[0].append(reaction_list[0][0].upper_bound) + reaction_list[0][0].upper_bound = 0 + else: + reaction_list[0].append(reaction_list[0][0].lower_bound) + reaction_list[0][0].lower_bound = 0 + #Check if the reaction passes the positive growth test + success = True + if len(positive_growth) > 0: + #Testing positive growth conditions + for pos_condition in positive_growth: + if not self.test_single_condition(pos_condition,apply_condition=True,model=currmodel): + print("Does not pass positive growth tests:"+reaction_list[0][1]+reaction_list[0][0].id) + success = False + break + #Restoring current test condition + self.apply_test_condition(condition) + if success: + reaction_list[0].append(self.score) + filtered_list.append(reaction_list[0]) + else: + #Restoring reaction + if reaction_list[0][1] == ">": + reaction_list[0][0].upper_bound = reaction_list[0][2] + else: + reaction_list[0][0].lower_bound = reaction_list[0][2] + self.breaking_reaction = reaction_list[0][0] + return filtered_list + # Break reaction list into two + original_bound = [] + sub_lists = [[], []] + midway_point = int(len(reaction_list) / 2) + for i, item in enumerate(reaction_list): + if item[1] == ">": + original_bound.append(item[0].upper_bound) + else: + original_bound.append(item[0].lower_bound) + if i < midway_point: + sub_lists[0].append(item) + else: + sub_lists[1].append(item) + if item[1] == ">": + item[0].upper_bound = 0 + else: + item[0].lower_bound = 0 + # Submitting first half of reactions for testing + new_filter = self.binary_expansion_test( + sub_lists[0], condition, currmodel,depth=newdepth,positive_growth=positive_growth + ) + for item in new_filter: + filtered_list.append(item) + if self.breaking_reaction != None: + print("Ending early due to breaking reaction:"+self.breaking_reaction.id) + return filtered_list + # Submitting second half of reactions for testing - now only breaking reactions are removed from the first list + for i, item in enumerate(reaction_list): + if i >= midway_point: + if item[1] == ">": + item[0].upper_bound = original_bound[i] + else: + item[0].lower_bound = original_bound[i] + new_filter = self.binary_expansion_test( + sub_lists[1], condition, currmodel,depth=newdepth,positive_growth=positive_growth ) + for item in new_filter: + filtered_list.append(item) + return filtered_list + def check_if_solution_exists(self, reaction_list, condition, model): + original_bound = [] + for i, item in enumerate(reaction_list): + if item[1] == ">": + original_bound.append(item[0].upper_bound) + item[0].upper_bound = 0 + else: + original_bound.append(item[0].lower_bound) + item[0].lower_bound = 0 + result = self.test_single_condition(condition,model=model) + for i, item in enumerate(reaction_list): + if item[1] == ">": + item[0].upper_bound = original_bound[i] + else: + item[0].lower_bound = original_bound[i] + return result + + def reaction_expansion_test( + self, + reaction_list, + condition_list, + binary_search=True, + attribute_label="gf_filter", + positive_growth=[], + resort_by_score=True + ): + """Adds reactions in reaction list one by one and appplies tests, filtering reactions that fail + + Parameters + ---------- + reaction_list : list<[obj reaction,{>|>}]> + List of reactions and directions to test for addition in the model (should already be in model) + condition_list : list + Specifies set of conditions to be tested with media, objective, is_max_threshold, threshold. + + Returns + ------- + list<[obj reaction,{>|>}]> + List of reactions and directions filtered because they fail tests when in the model + + Raises + ------ + """ + logger.debug(f"Expansion started! Binary = {binary_search}") + self.breaking_reaction = None filtered_list = [] + if resort_by_score: + scores = self.assign_reliability_scores_to_reactions() + reaction_list = sorted(reaction_list, key=lambda x: scores[x[0].id][x[1]]) + for item in reaction_list: + print(item[0].id+":"+item[1]+":"+str(scores[item[0].id][item[1]])) for condition in condition_list: - logger.debug(f"testing condition {condition}") - currmodel = self.model + tic = time.perf_counter() + new_filtered = [] + if not self.check_if_solution_exists(reaction_list, condition, currmodel): + print("No solution exists that passes tests for condition "+condition["media"].id) + return None with currmodel: self.apply_test_condition(condition) - # First knockout all reactions in the input list and save original bounds - original_bound = [] - for item in reaction_list: - if item[1] == ">": - original_bound.append(item[0].upper_bound) - item[0].upper_bound = 0 - else: - original_bound.append(item[0].lower_bound) - item[0].lower_bound = 0 - # Now restore reactions one at a time - count = 0 - for item in reaction_list: - if item[1] == ">": - item[0].upper_bound = original_bound[count] - if not self.test_single_condition(condition, False, currmodel): - item[0].upper_bound = 0 + if binary_search: + done = False + while not done: + new_filtered = self.binary_expansion_test( + reaction_list, condition, currmodel,positive_growth=positive_growth + ) + for item in new_filtered: if item not in filtered_list: - item.append(original_bound[count]) - item.append(self.score) filtered_list.append(item) + if self.breaking_reaction == None: + done = True + else: + #Remove breaking reaction from reaction_list + print("Keeping breaking reaction:"+self.breaking_reaction.id) + for i in range(len(reaction_list)): + if reaction_list[i][0] == self.breaking_reaction: + del reaction_list[i] + break + if not self.check_if_solution_exists(reaction_list, condition, currmodel): + print("No solution exists after retaining breaking reaction:"+self.breaking_reaction.id) + return None + self.breaking_reaction = None + else: + new_filtered = self.linear_expansion_test( + reaction_list, condition, currmodel,positive_growth=positive_growth + ) + for item in new_filtered: + if item not in filtered_list: + filtered_list.append(item) + # Restoring knockout of newly filtered reactions, which expire after exiting the "with" block above + for item in new_filtered: + if item[1] == ">": + item[0].upper_bound = 0 + else: + item[0].lower_bound = 0 + toc = time.perf_counter() + logger.info( + "Expansion time:" + condition["media"].id + ":" + str((toc - tic)) + ) + logger.info( + "Filtered count:" + + str(len(filtered_list)) + + " out of " + + str(len(reaction_list)) + ) + # Adding filter results to attributes + gf_filter_att = self.get_attributes(attribute_label, {}) + if condition["media"].id not in gf_filter_att: + gf_filter_att[condition["media"].id] = {} + if condition["objective"] not in gf_filter_att[condition["media"].id]: + gf_filter_att[condition["media"].id][condition["objective"]] = {} + if ( + condition["threshold"] + not in gf_filter_att[condition["media"].id][condition["objective"]] + ): + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ] = {} + for item in new_filtered: + if ( + item[0].id + not in gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ] + ): + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id] = {} + if ( + item[1] + not in gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id] + ): + if len(item) < 3: + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id][item[1]] = None else: - item[0].lower_bound = original_bound[count] - if not self.test_single_condition(condition, False, currmodel): - item[0].lower_bound = 0 - if item not in filtered_list: - item.append(original_bound[count]) - item.append(self.score) - filtered_list.append(item) - count += 1 - toc = time.perf_counter() - print("Expansion time:", (toc - tic)) - print("Filtered count:", len(filtered_list), " out of ", len(reaction_list)) + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id][item[1]] = item[2] return filtered_list + ################################################################################# + # Functions for reaction set analysis + ################################################################################# + def analyze_minimal_reaction_set(self,solution,label,print_output=True): + """Systematically exploring alternative options for each reaction in an input minimal reaction set + + Parameters + ---------- + reaction_set : list + List of reactions to be evaluated for alternative options + print_output : bool + Prints output to stdout if true + + Returns + ------- + {obj reaction: list >} : list of reactions pointing to their alternative options + + Raises + ------ + """ + #Determining reaction set as the set of currently active reactions in the input solution + reaction_set = [] + output = {} + original_objective = self.model.objective + minimal_deviation_objective = self.model.problem.Objective(0, direction="min") + initial_zero_reactions = {} + obj_coef = dict() + scores = self.assign_reliability_scores_to_reactions() + for rxn in self.model.reactions: + if abs(solution.fluxes[rxn.id]) < 0.000000001: + initial_zero_reactions[rxn.id] = {">":True,"<":True} + obj_coef[rxn.forward_variable] = 1 + obj_coef[rxn.reverse_variable] = 1 + elif solution.fluxes[rxn.id] > 0.000000001 and rxn.lower_bound <= 0: + output[rxn.id] = [">",[]] + reaction_set.append([rxn,">",solution.fluxes[rxn.id],scores[rxn.id][">"],self.is_core(rxn)]) + initial_zero_reactions[rxn.id] = {"<":True} + obj_coef[rxn.reverse_variable] = 1 + elif solution.fluxes[rxn.id] < -0.000000001 and rxn.upper_bound >= 0: + output[rxn.id] = ["<",[]] + reaction_set.append([rxn,"<",solution.fluxes[rxn.id],scores[rxn.id]["<"],self.is_core(rxn)]) + initial_zero_reactions[rxn.id] = {">":True} + obj_coef[rxn.forward_variable] = 1 + self.model.objective = minimal_deviation_objective + minimal_deviation_objective.set_linear_coefficients(obj_coef) + #Knocking reactions out one at a time and checking for alternative options + for item in reaction_set: + original_bound = None + if item[1] == ">": + original_bound = item[0].upper_bound + item[0].upper_bound = 0 + else: + original_bound = item[0].lower_bound + item[0].lower_bound = 0 + new_solution = self.model.optimize() + result = {"alternatives":[],"coupled":[],"failed":False,"flux":item[2],"score":item[3],"core":item[4]} + output[item[0].id][1].append(result) + if new_solution.status == "optimal": + for secitem in reaction_set: + if secitem != item: + if abs(new_solution.fluxes[secitem[0].id]) < 0.000000001: + result["coupled"].append(secitem) + for rxn in self.model.reactions: + if rxn.id in initial_zero_reactions and abs(new_solution.fluxes[rxn.id]) > 0.000000001: + if new_solution.fluxes[rxn.id] > 0.000000001 and ">" in initial_zero_reactions[rxn.id]: + result["alternatives"].append([rxn,">"]) + elif new_solution.fluxes[rxn.id] < -0.000000001 and "<" in initial_zero_reactions[rxn.id]: + result["alternatives"].append([rxn,"<"]) + else: + result["failed"] = True + if original_bound != None: + if item[1] == ">": + item[0].upper_bound = original_bound + else: + item[0].lower_bound = original_bound + + self.model.objective = original_objective + #Printing output if requested + if print_output: + records = [] + for rxnid in output: + item = output[rxnid] + record = {"id":rxnid,"direction":item[0],"flux":item[1][0]["flux"],"score":item[1][0]["score"],"core":item[1][0]["core"],"equation":self.model.reactions.get_by_id(rxnid).build_reaction_string(use_metabolite_names=True),"coupled":"","alternatives":"","failed":item[1][0]["failed"]} + for subitem in item[1][0]["alternatives"]: + if len(record["alternatives"]): + record["alternatives"] += ";" + record["alternatives"] += subitem[1]+subitem[0].id+":"+subitem[0].build_reaction_string(use_metabolite_names=True) + for subitem in item[1][0]["coupled"]: + if len(record["coupled"]): + record["coupled"] += ";" + record["coupled"] += subitem[1]+subitem[0].id+":"+subitem[0].build_reaction_string(use_metabolite_names=True) + records.append(record) + df = pd.DataFrame.from_records(records) + df.to_csv("nboutput/rxn_analysis/"+label+"-min_rxn_set_analysis.csv",index=False) + return output + + ################################################################################# + # Functions related to biomass sensitivity analysis + ################################################################################# + def find_unproducible_biomass_compounds(self, target_rxn="bio1", ko_list=None): + # Cloning the model because we don't want to modify the original model with this analysis + tempmodel = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + # Getting target reaction and making sure it exists + if target_rxn not in tempmodel.reactions: + logger.critical(target_rxn + " not in model!") + return None + target_rxn_obj = tempmodel.reactions.get_by_id(target_rxn) + tempmodel.objective = target_rxn + original_objective = tempmodel.objective + pkgmgr = MSPackageManager.get_pkg_mgr(tempmodel) + rxn_list = [target_rxn, "rxn05294_c0", "rxn05295_c0", "rxn05296_c0"] + for rxn in rxn_list: + if rxn in tempmodel.reactions: + pkgmgr.getpkg("FlexibleBiomassPkg").build_package( + { + "bio_rxn_id": rxn, + "flex_coefficient": [0, 1], + "use_rna_class": None, + "use_dna_class": None, + "use_protein_class": None, + "use_energy_class": [0, 1], + "add_total_biomass_constraint": False, + } + ) + + # Creating min flex objective + min_flex_obj = tempmodel.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in tempmodel.reactions: + if reaction.id[0:5] == "FLEX_" or reaction.id[0:6] == "energy": + obj_coef[reaction.forward_variable] = 1 + obj_coef[reaction.reverse_variable] = 1 + # Temporarily setting flex objective so I can set coefficients + tempmodel.objective = min_flex_obj + min_flex_obj.set_linear_coefficients(obj_coef) + if not ko_list: + return self.run_biomass_dependency_test( + target_rxn_obj, tempmodel, original_objective, min_flex_obj, rxn_list + ) + else: + output = {} + for item in ko_list: + logger.debug("KO:" + item[0] + item[1]) + if item[0] not in output: + output[item[0]] = {} + if item[0] in tempmodel.reactions: + rxnobj = tempmodel.reactions.get_by_id(item[0]) + if item[1] == ">": + original_bound = rxnobj.upper_bound + rxnobj.upper_bound = 0 + output[item[0]][item[1]] = self.run_biomass_dependency_test( + target_rxn_obj, + tempmodel, + original_objective, + min_flex_obj, + rxn_list, + ) + rxnobj.upper_bound = original_bound + else: + original_bound = rxnobj.lower_bound + rxnobj.lower_bound = 0 + output[item[0]][item[1]] = self.run_biomass_dependency_test( + target_rxn_obj, + tempmodel, + original_objective, + min_flex_obj, + rxn_list, + ) + rxnobj.lower_bound = original_bound + else: + logger.info("Reaction "+item[0]+" not in model during sensitivity analysis!") + output[item[0]][item[1]] = [] + return output + + def run_biomass_dependency_test( + self, target_rxn, tempmodel, original_objective, min_flex_obj, rxn_list + ): + tempmodel.objective = original_objective + objective = tempmodel.slim_optimize() + if objective > 0: + target_rxn.lower_bound = 0.1 + tempmodel.objective = min_flex_obj + solution = tempmodel.optimize() + biocpds = [] + for reaction in tempmodel.reactions: + if reaction.id[0:5] == "FLEX_" and ( + reaction.forward_variable.primal > Zero + or reaction.reverse_variable.primal > Zero + ): + logger.debug("Depends on:" + reaction.id) + label = reaction.id[5:] + for item in rxn_list: + if label[0 : len(item)] == item: + biocpds.append(label[len(item) + 1 :]) + target_rxn.lower_bound = 0 + return biocpds + else: + logger.debug("Cannot grow") + return None + def add_atp_hydrolysis(self, compartment): # Searching for ATP hydrolysis compounds coefs = { diff --git a/modelseedpy/core/msprobability.py b/modelseedpy/core/msprobability.py new file mode 100644 index 00000000..a4f913a0 --- /dev/null +++ b/modelseedpy/core/msprobability.py @@ -0,0 +1,246 @@ +from cobrakbase.core.kbasefba.fbamodel_from_cobra import CobraModelConverter +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.community.mscommunity import MSCommunity +from cobrakbase.core.kbasefba.fbamodel import FBAModel +from cobra.io import write_sbml_model, read_sbml_model +from optlang import Objective +from json import load, dump +from os import path, mkdir +from cobra import Model +import re + + +def add_biomass_objective(megaModel, captured_rxnIDs): + if "bio1" in captured_rxnIDs: + megaModel.objective = Objective( + megaModel.reactions.bio1.flux_expression, direction="max" + ) + else: + # select the most conserved biomass composition + for rxn in megaModel.reactions: + if "biomass" and not "EX_" in rxn.id: + megaModel.objective = Objective(rxn.flux_expression, direction="max") + break + megaModel.solver.update() + return megaModel + + +class MSProbability: + + # TODO - add the parallelization code with an argument flag + @staticmethod + def megaModel( + clades_paths, kbase_api=None, reaction_counts_path=None, numTotal="numMembers",copy_genes=True + ): + # compute the reaction frequency of the models in a given clade + broken_models, megaModels = [], [] + # models_paths = glob(f"{models_path}/*.xml") + for clade, paths in clades_paths.items(): + print(clade+"1") + if not reaction_counts_path: + print(clade+"2") + if not path.exists("reaction_counts"): + mkdir("reaction_counts") + reaction_counts = {} + for index, model_path in enumerate(paths): + print( + f"{model_path}\tindex {index}\t\t\t\t\t\t\t\t\t\t\t\t", end="\r" + ) + try: + model = ( + read_sbml_model(model_path) + if not kbase_api + else kbase_api.get_from_ws(model_path) + ) + except Exception as e: + print("broken", e, model_path) + broken_models.append(model_path) + continue + # print(f"\n{len(model.reactions)} reactions", ) + for rxn in model.reactions: + if rxn.id in reaction_counts: + reaction_counts[rxn.id] += 1 + else: + reaction_counts[rxn.id] = 1 + # TODO storing a list of the rxn objects will save computational effort in the subsequent step + reaction_counts.update({numTotal: len(paths) - len(broken_models)}) + reaction_counts.update( + { + rxnID: (count / reaction_counts[numTotal]) + for rxnID, count in reaction_counts.items() + if rxnID != numTotal + } + ) + with open(f"reaction_counts/{clade}_reactions.json", "w") as jsonOut: + dump(reaction_counts, jsonOut, indent=3) + else: + try: + with open(f"{reaction_counts_path}/{clade}.json", "r") as jsonIn: + reaction_counts = load(jsonIn) + except: + print(f"broken model: {clade}") + continue + + # constructing the probabilistic clade model + megaModel = FBAModel( + { + "id": clade, + "name": f"MegaModel for {clade} from {reaction_counts[numTotal]} members", + } + ) + # megaModel = CobraModelConverter(Model(clade, name=f"MegaModel for {clade} from {reaction_counts[numTotal]} members")).build() + remaining_rxnIDs = set(list(reaction_counts.keys())) + captured_reactions, captured_rxnIDs = [], set() + + print("\n", clade) # , end="\t") + found_rxn_hash = {} + for model_path in paths: + print(f"{model_path}\t\t\t\t\t\t\t\t\t\t\t\t", end="\r") + try: + model = ( + read_sbml_model(model_path) + if not kbase_api + else kbase_api.get_from_ws(model_path) + ) + except Exception as e: + print("broken", e, model_path) + broken_models.append(model_path) + continue + for rxn in model.reactions: + if rxn.id not in found_rxn_hash: + found_rxn_hash[rxn.id] = {"genes":{},"rxn":rxn} + captured_reactions.append(rxn) + elif copy_genes: + for gene in rxn.genes: + if gene.id not in found_rxn_hash[rxn.id]: + found_rxn_hash[rxn.id]["genes"][gene.id] = 1 + if len(found_rxn_hash[rxn.id]["rxn"].gene_reaction_rule) > 0: + found_rxn_hash[rxn.id]["rxn"].gene_reaction_rule += f" or {gene.id}" + else: + found_rxn_hash[rxn.id]["rxn"].gene_reaction_rule = gene.id + if captured_reactions == []: + print(f"\tNo models for {clade} are defined.") + continue + ## add reactions + megaModel.add_reactions(list(captured_reactions)) + for rxn in megaModel.reactions: + rxn.notes["probability"] = reaction_counts[rxn.id] + ## add objective + megaModel = add_biomass_objective(megaModel, captured_rxnIDs) + ## evaluate the model and export + missingRxns = ( + set([rxnID for rxnID in reaction_counts]) + - set([rxn.id for rxn in megaModel.reactions]) + - {numTotal} + ) + if missingRxns != set(): + print("\nmissing reactions: ", missingRxns) + write_sbml_model(megaModel, clade+".xml") + megaModels.append(megaModel) + print("\tfinished") + return megaModels if len(clades_paths) > 1 else megaModels[0] + + @staticmethod + def apply_threshold(model, threshold=0.5): + for rxn in model.reactions: + if rxn.notes["probability"] < threshold: + rxn.lower_bound = rxn.upper_bound = 0 + return model + + # "MS2 - Probabilistic modeling" would create a probabilstic model and optionally an ensemble model from the probabilistic model + + # TODO - develop a separate App from + + # TODO - Construct another code to aggregate functions from all genomes into a single model, where the genes themselves would be mapped with a probability + ## only count genomes with SSOs + ## this would accelerate the construction of making a megaModel + ## specify an ANI cut-off and a closeness to the top-hitting genome + ## yield two models: augmented MAG model with only conserved functions and the probabilistic model with all functions + ## create the KBase module + GitHub repository, after Chris settles on a name + + # TODO - integrate the ensembleFBA modules + repositories + + # TODO - update the CommunityFBA update to run probabilistic models + + @staticmethod + def prFBA( + model_s_, + environment=None, + abundances=None, + min_prob=0.01, + prob_exp=1, + ex_weight=100, + commkinetics=None, + kinetics_coef=1000, + printLP=False, + expression=None + ): + from modelseedpy.community.commhelper import build_from_species_models + from modelseedpy.core.msmodelutl import MSModelUtil + from modelseedpy.fbapkg.elementuptakepkg import ElementUptakePkg + from optlang.symbolics import Zero + + # commkinetics = commkinetics if commkinetics is not None else len(model_s_) > 1 + mdlUtil = MSModelUtil( + model_s_ + if len(model_s_) == 1 + else build_from_species_models( + model_s_, abundances=abundances, commkinetics=commkinetics + ) + ) + if environment is not None: + mdlUtil.add_medium(environment) + # constrain carbon consumption and community composition + elepkg = ElementUptakePkg(mdlUtil.model) + elepkg.build_package({"C": 100}) + ## the total flux through the members proportional to their relative abundances + if not commkinetics and len(model_s_) > 1: + pkgmgr = MSPackageManager.get_pkg_mgr(mdlUtil.model) + MSCommObj = MSCommunity(mdlUtil.model, model_s_) + pkgmgr.getpkg("CommKineticPkg").build_package(kinetics_coef, MSCommObj) + + # constrain the model to 95% of the optimum growth + maxBioSol = mdlUtil.model.slim_optimize() + mdlUtil.add_minimal_objective_cons(maxBioSol * 0.95) + + # weight internal reactions based on their probabilities + ## minimize: sum_r^R ((1-probabilities^prob_exp_r)*flux_r + min_prob) + sum_ex^EX(ex_weight*EX) + coef = {} + for rxn in mdlUtil.model.reactions: + if "rxn" == rxn.id[0:3]: + coef.update( + { + rxn.forward_variable: max( + min_prob, (1 - float(rxn.notes["probability"]) ** prob_exp) + ) + } + ) + coef.update( + { + rxn.reverse_variable: max( + min_prob, (1 - float(rxn.notes["probability"]) ** prob_exp) + ) + } + ) + elif "EX_" == rxn.id[0:3]: + coef.update({rxn.forward_variable: ex_weight}) + coef.update({rxn.reverse_variable: ex_weight}) + mdlUtil.add_objective(Zero, "min", coef) + + print([cons.name for cons in mdlUtil.model.constraints]) + + if printLP: + with open("prFBA.lp", "w") as out: + out.write(str(mdlUtil.model.solver)) + + # simulate the probabilistic model with the respective probabilities + return mdlUtil.model.optimize() + + @staticmethod + def iterative_simulation(time_iterative_data): + pass + + def expressionData(data): + # iterate over the reactions, genes, and keep the highest expression score + # turn off reactions that are below a threshold, ensure that the growth is unchanged, otherwise restore the reaction. + pass diff --git a/modelseedpy/core/mstemplate.py b/modelseedpy/core/mstemplate.py index b1bb1975..c798d1b5 100644 --- a/modelseedpy/core/mstemplate.py +++ b/modelseedpy/core/mstemplate.py @@ -3,20 +3,28 @@ import copy import math from enum import Enum +import pandas as pd +import numpy as np from cobra.core import Metabolite, Reaction from cobra.core.dictlist import DictList from cobra.util import format_long_string +from modelseedpy.core.fbahelper import FBAHelper from modelseedpy.core.msmodel import ( get_direction_from_constraints, get_reaction_constraints_from_direction, get_cmp_token, ) from cobra.core.dictlist import DictList +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union + +# from gevent.libev.corecext import self # from cobrakbase.kbase_object_info import KBaseObjectInfo logger = logging.getLogger(__name__) +SBO_ANNOTATION = "sbo" + class AttrDict(dict): """ @@ -35,6 +43,13 @@ class TemplateReactionType(Enum): GAPFILLING = "gapfilling" +class TemplateBiomassCoefficientType(Enum): + MOLFRACTION = "MOLFRACTION" + MOLSPLIT = "MOLSPLIT" + MULTIPLIER = "MULTIPLIER" + EXACT = "EXACT" + + class MSTemplateMetabolite: def __init__( self, @@ -129,7 +144,7 @@ class MSTemplateSpecies(Metabolite): def __init__( self, comp_cpd_id: str, - charge: int, + charge: float, compartment: str, cpd_id, max_uptake=0, @@ -146,20 +161,34 @@ def __init__( self.cpd_id ) - def to_metabolite(self, index="0"): + def to_metabolite(self, index="0", force=False): """ Create cobra.core.Metabolite instance :param index: compartment index + :@param force: force index :return: cobra.core.Metabolite """ if index is None: index = "" + index = str(index) + + if self.compartment == "e" and index.isnumeric(): + if force: + logger.warning( + f"Forcing numeric index [{index}] to extra cellular compartment not advised" + ) + else: + index = "0" + cpd_id = f"{self.id}{index}" compartment = f"{self.compartment}{index}" - name = f"{self.name}" - if len(str(index)) > 0: - name = f"{self.name} [{compartment}]" + if self.compound == None: + logger.critical( + f"Compound objective associated with [{cpd_id}] is missing from template" + ) + name = f"{self.compound.name} [{compartment}]" metabolite = Metabolite(cpd_id, self.formula, name, self.charge, compartment) + metabolite.notes["modelseed_template_id"] = self.id return metabolite @property @@ -169,8 +198,8 @@ def compound(self): @property def name(self): if self._template_compound: - return self._template_compound.name - return "" + return f"{self._template_compound.name} [{self.compartment}]" + return f"{self.id} [{self.compartment}]" @name.setter def name(self, value): @@ -279,15 +308,17 @@ def compartment(self): def to_reaction(self, model=None, index="0"): if index is None: index = "" + index = str(index) rxn_id = f"{self.id}{index}" compartment = f"{self.compartment}{index}" name = f"{self.name}" metabolites = {} for m, v in self.metabolites.items(): - if model and m.id in model.metabolites: - metabolites[model.metabolites.get_by_id(m.id)] = v + _metabolite = m.to_metabolite(index) + if _metabolite.id in model.metabolites: + metabolites[model.metabolites.get_by_id(_metabolite.id)] = v else: - metabolites[m.to_metabolite(index)] = v + metabolites[_metabolite] = v if len(str(index)) > 0: name = f"{self.name} [{compartment}]" @@ -295,6 +326,7 @@ def to_reaction(self, model=None, index="0"): rxn_id, name, self.subsystem, self.lower_bound, self.upper_bound ) reaction.add_metabolites(metabolites) + reaction.annotation["seed.reaction"] = self.reference_id return reaction @staticmethod @@ -411,7 +443,7 @@ def get_data(self): map(lambda x: "~/complexes/id/" + x.id, self.complexes) ), # 'status': self.status, - "type": self.type, + "type": self.type if type(self.type) is str else self.type.value, } # def build_reaction_string(self, use_metabolite_names=False, use_compartment_names=None): @@ -434,6 +466,431 @@ def get_data(self): # id=self.id, stoichiometry=self.build_reaction_string()) +class MSTemplateBiomassComponent: + def __init__( + self, + metabolite, + comp_class: str, + coefficient: float, + coefficient_type: str, + linked_metabolites, + ): + """ + :param metabolite:MSTemplateMetabolite + :param comp_class:string + :param coefficient:float + :param coefficient_type:string + :param linked_metabolites:{MSTemplateMetabolite:float} + """ + self.id = metabolite.id + "_" + comp_class + self.metabolite = metabolite + self.comp_class = comp_class + self.coefficient = coefficient + self.coefficient_type = coefficient_type + self.linked_metabolites = linked_metabolites + + @staticmethod + def from_dict(d, template): + met_id = d["templatecompcompound_ref"].split("/").pop() + metabolite = template.compcompounds.get_by_id(met_id) + linked_metabolites = {} + for count, item in enumerate(d["linked_compound_refs"]): + l_met_id = item.split("/").pop() + l_metabolite = template.compcompounds.get_by_id(l_met_id) + linked_metabolites[l_metabolite] = d["link_coefficients"][count] + self = MSTemplateBiomassComponent( + metabolite, + d["class"], + d["coefficient"], + d["coefficient_type"], + linked_metabolites, + ) + return self + + def get_data(self): + data = { + "templatecompcompound_ref": "~/compcompounds/id/" + self.metabolite.id, + "class": self.comp_class, + "coefficient": self.coefficient, + "coefficient_type": self.coefficient_type, + "linked_compound_refs": [], + "link_coefficients": [], + } + for met in self.linked_metabolites: + data["linked_compound_refs"].append("~/compcompounds/id/" + met.id) + data["link_coefficients"].append(self.linked_metabolites[met]) + return data + + +class MSTemplateBiomass: + def __init__( + self, + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + energy, + other, + ): + """ + + :param bio_id:string + :param name:string + :param type:string + :param dna:float + :param rna:float + :param protein:float + :param lipid:float + :param cellwall:float + :param cofactor:float + :param energy:float + :param other:float + """ + self.id = bio_id + self.name = name + self.type = type + self.dna = dna + self.rna = rna + self.protein = protein + self.lipid = lipid + self.cellwall = cellwall + self.cofactor = cofactor + self.energy = energy + self.other = other + self.templateBiomassComponents = DictList() + self._template = None + + @staticmethod + def from_table( + filename_or_df, + template, + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + energy, + other, + ): + self = MSTemplateBiomass( + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + energy, + other, + ) + if isinstance(filename_or_df, str): + filename_or_df = pd.read_table(filename_or_df) + for index, row in filename_or_df.iterrows(): + if "biomass_id" not in row: + row["biomass_id"] = "bio1" + if row["biomass_id"] == bio_id: + if "compartment" not in row: + row["compartment"] = "c" + metabolite = template.compcompounds.get_by_id( + f'{row["id"]}_{row["compartment"].lower()}' + ) + linked_mets = {} + if ( + isinstance(row["linked_compounds"], str) + and len(row["linked_compounds"]) > 0 + ): + array = row["linked_compounds"].split("|") + for item in array: + sub_array = item.split(":") + l_met = template.compcompounds.get_by_id( + f'{sub_array[0]}_{row["compartment"].lower()}' + ) + linked_mets[l_met] = float(sub_array[1]) + self.add_biomass_component( + metabolite, + row["class"].lower(), + float(row["coefficient"]), + row["coefficient_type"].upper(), + linked_mets, + ) + return self + + @staticmethod + def from_dict(d, template): + self = MSTemplateBiomass( + d["id"], + d["name"], + d["type"], + d["dna"], + d["rna"], + d["protein"], + d["lipid"], + d["cellwall"], + d["cofactor"], + d["energy"], + d["other"], + ) + for item in d["templateBiomassComponents"]: + biocomp = MSTemplateBiomassComponent.from_dict(item, template) + self.templateBiomassComponents.add(biocomp) + self._template = template + return self + + def add_biomass_component( + self, metabolite, comp_class, coefficient, coefficient_type, linked_mets={} + ): + biocomp = MSTemplateBiomassComponent( + metabolite, comp_class, coefficient, coefficient_type, linked_mets + ) + self.templateBiomassComponents.add(biocomp) + + def get_or_create_metabolite(self, model, baseid, compartment=None, index=None): + fullid = baseid + if compartment: + fullid += "_" + compartment + tempid = fullid + if index: + fullid += index + if fullid in model.metabolites: + return model.metabolites.get_by_id(fullid) + if tempid in self._template.compcompounds: + met = self._template.compcompounds.get_by_id(tempid).to_metabolite(index) + model.add_metabolites([met]) + return met + logger.error( + "Could not find biomass metabolite [%s] in model or template!", + fullid, + ) + + def get_or_create_reaction(self, model, baseid, compartment=None, index=None): + logger.debug(f"{baseid}, {compartment}, {index}") + fullid = baseid + if compartment: + fullid += "_" + compartment + tempid = fullid + if index: + fullid += index + if fullid in model.reactions: + return model.reactions.get_by_id(fullid) + if tempid in self._template.reactions: + rxn = self._template.reactions.get_by_id(tempid).to_reaction(model, index) + model.add_reactions([rxn]) + return rxn + newrxn = Reaction(fullid, fullid, "biomasses", 0, 1000) + model.add_reactions(newrxn) + return newrxn + + def build_biomass(self, model, index="0", classic=False, GC=0.5, add_to_model=True): + types = [ + "cofactor", + "lipid", + "cellwall", + "protein", + "dna", + "rna", + "energy", + "other", + ] + type_abundances = { + "cofactor": self.cofactor, + "lipid": self.lipid, + "cellwall": self.cellwall, + "protein": self.protein, + "dna": self.dna, + "rna": self.rna, + "energy": self.energy, + } + # Creating biomass reaction object + metabolites = {} + biorxn = Reaction(self.id, self.name, "biomasses", 0, 1000) + # Adding standard compounds for DNA, RNA, protein, and biomass + specific_reactions = {"dna": None, "rna": None, "protein": None} + exclusions = {"cpd17041_c": 1, "cpd17042_c": 1, "cpd17043_c": 1} + if not classic and self.dna > 0: + met = self.get_or_create_metabolite(model, "cpd11461", "c", index) + specific_reactions["dna"] = self.get_or_create_reaction( + model, "rxn05294", "c", index + ) + specific_reactions["dna"].name = "DNA synthesis" + if "rxn13783_c" + index in model.reactions: + specific_reactions[ + "dna" + ].gene_reaction_rule = model.reactions.get_by_id( + "rxn13783_c" + index + ).gene_reaction_rule + specific_reactions["dna"].notes[ + "modelseed_complex" + ] = model.reactions.get_by_id("rxn13783_c" + index).notes[ + "modelseed_complex" + ] + model.remove_reactions( + [model.reactions.get_by_id("rxn13783_c" + index)] + ) + specific_reactions["dna"].subtract_metabolites( + specific_reactions["dna"].metabolites + ) + specific_reactions["dna"].add_metabolites({met: 1}) + metabolites[met] = 1 + metabolites[met] = -1 * self.dna + if not classic and self.protein > 0: + met = self.get_or_create_metabolite(model, "cpd11463", "c", index) + specific_reactions["protein"] = self.get_or_create_reaction( + model, "rxn05296", "c", index + ) + specific_reactions["protein"].name = "Protein synthesis" + if "rxn13782_c" + index in model.reactions: + specific_reactions[ + "protein" + ].gene_reaction_rule = model.reactions.get_by_id( + "rxn13782_c" + index + ).gene_reaction_rule + specific_reactions["protein"].notes[ + "modelseed_complex" + ] = model.reactions.get_by_id("rxn13782_c" + index).notes[ + "modelseed_complex" + ] + model.remove_reactions( + [model.reactions.get_by_id("rxn13782_c" + index)] + ) + specific_reactions["protein"].subtract_metabolites( + specific_reactions["protein"].metabolites + ) + specific_reactions["protein"].add_metabolites({met: 1}) + metabolites[met] = -1 * self.protein + if not classic and self.rna > 0: + met = self.get_or_create_metabolite(model, "cpd11462", "c", index) + specific_reactions["rna"] = self.get_or_create_reaction( + model, "rxn05295", "c", index + ) + specific_reactions["rna"].name = "mRNA synthesis" + if "rxn13784_c" + index in model.reactions: + specific_reactions[ + "rna" + ].gene_reaction_rule = model.reactions.get_by_id( + "rxn13784_c" + index + ).gene_reaction_rule + specific_reactions["rna"].notes[ + "modelseed_complex" + ] = model.reactions.get_by_id("rxn13784_c" + index).notes[ + "modelseed_complex" + ] + model.remove_reactions( + [model.reactions.get_by_id("rxn13784_c" + index)] + ) + specific_reactions["rna"].subtract_metabolites( + specific_reactions["rna"].metabolites + ) + specific_reactions["rna"].add_metabolites({met: 1}) + metabolites[met] = -1 * self.rna + bio_type_hash = {} + for type in types: + for comp in self.templateBiomassComponents: + if comp.metabolite.id in exclusions and not classic: + pass + elif type == comp.comp_class: + met = self.get_or_create_metabolite( + model, comp.metabolite.id, None, index + ) + if type not in bio_type_hash: + bio_type_hash[type] = {"items": [], "total_mw": 0} + if FBAHelper.metabolite_mw(met): + bio_type_hash[type]["total_mw"] += ( + -1 * FBAHelper.metabolite_mw(met) * comp.coefficient / 1000 + ) + bio_type_hash[type]["items"].append(comp) + for type in bio_type_hash: + for comp in bio_type_hash[type]["items"]: + coef = None + if ( + comp.coefficient_type == "MOLFRACTION" + or comp.coefficient_type == "MOLSPLIT" + ): + coef = ( + type_abundances[type] / bio_type_hash[type]["total_mw"] + ) * comp.coefficient + elif comp.coefficient_type == "MULTIPLIER": + coef = type_abundances[type] * comp.coefficient + elif comp.coefficient_type == "EXACT": + coef = comp.coefficient + elif comp.coefficient_type == "AT": + coef = ( + 2 + * comp.coefficient + * (1 - GC) + * (type_abundances[type] / bio_type_hash[type]["total_mw"]) + ) + elif comp.coefficient_type == "GC": + coef = ( + 2 + * comp.coefficient + * GC + * (type_abundances[type] / bio_type_hash[type]["total_mw"]) + ) + if coef: + met = model.metabolites.get_by_id(comp.metabolite.id + index) + if type not in ("dna", "protein", "rna") or classic: + if met in metabolites: + metabolites[met] += coef + else: + metabolites[met] = coef + elif not classic: + coef = coef / type_abundances[type] + specific_reactions[type].add_metabolites({met: coef}) + for l_met in comp.linked_metabolites: + met = self.get_or_create_metabolite( + model, l_met.id, None, index + ) + if type not in ("dna", "protein", "rna") or classic: + if met in metabolites: + metabolites[met] += ( + coef * comp.linked_metabolites[l_met] + ) + else: + metabolites[met] = coef * comp.linked_metabolites[l_met] + elif not classic: + specific_reactions[type].add_metabolites( + {met: coef * comp.linked_metabolites[l_met]} + ) + biorxn.annotation[SBO_ANNOTATION] = "SBO:0000629" + biorxn.add_metabolites(metabolites) + if add_to_model: + if biorxn.id in model.reactions: + model.remove_reactions([biorxn.id]) + model.add_reactions([biorxn]) + return biorxn + + def get_data(self): + data = { + "id": self.id, + "name": self.name, + "type": self.type, + "dna": self.dna, + "rna": self.rna, + "protein": self.protein, + "lipid": self.lipid, + "cellwall": self.cellwall, + "cofactor": self.cofactor, + "energy": self.energy, + "other": self.other, + "templateBiomassComponents": [], + } + for comp in self.templateBiomassComponents: + data["templateBiomassComponents"].append(comp.get_data()) + + return data + + class NewModelTemplateRole: def __init__(self, role_id, name, features=None, source="", aliases=None): """ @@ -655,6 +1112,64 @@ def __init__( self.complexes = DictList() self.pathways = DictList() self.subsystems = DictList() + self.drains = None + + ################# Replaces biomass reactions from an input TSV table ############################ + def overwrite_biomass_from_table( + self, + filename_or_df, + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + energy, + other, + ): + if isinstance(filename_or_df, str): + filename_or_df = pd.read_table(filename_or_df) + newbio = MSTemplateBiomass.from_table( + filename_or_df, + self, + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + energy, + other, + ) + if newbio.id in self.biomasses: + self.biomasses.remove(newbio.id) + self.biomasses.add(newbio) + + def add_drain(self, compound_id, lower_bound, upper_bound): + if compound_id not in self.compcompounds: + raise ValueError(f"{compound_id} not in template") + if lower_bound > upper_bound: + raise ValueError( + f"lower_bound: {lower_bound} must not be > than upper_bound: {upper_bound}" + ) + if self.drains is None: + self.drains = {} + self.drains[self.compcompounds.get_by_id(compound_id)] = ( + lower_bound, + upper_bound, + ) + + def add_sink(self, compound_id, default_upper_bound=1000): + self.add_drain(compound_id, 0, default_upper_bound) + + def add_demand(self, compound_id, default_lower_bound=-1000): + self.add_drain(compound_id, default_lower_bound, 0) def add_compartments(self, compartments: list): """ @@ -761,6 +1276,24 @@ def add_comp_compounds(self, comp_compounds: list): x._template_compound.species.add(x) self.compcompounds += comp_compounds + def add_biomasses(self, biomasses: list): + """ + Add biomasses to the template + :param biomasses: + :return: + """ + duplicates = list(filter(lambda x: x.id in self.biomasses, biomasses)) + if len(duplicates) > 0: + logger.error( + "unable to add biomasses [%s] already present in the template", + duplicates, + ) + return None + + for x in biomasses: + x._template = self + self.biomasses += biomasses + def add_reactions(self, reaction_list: list): """ @@ -789,7 +1322,9 @@ def add_reactions(self, reaction_list: list): if cpx.id not in self.complexes: self.add_complexes([cpx]) complex_replace.add(self.complexes.get_by_id(cpx.id)) + x._metabolites = metabolites_replace + x._update_awareness() x.complexes = complex_replace self.reactions += reaction_list @@ -858,7 +1393,7 @@ def get_data(self): } NewModelTemplate; """ - return { + d = { "__VERSION__": self.__VERSION__, "id": self.id, "name": self.name, @@ -871,11 +1406,16 @@ def get_data(self): "roles": list(map(lambda x: x.get_data(), self.roles)), "complexes": list(map(lambda x: x.get_data(), self.complexes)), "reactions": list(map(lambda x: x.get_data(), self.reactions)), - "biomasses": list(self.biomasses), + "biomasses": list(map(lambda x: x.get_data(), self.biomasses)), "pathways": [], "subsystems": [], } + if self.drains is not None: + d["drain_list"] = {c.id: t for c, t in self.drains.items()} + + return d + def _repr_html_(self): """ taken from cobra.core.Model :) @@ -918,6 +1458,63 @@ def _repr_html_(self): num_roles=len(self.roles), num_complexes=len(self.complexes), ) + + def remove_reactions( + self, + reactions: Union[str, Reaction, List[Union[str, Reaction]]], + remove_orphans: bool = False, + ) -> None: + """Remove reactions from the template. + + The change is reverted upon exit when using the model as a context. + + Parameters + ---------- + reactions : list or reaction or str + A list with reactions (`cobra.Reaction`), or their id's, to remove. + Reaction will be placed in a list. Str will be placed in a list and used to + find the reaction in the model. + remove_orphans : bool, optional + Remove orphaned genes and metabolites from the model as + well (default False). + """ + if isinstance(reactions, str) or hasattr(reactions, "id"): + warn("need to pass in a list") + reactions = [reactions] + + for reaction in reactions: + # Make sure the reaction is in the model + try: + reaction = self.reactions[self.reactions.index(reaction)] + except ValueError: + warn(f"{reaction} not in {self}") + + else: + self.reactions.remove(reaction) + + """ for met in reaction._metabolites: + if reaction in met._reaction: + met._reaction.remove(reaction) + if context: + context(partial(met._reaction.add, reaction)) + if remove_orphans and len(met._reaction) == 0: + self.remove_metabolites(met) + + for gene in reaction._genes: + if reaction in gene._reaction: + gene._reaction.remove(reaction) + if context: + context(partial(gene._reaction.add, reaction)) + + if remove_orphans and len(gene._reaction) == 0: + self.genes.remove(gene) + if context: + context(partial(self.genes.add, gene)) + + # remove reference to the reaction in all groups + associated_groups = self.get_associated_groups(reaction) + for group in associated_groups: + group.remove_members(reaction) """ class MSTemplateBuilder: @@ -948,6 +1545,7 @@ def __init__( self.reactions = [] self.info = info self.biochemistry_ref = None + self.drains = {} @staticmethod def from_dict(d, info=None, args=None): @@ -969,6 +1567,7 @@ def from_dict(d, info=None, args=None): builder.reactions = d["reactions"] builder.biochemistry_ref = d["biochemistry_ref"] builder.biomasses = d["biomasses"] + return builder @staticmethod @@ -1074,7 +1673,12 @@ def build(self): ) ) template.biomasses += list( - map(lambda x: AttrDict(x), self.biomasses) - ) # TODO: biomass object + list( + map(lambda x: MSTemplateBiomass.from_dict(x, template), self.biomasses) + ) + ) + + for compound_id, (lb, ub) in self.drains.items(): + template.add_drain(compound_id, lb, ub) return template diff --git a/modelseedpy/core/optlanghelper.py b/modelseedpy/core/optlanghelper.py new file mode 100644 index 00000000..b616ab90 --- /dev/null +++ b/modelseedpy/core/optlanghelper.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Aug 18 10:26:32 2022 + +@author: Andrew Freiburger +""" +from collections import namedtuple +from optlang import Model +from typing import Iterable, Union +from pprint import pprint +import logging + +logger = logging.getLogger(__name__) + +Bounds = namedtuple("Bounds", ("lb", "ub"), defaults=(0, 1000)) +tupVariable = namedtuple( + "tupVariable", + ("name", "bounds", "type"), + defaults=("varName", Bounds(), "continuous"), +) +tupConstraint = namedtuple( + "tupConstraint", + ("name", "bounds", "expr"), + defaults=("consName", Bounds(0, 0), None), +) +tupObjective = namedtuple( + "tupObjective", + ("name", "expr", "direction"), + defaults=("objectiveName", None, "max"), +) + + +def isIterable(term): + try: + iter(term) + if type(term) is not str: + return True + return False + except: + return False + + +def isnumber(obj): + try: + float(obj) + return True + except: + return False + + +def define_term(value): + if isnumber(value): + return {"type": "Number", "value": value} + if isinstance(value, str): + return {"type": "Symbol", "name": value} + print(f"ERROR: The {value} of type {type(value)} is not known.") + + +def get_expression_template(expr): + # print(expr) + if isinstance(expr, list): + return {"type": "Add", "args": []} + return {"type": expr["operation"], "args": []} + + +class OptlangHelper: + + @staticmethod + def add_variables( + var_name: str, var_bounds: (list, tuple), var_type: str = "continuous" + ): + return { + "name": var_name.replace(" ", "_"), + "lb": var_bounds[0], + "ub": var_bounds[1], + "type": var_type, + } + + @staticmethod + def add_constraint(cons_name: str, cons_bounds: (list, tuple), cons_expr: dict): + return { + "name": cons_name.replace(" ", "_"), + "expression": OptlangHelper._define_expression(cons_expr), + "lb": cons_bounds[0], + "ub": cons_bounds[1], + "indicator_variable": None, + "active_when": 1, + } + + @staticmethod + def add_objective(obj_name: str, objective_expr: Union[dict, list], direction: str): + if isinstance(objective_expr, list): + obj_expr = { + "type": "Add", + "args": [ + OptlangHelper._define_expression(expr) for expr in objective_expr + ], + } + elif isinstance(objective_expr, dict): + obj_expr = { + "type": objective_expr["operation"], + "args": [define_term(term) for term in objective_expr["elements"]], + } + return { + "name": obj_name.replace(" ", "_"), + "expression": obj_expr, + "direction": direction, + } + + @staticmethod + def define_model(model_name, variables, constraints, objective, optlang=False): + model = {"name": model_name, "variables": [], "constraints": []} + # pprint(objective) + for var in variables: + if len(var) == 2: + var.append("continuous") + model["variables"].append( + OptlangHelper.add_variables(var[0], var[1], var[2]) + ) + for cons in constraints: + model["constraints"].append( + OptlangHelper.add_constraint(cons[0], cons[1], cons[2]) + ) + # if not isinstance(obj, str): # catches a strange error of the objective name as the objective itself + model["objective"] = OptlangHelper.add_objective( + objective[0], objective[1], objective[2] + ) + if optlang: + return Model.from_json(model) + return model + + @staticmethod + def _define_expression(expr: dict): + expression = get_expression_template(expr) + level1_coef = 0 + for ele in expr["elements"]: + if not isnumber(ele) and not isinstance(ele, str): + # print(expr, ele, end="\r") + arguments = [] + level2_coef = 0 + for ele2 in ele["elements"]: + if not isnumber(ele2) and not isinstance(ele2, str): + # print("recursive ele\t\t", type(ele2), ele2) + arguments.append(OptlangHelper._define_expression(ele2)) + elif isinstance(ele2, str): + arguments.append(define_term(ele2)) + else: + level2_coef += float(ele2) + expression["args"].append(get_expression_template(ele)) + if level2_coef != 0: + arguments.append(define_term(level2_coef)) + expression["args"][-1]["args"] = arguments + elif isinstance(ele, str): + expression["args"].append(define_term(ele)) + else: + level1_coef += float(ele) + if level1_coef != 0: + expression["args"].append(define_term(level1_coef)) + # pprint(expression) + return expression + + @staticmethod + def dot_product(zipped_to_sum, heuns_coefs=None): + # ensure that the lengths are compatible for heun's dot-products + if heuns_coefs is not None: + coefs = ( + heuns_coefs + if isinstance(heuns_coefs, (list, set)) + else heuns_coefs.tolist() + ) + zipped_length = len(zipped_to_sum) + coefs_length = len(coefs) + if zipped_length != coefs_length: + raise IndexError( + f"ERROR: The length of zipped elements {zipped_length}" + f" is unequal to that of coefficients {coefs_length}" + ) + + elements = [] + for index, (term1, term2) in enumerate(zipped_to_sum): + if heuns_coefs is not None: + elements.extend( + [ + {"operation": "Mul", "elements": [heuns_coefs[index], term1]}, + {"operation": "Mul", "elements": [heuns_coefs[index], term2]}, + ] + ) + else: + elements.append({"operation": "Mul", "elements": [term1, term2]}) + return elements diff --git a/modelseedpy/core/rast_client.py b/modelseedpy/core/rast_client.py index 575cf0d4..fc575237 100644 --- a/modelseedpy/core/rast_client.py +++ b/modelseedpy/core/rast_client.py @@ -52,10 +52,8 @@ def __init__(self): ) self.stages = [ {"name": "annotate_proteins_kmer_v2", "kmer_v2_parameters": {}}, - { - "name": "annotate_proteins_kmer_v1", - "kmer_v1_parameters": {"annotate_hypothetical_only": 1}, - }, + # {"name": "annotate_proteins_kmer_v1", + # "kmer_v1_parameters": {"annotate_hypothetical_only": 1},}, { "name": "annotate_proteins_similarity", "similarity_parameters": {"annotate_hypothetical_only": 1}, @@ -84,6 +82,14 @@ def annotate_genome_from_fasta(self, filepath, split="|"): return genome, res + def annotate_protein_sequence(self, protein_id: str, protein_seq: str): + p_features = [{"id": protein_id, "protein_translation": protein_seq}] + return self.f(p_features) + + def annotate_protein_sequences(self, protein_seqs: dict): + p_features = [{"id": protein_id, "protein_translation": protein_seq}] + return self.f(p_features) + def f1(self, protein_id, protein_seq): p_features = [{"id": protein_id, "protein_translation": protein_seq}] return self.f(p_features) diff --git a/modelseedpy/core/rpcclient.py b/modelseedpy/core/rpcclient.py old mode 100644 new mode 100755 diff --git a/modelseedpy/data/FBAReportTemplate.html b/modelseedpy/data/FBAReportTemplate.html new file mode 100644 index 00000000..2ccad425 --- /dev/null +++ b/modelseedpy/data/FBAReportTemplate.html @@ -0,0 +1,213 @@ + + + + + Community FBA + + + + + + +
+ + + + + diff --git a/modelseedpy/data/ModelReportTemplate.html b/modelseedpy/data/ModelReportTemplate.html new file mode 100644 index 00000000..cab60a0b --- /dev/null +++ b/modelseedpy/data/ModelReportTemplate.html @@ -0,0 +1,349 @@ + + + + + ModelSEED Reconstruction + + + + + + +
+ + + + + diff --git a/modelseedpy/data/atp_medias.tsv b/modelseedpy/data/atp_medias.tsv new file mode 100644 index 00000000..53d15048 --- /dev/null +++ b/modelseedpy/data/atp_medias.tsv @@ -0,0 +1,34 @@ +seed Glc.O2 Ac.O2 Etho.O2 Pyr.O2 Glyc.O2 Fum.O2 Succ.O2 Akg.O2 LLac.O2 Dlac.O2 For.O2 Glc Ac Etho Pyr Glyc Fum Succ Akg Llac Dlac For mal-L For.NO2 For.NO3 For.NO Pyr.NO2 Pyr.NO3 Pyr.NO Ac.NO2 Ac.NO3 Ac.NO Glc.DMSO Glc.TMAO Pyr.DMSO Pyr.TMAO Pyr.SO4 Pyr.SO3 H2.CO2 H2.Ac For.SO4.H2 LLac.SO4.H2 For.SO4 LLac.SO4 H2.SO4 empty Light ANME Methane Methanol Methanol.H2 Methanamine.H2 Dimethylamine.H2 Trimethylamine.H2 +EX_cpd00027_e0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00024_e0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00106_e0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00036_e0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00137_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00130_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00159_e0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 +EX_cpd00221_e0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00020_e0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00100_e0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00363_e0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00029_e0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00047_e0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00204_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00011_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00007_e0 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd11640_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 1000 1000 1000 0 0 1000 0 0 0 0 0 1000 1000 1000 1000 +EX_cpd00418_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 1000 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00209_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 1000 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00075_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 1000 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00659_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00528_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd08021_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00811_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00048_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 0 1000 1000 1000 1000 1000 0 0 0 0 0 0 0 0 0 +EX_cpd00081_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd11632_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 +EX_cpd08701_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 0 0 0 0 +EX_cpd01024_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 +EX_cpd00116_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 +EX_cpd00187_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 +EX_cpd00425_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 +EX_cpd00441_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 diff --git a/modelseedpy/fbapkg/__init__.py b/modelseedpy/fbapkg/__init__.py index 6f67c85f..9767fa04 100644 --- a/modelseedpy/fbapkg/__init__.py +++ b/modelseedpy/fbapkg/__init__.py @@ -19,3 +19,5 @@ from modelseedpy.fbapkg.objconstpkg import ObjConstPkg from modelseedpy.fbapkg.changeoptpkg import ChangeOptPkg from modelseedpy.fbapkg.elementuptakepkg import ElementUptakePkg +from modelseedpy.fbapkg.expressionactivationpkg import ExpressionActivationPkg +from modelseedpy.fbapkg.reactionactivationpkg import ReactionActivationPkg diff --git a/modelseedpy/fbapkg/basefbapkg.py b/modelseedpy/fbapkg/basefbapkg.py index 4d0c613c..f5a216f0 100644 --- a/modelseedpy/fbapkg/basefbapkg.py +++ b/modelseedpy/fbapkg/basefbapkg.py @@ -3,13 +3,20 @@ from __future__ import absolute_import import logging -import re -from optlang.symbolics import Zero, add -import json as _json -from cobra.core import Gene, Metabolite, Model, Reaction +import re # !!! import is never used +from optlang.symbolics import Zero, add # !!! add is never used +import json as _json # !!! import is never used +from cobra.core import ( + Gene, + Metabolite, + Model, + Reaction, +) # !!! none of these imports are used from modelseedpy.fbapkg.mspackagemanager import MSPackageManager from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.exceptions import FeasibilityError +logger = logging.getLogger(__name__) # Adding a few exception classes to handle different types of errors class FeasibilityError(Exception): @@ -26,17 +33,26 @@ class BaseFBAPkg: def __init__( self, model, name, variable_types={}, constraint_types={}, reaction_types={} ): - self.model = model - self.modelutl = MSModelUtil(model) + if isinstance(model, MSModelUtil): + self.model = model.model + self.modelutl = model + else: + self.model = model + self.modelutl = MSModelUtil.get(model) + self.name = name + self.pkgmgr = MSPackageManager.get_pkg_mgr(model) if self.pkgmgr is None: self.pkgmgr = MSPackageManager.get_pkg_mgr(model, 1) self.pkgmgr.addpkgobj(self) - self.constraints = dict() - self.variables = dict() - self.parameters = dict() - self.new_reactions = dict() + + self.constraints, self.variables, self.parameters, self.new_reactions = ( + {}, + {}, + {}, + {}, + ) self.variable_types = variable_types self.constraint_types = constraint_types @@ -53,53 +69,72 @@ def validate_parameters(self, params, required, defaults): self.parameters.update(params) # replace defaults with params def clear(self): - objects = [] - for type in self.variables: - for object in self.variables[type]: - objects.append(self.variables[type][object]) - for type in self.constraints: - for object in self.constraints[type]: - objects.append(self.constraints[type][object]) - self.model.remove_cons_vars(objects) - self.variables = {} - self.constraints = {} - - def build_variable(self, type, lower_bound, upper_bound, vartype, object=None): + cobra_objs = [] + for obj_type in self.variables: + for cobra_obj in self.variables[obj_type]: + cobra_objs.append(self.variables[obj_type][cobra_obj]) + self.variables[obj_type] = {} + for obj_type in self.constraints: + for cobra_obj in self.constraints[obj_type]: + cobra_objs.append(self.constraints[obj_type][cobra_obj]) + self.constraints[obj_type] = {} + self.model.remove_cons_vars(cobra_objs) + + def build_variable( + self, obj_type, lower_bound, upper_bound, vartype, cobra_obj=None + ): name = None - if self.variable_types[type] == "none": - count = len(self.variables[type]) + if self.variable_types[obj_type] == "none": + count = len(self.variables[obj_type]) name = str(count + 1) - elif self.variable_types[type] == "string": - name = object + elif self.variable_types[obj_type] == "string": + name = cobra_obj else: - name = object.id - if name not in self.variables[type]: - self.variables[type][name] = self.model.problem.Variable( - name + "_" + type, lb=lower_bound, ub=upper_bound, type=vartype + name = cobra_obj.id + if name not in self.variables[obj_type]: + self.variables[obj_type][name] = self.model.problem.Variable( + name + "_" + obj_type, lb=lower_bound, ub=upper_bound, type=vartype ) - self.model.add_cons_vars(self.variables[type][name]) - return self.variables[type][name] + self.model.add_cons_vars(self.variables[obj_type][name]) + return self.variables[obj_type][name] - def build_constraint(self, type, lower_bound, upper_bound, coef={}, object=None): + def build_constraint( + self, obj_type, lower_bound, upper_bound, coef={}, cobra_obj=None + ): name = None - if self.constraint_types[type] == "none": - count = len(self.constraints[type]) + if self.constraint_types[obj_type] == "none": + count = len(self.constraints[obj_type]) name = str(count + 1) - elif self.constraint_types[type] == "string": - name = object + elif self.constraint_types[obj_type] == "string": + name = cobra_obj else: - name = object.id - if name in self.constraints[type]: - self.model.remove_cons_vars(self.constraints[type][name]) - self.constraints[type][name] = self.model.problem.Constraint( - Zero, lb=lower_bound, ub=upper_bound, name=name + "_" + type + name = cobra_obj.id + if name in self.constraints[obj_type]: + self.model.remove_cons_vars(self.constraints[obj_type][name]) + self.constraints[obj_type][name] = self.model.problem.Constraint( + Zero, lb=lower_bound, ub=upper_bound, name=name + "_" + obj_type ) - self.model.add_cons_vars(self.constraints[type][name]) + self.model.add_cons_vars(self.constraints[obj_type][name]) self.model.solver.update() if len(coef) > 0: - self.constraints[type][name].set_linear_coefficients(coef) + self.constraints[obj_type][name].set_linear_coefficients(coef) self.model.solver.update() - return self.constraints[type][name] + return self.constraints[obj_type][name] + + # Utility functions + def print_lp(self, filename=None): + if filename is None: + filename = self.lp_filename + if filename is not None: + with open(filename + ".lp", "w") as out: + complete_line = "" + for line in str(self.model.solver).splitlines(): + if ":" in line: + if complete_line != "": + out.write(complete_line) + complete_line = "" + else: + complete_line += line def all_variables(self): return self.pkgmgr.all_variables() @@ -118,3 +153,6 @@ def add_constraint_type(self, name, type): self.constraints[name] = dict() if name not in self.constraint_types: self.constraint_types[name] = type + + def current_media(self): + return self.pkgmgr.getpkg("KBaseMediaPkg").current_media diff --git a/modelseedpy/fbapkg/bilevelpkg.py b/modelseedpy/fbapkg/bilevelpkg.py index dc2960bc..cb1cf331 100644 --- a/modelseedpy/fbapkg/bilevelpkg.py +++ b/modelseedpy/fbapkg/bilevelpkg.py @@ -3,8 +3,13 @@ from __future__ import absolute_import import re -from optlang.symbolics import Zero, add -from cobra.core import Gene, Metabolite, Model, Reaction +from optlang.symbolics import Zero, add # !!! Neither import is used +from cobra.core import ( + Gene, + Metabolite, + Model, + Reaction, +) # !!! None of these imports are used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg # Base class for FBA packages @@ -23,51 +28,52 @@ def build_package(self, filter=None, binary_variable_count=0): {}, [], {"binary_variable_count": binary_variable_count} ) print("binary_variable_count:", binary_variable_count) - coefficients = {} - obj_coef = {} - obj = self.model.solver.objective + varhash, coefficients, obj_coef = {}, {}, {} + objective = self.model.solver.objective + # Creating new objective coefficient and bound variables - bound_variables = {} - reactions = self.model.reactions if self.parameters["binary_variable_count"] > 0: for reaction in self.model.reactions: var = self.build_variable("flxcmp", reaction, None) - # Retreiving model data with componenent flux variables - # Using the JSON calls because get_linear_coefficients is REALLY slow + # Retrieving model data with componenent flux variables + # Using the JSON calls because get_linear_coefficients is REALLY slow #!!! get_linear_coefficients is still used? mdldata = self.model.solver.to_json() consthash = {} for const in mdldata["constraints"]: consthash[const["name"]] = const - constraints = list(self.model.solver.constraints) variables = list(self.model.solver.variables) - objterms = obj.get_linear_coefficients(variables) + objterms = objective.get_linear_coefficients(variables) + # Adding binary variables and constraints which should not be included in dual formulation if self.parameters["binary_variable_count"] > 0: for reaction in self.model.reactions: - var = self.build_variable("bflxcmp", reaction, None) + self.build_variable("bflxcmp", reaction, None) + # Now implementing dual variables and constraints - varhash = {} for var in variables: varhash[var.name] = var - for const in constraints: + for const in list(self.model.solver.constraints): var = self.build_variable("dualconst", const, obj_coef) - if ( - var != None - and const.name in consthash - and "expression" in consthash[const.name] - and "args" in consthash[const.name]["expression"] + if all( + [ + var, + const.name in consthash, + "expression" in consthash[const.name], + "args" in consthash[const.name]["expression"], + ] ): for item in consthash[const.name]["expression"]["args"]: - if ( - "args" in item - and len(item["args"]) >= 2 - and item["args"][1]["name"] in varhash + if all( + [ + "args" in item, + len(item["args"]) >= 2, + item["args"][1]["name"] in varhash, + ] ): - if varhash[item["args"][1]["name"]] not in coefficients: - coefficients[varhash[item["args"][1]["name"]]] = {} - coefficients[varhash[item["args"][1]["name"]]][var] = item[ - "args" - ][0]["value"] + var_name = varhash[item["args"][1]["name"]] + if var_name not in coefficients: + coefficients[var_name] = {} + coefficients[var_name][var] = item["args"][0]["value"] for var in variables: if var.type == "continuous": dvar = self.build_variable("duallb", var, obj_coef) @@ -80,95 +86,95 @@ def build_package(self, filter=None, binary_variable_count=0): if var not in coefficients: coefficients[var] = {} coefficients[var][dvar] = 1 - self.build_constraint("dualvar", var, obj, objterms, coefficients) - self.build_constraint("objective", None, obj, objterms, obj_coef) + self.build_constraint("dualvar", var, objective, objterms, coefficients) + self.build_constraint("objective", None, objective, objterms, obj_coef) - def build_variable(self, type, object, obj_coef): - if type == "dualconst": + def build_variable(self, obj_type, cobra_obj, obj_coef): + if obj_type == "dualconst": lb = -1000000 - ub = 1000000 + ub = -lb coef = 0 - if object.lb == None: + if cobra_obj.lb == None: lb = 0 - coef = object.ub - if object.ub == None: + coef = cobra_obj.ub + if cobra_obj.ub == None: ub = 0 - coef = object.lb + coef = cobra_obj.lb var = BaseFBAPkg.build_variable( - self, type, lb, ub, "continuous", object.name + self, obj_type, lb, ub, "continuous", cobra_obj.name ) obj_coef[var] = coef return var if ( - type == "dualub" - ): # Add a constraint that makes this variable zero when binary variable is zero + obj_type == "dualub" + ): # constrain this variable to zero when the binary variable is zero var = BaseFBAPkg.build_variable( - self, type, 0, 1000000, "continuous", object.name + self, obj_type, 0, 1000000, "continuous", cobra_obj.name ) - if re.search("(.+)_(fflxcmp\d+)$", object.name) is not None: - m = re.search("(.+)_(fflxcmp\d+)$", object.name) - bvar = self.variables[m[2]][m[1]] + if re.search("(.+)_(fflxcmp\d+)$", cobra_obj.name) is not None: + match = re.search("(.+)_(fflxcmp\d+)$", cobra_obj.name) + bvar = self.variables[match[2]][match[1]] BaseFBAPkg.build_constraint( - self, "dualbin", None, 0, {var: 1, bvar: -1000000}, object.name + self, "dualbin", None, 0, {var: 1, bvar: -1000000}, cobra_obj.name ) - obj_coef[var] = object.ub + obj_coef[var] = cobra_obj.ub return var - if type == "duallb": + if obj_type == "duallb": var = BaseFBAPkg.build_variable( - self, type, -1000000, 0, "continuous", object.name + self, obj_type, -1000000, 0, "continuous", cobra_obj.name ) - # if re.search('(.+)_(fflxcmp\d+)$', object.name) is not None: + # if re.search('(.+)_(fflxcmp\d+)$', cobra_obj.name) is not None: # m = re.search('(.+)_(fflxcmp\d+)$', metabolite.id) # bvar = self.variables[m[2]][m[1]] - # BaseFBAPkg.build_constraint(self,object.name+"_lbdualbin",None,0,{var:-1,bvar:-1000000},object) - obj_coef[var] = object.lb + # BaseFBAPkg.build_constraint(self,cobra_obj.name+"_lbdualbin",None,0,{var:-1,bvar:-1000000},cobra_obj) + obj_coef[var] = cobra_obj.lb return var - if type == "flxcmp" and self.parameters["binary_variable_count"] > 0: + if obj_type == "flxcmp" and self.parameters["binary_variable_count"] > 0: denominator = 2 ** self.parameters["binary_variable_count"] - 1 coefs = [{}, {}] for i in range(0, self.parameters["binary_variable_count"]): value = 2**i - if object.lower_bound < 0: + if cobra_obj.lower_bound < 0: self.add_variable_type("rflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( self, "rflxcmp" + str(i), 0, - -1 * value * object.lower_bound / denominator, + -1 * value * cobra_obj.lower_bound / denominator, "continuous", - object, + cobra_obj, ) coefs[0][var] = -1 - if object.upper_bound > 0: + if cobra_obj.upper_bound > 0: self.add_variable_type("fflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( self, "fflxcmp" + str(i), 0, - value * object.upper_bound / denominator, + value * cobra_obj.upper_bound / denominator, "continuous", - object, + cobra_obj, ) coefs[1][var] = -1 - if object.lower_bound < 0: + if cobra_obj.lower_bound < 0: # flux - flux_comp_0 - flux_comp_n = 0 - restriction of reverse fluxes by component fluxes self.add_constraint_type("rflxcmpc", "reaction") - coefs[0][object.reverse_variable] = 1 - BaseFBAPkg.build_constraint(self, "rflxcmpc", 0, 0, coefs[0], object) - if object.upper_bound > 0: + coefs[0][cobra_obj.reverse_variable] = 1 + BaseFBAPkg.build_constraint(self, "rflxcmpc", 0, 0, coefs[0], cobra_obj) + if cobra_obj.upper_bound > 0: # flux - flux_comp_0 - flux_comp_n = 0 - restriction of forward fluxes by component fluxes self.add_constraint_type("fflxcmpc", "reaction") - coefs[1][object.forward_variable] = 1 - BaseFBAPkg.build_constraint(self, "fflxcmpc", 0, 0, coefs[1], object) + coefs[1][cobra_obj.forward_variable] = 1 + BaseFBAPkg.build_constraint(self, "fflxcmpc", 0, 0, coefs[1], cobra_obj) return None - if type == "bflxcmp" and self.parameters["binary_variable_count"] > 0: + if obj_type == "bflxcmp" and self.parameters["binary_variable_count"] > 0: for i in range(0, self.parameters["binary_variable_count"]): - if object.lower_bound < 0: + if cobra_obj.lower_bound < 0: self.add_variable_type("brflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( - self, "brflxcmp" + str(i), 0, 1, "binary", object + self, "brflxcmp" + str(i), 0, 1, "binary", cobra_obj ) - othervar = self.variables["rflxcmp" + str(i)][object.id] + othervar = self.variables["rflxcmp" + str(i)][cobra_obj.id] self.add_constraint_type("brflxcmpc" + str(i), "reaction") BaseFBAPkg.build_constraint( self, @@ -176,14 +182,14 @@ def build_variable(self, type, object, obj_coef): None, 0, {othervar: 1, var: -1000}, - object, + cobra_obj, ) - if object.upper_bound > 0: + if cobra_obj.upper_bound > 0: self.add_variable_type("bfflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( - self, "bfflxcmp" + str(i), 0, 1, "binary", object + self, "bfflxcmp" + str(i), 0, 1, "binary", cobra_obj ) - othervar = self.variables["fflxcmp" + str(i)][object.id] + othervar = self.variables["fflxcmp" + str(i)][cobra_obj.id] self.add_constraint_type("bfflxcmpc" + str(i), "reaction") BaseFBAPkg.build_constraint( self, @@ -191,30 +197,30 @@ def build_variable(self, type, object, obj_coef): None, 0, {othervar: 1, var: -1000}, - object, + cobra_obj, ) return None - def build_constraint(self, type, object, objective, objterms, coefficients): - if type == "dualvar": + def build_constraint(self, obj_type, cobra_obj, objective, objterms, coefficients): + if obj_type == "dualvar": coef = {} - lb = 0 - ub = 0 + lb = ub = 0 objsign = 1 if objective.direction == "min": objsign = -1 - if object in objterms: - lb = objterms[object] - ub = objterms[object] - if object in coefficients: - for var in coefficients[object]: - coef[var] = coefficients[object][var] - if object.lb == 0: + if cobra_obj in objterms: + lb = ub = objterms[cobra_obj] + if cobra_obj in coefficients: + for var in coefficients[cobra_obj]: + coef[var] = coefficients[cobra_obj][var] + if cobra_obj.lb == 0: ub = None - elif object.ub == 0: + elif cobra_obj.ub == 0: lb = None - return BaseFBAPkg.build_constraint(self, type, lb, ub, coef, object.name) - elif type == "objective": + return BaseFBAPkg.build_constraint( + self, obj_type, lb, ub, coef, cobra_obj.name + ) + elif obj_type == "objective": coef = {} objsign = 1 if objective.direction == "min": @@ -223,4 +229,6 @@ def build_constraint(self, type, object, objective, objterms, coefficients): coef[var] = objsign * objterms[var] for dvar in coefficients: coef[dvar] = -1 * coefficients[dvar] - return BaseFBAPkg.build_constraint(self, type, 0, 0, coef, "dualobjconst") + return BaseFBAPkg.build_constraint( + self, obj_type, 0, 0, coef, "dualobjconst" + ) diff --git a/modelseedpy/fbapkg/elementuptakepkg.py b/modelseedpy/fbapkg/elementuptakepkg.py index 66e01035..1f61f7a8 100644 --- a/modelseedpy/fbapkg/elementuptakepkg.py +++ b/modelseedpy/fbapkg/elementuptakepkg.py @@ -16,21 +16,37 @@ def __init__(self, model): {"elements": "string"}, ) - def build_package(self, element_limits): + def build_package( + self, element_limits, exception_compounds=[], exception_reactions=[] + ): + # Converting exception compounds list into exception reaction list + self.parameters = { + "element_limits": element_limits, + "exception_compounds": exception_compounds, + "exception_reactions": exception_reactions, + } + exchange_hash = self.modelutl.exchange_hash() + for met in exception_compounds: + if met in exchange_hash: + exception_reactions.append(exchange_hash[met]) + # Now building or rebuilding constraints for element in element_limits: if element not in self.variables["elements"]: self.build_variable(element, element_limits[element]) - self.build_constraint(element) + for element in element_limits: + # This call will first remove existing constraints then build the new constraint + self.build_constraint(element, exception_reactions) def build_variable(self, element, limit): return BaseFBAPkg.build_variable( self, "elements", 0, limit, "continuous", element ) - def build_constraint(self, element): + def build_constraint(self, element, exception_reactions): coef = {self.variables["elements"][element]: -1} - for reaction in self.model.reactions: - if reaction.id[0:3] == "EX_": + rxnlist = self.modelutl.exchange_list() + for reaction in rxnlist: + if reaction not in exception_reactions: total = 0 for metabolite in reaction.metabolites: elements = metabolite.elements diff --git a/modelseedpy/fbapkg/expressionactivationpkg.py b/modelseedpy/fbapkg/expressionactivationpkg.py new file mode 100644 index 00000000..04dda4f8 --- /dev/null +++ b/modelseedpy/fbapkg/expressionactivationpkg.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +import logging + +logger = logging.getLogger(__name__) +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.fbahelper import FBAHelper + +# Base class for FBA packages +class ExpressionActivationPkg(BaseFBAPkg): + def __init__(self, model): + BaseFBAPkg.__init__( + self, + model, + "ExpressionActivation", + {}, + {} + ) + self.pkgmgr.addpkgs(["ReactionActivationPkg"]) + + def build_package(self,on_hash,off_hash,on_coeff=None,off_coeff=None,other_coef=0.1,max_value=0.001): + activation_filter = {} + for rxn in on_hash: + activation_filter[rxn] = 1 + self.pkgmgr.getpkg("ReactionActivationPkg").build_package(rxn_filter=activation_filter,max_value=max_value) + expression_objective = self.model.problem.Objective(0, direction="min") + obj_coef = dict() + for rxn in self.model.reactions: + if rxn.id in on_hash: + coef = on_coeff + if coef == None: + coef = on_hash[rxn.id] + obj_coef[self.pkgmgr.getpkg("ReactionActivationPkg").variables["fra"][rxn.id]] = -1*coef + obj_coef[self.pkgmgr.getpkg("ReactionActivationPkg").variables["rra"][rxn.id]] = -1*coef + elif rxn.id in off_hash: + coef = off_coeff + if coef == None: + coef = off_hash[rxn.id] + obj_coef[rxn.forward_variable] = coef + obj_coef[rxn.reverse_variable] = coef + elif rxn.id[0:3] == "bio" or rxn.id[0:3] == "EX_" or rxn.id[0:3] == "SK_" or rxn.id[0:3] == "DM_": + pass + else: + obj_coef[rxn.forward_variable] = other_coef + obj_coef[rxn.reverse_variable] = other_coef + self.model.objective = expression_objective + expression_objective.set_linear_coefficients(obj_coef) + self.parameters["gfobj"] = self.model.objective \ No newline at end of file diff --git a/modelseedpy/fbapkg/flexiblebiomasspkg.py b/modelseedpy/fbapkg/flexiblebiomasspkg.py index b3185a4d..223f778d 100644 --- a/modelseedpy/fbapkg/flexiblebiomasspkg.py +++ b/modelseedpy/fbapkg/flexiblebiomasspkg.py @@ -3,9 +3,12 @@ from __future__ import absolute_import import logging -from optlang.symbolics import Zero, add -from cobra import Model, Reaction, Metabolite + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! Neither import is ever used +from cobra import Model, Reaction, Metabolite # !!! Model and Metabolite are never used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.msmodelutl import MSModelUtil from modelseedpy.core.fbahelper import FBAHelper classes = { @@ -58,11 +61,12 @@ def build_package(self, parameters): parameters, ["bio_rxn_id"], { - "flex_coefficient": 0.75, + "flex_coefficient": [-0.75, 0.75], "use_rna_class": [-0.75, 0.75], "use_dna_class": [-0.75, 0.75], "use_protein_class": [-0.75, 0.75], "use_energy_class": [-0.1, 0.1], + "add_total_biomass_constraint": True, }, ) if self.parameters["bio_rxn_id"] not in self.model.reactions: @@ -79,305 +83,362 @@ def build_package(self, parameters): "cpd00067": None, "cpd00002": None, } - for metabolite in self.model.metabolites: - for msid in refcpd: - if FBAHelper.modelseed_id_from_cobra_metabolite(metabolite) == msid: - refcpd[msid] = metabolite + # Finding all reference compounds in the model + msid_hash = self.modelutl.msid_hash() + for msid in refcpd: + if msid in msid_hash: + refcpd[msid] = msid_hash[msid][0] + met_class = {} + # Determining class for each metabolite in biomass reaction for metabolite in self.parameters["bio_rxn"].metabolites: - msid = FBAHelper.modelseed_id_from_cobra_metabolite(metabolite) - if msid != "cpd11416": - met_class = "none" - if msid != None: + met_class[metabolite] = None + msid = MSModelUtil.metabolite_msid(metabolite) + if ( + msid != "cpd11416" + and msid != "cpd11463" + and msid != "cpd11462" + and msid != "cpd11461" + and msid != None + ): + if msid in refcpd: + met_class[metabolite] = "refcpd" + else: for curr_class in classes: - if msid in classes[curr_class]: - met_class = curr_class + if ( + self.parameters["use_" + curr_class + "_class"] + and msid in classes[curr_class] + ): + met_class[metabolite] = curr_class class_coef[curr_class][msid] = metabolite + # Eliminating any classes that are incomplete + for curr_class in classes: + for msid in classes[curr_class]: + if msid not in class_coef[curr_class]: + self.parameters["use_" + curr_class + "_class"] = None + break + # Creating FLEX reactions and constraints for unclassified compounds + flexcpds = {} + for metabolite in self.parameters["bio_rxn"].metabolites: + if not met_class[metabolite]: + flexcpds[metabolite] = self.parameters["bio_rxn"].metabolites[ + metabolite + ] + elif ( + met_class[metabolite] != "refcpd" + and not self.parameters["use_" + met_class[metabolite] + "_class"] + ): + flexcpds[metabolite] = self.parameters["bio_rxn"].metabolites[ + metabolite + ] + self.modelutl.add_exchanges_for_metabolites( + flexcpds, + uptake=1000, + excretion=1000, + prefix="FLEX_" + self.parameters["bio_rxn"].id + "_", + prefix_name="Biomass flex for ", + ) + for metabolite in flexcpds: + self.build_constraint(metabolite, "flxcpd") + # Creating metabolite class constraints + for met_class in classes: + if self.parameters["use_" + met_class + "_class"]: + add = 0 + total_coef = 0 + object_stoichiometry = {} + for msid in class_coef[met_class]: + if ( + met_class == "rna" + and msid == "cpd00002" + and "cpd00008" in class_coef["energy"] + ): + object_stoichiometry[class_coef[met_class][msid]] = ( + self.parameters["bio_rxn"].metabolites[ + class_coef[met_class][msid] + ] + + self.parameters["bio_rxn"].metabolites[ + class_coef["energy"]["cpd00008"] + ] + ) + else: + object_stoichiometry[ + class_coef[met_class][msid] + ] = self.parameters["bio_rxn"].metabolites[ + class_coef[met_class][msid] + ] + total_coef += abs(object_stoichiometry[class_coef[met_class][msid]]) if ( - met_class == "none" - or self.class_complete(class_coef, met_class) == 0 - or self.parameters["use_" + met_class + "_class"] == None - ) and msid not in refcpd: - drain_reaction = FBAHelper.add_drain_from_metabolite_id( - self.model, metabolite.id, 1000, 1000, "FLEX_" - ) - if drain_reaction.id not in self.new_reactions: - self.new_reactions[drain_reaction.id] = drain_reaction - self.model.add_reactions([drain_reaction]) - self.build_constraint(metabolite, "flxcpd") - for met_class in class_coef: - add = 0 - total_coef = 0 - object_stoichiometry = {} - for msid in class_coef[met_class]: + (met_class == "rna" or met_class == "dna") + and refcpd["cpd00012"] != None + and refcpd["cpd00001"] != None + ): + add = 1 + object_stoichiometry[refcpd["cpd00012"]] = total_coef + object_stoichiometry[refcpd["cpd00001"]] = total_coef + if met_class == "protein" and refcpd["cpd00001"] != None: + add = 1 + object_stoichiometry[refcpd["cpd00001"]] = total_coef if ( - met_class == "rna" - and msid == "cpd00002" - and "cpd00008" in class_coef["energy"] + met_class == "energy" + and refcpd["cpd00001"] != None + and refcpd["cpd00002"] != None + and refcpd["cpd00067"] != None + and refcpd["cpd00009"] != None ): - object_stoichiometry[class_coef[met_class][msid]] = ( - self.parameters["bio_rxn"].metabolites[ - class_coef[met_class][msid] - ] - + self.parameters["bio_rxn"].metabolites[ - class_coef["energy"]["cpd00008"] - ] - ) - else: - object_stoichiometry[class_coef[met_class][msid]] = self.parameters[ - "bio_rxn" - ].metabolites[class_coef[met_class][msid]] - total_coef += abs(object_stoichiometry[class_coef[met_class][msid]]) - if ( - (met_class == "rna" or met_class == "dna") - and refcpd["cpd00012"] != None - and refcpd["cpd00001"] != None - ): - add = 1 - object_stoichiometry[refcpd["cpd00012"]] = total_coef - object_stoichiometry[refcpd["cpd00001"]] = total_coef - if met_class == "protein" and refcpd["cpd00001"] != None: - add = 1 - object_stoichiometry[refcpd["cpd00001"]] = total_coef - if ( - met_class == "energy" - and refcpd["cpd00001"] != None - and refcpd["cpd00002"] != None - and refcpd["cpd00067"] != None - and refcpd["cpd00009"] != None - ): - add = 1 - object_stoichiometry[refcpd["cpd00001"]] = -1 * total_coef - object_stoichiometry[refcpd["cpd00002"]] = -1 * total_coef - object_stoichiometry[refcpd["cpd00009"]] = total_coef - object_stoichiometry[refcpd["cpd00067"]] = total_coef - if add == 1: - if met_class + "_flex" not in self.new_reactions: - self.new_reactions[met_class + "_flex"] = Reaction( - id=met_class + "_flex", - name=met_class + "_flex", - lower_bound=-1000, - upper_bound=1000, - ) - self.new_reactions[met_class + "_flex"].add_metabolites( - object_stoichiometry + add = 1 + object_stoichiometry[refcpd["cpd00001"]] = -1 * total_coef + object_stoichiometry[refcpd["cpd00002"]] = -1 * total_coef + object_stoichiometry[refcpd["cpd00009"]] = total_coef + object_stoichiometry[refcpd["cpd00067"]] = total_coef + if add == 1: + if met_class + "_flex" not in self.new_reactions: + self.new_reactions[met_class + "_flex"] = Reaction( + id=met_class + "_flex", + name=met_class + "_flex", + lower_bound=-1000, + upper_bound=1000, + ) + self.new_reactions[met_class + "_flex"].add_metabolites( + object_stoichiometry + ) + self.new_reactions[met_class + "_flex"].annotation[ + "sbo" + ] = "SBO:0000627" + self.model.add_reactions( + [self.new_reactions[met_class + "_flex"]] + ) + self.build_constraint( + self.new_reactions[met_class + "_flex"], "flxcls" ) - self.new_reactions[met_class + "_flex"].annotation[ - "sbo" - ] = "SBO:0000627" - self.model.add_reactions([self.new_reactions[met_class + "_flex"]]) - self.build_constraint(self.new_reactions[met_class + "_flex"], "flxcls") - self.build_constraint(self.parameters["bio_rxn"], "flxbio") + if parameters["add_total_biomass_constraint"]: + self.build_constraint(self.parameters["bio_rxn"], "flxbio") - def build_variable(self, object, type): + def build_variable(self, object, type): # !!! can the function be removed? pass - def build_constraint(self, object, type): - element_mass = FBAHelper.elemental_mass() - if type == "flxbio": + def build_constraint(self, cobra_obj, obj_type): + if obj_type == "flxbio": # Sum(MW*(vdrn,for-vdrn,ref)) + Sum(massdiff*(vrxn,for-vrxn,ref)) = 0 coef = {} for metabolite in self.parameters["bio_rxn"].metabolites: - if "FLEX_" + metabolite.id in self.model.reactions: + if ( + "FLEX_" + self.parameters["bio_rxn"].id + "_" + metabolite.id + in self.model.reactions + ): mw = FBAHelper.metabolite_mw(metabolite) sign = -1 if self.parameters["bio_rxn"].metabolites[metabolite] > 0: sign = 1 coef[ self.model.reactions.get_by_id( - "FLEX_" + metabolite.id + "FLEX_" + + self.parameters["bio_rxn"].id + + "_" + + metabolite.id ).forward_variable ] = (sign * mw) coef[ self.model.reactions.get_by_id( - "FLEX_" + metabolite.id + "FLEX_" + + self.parameters["bio_rxn"].id + + "_" + + metabolite.id ).reverse_variable ] = (-1 * sign * mw) for met_class in classes: if met_class + "_flex" in self.model.reactions: massdiff = 0 rxn = self.model.reactions.get_by_id(met_class + "_flex") - for metabolite in rxn.metabolites: - mw = FBAHelper.metabolite_mw(metabolite) - massdiff += rxn.metabolites[metabolite] * mw + for met in rxn.metabolites: + mw = FBAHelper.metabolite_mw(met) + massdiff += rxn.metabolites[met] * mw if abs(massdiff) > 0.00001: coef[rxn.forward_variable] = massdiff - coef[rxn.reverse_variable] = -1 * massdiff - return BaseFBAPkg.build_constraint(self, type, 0, 0, coef, object) - elif type == "flxcpd": - # 0.75 * abs(bio_coef) * vbio - vdrn,for >= 0 - # 0.75 * abs(bio_coef) * vbio - vdrn,rev >= 0 - coef = self.parameters["flex_coefficient"] * abs( - self.parameters["bio_rxn"].metabolites[object] - ) - if coef > 0.75: - coef = 0.75 - BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: coef, - self.model.reactions.get_by_id( - "FLEX_" + object.id - ).forward_variable: -1, - }, - object, - ) - return BaseFBAPkg.build_constraint( - self, - "r" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: coef, - self.model.reactions.get_by_id( - "FLEX_" + object.id - ).reverse_variable: -1, - }, - object, - ) - elif type == "flxcls" and object.id[0:-5] != None: - # 0.75 * vbio - vrxn,for >= 0 - # 0.75 * vbio - vrxn,rev >= 0 - # First deal with the situation where the flux is locked into a particular value relative to biomass + coef[rxn.reverse_variable] = -massdiff + return BaseFBAPkg.build_constraint(self, obj_type, 0, 0, coef, cobra_obj) + elif obj_type == "flxcpd" or obj_type == "flxcls": + first_entry = None + second_entry = None + product = False + biovar = self.parameters["bio_rxn"].forward_variable + object = None const = None - if ( - self.parameters["use_" + object.id[0:-5] + "_class"][0] - == self.parameters["use_" + object.id[0:-5] + "_class"][1] + if obj_type == "flxcpd": + # 0.75 * abs(bio_coef) * vbio - vdrn,for >= 0 + # 0.75 * abs(bio_coef) * vbio - vdrn,rev >= 0 + first_entry = self.parameters["flex_coefficient"][0] * abs( + self.parameters["bio_rxn"].metabolites[cobra_obj] + ) + second_entry = self.parameters["flex_coefficient"][1] * abs( + self.parameters["bio_rxn"].metabolites[cobra_obj] + ) + if self.parameters["bio_rxn"].metabolites[cobra_obj] > 0: + product = True + object = self.model.reactions.get_by_id( + "FLEX_" + self.parameters["bio_rxn"].id + "_" + cobra_obj.id + ) + elif ( + cobra_obj.id[0:-5] == None + or not self.parameters["use_" + cobra_obj.id[0:-5] + "_class"] ): + return None + else: + # 0.75 * vbio - vrxn,for >= 0 + # 0.75 * vbio - vrxn,rev >= 0 + first_entry = self.parameters["use_" + cobra_obj.id[0:-5] + "_class"][0] + second_entry = self.parameters["use_" + cobra_obj.id[0:-5] + "_class"][ + 1 + ] + object = cobra_obj + if first_entry == second_entry: # If the value is positive, lock in the forward variable and set the reverse to zero - if self.parameters["use_" + object.id[0:-5] + "_class"][0] > 0: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - 0, - { - self.parameters[ - "bio_rxn" - ].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][ - 1 - ], - object.forward_variable: -1, - }, - object, - ) - object.lower_bound = 0 + if first_entry > 0: + if product: + const = self.build_constraint( + "f" + obj_type, + 0, + 0, + {biovar: second_entry, object.forward_variable: -1}, + cobra_obj, + ) + object.lower_bound = 0 + else: + const = self.build_constraint( + "f" + obj_type, + 0, + 0, + {biovar: second_entry, object.reverse_variable: -1}, + cobra_obj, + ) + object.upper_bound = 0 # If the value is negative, lock in the reverse variable and set the forward to zero - elif self.parameters["use_" + object.id[0:-5] + "_class"][0] < 0: - const = BaseFBAPkg.build_constraint( - self, - "r" + type, - 0, - 0, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.reverse_variable: -1, - }, - object, - ) - object.upper_bound = 0 + elif first_entry < 0: + if product: + const = self.build_constraint( + "r" + obj_type, + 0, + 0, + {biovar: -first_entry, object.reverse_variable: -1}, + cobra_obj, + ) + object.upper_bound = 0 + else: + const = self.build_constraint( + "r" + obj_type, + 0, + 0, + {biovar: -first_entry, object.forward_variable: -1}, + cobra_obj, + ) + object.lower_bound = 0 # If the value is zero, lock both variables to zero - if self.parameters["use_" + object.id[0:-5] + "_class"][0] == 0: + if first_entry == 0: object.lower_bound = 0 object.upper_bound = 0 - elif self.parameters["use_" + object.id[0:-5] + "_class"][1] >= 0: - if self.parameters["use_" + object.id[0:-5] + "_class"][0] >= 0: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters[ - "bio_rxn" - ].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][ - 1 - ], - object.forward_variable: -1, - }, - object, - ) - BaseFBAPkg.build_constraint( - self, - "r" + type, + elif second_entry >= 0: + if first_entry >= 0: + if product: + const = BaseFBAPkg.build_constraint( + self, + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.forward_variable: -1}, + cobra_obj, + ) + object.lower_bound = 0 + if first_entry > 0: + BaseFBAPkg.build_constraint( + self, + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.forward_variable: 1}, + cobra_obj, + ) + else: + const = BaseFBAPkg.build_constraint( + self, + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.reverse_variable: -1}, + cobra_obj, + ) + object.upper_bound = 0 + if first_entry > 0: + BaseFBAPkg.build_constraint( + self, + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.reverse_variable: 1}, + cobra_obj, + ) + else: + if product: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.forward_variable: -1}, + cobra_obj, + ) + self.build_constraint( + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.reverse_variable: -1}, + cobra_obj, + ) + else: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.reverse_variable: -1}, + cobra_obj, + ) + self.build_constraint( + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.forward_variable: -1}, + cobra_obj, + ) + else: + if second_entry < 0: + if product: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.reverse_variable: 1}, + cobra_obj, + ) + else: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.forward_variable: 1}, + cobra_obj, + ) + if product: + self.build_constraint( + "r" + obj_type, 0, None, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.forward_variable: 1, - }, - object, + {biovar: -first_entry, object.reverse_variable: -1}, + cobra_obj, ) object.lower_bound = 0 else: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters[ - "bio_rxn" - ].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][ - 1 - ], - object.forward_variable: -1, - }, - object, - ) - BaseFBAPkg.build_constraint( - self, - "r" + type, + self.build_constraint( + "r" + obj_type, 0, None, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.reverse_variable: -1, - }, - object, + {biovar: -first_entry, object.forward_variable: -1}, + cobra_obj, ) - else: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][1], - object.reverse_variable: 1, - }, - object, - ) - BaseFBAPkg.build_constraint( - self, - "r" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.reverse_variable: -1, - }, - object, - ) - object.upper_bound = 0 + object.upper_bound = 0 return const - - def class_complete(self, class_coef, met_class): - for msid in classes[met_class]: - if msid not in class_coef[met_class]: - return 0 - return 1 diff --git a/modelseedpy/fbapkg/fluxfittingpkg.py b/modelseedpy/fbapkg/fluxfittingpkg.py old mode 100644 new mode 100755 index f4f8f05d..810f2567 --- a/modelseedpy/fbapkg/fluxfittingpkg.py +++ b/modelseedpy/fbapkg/fluxfittingpkg.py @@ -3,7 +3,9 @@ from __future__ import absolute_import import logging -from optlang.symbolics import Zero, add + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! Zero is never used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg # Base class for FBA packages @@ -25,11 +27,10 @@ def build_package(self, parameters): "rescale_vfit_by_flux": True, }, ) - if self.parameters["totalflux"] == 0: + if self.parameters["totalflux"] == 1: self.pkgmgr.getpkg("RevBinPkg", 1).build_package( self.parameters["target_flux"] ) - else: self.pkgmgr.getpkg("TotalFluxPkg", 1).build_package( self.parameters["target_flux"] ) @@ -39,7 +40,7 @@ def build_package(self, parameters): rxnobj = self.model.reactions.get_by_id(rxnid) var = self.build_variable(rxnobj) objvars.append(var**2) - const = self.build_constraint(rxnobj) + self.build_constraint(rxnobj) if self.parameters["set_objective"] == 1: self.model.objective = self.model.problem.Objective( add(objvars), direction="min", sloppy=True @@ -47,24 +48,28 @@ def build_package(self, parameters): def build_variable(self, object): return BaseFBAPkg.build_variable( - self, "vfit", -1000, 1000, "continuous", object + self, "vfit", -100000, 100000, "continuous", object ) - def build_constraint(self, object): + def build_constraint(self, cobra_obj): # vfit(i) = flux(i) - v(i) - if object.id in self.parameters["target_flux"]: - flux = self.parameters["target_flux"][object.id] + if cobra_obj.id in self.parameters["target_flux"]: + flux = self.parameters["target_flux"][cobra_obj.id] vfitcoef = 1 # if self.parameters["rescale_vfit_by_flux"] == True: # if flux != None and abs(flux) > 0: # vfitcoef = vfitcoef*flux#Multiply coef by fit flux which rescales by flux # else: # vfitcoef = vfitcoef*self.parameters["default_rescaling"]#Multiply coef by fit flux which rescales by flux - coef = {self.variables["vfit"][object.id]: vfitcoef} + coef = {self.variables["vfit"][cobra_obj.id]: vfitcoef} if self.parameters["totalflux"] == 0: - coef[object.forward_variable] = 1 - coef[object.reverse_variable] = -1 + coef[cobra_obj.forward_variable] = 1 + coef[cobra_obj.reverse_variable] = -1 else: - coef[self.pkgmgr.getpkg("TotalFluxPkg").variables["tf"][object.id]] = 1 + coef[ + self.pkgmgr.getpkg("TotalFluxPkg").variables["tf"][cobra_obj.id] + ] = 1 # !!! the total flux package does not return anything flux = abs(flux) - return BaseFBAPkg.build_constraint(self, "vfitc", flux, flux, coef, object) + return BaseFBAPkg.build_constraint( + self, "vfitc", flux, flux, coef, cobra_obj + ) diff --git a/modelseedpy/fbapkg/gapfillingpkg.py b/modelseedpy/fbapkg/gapfillingpkg.py index 1ccc98f3..5dab108b 100644 --- a/modelseedpy/fbapkg/gapfillingpkg.py +++ b/modelseedpy/fbapkg/gapfillingpkg.py @@ -3,364 +3,44 @@ from __future__ import absolute_import import logging +import sys import re import json from optlang.symbolics import Zero, add from cobra import Model, Reaction, Metabolite +from cobra.io import ( + load_json_model, + save_json_model, + load_matlab_model, + save_matlab_model, + read_sbml_model, + write_sbml_model, +) from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg from modelseedpy.core.fbahelper import FBAHelper logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO -default_blacklist = [ - "rxn12985", - "rxn00238", - "rxn07058", - "rxn05305", - "rxn00154", - "rxn09037", - "rxn10643", - "rxn11317", - "rxn05254", - "rxn05257", - "rxn05258", - "rxn05259", - "rxn05264", - "rxn05268", - "rxn05269", - "rxn05270", - "rxn05271", - "rxn05272", - "rxn05273", - "rxn05274", - "rxn05275", - "rxn05276", - "rxn05277", - "rxn05278", - "rxn05279", - "rxn05280", - "rxn05281", - "rxn05282", - "rxn05283", - "rxn05284", - "rxn05285", - "rxn05286", - "rxn05963", - "rxn05964", - "rxn05971", - "rxn05989", - "rxn05990", - "rxn06041", - "rxn06042", - "rxn06043", - "rxn06044", - "rxn06045", - "rxn06046", - "rxn06079", - "rxn06080", - "rxn06081", - "rxn06086", - "rxn06087", - "rxn06088", - "rxn06089", - "rxn06090", - "rxn06091", - "rxn06092", - "rxn06138", - "rxn06139", - "rxn06140", - "rxn06141", - "rxn06145", - "rxn06217", - "rxn06218", - "rxn06219", - "rxn06220", - "rxn06221", - "rxn06222", - "rxn06223", - "rxn06235", - "rxn06362", - "rxn06368", - "rxn06378", - "rxn06474", - "rxn06475", - "rxn06502", - "rxn06562", - "rxn06569", - "rxn06604", - "rxn06702", - "rxn06706", - "rxn06715", - "rxn06803", - "rxn06811", - "rxn06812", - "rxn06850", - "rxn06901", - "rxn06971", - "rxn06999", - "rxn07123", - "rxn07172", - "rxn07254", - "rxn07255", - "rxn07269", - "rxn07451", - "rxn09037", - "rxn10018", - "rxn10077", - "rxn10096", - "rxn10097", - "rxn10098", - "rxn10099", - "rxn10101", - "rxn10102", - "rxn10103", - "rxn10104", - "rxn10105", - "rxn10106", - "rxn10107", - "rxn10109", - "rxn10111", - "rxn10403", - "rxn10410", - "rxn10416", - "rxn11313", - "rxn11316", - "rxn11318", - "rxn11353", - "rxn05224", - "rxn05795", - "rxn05796", - "rxn05797", - "rxn05798", - "rxn05799", - "rxn05801", - "rxn05802", - "rxn05803", - "rxn05804", - "rxn05805", - "rxn05806", - "rxn05808", - "rxn05812", - "rxn05815", - "rxn05832", - "rxn05836", - "rxn05851", - "rxn05857", - "rxn05869", - "rxn05870", - "rxn05884", - "rxn05888", - "rxn05896", - "rxn05898", - "rxn05900", - "rxn05903", - "rxn05904", - "rxn05905", - "rxn05911", - "rxn05921", - "rxn05925", - "rxn05936", - "rxn05947", - "rxn05956", - "rxn05959", - "rxn05960", - "rxn05980", - "rxn05991", - "rxn05992", - "rxn05999", - "rxn06001", - "rxn06014", - "rxn06017", - "rxn06021", - "rxn06026", - "rxn06027", - "rxn06034", - "rxn06048", - "rxn06052", - "rxn06053", - "rxn06054", - "rxn06057", - "rxn06059", - "rxn06061", - "rxn06102", - "rxn06103", - "rxn06127", - "rxn06128", - "rxn06129", - "rxn06130", - "rxn06131", - "rxn06132", - "rxn06137", - "rxn06146", - "rxn06161", - "rxn06167", - "rxn06172", - "rxn06174", - "rxn06175", - "rxn06187", - "rxn06189", - "rxn06203", - "rxn06204", - "rxn06246", - "rxn06261", - "rxn06265", - "rxn06266", - "rxn06286", - "rxn06291", - "rxn06294", - "rxn06310", - "rxn06320", - "rxn06327", - "rxn06334", - "rxn06337", - "rxn06339", - "rxn06342", - "rxn06343", - "rxn06350", - "rxn06352", - "rxn06358", - "rxn06361", - "rxn06369", - "rxn06380", - "rxn06395", - "rxn06415", - "rxn06419", - "rxn06420", - "rxn06421", - "rxn06423", - "rxn06450", - "rxn06457", - "rxn06463", - "rxn06464", - "rxn06466", - "rxn06471", - "rxn06482", - "rxn06483", - "rxn06486", - "rxn06492", - "rxn06497", - "rxn06498", - "rxn06501", - "rxn06505", - "rxn06506", - "rxn06521", - "rxn06534", - "rxn06580", - "rxn06585", - "rxn06593", - "rxn06609", - "rxn06613", - "rxn06654", - "rxn06667", - "rxn06676", - "rxn06693", - "rxn06730", - "rxn06746", - "rxn06762", - "rxn06779", - "rxn06790", - "rxn06791", - "rxn06792", - "rxn06793", - "rxn06794", - "rxn06795", - "rxn06796", - "rxn06797", - "rxn06821", - "rxn06826", - "rxn06827", - "rxn06829", - "rxn06839", - "rxn06841", - "rxn06842", - "rxn06851", - "rxn06866", - "rxn06867", - "rxn06873", - "rxn06885", - "rxn06891", - "rxn06892", - "rxn06896", - "rxn06938", - "rxn06939", - "rxn06944", - "rxn06951", - "rxn06952", - "rxn06955", - "rxn06957", - "rxn06960", - "rxn06964", - "rxn06965", - "rxn07086", - "rxn07097", - "rxn07103", - "rxn07104", - "rxn07105", - "rxn07106", - "rxn07107", - "rxn07109", - "rxn07119", - "rxn07179", - "rxn07186", - "rxn07187", - "rxn07188", - "rxn07195", - "rxn07196", - "rxn07197", - "rxn07198", - "rxn07201", - "rxn07205", - "rxn07206", - "rxn07210", - "rxn07244", - "rxn07245", - "rxn07253", - "rxn07275", - "rxn07299", - "rxn07302", - "rxn07651", - "rxn07723", - "rxn07736", - "rxn07878", - "rxn11417", - "rxn11582", - "rxn11593", - "rxn11597", - "rxn11615", - "rxn11617", - "rxn11619", - "rxn11620", - "rxn11624", - "rxn11626", - "rxn11638", - "rxn11648", - "rxn11651", - "rxn11665", - "rxn11666", - "rxn11667", - "rxn11698", - "rxn11983", - "rxn11986", - "rxn11994", - "rxn12006", - "rxn12007", - "rxn12014", - "rxn12017", - "rxn12022", - "rxn12160", - "rxn12161", - "rxn01267", - "rxn05294", - "rxn04656", -] +base_blacklist = {}#{"rxn00062":"="} +zero_threshold = 1e-8 class GapfillingPkg(BaseFBAPkg): """ """ def __init__(self, model): - BaseFBAPkg.__init__(self, model, "gapfilling", {}, {}) + BaseFBAPkg.__init__( + self, + model, + "gapfilling", + {"rmaxf": "reaction", "fmaxf": "reaction"}, + {"rmaxfc": "reaction", "fmaxfc": "reaction"}, + ) self.gapfilling_penalties = None + self.maxflux_variables = {} def build(self, template, minimum_objective=0.01): parameters = { @@ -394,7 +74,7 @@ def build_package(self, parameters): parameters, [], { - "auto_sink": ["cpd02701", "cpd11416", "cpd15302"], + "auto_sink": ["cpd01042","cpd02701", "cpd11416", "cpd15302", "cpd03091"], "extend_with_template": 1, "model_penalty": 1, "default_gapfill_models": [], @@ -405,12 +85,17 @@ def build_package(self, parameters): "gapfill_all_indecies_with_default_templates": 1, "gapfill_all_indecies_with_default_models": 1, "default_excretion": 100, - "default_uptake": -100, + "default_uptake": 0, "minimum_obj": 0.01, - "set_objective": 1, - "blacklist": default_blacklist, - }, + "minimize_exchanges": False, + "blacklist": [], + "base_media": None, + "objective":self.model.objective, + "base_media_target_element": "C", + "default_exchange_penalty":0.1 + } ) + # Adding model reactions to original reaction list self.parameters["original_reactions"] = [] for rxn in self.model.reactions: @@ -422,17 +107,29 @@ def build_package(self, parameters): self.parameters["original_reactions"].append([rxn, "<"]) if rxn.upper_bound > 0: self.parameters["original_reactions"].append([rxn, ">"]) + # Adding constraint for target reaction - self.parameters["origobj"] = self.model.objective - self.pkgmgr.getpkg("ObjConstPkg").build_package( - self.parameters["minimum_obj"], None - ) - + self.set_base_objective(self.parameters["objective"],self.parameters["minimum_obj"]) + + #Extending model + self.extend_model_for_gapfilling() + + #Computing gapfilling penalties + self.compute_gapfilling_penalties() + + # Creating the gapfilling objective function and saving it under self.parameters["gfobj"] + self.build_gapfilling_objective_function() + + def extend_model_for_gapfilling(self): + """Extends the model for gapfilling + Parameters + ---------- + None + """ # Determine all indecies that should be gapfilled indexhash = self.get_model_index_hash() - # Iterating over all indecies with more than 10 intracellular compounds: - self.gapfilling_penalties = dict() + self.base_gapfilling_penalties = dict() for index in indexhash: if indexhash[index] > 10: if index == "none": @@ -440,12 +137,12 @@ def build_package(self, parameters): new_penalties = self.extend_model_with_template_for_gapfilling( template, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) for gfmdl in self.parameters["default_gapfill_models"]: new_penalties = self.extend_model_with_model_for_gapfilling( gfmdl, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if index in self.parameters["gapfill_templates_by_index"]: for template in self.parameters["gapfill_templates_by_index"][ index @@ -453,67 +150,158 @@ def build_package(self, parameters): new_penalties = self.extend_model_with_template_for_gapfilling( template, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if index in self.parameters["gapfill_models_by_index"]: for gfmdl in self.parameters["gapfill_models_by_index"]: new_penalties = self.extend_model_with_model_for_gapfilling( gfmdl, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if self.parameters["gapfill_all_indecies_with_default_templates"]: for template in self.parameters["default_gapfill_templates"]: new_penalties = self.extend_model_with_template_for_gapfilling( template, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if self.parameters["gapfill_all_indecies_with_default_models"]: for gfmdl in self.parameters["default_gapfill_models"]: new_penalties = self.extend_model_with_model_for_gapfilling( gfmdl, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) + + def compute_gapfilling_penalties(self,exclusion_solution=None,reaction_scores=None): + """Builds gapfilling objective function for model + Parameters + ---------- + exclusion_solution : [string rxn_id,string direction] + Solution with reaction directions that should be removed from the gapfilling objective function + """ + self.gapfilling_penalties = self.base_gapfilling_penalties.copy() + #Removing exclusion solution reactions from penalties dictionary + if exclusion_solution: + for item in exclusion_solution: + if item[0] in self.gapfilling_penalties: + if item[1] == ">" and "forward" in self.gapfilling_penalties[item[0]]: + del self.gapfilling_penalties[item[0]]["forward"] + elif item[1] == "<" and "reverse" in self.gapfilling_penalties[item[0]]: + del self.gapfilling_penalties[item[0]]["reverse"] # Rescaling penalties by reaction scores and saving genes + if not reaction_scores: + reaction_scores = self.parameters["reaction_scores"] for reaction in self.gapfilling_penalties: rxnid = reaction.split("_")[0] - if rxnid in self.parameters["reaction_scores"]: - highest_score = 0 - for gene in self.parameters["reaction_scores"][rxnid]: - if highest_score < self.parameters["reaction_scores"][rxnid][gene]: - highest_score = self.parameters["reaction_scores"][rxnid][gene] - factor = 0.1 - if "reverse" in self.gapfilling_penalties[reaction]: - self.gapfilling_penalties[reaction]["reverse"] = ( - factor * self.gapfilling_penalties[reaction]["reverse"] + highest_score = 0 + if rxnid in reaction_scores: + for gene in reaction_scores[rxnid]: + score = None + if isinstance(reaction_scores[rxnid][gene], dict): + score = reaction_scores[rxnid][gene]["probability"] + else: + score = reaction_scores[rxnid][gene] + if highest_score < score: + highest_score = score + factor = 2-highest_score + if "reverse" in self.gapfilling_penalties[reaction]: + self.gapfilling_penalties[reaction]["reverse"] = ( + factor * self.gapfilling_penalties[reaction]["reverse"] + ) + if "forward" in self.gapfilling_penalties[reaction]: + self.gapfilling_penalties[reaction]["forward"] = ( + factor * self.gapfilling_penalties[reaction]["forward"] + ) + + def build_gapfilling_objective_function(self): + """Builds gapfilling objective function for model + """ + reaction_objective = self.model.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in self.model.reactions: + if reaction.id in self.gapfilling_penalties: + # Minimizing gapfilled reactions + if "reverse" in self.gapfilling_penalties[reaction.id]: + obj_coef[reaction.reverse_variable] = abs( + self.gapfilling_penalties[reaction.id]["reverse"] + ) + if "forward" in self.gapfilling_penalties[reaction.id]: + obj_coef[reaction.forward_variable] = abs( + self.gapfilling_penalties[reaction.id]["forward"] + ) + else: + obj_coef[reaction.forward_variable] = 0 + obj_coef[reaction.reverse_variable] = 0 + self.model.objective = reaction_objective + reaction_objective.set_linear_coefficients(obj_coef) + self.parameters["gfobj"] = self.model.objective + + def create_max_flux_variables(self): + """Creates max flux variables needed for the global gapfilling formulation + Parameters + ---------- + """ + for reaction in self.model.reactions: + if reaction.id in self.gapfilling_penalties: + if "reverse" in self.gapfilling_penalties[reaction.id]: + self.maxflux_variables[reaction.id][ + "reverse" + ] = self.build_variable( + "rmaxf", 0, 1000, "continuous", reaction + ) + self.build_constraint( + "rmaxfc", + 0, + None, + { + reaction.reverse_variable: -1, + self.maxflux_variables[reaction.id]["reverse"]: 1, + }, + reaction, ) - if "forward" in self.gapfilling_penalties[reaction]: - self.gapfilling_penalties[reaction]["forward"] = ( - factor * self.gapfilling_penalties[reaction]["forward"] + if "forward" in self.gapfilling_penalties[reaction.id]: + self.maxflux_variables[reaction.id][ + "forward" + ] = self.build_variable( + "fmaxf", 0, 1000, "continuous", reaction + ) + self.build_constraint( + "fmaxfc", + 0, + None, + { + reaction.forward_variable: -1, + self.maxflux_variables[reaction.id]["forward"]: 1, + }, + reaction, ) - self.model.solver.update() - if self.parameters["set_objective"] == 1: - reaction_objective = self.model.problem.Objective(Zero, direction="min") - obj_coef = dict() - for reaction in self.model.reactions: - if reaction.id in self.gapfilling_penalties: - # Minimizing gapfilled reactions - if "reverse" in self.gapfilling_penalties[reaction.id]: - obj_coef[reaction.reverse_variable] = abs( - self.gapfilling_penalties[reaction.id]["reverse"] - ) - # elif default_penalty != 0: - # obj_coef[reaction.reverse_variable] = 0 - if "forward" in self.gapfilling_penalties[reaction.id]: - obj_coef[reaction.forward_variable] = abs( - self.gapfilling_penalties[reaction.id]["forward"] - ) - # elif default_penalty != 0: - # obj_coef[reaction.forward_variable] = 0 - else: - obj_coef[reaction.forward_variable] = 0 - obj_coef[reaction.reverse_variable] = 0 - self.model.objective = reaction_objective - reaction_objective.set_linear_coefficients(obj_coef) + def set_base_objective(self,objective,minobjective): + """Sets the base objective for the model + Parameters + ---------- + objective : string | model.objective + ID of reaction to be maximized as the objective or model objective object + minobjective : float + Minimal objective value to be used + """ + #Setting the objective based on the objective argument + if isinstance(objective, str): + self.model.objective = self.model.reactions.get_by_id(objective).flux_expression + self.model.objective.direction = "max" + else: + self.model.objective = objective + #Setting original objective field + self.original_objective = self.model.objective + #Setting minimal objective constraint + self.pkgmgr.getpkg("ObjConstPkg").clear() + if minobjective: + if self.model.objective.direction == "max": + self.pkgmgr.getpkg("ObjConstPkg").build_package( + minobjective, None + ) + else: + self.pkgmgr.getpkg("ObjConstPkg").build_package( + None, minobjective + ) def extend_model_with_model_for_gapfilling(self, source_model, index): new_metabolites = {} @@ -550,6 +338,11 @@ def extend_model_with_model_for_gapfilling(self, source_model, index): if re.search("(.+)_([a-z])\d+$", modelreaction.id) != None: m = re.search("(.+)_([a-z])\d+$", modelreaction.id) if m[1] not in self.parameters["blacklist"]: + if m[1] in base_blacklist: + if base_blacklist[m[1]] == ">" or base_blacklist[m[1]] == "=": + cobra_reaction.upper_bound = 0 + if base_blacklist[m[1]] == "<" or base_blacklist[m[1]] == "=": + cobra_reaction.lower_bound = 0 cobra_reaction = modelreaction.copy() cobra_reaction.id = groups[1] + "_" + groups[2] + index if ( @@ -672,13 +465,24 @@ def extend_model_with_template_for_gapfilling(self, template, index): new_exchange, new_demand = self.extend_model_with_template_metabolites( template, index ) - + for template_reaction in template.reactions: if template_reaction.reference_id in self.parameters["blacklist"]: continue cobra_reaction = self.convert_template_reaction( template_reaction, index, template, 1 ) # TODO: move function out + if template_reaction.reference_id in base_blacklist: + if ( + base_blacklist[template_reaction.reference_id] == ">" + or base_blacklist[template_reaction.reference_id] == "=" + ): + cobra_reaction.upper_bound = 0 + if ( + base_blacklist[template_reaction.reference_id] == "<" + or base_blacklist[template_reaction.reference_id] == "=" + ): + cobra_reaction.lower_bound = 0 new_penalties[cobra_reaction.id] = dict() if ( cobra_reaction.id not in self.model.reactions @@ -718,7 +522,7 @@ def extend_model_with_template_for_gapfilling(self, template, index): self.parameters["default_excretion"], ) for ex in exchanges: - new_penalties[ex.id] = {"added": 1, "reverse": 1, "forward": 1} + new_penalties[ex.id] = {"added": 1, "reverse": self.parameters["default_exchange_penalty"], "forward": self.parameters["default_exchange_penalty"]} # Only run this on new demands so we don't readd for all exchanges exchanges = self.modelutl.add_exchanges_for_metabolites( @@ -728,7 +532,7 @@ def extend_model_with_template_for_gapfilling(self, template, index): "DM_", ) for ex in exchanges: - new_penalties[ex.id] = {"added": 1, "reverse": 1, "forward": 1} + new_penalties[ex.id] = {"added": 1, "reverse": self.parameters["default_exchange_penalty"], "forward": self.parameters["default_exchange_penalty"]} # Adding all new reactions to the model at once (much faster than one at a time) self.model.add_reactions(new_reactions.values()) @@ -802,6 +606,19 @@ def convert_template_reaction( return cobra_reaction + def set_media(self, media): + if self.parameters["base_media"]: + reaction_exceptions = [] + for mediacpd in media.mediacompounds: + if not self.parameters["base_media"].find_mediacpd(mediacpd.id): + ex_hash = mediacpd.get_mdl_exchange_hash(self.modelutl) + for mdlcpd in ex_hash: + reaction_exceptions.append(ex_hash[mdlcpd]) + self.modelutl.pkgmgr.getpkg("ElementUptakePkg").build_package( + {self.parameters["base_media_target_element"]:1}, exception_reactions=reaction_exceptions + ) + self.modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(media, self.parameters["default_uptake"], self.parameters["default_excretion"]) + def binary_check_gapfilling_solution(self, solution=None, flux_values=None): if solution is None: solution = self.compute_gapfilled_solution() @@ -843,12 +660,12 @@ def knockout_gf_reactions_outside_solution(self, solution=None, flux_values=None if rxnobj.id in self.gapfilling_penalties: if ( "reverse" in self.gapfilling_penalties[rxnobj.id] - and flux_values[rxnobj.id]["reverse"] <= Zero + and flux_values[rxnobj.id]["reverse"] <= zero_threshold ): rxnobj.lower_bound = 0 if ( "forward" in self.gapfilling_penalties[rxnobj.id] - and flux_values[rxnobj.id]["forward"] <= Zero + and flux_values[rxnobj.id]["forward"] <= zero_threshold ): rxnobj.upper_bound = 0 rxnobj.update_variable_bounds() @@ -869,7 +686,7 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): with self.model: # Setting all gapfilled reactions not in the solution to zero self.knockout_gf_reactions_outside_solution(solution) - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 + self.reset_objective_minimum(0,False) for condition in condition_list: condition["change"] = True filtered_list = self.modelutl.reaction_expansion_test( @@ -879,7 +696,7 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): condition["change"] = False if len(filtered_list) > 0: if max_iterations > 0: - print("Gapfilling test failed " + str(11 - max_iterations)) + logger.warning("Gapfilling test failed " + str(11 - max_iterations)) # Forcing filtered reactions to zero for item in filtered_list: if item[1] == ">": @@ -887,9 +704,7 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): else: self.model.reactions.get_by_id(item[0].id).lower_bound = 0 # Restoring lower bound on biomass constraint - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"][ - "1" - ].lb = self.parameters["minimum_obj"] + self.reset_objective_minimum(self.parameters["minimum_obj"]) # Reoptimizing self.model.optimize() return self.run_test_conditions( @@ -898,43 +713,110 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): return None return solution - def filter_database_based_on_tests(self, test_conditions): - filetered_list = [] - with self.model: - rxnlist = [] - for reaction in self.model.reactions: - if reaction.id in self.gapfilling_penalties: - if "reverse" in self.gapfilling_penalties[reaction.id]: - rxnlist.append([reaction, "<"]) - if "forward" in self.gapfilling_penalties[reaction.id]: - rxnlist.append([reaction, ">"]) - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 - filtered_list = self.modelutl.reaction_expansion_test( - rxnlist, test_conditions - ) + def test_gapfill_database(self): + self.reset_objective_minimum(0,False) + self.model.objective = self.original_objective + self.test_solution = self.model.optimize() + logger.info( + "Objective with gapfill database:" + + str(self.test_solution.objective_value) + + "; min objective:" + + str(self.parameters["minimum_obj"]) + ) + self.reset_objective_minimum(self.parameters["minimum_obj"]) + self.model.objective = self.parameters["gfobj"] + if self.test_solution.objective_value < self.parameters["minimum_obj"] or self.test_solution.status == 'infeasible': + return False + return True + + def reset_objective_minimum(self, min_objective,reset_params=True): + if reset_params and min_objective != 0: + self.parameters["minimum_obj"] = min_objective + if "1" not in self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]: + self.pkgmgr.getpkg("ObjConstPkg").build_package(min_objective, None) + if min_objective == 0: + if self.parameters["minimum_obj"] > 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 + if self.parameters["minimum_obj"] < 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].ub = 0 + else: + if min_objective > 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = min_objective + if min_objective < 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].ub = min_objective + + def filter_database_based_on_tests(self,test_conditions,growth_conditions=[],base_filter=None,base_target="rxn00062_c0",base_filter_only=False,all_noncore=True): + #Saving the current media + current_media = self.current_media() + #Clearing element uptake constraints + self.pkgmgr.getpkg("ElementUptakePkg").clear() + # Setting the minimal growth constraint to zero + self.reset_objective_minimum(0,False) + # Applying base filter + base_filter_list = [] + if base_filter != None: + for media_id in base_filter: + if base_target in base_filter[media_id]: + for threshold in base_filter[media_id][base_target]: + for rxn_id in base_filter[media_id][base_target][threshold]: + for direction in base_filter[media_id][base_target][threshold][rxn_id]: + if rxn_id in self.model.reactions: + rxnobj = self.model.reactions.get_by_id(rxn_id) + base_filter_list.append([rxnobj,direction]) + if direction == ">": + rxnobj.upper_bound = 0 + else: + rxnobj.lower_bound = 0 + # Filtering the database of any reactions that violate the specified tests + filtered_list = [] + if not base_filter_only: + with self.model: + rxnlist = [] + for reaction in self.model.reactions: + if reaction.id in self.gapfilling_penalties: + if "reverse" in self.gapfilling_penalties[reaction.id]: + rxnlist.append([reaction, "<"]) + if "forward" in self.gapfilling_penalties[reaction.id]: + rxnlist.append([reaction, ">"]) + elif all_noncore and not self.modelutl.is_core(reaction): + if reaction.lower_bound < 0: + rxnlist.append([reaction, "<"]) + if reaction.upper_bound > 0: + rxnlist.append([reaction, ">"]) + filtered_list = self.modelutl.reaction_expansion_test( + rxnlist, test_conditions + ) + #Adding base filter reactions to model + if base_filter != None: + gf_filter_att = self.modelutl.get_attributes("gf_filter", {}) + for media_id in base_filter: + if media_id not in gf_filter_att: + gf_filter_att[media_id] = {} + if base_target in base_filter[media_id]: + if base_target not in gf_filter_att[media_id]: + gf_filter_att[media_id][base_target] = {} + for threshold in base_filter[media_id][base_target]: + if threshold not in gf_filter_att[media_id][base_target]: + gf_filter_att[media_id][base_target][threshold] = {} + for rxn_id in base_filter[media_id][base_target][threshold]: + if rxn_id not in gf_filter_att[media_id][base_target][threshold]: + gf_filter_att[media_id][base_target][threshold][rxn_id] = {} + for direction in base_filter[media_id][base_target][threshold][rxn_id]: + if direction not in gf_filter_att[media_id][base_target][threshold][rxn_id]: + gf_filter_att[media_id][base_target][threshold][rxn_id][direction] = {} + gf_filter_att[media_id][base_target][threshold][rxn_id][direction][direction] = base_filter[media_id][base_target][threshold][rxn_id][direction] # Now constraining filtered reactions to zero for item in filtered_list: - logger.debug("Filtering:", item[0].id, item[1]) + logger.debug("Filtering:" + item[0].id + item[1]) if item[1] == ">": self.model.reactions.get_by_id(item[0].id).upper_bound = 0 else: self.model.reactions.get_by_id(item[0].id).lower_bound = 0 # Now testing if the gapfilling minimum objective can still be achieved - gfobj = self.model.objective - self.model.objective = self.parameters["origobj"] - solution = self.model.optimize() - # Restoring the minimum objective constraint - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = self.parameters[ - "minimum_obj" - ] - print( - "Objective after filtering:", - solution.objective_value, - "; min objective:", - self.parameters["minimum_obj"], - ) - if solution.objective_value < self.parameters["minimum_obj"]: + if not self.test_gapfill_database(): # Now we need to restore a minimal set of filtered reactions such that we permit the minimum objective to be reached + # Restoring the minimum objective constraint + self.reset_objective_minimum(self.parameters["minimum_obj"]) new_objective = self.model.problem.Objective(Zero, direction="min") filterobjcoef = dict() for item in filtered_list: @@ -945,7 +827,6 @@ def filter_database_based_on_tests(self, test_conditions): else: filterobjcoef[rxn.reverse_variable] = item[3] rxn.lower_bound = item[2] - self.model.objective = new_objective new_objective.set_linear_coefficients(filterobjcoef) solution = self.model.optimize() @@ -967,22 +848,24 @@ def filter_database_based_on_tests(self, test_conditions): else: count += -1 rxn.lower_bound = 0 - print("Reactions unfiltered:", count) + logger.debug("Reactions unfiltered:" + str(count)) # Checking for model reactions that can be removed to enable all tests to pass - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 + self.reset_objective_minimum(0,False) filtered_list = self.modelutl.reaction_expansion_test( - self.parameters["original_reactions"], test_conditions + self.parameters["original_reactions"], test_conditions,positive_growth=growth_conditions ) for item in filtered_list: - logger.debug("Filtering:", item[0].id, item[1]) + logger.debug("Filtering:" + item[0].id + item[1]) if item[1] == ">": self.model.reactions.get_by_id(item[0].id).upper_bound = 0 else: self.model.reactions.get_by_id(item[0].id).lower_bound = 0 - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"][ - "1" - ].lb = self.parameters["minimum_obj"] - self.model.objective = gfobj + # Restoring gapfilling objective function and minimal objective constraint + self.reset_objective_minimum(self.parameters["minimum_obj"]) + self.model.objective = self.parameters["gfobj"] + if current_media: + self.set_media(current_media) + return True def compute_gapfilled_solution(self, flux_values=None): if flux_values is None: @@ -991,19 +874,23 @@ def compute_gapfilled_solution(self, flux_values=None): for reaction in self.model.reactions: if reaction.id in self.gapfilling_penalties: if ( - flux_values[reaction.id]["forward"] > Zero + flux_values[reaction.id]["forward"] > zero_threshold and "forward" in self.gapfilling_penalties[reaction.id] ): if "added" in self.gapfilling_penalties[reaction.id]: + logger.debug(f"New gapfilled reaction: {reaction.id} >") output["new"][reaction.id] = ">" else: + logger.debug(f"Reversed gapfilled reaction: {reaction.id} >") output["reversed"][reaction.id] = ">" elif ( - flux_values[reaction.id]["reverse"] > Zero + flux_values[reaction.id]["reverse"] > zero_threshold and "reverse" in self.gapfilling_penalties[reaction.id] ): if "added" in self.gapfilling_penalties[reaction.id]: + logger.debug(f"New gapfilled reaction: {reaction.id} <") output["new"][reaction.id] = "<" else: + logger.debug(f"Reversed gapfilled reaction: {reaction.id} <") output["reversed"][reaction.id] = "<" return output diff --git a/modelseedpy/fbapkg/kbasemediapkg.py b/modelseedpy/fbapkg/kbasemediapkg.py index 4dbf0779..92525b30 100644 --- a/modelseedpy/fbapkg/kbasemediapkg.py +++ b/modelseedpy/fbapkg/kbasemediapkg.py @@ -16,6 +16,7 @@ class KBaseMediaPkg(BaseFBAPkg): def __init__(self, model): BaseFBAPkg.__init__(self, model, "kbase media", {}, {}) + self.current_media = None def build_package( self, media_or_parameters, default_uptake=None, default_excretion=None @@ -40,14 +41,21 @@ def build_package( self.parameters["default_uptake"] = 0 if self.parameters["default_excretion"] is None: self.parameters["default_excretion"] = 100 - if self.parameters["media"] is None and self.parameters["default_uptake"] == 0: + self.current_media = self.parameters["media"] + if ( + self.parameters["media"] and self.parameters["media"].name == "Complete" + ) and self.parameters["default_uptake"] == 0: self.parameters["default_uptake"] = 100 # First initializing all exchanges to default uptake and excretion exchange_list = self.modelutl.exchange_list() for reaction in exchange_list: - reaction.lower_bound = -1 * self.parameters["default_uptake"] - reaction.upper_bound = self.parameters["default_excretion"] + if -1 * self.parameters["default_uptake"] > reaction.upper_bound: + reaction.upper_bound = self.parameters["default_excretion"] + reaction.lower_bound = -1 * self.parameters["default_uptake"] + else: + reaction.lower_bound = -1 * self.parameters["default_uptake"] + reaction.upper_bound = self.parameters["default_excretion"] # Now constraining exchanges for specific compounds specified in the media if self.parameters["media"]: diff --git a/modelseedpy/fbapkg/proteomefittingpkg.py b/modelseedpy/fbapkg/proteomefittingpkg.py index 469efc08..3aedacb5 100644 --- a/modelseedpy/fbapkg/proteomefittingpkg.py +++ b/modelseedpy/fbapkg/proteomefittingpkg.py @@ -7,7 +7,7 @@ from optlang.symbolics import Zero, add from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg from modelseedpy.core.fbahelper import FBAHelper -from modelseedpy.multiomics.msexpression import MSExpression, GENOME, MODEL, COLUMN_NORM +from modelseedpy.multiomics.msexpression import MSExpression # Options for default behavior LOWEST = 10 @@ -45,7 +45,7 @@ def build_package(self, parameters): ) objvars = [] # Converting genome proteome to reaction proteome if necessary - if self.parameters["proteome"].type == GENOME: + if self.parameters["proteome"].type == "genome": self.parameters["proteome"] = self.parameters[ "proteome" ].build_reaction_expression( @@ -123,7 +123,7 @@ def build_constraint(self, object, type): # kvfit(i) = kapp(i)*ProtCoef*Prot(i) - v(i) # Pulling expression value for selected condition and reaction expval = self.parameters["proteome"].get_value( - object.id, self.parameters["condition"], COLUMN_NORM + object.id, self.parameters["condition"], "column_norm" ) if expval is None and self.parameters["default_expression"] is not None: if self.parameters["default_expression"] == LOWEST: diff --git a/modelseedpy/fbapkg/reactionactivationpkg.py b/modelseedpy/fbapkg/reactionactivationpkg.py new file mode 100644 index 00000000..f43bac06 --- /dev/null +++ b/modelseedpy/fbapkg/reactionactivationpkg.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +import logging + +logger = logging.getLogger(__name__) +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.fbahelper import FBAHelper + +# Base class for FBA packages +class ReactionActivationPkg(BaseFBAPkg): + def __init__(self, model): + BaseFBAPkg.__init__( + self, + model, + "ReactionActivation", + {"fra": "reaction", "rra": "reaction"}, + { + "fra": "reaction", + "rra": "reaction" + } + ) + + def build_package(self, rxn_filter=None,max_value=0.001): + self.pkgmgr.getpkg("RevBinPkg").build_package(filter=rxn_filter) + for rxn in self.model.reactions: + # Checking that reaction passes input filter if one is provided + if rxn_filter == None: + self.build_variable(rxn,max_value) + self.build_constraint(rxn) + elif rxn.id in rxn_filter: + self.build_variable(rxn,max_value) + self.build_constraint(rxn) + + def build_variable(self, cobra_obj,max_value): + variable = BaseFBAPkg.build_variable(self, "fra", 0,max_value, "continuous", cobra_obj) + variable = BaseFBAPkg.build_variable(self, "rra", 0,max_value, "continuous", cobra_obj) + return variable + + def build_constraint(self, cobra_obj): + constraint = None + if cobra_obj.id not in self.constraints["fra"]: + constraint = BaseFBAPkg.build_constraint( + self, + "fra", + None, + 0, + { + self.variables["fra"][cobra_obj.id]: 1, + cobra_obj.forward_variable: -1, + }, + cobra_obj, + ) + if cobra_obj.id not in self.constraints["rra"]: + constraint = BaseFBAPkg.build_constraint( + self, + "rra", + None, + 0, + { + self.variables["rra"][cobra_obj.id]: 1, + cobra_obj.reverse_variable: -1 + }, + cobra_obj, + ) + return constraint \ No newline at end of file diff --git a/modelseedpy/fbapkg/reactionusepkg.py b/modelseedpy/fbapkg/reactionusepkg.py index c68c9a44..f3e17bc9 100644 --- a/modelseedpy/fbapkg/reactionusepkg.py +++ b/modelseedpy/fbapkg/reactionusepkg.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import - import logging -from optlang.symbolics import Zero, add + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! add is never used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.fbahelper import FBAHelper # Base class for FBA packages class ReactionUsePkg(BaseFBAPkg): @@ -22,62 +24,70 @@ def __init__(self, model): }, ) - def build_package(self, filter=None, reversibility=0): - for reaction in self.model.reactions: + def build_package(self, rxn_filter=None, reversibility=False): + for rxn in self.model.reactions: # Checking that reaction passes input filter if one is provided - if filter == None: - self.build_variable(reaction, "=") - self.build_constraint(reaction, reversibility) - elif reaction.id in filter: - self.build_variable(reaction, filter[reaction.id]) - self.build_constraint(reaction, reversibility) + if rxn_filter == None: + self.build_variable(rxn, "=") + self.build_constraint(rxn, reversibility) + elif rxn.id in rxn_filter: + self.build_variable(rxn, rxn_filter[rxn.id]) + self.build_constraint(rxn, reversibility) - def build_variable(self, object, direction): + def build_variable(self, cobra_obj, direction): variable = None if ( (direction == ">" or direction == "=") - and object.upper_bound > 0 - and object.id not in self.variables["fu"] + and cobra_obj.upper_bound > 0 + and cobra_obj.id not in self.variables["fu"] ): - variable = BaseFBAPkg.build_variable(self, "fu", 0, 1, "binary", object) + variable = BaseFBAPkg.build_variable(self, "fu", 0, 1, "binary", cobra_obj) if ( (direction == "<" or direction == "=") - and object.lower_bound < 0 - and object.id not in self.variables["ru"] + and cobra_obj.lower_bound < 0 + and cobra_obj.id not in self.variables["ru"] ): - variable = BaseFBAPkg.build_variable(self, "ru", 0, 1, "binary", object) + variable = BaseFBAPkg.build_variable(self, "ru", 0, 1, "binary", cobra_obj) return variable - def build_constraint(self, object, reversibility): + def build_constraint(self, cobra_obj, reversibility): constraint = None if ( - object.id not in self.constraints["fu"] - and object.id in self.variables["fu"] + cobra_obj.id not in self.constraints["fu"] + and cobra_obj.id in self.variables["fu"] ): constraint = BaseFBAPkg.build_constraint( self, "fu", 0, None, - {self.variables["fu"][object.id]: 1000, object.forward_variable: -1}, - object, + { + self.variables["fu"][cobra_obj.id]: 1000, + cobra_obj.forward_variable: -1, + }, + cobra_obj, ) if ( - object.id not in self.constraints["ru"] - and object.id in self.variables["ru"] + cobra_obj.id not in self.constraints["ru"] + and cobra_obj.id in self.variables["ru"] ): constraint = BaseFBAPkg.build_constraint( self, "ru", 0, None, - {self.variables["ru"][object.id]: 1000, object.reverse_variable: -1}, - object, + { + self.variables["ru"][cobra_obj.id]: 1000, + cobra_obj.reverse_variable: -1, + }, + cobra_obj, ) - if ( - reversibility == 1 - and object.id in self.variables["ru"] - and object.id in self.variables["fu"] + if all( + [ + reversibility, + cobra_obj.id in self.variables["ru"], + cobra_obj.id in self.variables["fu"], + ] ): constraint = BaseFBAPkg.build_constraint( self, @@ -85,24 +95,25 @@ def build_constraint(self, object, reversibility): None, 1, { - self.variables["ru"][object.id]: 1, - self.variables["fu"][object.id]: 1, + self.variables["ru"][cobra_obj.id]: 1, + self.variables["fu"][cobra_obj.id]: 1, }, - object, + cobra_obj, ) return constraint def build_exclusion_constraint(self, flux_values=None): - if flux_values == None: - flux_values = FBAHelper.compute_flux_values_from_variables(self.model) + flux_values = flux_values or FBAHelper.compute_flux_values_from_variables( + self.model + ) count = len(self.constraints["exclusion"]) solution_coef = {} solution_size = 0 - for rxnid in flux_values: - if flux_values[rxnid] > Zero: + for rxnid, flux in flux_values.items(): + if flux > Zero: solution_size += 1 solution_coef[self.variables["fu"][rxnid]] = 1 - elif flux_values[rxnid] < -1 * Zero: + elif flux < -1 * Zero: solution_size += 1 solution_coef[self.variables["ru"][rxnid]] = 1 if len(solution_coef) > 0: diff --git a/modelseedpy/multiomics/msexpression.py b/modelseedpy/multiomics/msexpression.py index 02453e34..c73a7ead 100644 --- a/modelseedpy/multiomics/msexpression.py +++ b/modelseedpy/multiomics/msexpression.py @@ -1,25 +1,18 @@ # -*- coding: utf-8 -*- import logging +import pandas as pd import re import copy from cobra.core.dictlist import DictList -from cobra.core.gene import Gene, ast2str, eval_gpr, parse_gpr +from cobra.core.gene import Gene, ast2str, eval_gpr, parse_gpr, GPR from ast import And, BitAnd, BitOr, BoolOp, Expression, Name, NodeTransformer, Or from modelseedpy.core.msgenome import MSGenome, MSFeature -# Types of expression data -GENOME = 10 -MODEL = 20 - -# Types of normalization -COLUMN_NORM = 10 - logger = logging.getLogger(__name__) - def compute_gene_score(expr, values, default): - if isinstance(expr, Expression): + if isinstance(expr, (Expression, GPR)): return compute_gene_score(expr.body, values, default) elif isinstance(expr, Name): if expr.id in values: @@ -29,16 +22,21 @@ def compute_gene_score(expr, values, default): elif isinstance(expr, BoolOp): op = expr.op if isinstance(op, Or): - total = 0 + total = None for subexpr in expr.values: - total += compute_gene_score(subexpr, values, default) + value = compute_gene_score(subexpr, values, default) + if value != None: + if total == None: + total = 0 + total += value return total elif isinstance(op, And): least = None for subexpr in expr.values: value = compute_gene_score(subexpr, values, default) - if least == None or value < least: - least = value + if value != None: + if least == None or value < least: + least = value return least else: raise TypeError("unsupported operation " + op.__class__.__name__) @@ -49,12 +47,22 @@ def compute_gene_score(expr, values, default): class MSCondition: - def __init__(self, id): + def __init__(self, id,parent): self.id = id self.column_sum = None self.feature_count = None self.lowest = None - + self.parent = parent + + def value_at_zscore(self,zscore,normalization=None): + array = [] + for feature in self.parent.features: + value = feature.get_value(self,normalization) + if value != None: + array.append(value) + mean = sum(array) / len(array) + std_dev = (sum([(x - mean) ** 2 for x in array]) / len(array)) ** 0.5 + return mean + (zscore * std_dev) class MSExpressionFeature: def __init__(self, feature, parent): @@ -63,23 +71,32 @@ def __init__(self, feature, parent): self.values = {} self.parent = parent - def add_value(self, condition, value): + def add_value(self, condition, value,collision_policy="add"):#Could also choose overwrit if condition in self.values: - condition.feature_count += -1 - condition.column_sum += -1 * value + if self.values[condition] != None: + condition.column_sum += -1 * self.values[condition] + if collision_policy == "add": + if self.values[condition] == None: + if value != None: + self.values[condition] = value + elif value != None: + self.values[condition] += value + else: + self.values[condition] = self.values[condition] logger.warning( - "Overwriting value " + collision_policy+" value " + str(self.values[condition]) - + " with " + + " to " + str(value) + " in feature " - + self.feature.id - ) - if condition.lowest is None or condition.lowest > value: - condition.lowest = value - condition.feature_count += 1 - condition.column_sum += value - self.values[condition] = value + + self.feature.id) + else: + condition.feature_count += 1 + self.values[condition] = value + if self.values[condition] != None: + condition.column_sum += self.values[condition] + if condition.lowest is None or condition.lowest > self.values[condition]: + condition.lowest = self.values[condition] def get_value(self, condition, normalization=None): if isinstance(condition, str): @@ -94,7 +111,7 @@ def get_value(self, condition, normalization=None): "Condition " + condition.id + " has no value in " + self.feature.id ) return None - if normalization == COLUMN_NORM: + if normalization == "column_norm" and self.values[condition] != None: return self.values[condition] / condition.column_sum return self.values[condition] @@ -107,8 +124,8 @@ def __init__(self, type): self.conditions = DictList() @staticmethod - def from_gene_feature_file(filename, genome=None, create_missing_features=False): - expression = MSExpression(GENOME) + def from_gene_feature_file(filename, genome=None, create_missing_features=False,ignore_columns=[],description_column=None,sep="\t"): + expression = MSExpression("genome") if genome == None: expression.object = MSGenome() create_missing_features = True @@ -119,31 +136,41 @@ def from_gene_feature_file(filename, genome=None, create_missing_features=False) data = file.read() lines = data.split("\n") conditions = None + description_index = None + cond_indeces = [] for line in lines: if conditions == None: conditions = [] headers = line.split("\t") for i in range(1, len(headers)): - if headers[i] not in expression.conditions: - conditions.append(MSCondition(headers[i])) - expression.conditions.append(conditions[i - 1]) - else: - conditions.append(self.conditions.get_by_id(headers[i])) - conditions[i - 1].column_sum = 0 - conditions[i - 1].feature_count = 0 + if headers[i] == description_column: + description_index = i + print("Description column:",description_index) + elif headers[i] not in ignore_columns: + conditions.append(headers[i]) + cond_indeces.append(i) + if headers[i] not in expression.conditions: + expression.conditions.append(MSCondition(headers[i],expression)) + else: + conditions.append(self.conditions.get_by_id(headers[i])) + expression.conditions.get_by_id(headers[i]).column_sum = 0 + expression.conditions.get_by_id(headers[i]).feature_count = 0 else: array = line.split("\t") - protfeature = expression.add_feature(array[0], create_missing_features) + description = None + if description_index != None: + description = array[description_index] + protfeature = expression.add_feature(array[0], create_missing_features,description=description) if protfeature != None: - for i in range(1, len(array)): - protfeature.add_value(conditions[i - 1], float(array[i])) + for cond_index in cond_indeces: + protfeature.add_value(expression.conditions.get_by_id(headers[cond_index]), float(array[cond_index])) return expression - def add_feature(self, id, create_gene_if_missing=False): + def add_feature(self, id, create_gene_if_missing=False,description=None): if id in self.features: return self.features.get_by_id(id) feature = None - if self.type == GENOME: + if self.type == "genome": if self.object.search_for_gene(id) == None: if create_gene_if_missing: self.object.features.append(MSFeature(id, "")) @@ -173,29 +200,29 @@ def get_value(self, feature, condition, normalization=None): return feature.get_value(condition, normalization) def build_reaction_expression(self, model, default): - if self.type == MODEL: + if self.type == "model": logger.critical( "Cannot build a reaction expression from a model-based expression object!" ) # Creating the expression and features - rxnexpression = MSExpression(MODEL) + rxnexpression = MSExpression("model") rxnexpression.object = model for rxn in model.reactions: if len(rxn.genes) > 0: rxnexpression.add_feature(rxn.id) for condition in self.conditions: + newcondition = MSCondition(condition.id,rxnexpression) rxnexpression.conditions.append(condition) # Pulling the gene values from the current expression values = {} - logger.warning("TESTING!") for gene in model.genes: feature = self.object.search_for_gene(gene.id) if feature == None: - logger.warning( - "Model gene " + gene.id + " not found in genome of expression" + logger.debug( + "Model gene " + gene.id + " not found in genome or expression" ) elif feature.id not in self.features: - logger.warning( + logger.debug( "Model gene " + gene.id + " in genome but not in expression" ) else: @@ -208,8 +235,17 @@ def build_reaction_expression(self, model, default): # Computing the reaction level values for condition in rxnexpression.conditions: for feature in rxnexpression.features: - tree = parse_gpr(feature.feature.gene_reaction_rule)[0] + tree = GPR().from_string(str(feature.feature.gene_reaction_rule)) feature.add_value( condition, compute_gene_score(tree, values[condition.id], default) ) return rxnexpression + + def get_dataframe(self, normalization=None): + records = [] + for feature in self.features: + record = {"ftr_id":feature.id} + for condition in self.conditions: + record[condition.id] = feature.get_value(condition, normalization) + records.append(record) + return pd.DataFrame.from_records(records) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..0ed58542 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = [ + 'setuptools>=40.6.0', + 'wheel' +] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 88 +python-version = ['py36'] +include = '\.pyi?$' +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + )/ +) +''' diff --git a/setup.py b/setup.py index 775d60a4..fd9e77a7 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,9 @@ setup( name="ModelSEEDpy", - version="0.2.2", + version="0.3.3", description="Python package for building and analyzing models using ModelSEED", + long_description_content_type="text/x-rst", long_description=readme, author="Christopher Henry", author_email="chenry@anl.gov", @@ -19,23 +20,33 @@ license=license, packages=find_packages(exclude=("docs")), package_data={ - "modelseedpy": ["config.cfg"], + "modelseedpy": ["config.cfg", "data/*"], }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Intended Audience :: Science/Research", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Natural Language :: English", + ], install_requires=[ "networkx >= 2.4", - "cobra >= 0.17.1", - "scikit-learn == 0.23.2", # too support KBase pickle models + "cobra >= 0.28.0", + "scikit-learn == 1.2.0", # version lock for pickle ML models "scipy >= 1.5.4", "chemicals >= 1.0.13", "chemw >= 0.3.2", - "matplotlib >= 3.0.0", - "pyeda", + "matplotlib >= 3.0.0" ], tests_require=[ "pytest", ], project_urls={ - "Documentation": "https://modelseedpy.readthedocs.io/en/stable/", + "Documentation": "https://modelseedpy.readthedocs.io/en/latest/", "Issues": "https://github.com/ModelSEED/ModelSEEDpy/issues", }, ) diff --git a/tests/core/test_msatpcorreption.py b/tests/core/test_msatpcorreption.py index 13acf3c3..a60d33ec 100644 --- a/tests/core/test_msatpcorreption.py +++ b/tests/core/test_msatpcorreption.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import pytest import json import cobra @@ -8,13 +9,26 @@ @pytest.fixture def template(): - with open("./tests/test_data/template_core_bigg.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "template_core_bigg.json" + ), + "r", + ) as fh: return MSTemplateBuilder.from_dict(json.load(fh)).build() @pytest.fixture def template_genome_scale(): - with open("./tests/test_data/template_genome_scale_bigg.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), + "..", + "test_data", + "template_genome_scale_bigg.json", + ), + "r", + ) as fh: return MSTemplateBuilder.from_dict(json.load(fh)).build() @@ -23,7 +37,12 @@ def get_model(): def _method(ko=None, added_compounds=None, added_reactions=None): if ko is None: ko = [] - with open("./tests/test_data/e_coli_core.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "e_coli_core.json" + ), + "r", + ) as fh: model_json = json.load(fh) model_json["compartments"] = { k + "0": v for (k, v) in model_json["compartments"].items() @@ -89,7 +108,7 @@ def media_acetate_aerobic(): "h2o": (-1000, 1000), } ) - media.id = "glc/o2" + media.id = "ac/o2" return media @@ -186,9 +205,14 @@ def test_infinite_atp_model_growth_boost( def test_ms_atp_correction1(get_model, template, media_all_aerobic): + atp_hydrolysis_id = "ATPM_c0" model = get_model(["GLCpts_c0", "NADH16_c0", "CYTBD_c0", "O2t_c0"]) atp_correction = MSATPCorrection( - model, template, media_all_aerobic, atp_hydrolysis_id="ATPM_c0" + model, + template, + media_all_aerobic, + atp_hydrolysis_id=atp_hydrolysis_id, + load_default_medias=False, ) atp_correction.evaluate_growth_media() assert len(atp_correction.noncore_reactions) == 1 # the biomass @@ -211,9 +235,14 @@ def test_ms_atp_correction1(get_model, template, media_all_aerobic): tests = atp_correction.build_tests() assert tests - assert len(tests) == 1 - assert tests[0]["threshold"] > 0 - assert tests[0]["objective"] == "ATPM_c0" + assert len(tests) == 2 # glucose and empty + for t in tests: + if t["media"].id == "empty": + assert t["threshold"] <= 1e-05 + else: + assert t["threshold"] > 1e-05 + assert t["objective"] == atp_hydrolysis_id + assert t["is_max_threshold"] is True def test_ms_atp_correction_and_gap_fill1( @@ -225,35 +254,45 @@ def test_ms_atp_correction_and_gap_fill1( ): from modelseedpy import MSGapfill + atp_hydrolysis_id = "ATPM_c0" + model = get_model_with_infinite_atp_loop(["GLCpts_c0", "GLUSy_c0", "GLUDy_c0"]) model.reactions.ATPM_c0.lower_bound = 0 model.reactions.ATPM_c0.upper_bound = 1000 - + model.objective = atp_hydrolysis_id atp_correction = MSATPCorrection( - model, template, [media_glucose_aerobic], atp_hydrolysis_id="ATPM_c0" + model, + template, + [media_glucose_aerobic], + atp_hydrolysis_id=atp_hydrolysis_id, + load_default_medias=False, ) tests = atp_correction.run_atp_correction() - # expected tests = [{'media': MSMedia object, 'is_max_threshold': True, 'threshold': 21.0, 'objective': 'ATPM_c0'}] assert tests - assert len(tests) == 1 - assert tests[0]["threshold"] > 0 - assert tests[0]["objective"] == "ATPM_c0" - + assert len(tests) == 2 + for t in tests: + if t["media"].id == "empty": + assert t["threshold"] <= 1e-05 + else: + assert t["threshold"] > 1e-05 + assert t["objective"] == atp_hydrolysis_id + assert t["is_max_threshold"] is True + + model.objective = "BIOMASS_Ecoli_core_w_GAM_c0" gap_fill = MSGapfill(model, [template_genome_scale], [], tests, {}, []) result = gap_fill.run_gapfilling( media_genome_scale_glucose_aerobic, "BIOMASS_Ecoli_core_w_GAM_c0", minimum_obj=0.1, ) - # either GLUSy_c0 or GLUDy_c0 should be gap filled for glutamate assert result assert len(result["new"]) == 1 assert "GLUSy_c0" in result["new"] or "GLUDy_c0" in result["new"] - model = gap_fill.integrate_gapfill_solution(result) + gap_fill.integrate_gapfill_solution(result) - assert model + # TODO: add some model testing assertion diff --git a/tests/core/test_msgapfill.py b/tests/core/test_msgapfill.py index 77238f59..622a0924 100644 --- a/tests/core/test_msgapfill.py +++ b/tests/core/test_msgapfill.py @@ -1,54 +1,5 @@ # -*- coding: utf-8 -*- -""" -from glob import glob -os.environ["HOME"] = 'C:\\Users\\Andrew Freiburger\\Dropbox\\My PC (DESKTOP-M302P50)\\Documents\\UVic Civil Engineering\\Internships\\Agronne\\cobrakbase' -import cobrakbase -token = 'xx' -kbase = cobrakbase.KBaseAPI(token) -import re - -# define the example individual model and associated API media package -model = kbase.get_from_ws('e_coli_core.kb', 95098) -model.solver = 'optlang-cplex' - -# import the modelseedpy packages -import modelseedpy -from modelseedpy.core.msgapfill import MSGapfill -gapfill = MSGapfill(model) - -def test_init(): - assert type(gapfill.model) is cobrakbase.core.kbasefba.fbamodel.FBAModel - assert type(gapfill.blacklist) is list - assert type(gapfill.solutions) is dict - -def test_run_gapfilling_and_integrate_gapfill_solution(): - solutions = gapfill.run_gapfilling() - - # test that the objective expression is correctly set - if solutions is not None: - assert type(solutions) is dict - - # verify the integrate_gapfill_solution function - model_2 = gapfill.integrate_gapfill_solution(solutions) - assert type(model_2) is cobrakbase.core.kbasefba.fbamodel.FBAModel - - for reaction in solutions['reversed']: - if solution["reversed"][reaction] == ">": - assert reaction.upper_bound == 100 - else: - assert reaction.lower_bound == -100 - - for reaction in solutions['new']: - if solution["new"][reaction] == ">": - assert reaction.upper_bound == 100 - assert reaction.lower_bound == 0 - else: - assert reaction.upper_bound == 0 - assert reaction.lower_bound == -100 - -def test_gapfill(): - pass -""" +import os import pytest import json import cobra @@ -58,7 +9,12 @@ def test_gapfill(): @pytest.fixture def template(): - with open("./tests/test_data/template_core_bigg.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "template_core_bigg.json" + ), + "r", + ) as fh: return MSTemplateBuilder.from_dict(json.load(fh)).build() @@ -67,7 +23,12 @@ def get_model(): def _method(ko=None): if ko is None: ko = [] - with open("./tests/test_data/e_coli_core.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "e_coli_core.json" + ), + "r", + ) as fh: model_json = json.load(fh) model_json["compartments"] = { k + "0": v for (k, v) in model_json["compartments"].items() diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..d5ff7ef9 --- /dev/null +++ b/tox.ini @@ -0,0 +1,36 @@ +[tox] +envlist = py38,py39,py310 + +[gh-actions] +python = + 3.8: py38 + 3.9: py39 + 3.10: py310 + +[testenv] +setenv = ARCHIVEINTERFACE_CPCONFIG = {toxinidir}/server.conf +deps = + build + coverage + mock + pre-commit + pytest + pytest-cov + recommonmark + setuptools +commands = pytest --cov --cov-append --cov-report=term-missing +changedir = tests + +[testenv:report] +deps = coverage +skip_install = true +commands = + coverage report -m + coverage html +changedir = tests + +[testenv:clean] +deps = coverage +skip_install = true +commands = coverage erase +changedir = tests