diff --git a/MANIFEST.in b/MANIFEST.in index e0e7cc153..09ec61e51 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,6 @@ recursive-include simba * +prune simba/sandbox +recursive-exclude simba/sandbox * prune */__pycache__ recursive-exclude */__pycache__ * global-exclude __pycache__/* diff --git a/docs/nb/Untitled.ipynb b/docs/nb/Untitled.ipynb new file mode 100644 index 000000000..5feb95377 --- /dev/null +++ b/docs/nb/Untitled.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "abb4b6b3", + "metadata": {}, + "source": [ + "# Shapley calculations: Example II (MULTI-CORE)" + ] + }, + { + "cell_type": "markdown", + "id": "0ee78a39", + "metadata": {}, + "source": [ + "In this example, we have previously created a classifier. We have the data used to create this classifier, and now we want to compute SHAP explainability scores for this classifier using multiprocessing (to speed things up)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b35abe6", + "metadata": {}, + "outputs": [], + "source": [ + "from simba.mixins.train_model_mixin import TrainModelMixin\n", + "from simba.mixins.config_reader import ConfigReader\n", + "from simba.utils.read_write import read_df, read_config_file\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55fd00d6", + "metadata": {}, + "outputs": [], + "source": [ + "# DEFINITIONS\n", + "CONFIG_PATH = r\"C:\\troubleshooting\\mitra\\project_folder\\project_config.ini\"\n", + "CLASSIFIER_PATH = r\"C:\\troubleshooting\\mitra\\models\\generated_models\\grooming.sav\"\n", + "CLASSIFIER_NAME = 'grooming'\n", + "COUNT_PRESENT = 250\n", + "COUNT_ABSENT = 250" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4c29a21", + "metadata": {}, + "outputs": [], + "source": [ + "# READ IN THE CONFIG AND THE CLASSIFIER\n", + "config = read_config_file(config_path=CONFIG_PATH)\n", + "config_object = ConfigReader(config_path=CONFIG_PATH)\n", + "clf = read_df(file_path=CLASSIFIER_PATH, file_type='pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1381c40", + "metadata": {}, + "outputs": [], + "source": [ + "# READ IN THE DATA \n", + "\n", + "#Read in the path to all files inside the project_folder/csv/targets_inserted directory\n", + "file_paths = glob.glob(config_object.targets_folder + '/*' + config_object.file_type)\n", + "\n", + "#Reads in the data held in all files in ``file_paths`` defined above\n", + "data, _ = TrainModelMixin().read_all_files_in_folder_mp(file_paths=file_paths, file_type=config.get('General settings', 'workflow_file_type').strip())\n", + "\n", + "#We find all behavior annotations that are NOT the targets. I.e., if SHAP values for Attack is going to be calculated, bit we need to find which other annotations exist in the data e.g., Escape and Defensive.\n", + "non_target_annotations = TrainModelMixin().read_in_all_model_names_to_remove(config=config, model_cnt=config_object.clf_cnt, clf_name=CLASSIFIER_NAME)\n", + "\n", + "# We remove the body-part coordinate columns and the annotations which are not the target from the data \n", + "data = data.reset_index(drop=True).drop(non_target_annotations + config_object.bp_headers, axis=1)\n", + "\n", + "# We place the target data in its own variable\n", + "target_df = data.pop(CLASSIFIER_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e978462c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing 20 SHAP values (MULTI-CORE BATCH SIZE: 10, FOLLOW PROGRESS IN OS TERMINAL)...\n", + "Concatenating multi-processed SHAP data (batch 1/2)\n", + "Concatenating multi-processed SHAP data (batch 2/2)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n", + "[Parallel(n_jobs=32)]: Done 136 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 386 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed: 0.0s finished\n", + "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n", + "[Parallel(n_jobs=32)]: Done 136 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 386 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed: 0.0s finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SIMBA COMPLETE: SHAP calculations complete (elapsed time: 18.611s) \tcomplete\n", + "SIMBA WARNING: ShapWarning: SHAP visualizations/aggregate stats skipped (only viable for projects with two animals and default 7 or 8 body-parts per animal) ... \twarning\n" + ] + } + ], + "source": [ + "# We define a SHAP computer intance using the data created and defined in the prior two cells.\n", + "TrainModelMixin().create_shap_log_mp(ini_file_path=CONFIG_PATH,\n", + " rf_clf=clf,\n", + " x_df=data,\n", + " y_df=target_df,\n", + " x_names=data.columns,\n", + " clf_name=CLASSIFIER_NAME,\n", + " cnt_present=COUNT_PRESENT,\n", + " cnt_absent=COUNT_ABSENT,\n", + " save_path=config_object.logs_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11e58a5b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "simba", + "language": "python", + "name": "simba" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/nb/Untitled1.ipynb b/docs/nb/Untitled1.ipynb new file mode 100644 index 000000000..161efa4c1 --- /dev/null +++ b/docs/nb/Untitled1.ipynb @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "486d83d7", + "metadata": {}, + "outputs": [], + "source": [ + "shap_example_2" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "simba", + "language": "python", + "name": "simba" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/nb/shap_example_1.ipynb b/docs/nb/shap_example_1.ipynb index 5aafd1710..2f73d9cdd 100644 --- a/docs/nb/shap_example_1.ipynb +++ b/docs/nb/shap_example_1.ipynb @@ -5,7 +5,7 @@ "id": "0a14bf66", "metadata": {}, "source": [ - "# Shapley calculations: Example I" + "# Shapley calculations: Example I (single core)" ] }, { @@ -18,19 +18,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "51050795", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/simon/opt/anaconda3/envs/simba_dev/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.metrics.classification module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.\n", - " warnings.warn(message, FutureWarning)\n" - ] - } - ], + "outputs": [], "source": [ "from simba.mixins.train_model_mixin import TrainModelMixin\n", "from simba.mixins.config_reader import ConfigReader\n", @@ -40,22 +31,22 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "2e0d959e", "metadata": {}, "outputs": [], "source": [ "# DEFINITIONS\n", - "CONFIG_PATH = '/Users/simon/Desktop/envs/troubleshooting/Nastacia_unsupervised/project_folder/project_config.ini'\n", - "CLASSIFIER_PATH = '/Users/simon/Desktop/envs/troubleshooting/Nastacia_unsupervised/models/generated_models/Attack.sav'\n", - "CLASSIFIER_NAME = 'Attack'\n", + "CONFIG_PATH = r\"C:\\troubleshooting\\mitra\\project_folder\\project_config.ini\"\n", + "CLASSIFIER_PATH = r\"C:\\troubleshooting\\mitra\\models\\generated_models\\grooming.sav\"\n", + "CLASSIFIER_NAME = 'grooming'\n", "COUNT_PRESENT = 10\n", "COUNT_ABSENT = 10" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "ebde38eb", "metadata": {}, "outputs": [], @@ -68,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "fa5bdb37", "metadata": {}, "outputs": [ @@ -76,7 +67,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dataset size: 27.391392MB / 0.027391GB\n" + "Dataset size: 155.852436MB / 0.155852GB\n" ] } ], @@ -87,7 +78,7 @@ "file_paths = glob.glob(config_object.targets_folder + '/*' + config_object.file_type)\n", "\n", "#Reads in the data held in all files in ``file_paths`` defined above\n", - "data = TrainModelMixin().read_all_files_in_folder_mp(file_paths=file_paths, file_type=config.get('General settings', 'workflow_file_type').strip()).reset_index(drop=True)\n", + "data, _ = TrainModelMixin().read_all_files_in_folder_mp(file_paths=file_paths, file_type=config.get('General settings', 'workflow_file_type').strip())\n", "\n", "#We find all behavior annotations that are NOT the targets. I.e., if SHAP values for Attack is going to be calculated, bit we need to find which other annotations exist in the data e.g., Escape and Defensive.\n", "non_target_annotations = TrainModelMixin().read_in_all_model_names_to_remove(config=config, model_cnt=config_object.clf_cnt, clf_name=CLASSIFIER_NAME)\n", @@ -374,9 +365,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:simba_dev] *", + "display_name": "simba_310", "language": "python", - "name": "conda-env-simba_dev-py" + "name": "simba_310" }, "language_info": { "codemirror_mode": { @@ -388,7 +379,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/docs/nb/shap_example_2.ipynb b/docs/nb/shap_example_2.ipynb new file mode 100644 index 000000000..f7eeed327 --- /dev/null +++ b/docs/nb/shap_example_2.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "842dae4e", + "metadata": {}, + "source": [ + "# Shapley calculations: Example II (multiple cores)" + ] + }, + { + "cell_type": "markdown", + "id": "62ae8909", + "metadata": {}, + "source": [ + "In this example, we have previously created a classifier. We have the data used to create this classifier, and now we want to compute SHAP explainability scores for this classifier using multiple CPU cores (to speed things up a bit). Time should scale linearly with the number of cores available. Because the model has to be pushed to each core, it's advisable to use as slim of a model as possible. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "16951076", + "metadata": {}, + "outputs": [], + "source": [ + "from simba.mixins.train_model_mixin import TrainModelMixin\n", + "from simba.mixins.config_reader import ConfigReader\n", + "from simba.utils.read_write import read_df, read_config_file\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3bd92c3d", + "metadata": {}, + "outputs": [], + "source": [ + "# DEFINITIONS\n", + "CONFIG_PATH = r\"C:\\troubleshooting\\mitra\\project_folder\\project_config.ini\"\n", + "CLASSIFIER_PATH = r\"C:\\troubleshooting\\mitra\\models\\generated_models\\grooming.sav\"\n", + "CLASSIFIER_NAME = 'grooming'\n", + "COUNT_PRESENT = 250\n", + "COUNT_ABSENT = 250" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c206a9a2", + "metadata": {}, + "outputs": [], + "source": [ + "# READ IN THE CONFIG AND THE CLASSIFIER\n", + "config = read_config_file(config_path=CONFIG_PATH)\n", + "config_object = ConfigReader(config_path=CONFIG_PATH)\n", + "clf = read_df(file_path=CLASSIFIER_PATH, file_type='pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b2d453a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset size: 544.10988MB / 0.54411GB\n" + ] + } + ], + "source": [ + "# READ IN THE DATA \n", + "\n", + "#Read in the path to all files inside the project_folder/csv/targets_inserted directory\n", + "file_paths = glob.glob(config_object.targets_folder + '/*' + config_object.file_type)\n", + "\n", + "#Reads in the data held in all files in ``file_paths`` defined above\n", + "data, _ = TrainModelMixin().read_all_files_in_folder_mp(file_paths=file_paths, file_type=config.get('General settings', 'workflow_file_type').strip())\n", + "\n", + "#We find all behavior annotations that are NOT the targets. I.e., if SHAP values for Attack is going to be calculated, bit we need to find which other annotations exist in the data e.g., Escape and Defensive.\n", + "non_target_annotations = TrainModelMixin().read_in_all_model_names_to_remove(config=config, model_cnt=config_object.clf_cnt, clf_name=CLASSIFIER_NAME)\n", + "\n", + "# We remove the body-part coordinate columns and the annotations which are not the target from the data \n", + "data = data.drop(non_target_annotations + config_object.bp_headers, axis=1)\n", + "\n", + "# We place the target data in its own variable\n", + "target_df = data.pop(CLASSIFIER_NAME)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "edbe5dfd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing 500 SHAP values (MULTI-CORE BATCH SIZE: 100, FOLLOW PROGRESS IN OS TERMINAL)...\n", + "Concatenating multi-processed SHAP data (batch 1/5)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n", + "[Parallel(n_jobs=32)]: Done 136 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 386 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed: 0.0s finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Concatenating multi-processed SHAP data (batch 2/5)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n", + "[Parallel(n_jobs=32)]: Done 136 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 386 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed: 0.0s finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Concatenating multi-processed SHAP data (batch 3/5)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n", + "[Parallel(n_jobs=32)]: Done 136 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 386 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed: 0.0s finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Concatenating multi-processed SHAP data (batch 4/5)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n", + "[Parallel(n_jobs=32)]: Done 136 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 386 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed: 0.0s finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Concatenating multi-processed SHAP data (batch 5/5)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n", + "[Parallel(n_jobs=32)]: Done 136 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 386 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed: 0.0s finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SIMBA COMPLETE: SHAP calculations complete (elapsed time: 231.2415s) \tcomplete\n", + "SIMBA WARNING: ShapWarning: SHAP visualizations/aggregate stats skipped (only viable for projects with two animals and default 7 or 8 body-parts per animal) ... \twarning\n" + ] + } + ], + "source": [ + "TrainModelMixin().create_shap_log_mp(ini_file_path=CONFIG_PATH,\n", + " rf_clf=clf,\n", + " x_df=data,\n", + " y_df=target_df,\n", + " x_names=data.columns,\n", + " clf_name=CLASSIFIER_NAME,\n", + " cnt_present=COUNT_PRESENT,\n", + " cnt_absent=COUNT_ABSENT,\n", + " save_path=config_object.logs_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed794ac4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "simba", + "language": "python", + "name": "simba" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/nb/shap_log_3.ipynb b/docs/nb/shap_log_3.ipynb new file mode 100644 index 000000000..32cb8e9e1 --- /dev/null +++ b/docs/nb/shap_log_3.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "47e027ba-17ec-4b1a-93a8-927dfd62e4f9", + "metadata": {}, + "source": [ + "# Shapley calculations: Example III (GPU)" + ] + }, + { + "cell_type": "markdown", + "id": "4b0bd65d-632c-48db-b1fb-fb67283cb0c9", + "metadata": {}, + "source": [ + ">NOTE I: The SHAP library has to be built from got rather than pip: ``pip install git+https://github.com/slundberg/shap.git``\n", + "\n", + ">NOTE II: The scikit model can not be built using max_depth > 31 for it to work with this code. You can set this in the SImBA config under [create ensemble settings][rf_max_depth].\n", + "\n", + "In this example, we have previously created a classifier. We have the data used to create this classifier, and now we want to compute SHAP explainability scores\n", + "for this classifier using GPU (to speed things up a MASSIVELY).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d37811de-7208-4753-af63-fab9e986ecb3", + "metadata": {}, + "outputs": [], + "source": [ + "from simba.sandbox.create_shap_log import create_shap_log\n", + "from simba.mixins.train_model_mixin import TrainModelMixin\n", + "from simba.mixins.config_reader import ConfigReader\n", + "from simba.utils.read_write import read_df, read_config_file\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8ee1e26e-fb8b-40a7-81a8-be5bfb80662d", + "metadata": {}, + "outputs": [], + "source": [ + "# DEFINITIONS\n", + "CONFIG_PATH = r\"/mnt/c/troubleshooting/mitra/project_folder/project_config.ini\"\n", + "CLASSIFIER_PATH = r\"/mnt/c/troubleshooting/mitra/models/generated_models/grooming.sav\"\n", + "CLASSIFIER_NAME = 'grooming'\n", + "SAVE_DIR = r'/mnt/c/troubleshooting/mitra/models/generated_models'\n", + "COUNT_PRESENT = 50\n", + "COUNT_ABSENT = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9d5920c2-f7d3-4835-88b3-edc9459c3090", + "metadata": {}, + "outputs": [], + "source": [ + "# READ IN THE CONFIG AND THE CLASSIFIER\n", + "config = read_config_file(config_path=CONFIG_PATH)\n", + "config_object = ConfigReader(config_path=CONFIG_PATH, create_logger=False)\n", + "clf = read_df(file_path=CLASSIFIER_PATH, file_type='pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e9ad6e27-a1c9-4a47-84cc-e6c7bef64517", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading complete 842_MA42_gq_Saline_0621 (elapsed time: 1.6263s)...\n", + "Reading complete 842_MA42_gq_CNO_0624 (elapsed time: 1.6594s)...\n", + "Reading complete 501_MA142_Gi_CNO_0514 (elapsed time: 1.6711s)...\n", + "Reading complete 592_MA147_Gq_CNO_0517 (elapsed time: 1.6725s)...\n", + "Dataset size: 311.05188MB / 0.311052GB\n" + ] + } + ], + "source": [ + "# READ IN THE DATA\n", + "\n", + "#Read in the path to all files inside the project_folder/csv/targets_inserted directory\n", + "file_paths = glob.glob(config_object.targets_folder + '/*' + config_object.file_type)\n", + "#Reads in the data held in all files in ``file_paths`` defined above\n", + "data, _ = TrainModelMixin().read_all_files_in_folder_mp(file_paths=file_paths, file_type=config.get('General settings', 'workflow_file_type').strip())\n", + "#We find all behavior annotations that are NOT the targets. I.e., if SHAP values for Attack is going to be calculated, bit we need to find which other annotations exist in the data e.g., Escape and Defensive.\n", + "non_target_annotations = TrainModelMixin().read_in_all_model_names_to_remove(config=config, model_cnt=config_object.clf_cnt, clf_name=CLASSIFIER_NAME)\n", + "# We remove the body-part coordinate columns and the annotations which are not the target from the data\n", + "data = data.drop(non_target_annotations + config_object.bp_headers, axis=1)\n", + "# We place the target data in its own variable\n", + "target_df = data.pop(CLASSIFIER_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c66b8ea0-4699-41f8-b825-7f672b7387d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing SHAP values (GPU)...\n" + ] + } + ], + "source": [ + "#TO RETURN THE DATA\n", + "\n", + "shap_values, raw_values, expected_value = create_shap_log(rf_clf=clf,\n", + " x=data,\n", + " y=target_df,\n", + " cnt_present=COUNT_PRESENT,\n", + " cnt_absent=COUNT_ABSENT,\n", + " x_names=list(data.columns),\n", + " clf_name='grooming',\n", + " save_dir=None,\n", + " verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73f0edc7-25a9-4009-9d33-eb53e39ae949", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "simba_310", + "language": "python", + "name": "simba_310" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/notebooks.rst b/docs/notebooks.rst index 34a398894..a1142af13 100644 --- a/docs/notebooks.rst +++ b/docs/notebooks.rst @@ -8,6 +8,8 @@ General processing nb/CLI Example 1 nb/shap_example_1 + nb/shap_example_2 + nb/shap_log_3 nb/outlier_correction nb/third_party_append nb/advanced_smoothing_interpolation diff --git a/misc/Detailed_conditional_aggregate_statistics_20241011123409.csv b/misc/Detailed_conditional_aggregate_statistics_20241011123409.csv new file mode 100644 index 000000000..ecb972d60 --- /dev/null +++ b/misc/Detailed_conditional_aggregate_statistics_20241011123409.csv @@ -0,0 +1,3 @@ +VIDEO,Polygon_2 Animal_1 nose in zone,Polygon_2 Animal_1 facing,START FRAME,END FRAME,START TIME,END TIME,BOUT TIME +F1 HAB,True,False,0,2,0.0,0.1,0.1 +F1 HAB,True,False,10,13,0.3333333333333333,0.4666666666666667,0.13333333333333333 diff --git a/requirements.txt b/requirements.txt index 3c4745e44..a75a95829 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,7 +56,8 @@ statsmodels==0.12.0;python_version=="3.6" statsmodels==0.14.2;python_version>="3.9" cefpython3 == 66.0 pyarrow == 6.0.1 -shap == 0.35.0 +shap == 0.35.0;python_version=="3.6" +shap == 0.42.0;python_version>="3.9" tables==3.6.1;python_version=="3.6" tables==3.9.2;python_version>="3.9" openpyxl==3.1.2;python_version=="3.6" diff --git a/setup.py b/setup.py index d512ed209..1f87fb088 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ # Setup configuration setuptools.setup( name="Simba-UW-tf-dev", - version="2.2.2", + version="2.2.4", author="Simon Nilsson, Jia Jie Choong, Sophia Hwang", author_email="sronilsson@gmail.com", description="Toolkit for computer classification and analysis of behaviors in experimental animals", @@ -39,7 +39,10 @@ "tests.*", "tests", "__pycache__", - "pose_configurations_archive"]), + "pose_configurations_archive", + "sandbox", + "sandbox.*", + ]), include_package_data=True, classifiers=( "Programming Language :: Python :: 3", diff --git a/simba/SimBA.py b/simba/SimBA.py index e262f5545..6be73878a 100644 --- a/simba/SimBA.py +++ b/simba/SimBA.py @@ -942,6 +942,7 @@ def __init__(self): sys.stdout = StdRedirector(self.txt) if OS.PYTHON_VER.value != "3.6": + self.txt['width'], self.txt['height'] = 200, 38 PythonVersionWarning(msg=f"SimBA is not extensively tested beyond python 3.6. You are using python {OS.PYTHON_VER.value}. If you encounter errors in python>3.6, please report them on GitHub or Gitter (links in the help toolbar) and we will work together to fix the issues!", source=self.__class__.__name__) if not check_ffmpeg_available(): diff --git a/simba/bounding_box_tools/yolo/geometries_to_annotations.py b/simba/bounding_box_tools/yolo/geometries_to_annotations.py index 38e572224..9d8a792f4 100644 --- a/simba/bounding_box_tools/yolo/geometries_to_annotations.py +++ b/simba/bounding_box_tools/yolo/geometries_to_annotations.py @@ -10,10 +10,9 @@ from skimage.draw import polygon from simba.mixins.geometry_mixin import GeometryMixin -from simba.utils.checks import check_instance, check_int, check_valid_array +from simba.utils.checks import check_instance, check_int, check_valid_array, check_if_dir_exists from simba.utils.enums import Formats -from simba.utils.read_write import (get_video_meta_data, read_df, - read_frm_of_video) +from simba.utils.read_write import (get_video_meta_data, read_df, read_frm_of_video, find_files_of_filetypes_in_directory) def geometry_to_rle(geometry: Union[np.ndarray, Polygon], img_size: Tuple[int, int]): @@ -28,7 +27,7 @@ def geometry_to_rle(geometry: Union[np.ndarray, Polygon], img_size: Tuple[int, i if isinstance(geometry, (Polygon,)): geometry = geometry.exterior.coords else: - check_valid_array(data=geometry, source=geometry_to_rle.__name__, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=geometry, source=geometry_to_rle.__name__, accepted_ndims=[(2,)], accepted_dtypes=Formats.NUMERIC_DTYPES.value) binary_mask = np.zeros(img_size, dtype=np.uint8) rr, cc = polygon(geometry[:, 0].flatten(), geometry[:, 1].flatten(), img_size) binary_mask[rr, cc] = 1 @@ -161,23 +160,33 @@ def geometries_to_yolo(geometries: Dict[Union[str, int], np.ndarray], for k, v in results.items(): name = k.split(sep='.', maxsplit=2)[0] file_name = os.path.join(save_labels_dir, f'{name}.txt') - with open(file_name, mode='wt', encoding='utf-8') as myfile: - myfile.write('\n'.join(v)) + with open(file_name, mode='wt', encoding='utf-8') as f: + f.write('\n'.join(v)) - - -#def geometries_to_yolo_obb(geometries: Dict[Union[str, int], np.ndarray]): +# def labelme_to_dlc(labelme_dir: Union[str, os.PathLike]): +# check_if_dir_exists(in_dir=labelme_dir, source=labelme_to_dlc.__name__) +# labelme_files = find_files_of_filetypes_in_directory(directory=labelme_dir, extensions=['.json'], raise_error=True) +# +# for file_cnt, file_path in enumerate(labelme_files[0:2]): +# with open(file_path, mode='r', encoding='utf-8') as f: +# labelme_file = json.load(f) +# print(labelme_file) +# +# #def geometries_to_yolo_obb(geometries: Dict[Union[str, int], np.ndarray]): # # # -data_path = r"C:\troubleshooting\mitra\project_folder\csv\outlier_corrected_movement_location\FL_gq_CNO_0625.csv" -animal_data = read_df(file_path=data_path, file_type='csv', usecols=['Nose_x', 'Nose_y', 'Tail_base_x', 'Tail_base_y', 'Left_side_x', 'Left_side_y', 'Right_side_x', 'Right_side_y']).values.reshape(-1, 4, 2).astype(np.int32) -animal_polygons = GeometryMixin().bodyparts_to_polygon(data=animal_data) -poygons = GeometryMixin().multiframe_minimum_rotated_rectangle(shapes=animal_polygons) -animal_polygons = GeometryMixin().geometries_to_exterior_keypoints(geometries=poygons) -# animal_polygons = GeometryMixin.keypoints_to_axis_aligned_bounding_box(keypoints=animal_polygons) -animal_polygons = {0: animal_polygons} -geometries_to_yolo(geometries=animal_polygons, video_path=r'C:\troubleshooting\mitra\project_folder\videos\FL_gq_CNO_0625.mp4', save_dir=r"C:\troubleshooting\coco_data", sample=500, obb=True) +# # +# # +# # +# data_path = r"C:\troubleshooting\mitra\project_folder\csv\outlier_corrected_movement_location\FL_gq_CNO_0625.csv" +# animal_data = read_df(file_path=data_path, file_type='csv', usecols=['Nose_x', 'Nose_y', 'Tail_base_x', 'Tail_base_y', 'Left_side_x', 'Left_side_y', 'Right_side_x', 'Right_side_y']).values.reshape(-1, 4, 2).astype(np.int32) +# animal_polygons = GeometryMixin().bodyparts_to_polygon(data=animal_data) +# poygons = GeometryMixin().multiframe_minimum_rotated_rectangle(shapes=animal_polygons) +# animal_polygons = GeometryMixin().geometries_to_exterior_keypoints(geometries=poygons) +# # animal_polygons = GeometryMixin.keypoints_to_axis_aligned_bounding_box(keypoints=animal_polygons) +# animal_polygons = {0: animal_polygons} +# geometries_to_yolo(geometries=animal_polygons, video_path=r'C:\troubleshooting\mitra\project_folder\videos\FL_gq_CNO_0625.mp4', save_dir=r"C:\troubleshooting\coco_data", sample=500, obb=True) diff --git a/simba/data_processors/boolean_conditional_calculator.py b/simba/data_processors/boolean_conditional_calculator.py index 064e35bfe..01c8b52b1 100644 --- a/simba/data_processors/boolean_conditional_calculator.py +++ b/simba/data_processors/boolean_conditional_calculator.py @@ -2,17 +2,15 @@ from copy import deepcopy from typing import Dict, Optional, Union +import numpy as np import pandas as pd from simba.mixins.config_reader import ConfigReader -from simba.utils.checks import ( - check_all_file_names_are_represented_in_video_log, - check_if_df_field_is_boolean, check_if_filepath_list_is_empty, - check_instance) -from simba.utils.errors import InvalidInputError, MissingColumnsError +from simba.utils.checks import (check_all_file_names_are_represented_in_video_log, check_if_df_field_is_boolean, check_instance) +from simba.utils.errors import MissingColumnsError from simba.utils.printing import stdout_success -from simba.utils.read_write import (get_fn_ext, read_data_paths, read_df, - read_video_info, str_2_bool) +from simba.utils.read_write import (get_fn_ext, read_data_paths, read_df, read_video_info, str_2_bool) +from simba.utils.data import detect_bouts class BooleanConditionalCalculator(ConfigReader): @@ -22,15 +20,16 @@ class BooleanConditionalCalculator(ConfigReader): For example, computedescriptive statistics for when Animal 1 is inside the shape Rectangle_1 while at the same time directing towards shape Polygon_1, while at the same time Animal 2 is outside shape Rectangle_1 and directing towards Polygon_1. - :parameter str config_path: path to SimBA project config file in Configparser format. - :parameter Dict[str, bool] rules: Rules with field names as keys and bools as values. + :param Union[str, os.PathLike] config_path: path to SimBA project config file in Configparser format. + :param Dict[str, Union[bool, str]] rules: Rules with field names as keys and bools (or string representations of bools) as values. + :param Optional[Union[str, os.PathLike, None]] data_paths: Optional data paths to be processsed. If None, all CSVs inside the `projecet_folder/csv/outlier_corrected_movement_location` are analysed. .. note: - `Example expected output table `__. + `Example expected aggregate output table `__. + `Example expected detailed output table `__. - Examples - ----- - >>> rules = {'Rectangle_1 Simon in zone': 'TRUE', 'Polygon_1 JJ in zone': 'TRUE'} + :examples: + >>> rules = {'Rectangle_1 Simon in zone': 'TRUE', 'Polygon_1 JJ in zone': 'TRUE'} # OR {'Rectangle_1 Simon in zone': True, 'Polygon_1 JJ in zone': True} >>> conditional_bool_rule_calculator = BooleanConditionalCalculator(rules=rules, config_path='/Users/simon/Desktop/envs/troubleshooting/two_animals_16bp_032023/project_folder/project_config.ini') >>> conditional_bool_rule_calculator.run() >>> conditional_bool_rule_calculator.save() @@ -38,15 +37,18 @@ class BooleanConditionalCalculator(ConfigReader): def __init__(self, config_path: Union[str, os.PathLike], - rules: Dict[str, bool], + rules: Dict[str, Union[bool, str]], data_paths: Optional[Union[str, os.PathLike, None]] = None): ConfigReader.__init__(self, config_path=config_path) check_instance(source=self.__class__.__name__, instance=rules, accepted_types=(dict,)) self.save_path = os.path.join(self.logs_path, f"Conditional_aggregate_statistics_{self.datetime}.csv") + self.detailed_save_path = os.path.join(self.logs_path, f"Detailed_conditional_aggregate_statistics_{self.datetime}.csv") self.data_paths = read_data_paths(path=data_paths, default=self.outlier_corrected_paths, default_name=self.feature_file_paths, file_type=self.file_type) self.rules = rules self.output_df = pd.DataFrame(columns=["VIDEO"] + list(self.rules.keys()) + ["TIME (s)", "FRAMES (count)"]) + self.bout_df_cols = ["VIDEO"] + list(self.rules.keys()) + ["START FRAME", "END FRAME", "START TIME", "END TIME" ,"BOUT TIME"] + self.bout_dfs = [] def _check_integrity_of_rule_columns(self): for behavior in self.rules.keys(): @@ -72,25 +74,32 @@ def run(self): self.sliced_df = self.sliced_df[self.sliced_df[k] == 0] values_str.append(v) time_s = round(len(self.sliced_df) / self.fps, 4) - self.output_df.loc[len(self.output_df)] = ( - [self.video_name] - + list(self.rules.values()) - + [time_s] - + [len(self.sliced_df)] - ) + if len(self.sliced_df) > 0: + bout_df = pd.DataFrame(data=np.zeros((len(self.df))), columns=['behavior']) + bout_df.iloc[self.sliced_df.index] = 1 + bout_df = detect_bouts(data_df=bout_df, target_lst=['behavior'], fps=self.fps) + bout_df[list(self.rules.keys())] = list(self.rules.values()) + bout_df['VIDEO'] = self.video_name + bout_df = bout_df.rename(columns={'Start_time': 'START TIME', 'End Time': 'END TIME', 'Start_frame': 'START FRAME', 'End_frame': 'END FRAME', 'Bout_time': 'BOUT TIME'}) + self.bout_dfs.append(bout_df[self.bout_df_cols]) + self.output_df.loc[len(self.output_df)] = ([self.video_name] + list(self.rules.values()) + [time_s] + [len(self.sliced_df)]) + def save(self): self.output_df.to_csv(self.save_path, index=False) self.timer.stop_timer() - stdout_success( - msg=f"Boolean conditional data saved at at {self.save_path}!", - elapsed_time=self.timer.elapsed_time_str, - source=self.__class__.__name__, - ) - -# rules = {'Right Animal_1 Front Paw R in zone': 'TRUE', 'Left Animal_1 Hind Paw R in zone': 'TRUE'} -# runner = BooleanConditionalCalculator(rules=rules, config_path='/Users/simon/Desktop/envs/simba/troubleshooting/open_field_below/project_folder/project_config.ini') -# -# + stdout_success(msg=f"Boolean conditional data saved at at {self.save_path}!", elapsed_time=self.timer.elapsed_time_str, source=self.__class__.__name__) + if len(self.bout_dfs) > 0: + self.bout_dfs = pd.concat(self.bout_dfs, axis=0).reset_index(drop=True) + self.bout_dfs.to_csv(self.detailed_save_path, index=False) + stdout_success(msg=f"Detailed boolean conditional data saved at at {self.save_path}!", elapsed_time=self.timer.elapsed_time_str, source=self.__class__.__name__) + + +# rules = {'Polygon_2 Animal_1 nose in zone': True, 'Polygon_2 Animal_1 facing': False} +# runner = BooleanConditionalCalculator(rules=rules, config_path=r"C:\troubleshooting\spontenous_alternation\project_folder\project_config.ini") # runner.run() # runner.save() +# # +# + + diff --git a/simba/data_processors/cuda/create_shap_log.py b/simba/data_processors/cuda/create_shap_log.py new file mode 100644 index 000000000..eed22b939 --- /dev/null +++ b/simba/data_processors/cuda/create_shap_log.py @@ -0,0 +1,157 @@ +__author__ = "Simon Nilsson" +__email__ = "sronilsson@gmail.com" + +import os +from typing import List, Optional, Tuple, Union + +import numpy as np +import pandas as pd +import shap +from sklearn.ensemble import RandomForestClassifier + +from simba.mixins.train_model_mixin import TrainModelMixin +from simba.utils.checks import (check_if_dir_exists, check_instance, check_int, + check_nvidea_gpu_available, check_str, + check_valid_array, check_valid_dataframe, + check_valid_lst) +from simba.utils.enums import Formats +from simba.utils.errors import FFMPEGCodecGPUError +from simba.utils.printing import SimbaTimer, stdout_success +from simba.utils.read_write import read_df, write_df +from simba.utils.warnings import NotEnoughDataWarning + + +def create_shap_log(rf_clf: Union[str, os.PathLike, RandomForestClassifier], + x: Union[pd.DataFrame, np.ndarray], + y: Union[pd.DataFrame, pd.Series, np.ndarray], + cnt_present: int, + cnt_absent: int, + x_names: Optional[List[str]] = None, + clf_name: Optional[str] = None, + save_dir: Optional[Union[str, os.PathLike]] = None, + verbose: Optional[bool] = True) -> Union[None, Tuple[pd.DataFrame, pd.DataFrame, int]]: + """ + Computes SHAP (SHapley Additive exPlanations) values using a GPU for a RandomForestClassifier, + based on specified counts of positive and negative samples, and optionally saves the results. + + .. image:: _static/img/create_shap_log_cuda.png + :width: 500 + :align: center + + :param Union[str, os.PathLike, RandomForestClassifier] rf_clf: Trained RandomForestClassifier model or path to the saved model. Can be a string, os.PathLike object, or an instance of RandomForestClassifier. + :param Union[pd.DataFrame, np.ndarray] x: Input features used for SHAP value computation. Can be a pandas DataFrame or numpy ndarray. + :param Union[pd.DataFrame, pd.Series, np.ndarray] y: Target labels corresponding to the input features. Can be a pandas DataFrame, pandas Series, or numpy ndarray with 0 and 1 values. + :param int cnt_present: Number of positive samples (label=1) to include in the SHAP value computation. + :param int cnt_absent: Number of negative samples (label=0) to include in the SHAP value computation. + :param Optional[List[str]] x_names: Optional list of feature names corresponding to the columns in `x`. If `x` is a DataFrame, this is extracted automatically. + :param Optional[str] clf_name: Optional name for the classifier, used in naming output files. If not provided, it is extracted from the `y` labels if possible. + :param Optional[Union[str, os.PathLike]] save_dir: Optional directory path where the SHAP values and corresponding raw features are saved as CSV files. + :param Optional[bool] verbose: Optional boolean flag indicating whether to print progress messages. Defaults to True. + :return Union[None, Tuple[pd.DataFrame, pd.DataFrame, int]]: If `save_dir` is None, returns a tuple containing: + - V: DataFrame with SHAP values, expected value, sum of SHAP values, prediction probability, and target labels. + - R: DataFrame containing the raw feature values for the selected samples. + - expected_value: The expected value from the SHAP explainer. + If `save_dir` is provided, the function returns None and saves the output to CSV files in the specified directory. + + :example: + >>> x = np.random.random((1000, 501)).astype(np.float32) + >>> y = np.random.randint(0, 2, size=(len(x), 1)).astype(np.int32) + >>> clf_names = [str(x) for x in range(501)] + >>> results = create_shap_log(rf_clf=MODEL_PATH, x=x, y=y, cnt_present=int(i/2), cnt_absent=int(i/2), clf_name='TEST', x_names=clf_names, verbose=False) + """ + + timer = SimbaTimer(start=True) + if verbose: + print('Computing SHAP values (GPU)...') + if not check_nvidea_gpu_available(): + raise FFMPEGCodecGPUError(msg="No GPU found (as evaluated by nvidea-smi returning None)", + source=create_shap_log.__name__) + check_instance(source=f'{create_shap_log.__name__} rf_clf', instance=rf_clf, + accepted_types=(str, RandomForestClassifier)) + if isinstance(rf_clf, (str, os.PathLike)): + rf_clf = TrainModelMixin().read_pickle(file_path=rf_clf) + check_instance(source=f'{create_shap_log.__name__} x', instance=x, accepted_types=(pd.DataFrame, np.ndarray)) + if isinstance(x, np.ndarray): + check_valid_lst(data=x_names, source=f'{create_shap_log.__name__} x_names', valid_dtypes=(str,), + exact_len=x.shape[1]) + check_valid_array(data=x, source=f'{create_shap_log.__name__} x', accepted_ndims=[2, ], + accepted_dtypes=Formats.NUMERIC_DTYPES.value) + else: + check_valid_dataframe(df=x, source=f'{create_shap_log.__name__} x', + valid_dtypes=Formats.NUMERIC_DTYPES.value) + x_names = list(x.columns) + x = x.values + check_instance(source=f'{create_shap_log.__name__} y', instance=y, + accepted_types=(pd.DataFrame, np.ndarray, pd.Series)) + if isinstance(y, np.ndarray): + check_str(name=f'{create_shap_log.__name__} clf_name', value=clf_name) + y = y.flatten() + elif isinstance(y, pd.Series): + clf_name = y.name + y = y.values.flatten() + else: + check_valid_dataframe(df=y, source=f'{create_shap_log.__name__} y', + valid_dtypes=Formats.NUMERIC_DTYPES.value, max_axis_1=1) + clf_name = list(y.columns)[0] + y = y.values.flatten() + save_shap_path, save_raw_path = None, None + if save_dir is not None: + check_if_dir_exists(in_dir=save_dir) + save_shap_path = os.path.join(save_dir, f"SHAP_values_{clf_name}.csv") + save_raw_path = os.path.join(save_dir, f"RAW_SHAP_feature_values_{clf_name}.csv") + check_valid_array(data=y, source=f'{create_shap_log.__name__} y', accepted_values=[0, 1]) + check_int(name=f'{create_shap_log.__name__} cnt_present', value=cnt_present, min_value=1) + check_int(name=f'{create_shap_log.__name__} cnt_absent', value=cnt_absent, min_value=1) + target_cnt = np.sum(y) + absent_cnt = y.shape[0] - target_cnt + + if cnt_present > target_cnt: + NotEnoughDataWarning( + msg=f"Data contains {target_cnt} behavior-present annotations. This is less the number of frames you specified to calculate shap values for ({cnt_present}). SimBA will calculate shap scores for the {target_cnt} behavior-present frames available", + source=create_shap_log.__name__) + cnt_present = target_cnt + if absent_cnt < cnt_absent: + NotEnoughDataWarning( + msg=f"Data contains {absent_cnt} behavior-absent annotations. This is less the number of frames you specified to calculate shap values for ({cnt_absent}). SimBA will calculate shap scores for the {absent_cnt} behavior-absent frames available", + source=create_shap_log.__name__) + cnt_absent = absent_cnt + + target_idx = np.argwhere(y == 1).flatten() + absent_idx = np.argwhere(y == 0).flatten() + target_idx = np.sort(np.random.choice(target_idx, cnt_present)) + absent_idx = np.sort(np.random.choice(absent_idx, cnt_absent)) + target_x = x[target_idx] + absent_x = x[absent_idx] + X = np.vstack([target_x, absent_x]).astype(np.float32) + Y = np.hstack([np.ones(target_x.shape[0]), np.zeros(absent_x.shape[0])]).astype(np.int32) + explainer = shap.explainers.GPUTree(model=rf_clf, data=None, model_output='raw', + feature_names='tree_path_dependent') + shap_values = explainer.shap_values(X, check_additivity=True) + V = pd.DataFrame(shap_values[1], columns=x_names).astype(np.float32) + sum = V.sum(axis=1) + expected_value = explainer.expected_value[1] + p = TrainModelMixin().clf_predict_proba(clf=rf_clf, x_df=X) + + V['EXPECTED_VALUE'] = expected_value.round(4) + V['SUM'] = sum + V['EXPECTED_VALUE'] + V['PREDICTION_PROBABILITY'] = p.round(4) + V['SUM'] = V['SUM'].round(4) + V[clf_name] = Y + x_idx = np.hstack([target_idx, absent_idx]) + R = pd.DataFrame(x[x_idx, :], columns=x_names) + timer.stop_timer() + if save_dir is None: + if verbose: + stdout_success(msg=f'Shap values compute complete (GPU) for {len(V)} observations.', elapsed_time=timer.elapsed_time_str) + return (V, R, expected_value) + else: + write_df(df=V, file_type='csv', save_path=save_shap_path) + write_df(df=R, file_type='csv', save_path=save_raw_path) + if verbose: + stdout_success(msg=f'Shap values compute complete (GPU) for {len(V)} observations, and saved in {save_dir}', elapsed_time=timer.elapsed_time_str) + + + + + + diff --git a/simba/mixins/__init__.py b/simba/mixins/__init__.py index 7db1888cf..d0284eebd 100644 --- a/simba/mixins/__init__.py +++ b/simba/mixins/__init__.py @@ -1,7 +1,8 @@ -try: - from cuml.ensemble import RandomForestClassifier as cuRF -except ImportError: - cuRF = None +# try: +# from cuml.ensemble import RandomForestClassifier as cuRF +# except ImportError: +# cuRF = None +cuRF = None __all__ = ['cuRF'] diff --git a/simba/mixins/circular_statistics.py b/simba/mixins/circular_statistics.py index fd12fdd42..96cf5487f 100644 --- a/simba/mixins/circular_statistics.py +++ b/simba/mixins/circular_statistics.py @@ -486,7 +486,7 @@ def direction_two_bps(anterior_loc: np.ndarray, posterior_loc: np.ndarray) -> np @njit("(float32[:],)") def rayleigh(data: np.ndarray) -> Tuple[float, float]: - """ + r""" Jitted compute of Rayleigh Z (test of non-uniformity) of single sample of circular data in degrees. .. note:: @@ -1042,7 +1042,6 @@ def sliding_circular_range(data: np.ndarray, time_windows: np.ndarray, fps: int results = np.full((data.shape[0], time_windows.shape[0]), 0.0) for time_window_cnt in range(time_windows.shape[0]): win_size = int(time_windows[time_window_cnt] * fps) - print('s') for left, right in zip(range(0, (data.shape[0] - win_size) + 1), range(win_size-1, data.shape[0])): sample = np.sort(np.deg2rad(data[left : right + 1])) angular_diffs = np.diff(sample) diff --git a/simba/mixins/feature_extraction_mixin.py b/simba/mixins/feature_extraction_mixin.py index ac7780588..a55d8c9e9 100644 --- a/simba/mixins/feature_extraction_mixin.py +++ b/simba/mixins/feature_extraction_mixin.py @@ -6,6 +6,7 @@ import glob import math import os +import re from typing import List, Optional import numpy as np @@ -519,6 +520,8 @@ def check_directionality_viable(self): :return np.ndarray ear_left_coord: If viable, then 2D array with coordinates of the left ear in all frames. Else, empty array. :return np.ndarray ear_right_coord: If viable, then 2D array with coordinates of the right ear in all frames. Else, empty array. """ + DELIMITERS = "_ " + direction_viable = True nose_cords, ear_left_cords, ear_right_cords = [], [], [] @@ -526,17 +529,11 @@ def check_directionality_viable(self): for bp_cord in ["X_bps", "Y_bps"]: bp_list = self.animal_bp_dict[animal_name][bp_cord] for bp_name in bp_list: - bp_name_components = bp_name.split("_") - bp_name_components = [x.lower() for x in bp_name_components] - if "nose" in bp_name_components: + if "nose" in bp_name: nose_cords.append(bp_name) - elif ("ear" in bp_name_components) and ( - "left" in bp_name_components - ): + elif ("ear" in bp_name) and ("left" in bp_name): ear_left_cords.append(bp_name) - elif ("ear" in bp_name_components) and ( - "right" in bp_name_components - ): + elif ("ear" in bp_name) and ("right" in bp_name): ear_right_cords.append(bp_name) else: pass @@ -546,18 +543,9 @@ def check_directionality_viable(self): direction_viable = False if direction_viable: - nose_cords = [ - nose_cords[i * 2 : (i + 1) * 2] - for i in range((len(nose_cords) + 2 - 1) // 2) - ] - ear_left_cords = [ - ear_left_cords[i * 2 : (i + 1) * 2] - for i in range((len(ear_left_cords) + 2 - 1) // 2) - ] - ear_right_cords = [ - ear_right_cords[i * 2 : (i + 1) * 2] - for i in range((len(ear_right_cords) + 2 - 1) // 2) - ] + nose_cords = [nose_cords[i * 2 : (i + 1) * 2] for i in range((len(nose_cords) + 2 - 1) // 2)] + ear_left_cords = [ear_left_cords[i * 2 : (i + 1) * 2] for i in range((len(ear_left_cords) + 2 - 1) // 2)] + ear_right_cords = [ear_right_cords[i * 2 : (i + 1) * 2] for i in range((len(ear_right_cords) + 2 - 1) // 2)] return direction_viable, nose_cords, ear_left_cords, ear_right_cords diff --git a/simba/mixins/image_mixin.py b/simba/mixins/image_mixin.py index cdc89dccf..7d52ac0ef 100644 --- a/simba/mixins/image_mixin.py +++ b/simba/mixins/image_mixin.py @@ -1249,9 +1249,13 @@ def pad_img_stack(image_dict: Dict[int, np.ndarray], pad_value: Optional[int] = max_width = max(image.shape[1] for image in image_dict.values()) padded_images = {} for key, image in image_dict.items(): + check_if_valid_img(data=image, source=ImageMixin.pad_img_stack.__name__, raise_error=True) pad_height = max_height - image.shape[0] pad_width = max_width - image.shape[1] - padded_image = np.pad(image, ((0, pad_height), (0, pad_width), (0, 0)), mode="constant", constant_values=pad_value) + if image.ndim == 3: + padded_image = np.pad(image, ((0, pad_height), (0, pad_width), (0, 0)), mode="constant", constant_values=pad_value) + else: + padded_image = np.pad(image, ((0, pad_height), (0, pad_width)), mode="constant", constant_values=pad_value) padded_images[key] = padded_image return padded_images diff --git a/simba/mixins/train_model_mixin.py b/simba/mixins/train_model_mixin.py index f62e29a00..60ae511c9 100644 --- a/simba/mixins/train_model_mixin.py +++ b/simba/mixins/train_model_mixin.py @@ -32,8 +32,7 @@ from sklearn.inspection import partial_dependence, permutation_importance from sklearn.metrics import classification_report, precision_recall_curve from sklearn.model_selection import ShuffleSplit, learning_curve -from sklearn.preprocessing import (MinMaxScaler, QuantileTransformer, - StandardScaler) +from sklearn.preprocessing import (MinMaxScaler, QuantileTransformer, StandardScaler) from sklearn.tree import export_graphviz from sklearn.utils import parallel_backend from tabulate import tabulate @@ -58,14 +57,11 @@ from simba.plotting.shap_agg_stats_visualizer import \ ShapAggregateStatisticsVisualizer from simba.ui.tkinter_functions import TwoOptionQuestionPopUp -from simba.utils.checks import (check_file_exist_and_readable, check_float, - check_if_dir_exists, check_if_valid_input, - check_instance, check_int, check_str, - check_that_column_exist) +from simba.utils.checks import (check_file_exist_and_readable, check_float, check_if_dir_exists, check_if_valid_input, check_instance, check_int, check_str, check_that_column_exist, check_valid_dataframe, check_valid_lst) from simba.utils.data import (detect_bouts, detect_bouts_multiclass, get_library_version) from simba.utils.enums import (OS, ConfigKey, Defaults, Dtypes, Methods, - MLParamKeys, Options) + MLParamKeys, Options, Formats) from simba.utils.errors import (ClassifierInferenceError, ColumnNotFoundError, CorruptedFileError, DataHeaderError, FaultyTrainingSetError, @@ -91,13 +87,12 @@ class TrainModelMixin(object): def __init__(self): pass - def read_all_files_in_folder( - self, - file_paths: List[str], - file_type: str, - classifier_names: Optional[List[str]] = None, - raise_bool_clf_error: bool = True, - ) -> (pd.DataFrame, List[int]): + def read_all_files_in_folder(self, + file_paths: List[str], + file_type: str, + classifier_names: Optional[List[str]] = None, + raise_bool_clf_error: bool = True) -> (pd.DataFrame, List[int]): + """ Read in all data files in a folder to a single pd.DataFrame for downstream ML algo. Asserts that all classifiers have annotation fields present in concatenated dataframe. @@ -133,18 +128,9 @@ def read_all_files_in_folder( if classifier_names != None: for clf_name in classifier_names: if not clf_name in df.columns: - raise MissingColumnsError( - msg=f"Data for video {vid_name} does not contain any annotations for behavior {clf_name}. Delete classifier {clf_name} from the SimBA project, or add annotations for behavior {clf_name} to the video {vid_name}", - source=self.__class__.__name__, - ) - elif ( - len(set(df[clf_name].unique()) - {0, 1}) > 0 - and raise_bool_clf_error - ): - raise InvalidInputError( - msg=f"The annotation column for a classifier should contain only 0 or 1 values. However, in file {file} the {clf_name} field contains additional value(s): {list(set(df[clf_name].unique()) - {0, 1})}.", - source=self.__class__.__name__, - ) + raise MissingColumnsError(msg=f"Data for video {vid_name} does not contain any annotations for behavior {clf_name}. Delete classifier {clf_name} from the SimBA project, or add annotations for behavior {clf_name} to the video {vid_name}", source=self.__class__.__name__,) + elif (len(set(df[clf_name].unique()) - {0, 1}) > 0 and raise_bool_clf_error): + raise InvalidInputError(msg=f"The annotation column for a classifier should contain only 0 or 1 values. However, in file {file} the {clf_name} field contains additional value(s): {list(set(df[clf_name].unique()) - {0, 1})}.", source=self.__class__.__name__) else: df_concat = pd.concat([df_concat, df], axis=0) else: @@ -154,10 +140,7 @@ def read_all_files_in_folder( except KeyError: pass if len(df_concat) == 0: - raise NoDataError( - msg="SimBA found 0 annotated frames in the project_folder/csv/targets_inserted directory", - source=self.__class__.__name__, - ) + raise NoDataError(msg="SimBA found 0 annotated frames in the project_folder/csv/targets_inserted directory", source=self.__class__.__name__) df_concat = df_concat.loc[ :, ~df_concat.columns.str.contains("^Unnamed") ].fillna(0) @@ -797,7 +780,7 @@ def split_and_group_df(df: pd.DataFrame, splits: int, include_split_order: bool return data_arr, obs_per_split def create_shap_log(self, - ini_file_path: str, + ini_file_path: Union[str, os.PathLike], rf_clf: RandomForestClassifier, x_df: pd.DataFrame, y_df: pd.Series, @@ -839,104 +822,60 @@ def create_shap_log(self, :param int cnt_present: Number of behavior-present frames to calculate SHAP values for. :param int cnt_absent: Number of behavior-absent frames to calculate SHAP values for. :param str save_path: Directory where to save output in csv file format. - :param Optional[int] save_file_no: If integer, represents the count of the classifier within a grid search. If none, the classifier is not - part of a grid search. - + :param Optional[int] save_file_no: If integer, represents the count of the classifier within a grid search. If none, the classifier is not part of a grid search. """ print("Calculating SHAP values (SINGLE CORE)...") + check_file_exist_and_readable(file_path=ini_file_path) + check_instance(source='create_shap_log', instance=rf_clf, accepted_types=(RandomForestClassifier,)) + check_valid_lst(data=list(x_names), valid_dtypes=(str,)) + check_valid_dataframe(df=x_df, valid_dtypes=Formats.NUMERIC_DTYPES.value, required_fields=list(x_names)) + check_str(name='clf_name', value=clf_name) + check_int(name='shap cnt_present', value=cnt_present, min_value=1) + check_int(name='shap cnt_absent', value=cnt_absent, min_value=1) shap_timer = SimbaTimer(start=True) data_df = pd.concat([x_df, y_df], axis=1) if (save_file_no == None) and (save_path is not None): - self.out_df_shap_path = os.path.join( - save_path, f"SHAP_values_{clf_name}.csv" - ) - self.out_df_raw_path = os.path.join( - save_path, f"RAW_SHAP_feature_values_{clf_name}.csv" - ) + self.out_df_shap_path = os.path.join(save_path, f"SHAP_values_{clf_name}.csv") + self.out_df_raw_path = os.path.join(save_path, f"RAW_SHAP_feature_values_{clf_name}.csv") elif (save_file_no is not None) and (save_path is not None): - self.out_df_shap_path = os.path.join( - save_path, f"SHAP_values_{str(save_file_no)}_{clf_name}.csv" - ) - self.out_df_raw_path = os.path.join( - save_path, f"RAW_SHAP_feature_values_{str(save_file_no)}_{clf_name}.csv" - ) + self.out_df_shap_path = os.path.join(save_path, f"SHAP_values_{str(save_file_no)}_{clf_name}.csv") + self.out_df_raw_path = os.path.join(save_path, f"RAW_SHAP_feature_values_{str(save_file_no)}_{clf_name}.csv") - target_df, nontarget_df = ( - data_df[data_df[y_df.name] == 1], - data_df[data_df[y_df.name] == 0], - ) + target_df, nontarget_df = (data_df[data_df[y_df.name] == 1], data_df[data_df[y_df.name] == 0]) if len(target_df) < cnt_present: - NotEnoughDataWarning( - msg=f"Train data contains {len(target_df)} behavior-present annotations. This is less the number of frames you specified to calculate shap values for ({str(cnt_present)}). SimBA will calculate shap scores for the {len(target_df)} behavior-present frames available", - source=self.__class__.__name__, - ) + NotEnoughDataWarning(msg=f"Train data contains {len(target_df)} behavior-present annotations. This is less the number of frames you specified to calculate shap values for ({str(cnt_present)}). SimBA will calculate shap scores for the {len(target_df)} behavior-present frames available", source=self.__class__.__name__) cnt_present = len(target_df) if len(nontarget_df) < cnt_absent: - NotEnoughDataWarning( - msg=f"Train data contains {len(nontarget_df)} behavior-absent annotations. This is less the number of frames you specified to calculate shap values for ({str(cnt_absent)}). SimBA will calculate shap scores for the {len(nontarget_df)} behavior-absent frames available", - source=self.__class__.__name__, - ) + NotEnoughDataWarning(msg=f"Train data contains {len(nontarget_df)} behavior-absent annotations. This is less the number of frames you specified to calculate shap values for ({str(cnt_absent)}). SimBA will calculate shap scores for the {len(nontarget_df)} behavior-absent frames available", source=self.__class__.__name__) cnt_absent = len(nontarget_df) non_target_for_shap = nontarget_df.sample(cnt_absent, replace=False) targets_for_shap = target_df.sample(cnt_present, replace=False) shap_df = pd.concat([targets_for_shap, non_target_for_shap], axis=0) y_df = shap_df.pop(clf_name).values - explainer = shap.TreeExplainer( - rf_clf, - data=None, - model_output="raw", - feature_perturbation="tree_path_dependent", - ) + explainer = shap.TreeExplainer(rf_clf, data=None, model_output="raw", feature_perturbation="tree_path_dependent") expected_value = explainer.expected_value[1] out_df_raw = pd.DataFrame(columns=x_names) shap_headers = list(x_names) - shap_headers.extend( - ("Expected_value", "Sum", "Prediction_probability", clf_name) - ) + shap_headers.extend(("Expected_value", "Sum", "Prediction_probability", clf_name)) out_df_shap = pd.DataFrame(columns=shap_headers) for cnt, frame in enumerate(range(len(shap_df))): shap_frm_timer = SimbaTimer(start=True) frame_data = shap_df.iloc[[frame]] - frame_shap = explainer.shap_values(frame_data, check_additivity=False)[1][ - 0 - ].tolist() - frame_shap.extend( - ( - expected_value, - sum(frame_shap), - rf_clf.predict_proba(frame_data)[0][1], - y_df[cnt], - ) - ) + frame_shap = explainer.shap_values(frame_data, check_additivity=False)[1][0].tolist() + frame_shap.extend((expected_value, sum(frame_shap), rf_clf.predict_proba(frame_data)[0][1], y_df[cnt])) out_df_raw.loc[len(out_df_raw)] = list(shap_df.iloc[frame]) out_df_shap.loc[len(out_df_shap)] = frame_shap - if ( - (cnt % save_it == 0) - or (cnt == len(shap_df) - 1) - and (cnt != 0) - and (save_path is not None) - ): + if ((cnt % save_it == 0) or (cnt == len(shap_df) - 1) and (cnt != 0) and (save_path is not None)): print(f"Saving SHAP data after {cnt} iterations...") out_df_shap.to_csv(self.out_df_shap_path) out_df_raw.to_csv(self.out_df_raw_path) shap_frm_timer.stop_timer() print(f"SHAP frame: {cnt + 1} / {len(shap_df)}, elapsed time: {shap_frm_timer.elapsed_time_str}...") - shap_timer.stop_timer() - stdout_success( - msg="SHAP calculations complete", - elapsed_time=shap_timer.elapsed_time_str, - source=self.__class__.__name__, - ) + stdout_success(msg=f"SHAP calculations complete! Results saved at {self.out_df_shap_path} and {self.out_df_raw_path}", elapsed_time=shap_timer.elapsed_time_str, source=self.__class__.__name__) if save_path is not None: - _ = ShapAggregateStatisticsVisualizer( - config_path=ini_file_path, - classifier_name=clf_name, - shap_df=out_df_shap, - shap_baseline_value=int(expected_value * 100), - save_path=save_path, - ) + _ = ShapAggregateStatisticsVisualizer(config_path=ini_file_path, classifier_name=clf_name, shap_df=out_df_shap, shap_baseline_value=int(expected_value * 100), save_path=save_path) else: return (out_df_shap, out_df_raw, int(expected_value * 100)) @@ -1560,10 +1499,8 @@ def read_all_files_in_folder_mp( return df_concat, frame_numbers_lst except BrokenProcessPool or AttributeError: - MultiProcessingFailedWarning( - msg="Multi-processing file read failed, reverting to single core (increased run-time)." - ) - return TrainModelMixin.read_all_files_in_folder( + MultiProcessingFailedWarning(msg="Multi-processing file read failed, reverting to single core (increased run-time).") + return TrainModelMixin().read_all_files_in_folder( file_paths=file_paths, file_type=file_type, classifier_names=classifier_names, @@ -1571,12 +1508,10 @@ def read_all_files_in_folder_mp( ) @staticmethod - def _read_data_file_helper_futures( - file_path: str, - file_type: str, - clf_names: Optional[List[str]] = None, - raise_bool_clf_error: bool = True, - ): + def _read_data_file_helper_futures(file_path: str, + file_type: str, + clf_names: Optional[List[str]] = None, + raise_bool_clf_error: bool = True): """ Private function called by :meth:`simba.train_model_functions.read_all_files_in_folder_mp_futures` """ @@ -1595,13 +1530,12 @@ def _read_data_file_helper_futures( timer.stop_timer() return df, vid_name, timer.elapsed_time_str, frm_numbers - def read_all_files_in_folder_mp_futures( - self, - annotations_file_paths: List[str], - file_type: Literal["csv", "parquet", "pickle"], - classifier_names: Optional[List[str]] = None, - raise_bool_clf_error: bool = True, - ) -> (pd.DataFrame, List[int]): + def read_all_files_in_folder_mp_futures(self, + annotations_file_paths: List[str], + file_type: Literal["csv", "parquet", "pickle"], + classifier_names: Optional[List[str]] = None, + raise_bool_clf_error: bool = True) -> (pd.DataFrame, List[int]): + """ Multiprocessing helper function to read in all data files in a folder to a single pd.DataFrame for downstream ML through ``concurrent.Futures``. Asserts that all classifiers @@ -1620,22 +1554,17 @@ def read_all_files_in_folder_mp_futures( :return pd.DataFrame: Concatenated dataframe of all data in ``file_paths``. """ + + THREADSAFE_CORE_COUNT = 16 try: if (platform.system() == "Darwin") and (multiprocessing.get_start_method() != "spawn"): multiprocessing.set_start_method("spawn", force=True) cpu_cnt, _ = find_core_cnt() - if (cpu_cnt > Defaults.THREADSAFE_CORE_COUNT.value) and (platform.system() == OS.WINDOWS.value): - cpu_cnt = Defaults.THREADSAFE_CORE_COUNT.value + if (cpu_cnt > THREADSAFE_CORE_COUNT) and (platform.system() == OS.WINDOWS.value): + cpu_cnt = THREADSAFE_CORE_COUNT df_lst, frm_number_list = [], [] with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_cnt) as executor: - results = [executor.submit(self._read_data_file_helper_futures, - data, - file_type, - classifier_names, - raise_bool_clf_error, - ) - for data in annotations_file_paths - ] + results = [executor.submit(self._read_data_file_helper_futures, data, file_type, classifier_names, raise_bool_clf_error) for data in annotations_file_paths] for result in concurrent.futures.as_completed(results): df_lst.append(result.result()[0]) frm_number_list.extend((result.result()[-1])) @@ -1699,41 +1628,36 @@ def check_raw_dataset_integrity( ) @staticmethod - def _create_shap_mp_helper( - data: pd.DataFrame, - explainer: shap.TreeExplainer, - clf_name: str, - rf_clf: RandomForestClassifier, - expected_value: float, - ): + def _create_shap_mp_helper(data: pd.DataFrame, + explainer: shap.TreeExplainer, + clf_name: str, + rf_clf: RandomForestClassifier, + expected_value: float): target = data.pop(clf_name).values.reshape(-1, 1) + print('ppp') frame_batch_shap = explainer.shap_values(data.values, check_additivity=False)[1] shap_sum = np.sum(frame_batch_shap, axis=1).reshape(-1, 1) + print('s') proba = rf_clf.predict_proba(data)[:, 1].reshape(-1, 1) - frame_batch_shap = np.hstack( - ( - frame_batch_shap, - np.full((frame_batch_shap.shape[0]), expected_value).reshape(-1, 1), - shap_sum, - proba, - target, - ) - ) + print(proba) + frame_batch_shap = np.hstack((frame_batch_shap, + np.full((frame_batch_shap.shape[0]), expected_value).reshape(-1, 1), + shap_sum, + proba, + target)) + print('ss') + return frame_batch_shap, data.values @staticmethod - def _create_shap_mp_helper( - data: pd.DataFrame, explainer: shap.TreeExplainer, clf_name: str - ): - + def _create_shap_mp_helper(data: pd.DataFrame, explainer: shap.TreeExplainer, clf_name: str): target = data.pop(clf_name).values.reshape(-1, 1) group_cnt = data.pop("group").values[0] shap_vals = np.full((len(data), len(data.columns)), np.nan) for cnt, i in enumerate(list(data.index)): - shap_vals[cnt] = explainer.shap_values( - data.loc[i].values, check_additivity=False - )[1] + shap_instance = explainer.shap_values(data.loc[i].values, check_additivity=False)[1] + shap_vals[cnt] = explainer.shap_values(data.loc[i].values, check_additivity=False)[1] print(f"SHAP complete core frame: {i} (CORE BATCH: {group_cnt})") return shap_vals, data.values, target @@ -1778,12 +1702,8 @@ def create_shap_log_mp( shap_timer = SimbaTimer(start=True) data_df = pd.concat([x_df, y_df], axis=1) if (save_file_no == None) and (save_path is not None): - self.out_df_shap_path = os.path.join( - save_path, f"SHAP_values_{clf_name}.csv" - ) - self.out_df_raw_path = os.path.join( - save_path, f"RAW_SHAP_feature_values_{clf_name}.csv" - ) + self.out_df_shap_path = os.path.join(save_path, f"SHAP_values_{clf_name}.csv") + self.out_df_raw_path = os.path.join(save_path, f"RAW_SHAP_feature_values_{clf_name}.csv") elif (save_file_no is not None) and (save_path is not None): self.out_df_shap_path = os.path.join( save_path, f"SHAP_values_{str(save_file_no)}_{clf_name}.csv" @@ -1796,9 +1716,7 @@ def create_shap_log_mp( data_df[data_df[y_df.name] == 0], ) if len(target_df) < cnt_present: - NotEnoughDataWarning( - msg=f"Train data contains {len(target_df)} behavior-present annotations. This is less the number of frames you specified to calculate shap values for ({str(cnt_present)}). SimBA will calculate shap scores for the {len(target_df)} behavior-present frames available", - source=self.__class__.__name__, + NotEnoughDataWarning(msg=f"Train data contains {len(target_df)} behavior-present annotations. This is less the number of frames you specified to calculate shap values for ({str(cnt_present)}). SimBA will calculate shap scores for the {len(target_df)} behavior-present frames available", source=self.__class__.__name__, ) cnt_present = len(target_df) if len(nontarget_df) < cnt_absent: @@ -1809,12 +1727,7 @@ def create_shap_log_mp( cnt_absent = len(nontarget_df) non_target_for_shap = nontarget_df.sample(cnt_absent, replace=False) targets_for_shap = target_df.sample(cnt_present, replace=False) - explainer = shap.TreeExplainer( - rf_clf, - data=None, - model_output="raw", - feature_perturbation="tree_path_dependent", - ) + explainer = shap.TreeExplainer( rf_clf, data=None, model_output="raw", feature_perturbation="tree_path_dependent") expected_value = explainer.expected_value[1] cores, _ = find_core_cnt() shap_data_df = pd.concat([targets_for_shap, non_target_for_shap], axis=0) @@ -1822,85 +1735,50 @@ def create_shap_log_mp( batch_size = 1 if len(shap_data_df) > 100: batch_size = 100 - print( - f"Computing {len(shap_data_df)} SHAP values (MULTI-CORE BATCH SIZE: {batch_size}, FOLLOW PROGRESS IN OS TERMINAL)...") - shap_data, _ = self.split_and_group_df(df=shap_data_df, splits=int(len(shap_data_df) / batch_size)) + print(f"Computing {len(shap_data_df)} SHAP values (MULTI-CORE BATCH SIZE: {batch_size}, FOLLOW PROGRESS IN OS TERMINAL)...") + shap_data, _ = self.split_and_group_df(df=shap_data_df.reset_index(drop=True), splits=int(len(shap_data_df) / batch_size)) shap_results, shap_raw = [], [] - try: - with multiprocessing.Pool(cores, maxtasksperchild=10) as pool: - constants = functools.partial( - self._create_shap_mp_helper, explainer=explainer, clf_name=clf_name - ) - for cnt, result in enumerate( - pool.imap_unordered(constants, shap_data, chunksize=1) - ): - print( - f"Concatenating multi-processed SHAP data (batch {cnt + 1}/{len(shap_data)})" - ) - proba = rf_clf.predict_proba(result[1])[:, 1].reshape(-1, 1) - shap_sum = np.sum(result[0], axis=1).reshape(-1, 1) - batch_shap_results = np.hstack( - ( - result[0], - np.full((result[0].shape[0]), expected_value).reshape( - -1, 1 - ), - shap_sum, - proba, - result[2], - ) - ) - shap_results.append(batch_shap_results) - shap_raw.append(result[1]) - pool.terminate() - pool.join() - shap_save_df = pd.DataFrame( - data=np.row_stack(shap_results), - columns=list(x_names) - + ["Expected_value", "Sum", "Prediction_probability", clf_name], - ) - raw_save_df = pd.DataFrame( - data=np.row_stack(shap_raw), columns=list(x_names) - ) - - shap_timer.stop_timer() - stdout_success( - msg="SHAP calculations complete", - elapsed_time=shap_timer.elapsed_time_str, - source=self.__class__.__name__, - ) - if save_path: - shap_save_df.to_csv(self.out_df_shap_path) - raw_save_df.to_csv(self.out_df_raw_path) - _ = ShapAggregateStatisticsVisualizer( - config_path=ini_file_path, - classifier_name=clf_name, - shap_df=shap_save_df, - shap_baseline_value=int(expected_value * 100), - save_path=save_path, - ) - else: - return (shap_save_df, raw_save_df, int(expected_value * 100)) - - except Exception as e: - print(e.args) - ShapWarning( - msg="Multiprocessing SHAP values failed. Revert to single core. This will negatively affect run-time. ", - source=self.__class__.__name__, - ) - self.create_shap_log( - ini_file_path=ini_file_path, - rf_clf=rf_clf, - x_df=x_df, - y_df=y_df, - x_names=x_names, - clf_name=clf_name, - cnt_present=cnt_present, - cnt_absent=cnt_absent, - save_path=save_path, - save_it=len(x_df), - save_file_no=save_file_no, - ) + # try: + with multiprocessing.Pool(cores, maxtasksperchild=Defaults.LARGE_MAX_TASK_PER_CHILD.value) as pool: + constants = functools.partial(self._create_shap_mp_helper, explainer=explainer, clf_name=clf_name) + for cnt, result in enumerate(pool.imap_unordered(constants, shap_data, chunksize=1)): + print(f"Concatenating multi-processed SHAP data (batch {cnt + 1}/{len(shap_data)})") + proba = rf_clf.predict_proba(result[1])[:, 1].reshape(-1, 1) + shap_sum = np.sum(result[0], axis=1).reshape(-1, 1) + batch_shap_results = np.hstack((result[0], np.full((result[0].shape[0]), expected_value).reshape(-1, 1), shap_sum, proba, result[2])) + shap_results.append(batch_shap_results) + shap_raw.append(result[1]) + pool.terminate() + pool.join() + shap_save_df = pd.DataFrame(data=np.row_stack(shap_results), columns=list(x_names) + ["Expected_value", "Sum", "Prediction_probability", clf_name]) + raw_save_df = pd.DataFrame(data=np.row_stack(shap_raw), columns=list(x_names)) + shap_timer.stop_timer() + stdout_success(msg="SHAP calculations complete", elapsed_time=shap_timer.elapsed_time_str, source=self.__class__.__name__) + if save_path: + shap_save_df.to_csv(self.out_df_shap_path) + raw_save_df.to_csv(self.out_df_raw_path) + _ = ShapAggregateStatisticsVisualizer(config_path=ini_file_path, + classifier_name=clf_name, + shap_df=shap_save_df, + shap_baseline_value=int(expected_value * 100), + save_path=save_path) + else: + return (shap_save_df, raw_save_df, int(expected_value * 100)) + + # except Exception as e: + # print(e.args) + # ShapWarning(msg="Multiprocessing SHAP values failed. Revert to single core. This will negatively affect run-time. ", source=self.__class__.__name__) + # self.create_shap_log(ini_file_path=ini_file_path, + # rf_clf=rf_clf, + # x_df=x_df, + # y_df=y_df, + # x_names=x_names, + # clf_name=clf_name, + # cnt_present=cnt_present, + # cnt_absent=cnt_absent, + # save_path=save_path, + # save_it=len(x_df), + # save_file_no=save_file_no) def check_df_dataset_integrity( self, df: pd.DataFrame, file_name: str, logs_path: Union[str, os.PathLike] @@ -1926,10 +1804,8 @@ def check_df_dataset_integrity( pass def read_model_settings_from_config(self, config: configparser.ConfigParser): - self.model_dir_out = os.path.join( - read_config_entry(config, ConfigKey.SML_SETTINGS.value, ConfigKey.MODEL_DIR.value, - data_type=Dtypes.STR.value), "generated_models") + read_config_entry(config, ConfigKey.SML_SETTINGS.value, ConfigKey.MODEL_DIR.value, data_type=Dtypes.STR.value), "generated_models") if not os.path.exists(self.model_dir_out): os.makedirs(self.model_dir_out) self.eval_out_path = os.path.join(self.model_dir_out, "model_evaluations") @@ -2491,7 +2367,7 @@ def define_scaler( if scaler_name not in Options.SCALER_OPTIONS.value: raise InvalidInputError( msg=f"Scaler {scaler_name} not supported. Options: {Options.SCALER_OPTIONS.value}", - source=self.__class__.__name__, + source=TrainModelMixin.define_scaler.__name__, ) if scaler_name == Options.MIN_MAX_SCALER.value: return MinMaxScaler() diff --git a/simba/model/train_rf.py b/simba/model/train_rf.py index 05e6ddb7e..ab377a5a4 100644 --- a/simba/model/train_rf.py +++ b/simba/model/train_rf.py @@ -60,26 +60,15 @@ def perform_sampling(self): """ if self.split_type == Methods.SPLIT_TYPE_FRAMES.value: - self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( - self.x_df, self.y_df, test_size=self.tt_size - ) + self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_df, self.y_df, test_size=self.tt_size) elif self.split_type == Methods.SPLIT_TYPE_BOUTS.value: - self.x_train, self.x_test, self.y_train, self.y_test = ( - self.bout_train_test_splitter( - x_df=self.x_df, y_df=self.y_df, test_size=self.tt_size - ) - ) + self.x_train, self.x_test, self.y_train, self.y_test = ( self.bout_train_test_splitter(x_df=self.x_df, y_df=self.y_df, test_size=self.tt_size)) if self.under_sample_setting == Methods.RANDOM_UNDERSAMPLE.value.lower(): - self.x_train, self.y_train = self.random_undersampler( - self.x_train, self.y_train, float(self.under_sample_ratio) - ) + self.x_train, self.y_train = self.random_undersampler(self.x_train, self.y_train, float(self.under_sample_ratio)) if self.over_sample_setting == Methods.SMOTEENN.value.lower(): - self.x_train, self.y_train = self.smoteen_oversampler( - self.x_train, self.y_train, float(self.over_sample_ratio) - ) + self.x_train, self.y_train = self.smoteen_oversampler(self.x_train, self.y_train, float(self.over_sample_ratio)) elif self.over_sample_setting == Methods.SMOTE.value.lower(): - self.x_train, self.y_train = self.smote_oversampler( - self.x_train, self.y_train, float(self.over_sample_ratio) + self.x_train, self.y_train = self.smote_oversampler(self.x_train, self.y_train, float(self.over_sample_ratio) ) if self.save_train_test_frm_info: @@ -385,6 +374,14 @@ def save(self) -> None: stdout_success(msg=f"Classifier {self.clf_name} saved in models/generated_models directory", elapsed_time=self.timer.elapsed_time_str, source=self.__class__.__name__) stdout_success(msg=f"Evaluation files are in models/generated_models/model_evaluations folders", source=self.__class__.__name__) + + +test = TrainRandomForestClassifier(config_path=r"C:\troubleshooting\mitra\project_folder\project_config.ini") +test.run() +test.save() + + + # # test = TrainRandomForestClassifier(config_path=r"C:\troubleshooting\mitra\project_folder\project_config.ini") # test.run() diff --git a/simba/plotting/ROI_feature_visualizer.py b/simba/plotting/ROI_feature_visualizer.py index 627bc74de..3e7d5235d 100644 --- a/simba/plotting/ROI_feature_visualizer.py +++ b/simba/plotting/ROI_feature_visualizer.py @@ -31,15 +31,13 @@ POSE = "pose_estimation" ANIMAL_NAMES = "animal_names" -STYLE_KEYS = [ - ROI_CENTERS, - ROI_EAR_TAGS, - DIRECTIONALITY, - BORDER_COLOR, - POSE, - DIRECTIONALITY_STYLE, - ANIMAL_NAMES, -] +STYLE_KEYS = [ROI_CENTERS, + ROI_EAR_TAGS, + DIRECTIONALITY, + BORDER_COLOR, + POSE, + DIRECTIONALITY_STYLE, + ANIMAL_NAMES] class ROIfeatureVisualizer(ConfigReader): @@ -122,7 +120,7 @@ def __calc_text_locs(self): for shape in self.shape_names: txt_strs.append(animal_name + ' ' + shape + ' center distance') longest_text_str = max(txt_strs, key=len) - self.font_size, self.x_scaler, self.y_scaler = PlottingMixin().get_optimal_font_scales(text=longest_text_str, accepted_px_width=(self.video_meta_data['width'] / 3), accepted_px_height=(self.video_meta_data['height'] / 15), text_thickness=2) + self.font_size, self.x_scaler, self.y_scaler = PlottingMixin().get_optimal_font_scales(text=longest_text_str, accepted_px_width=int(self.video_meta_data['width'] / 3), accepted_px_height=int(self.video_meta_data['height'] / 15), text_thickness=2) self.circle_size = PlottingMixin().get_optimal_circle_size(frame_size=(int(self.video_meta_data['width']), int(self.video_meta_data['height'])), circle_frame_ratio=100) for animal_cnt, animal_data in self.bp_lk.items(): animal, animal_bp, _ = animal_data diff --git a/simba/plotting/plot_clf_results_mp.py b/simba/plotting/plot_clf_results_mp.py index b85a19d40..e90b2294e 100644 --- a/simba/plotting/plot_clf_results_mp.py +++ b/simba/plotting/plot_clf_results_mp.py @@ -302,6 +302,20 @@ def run(self): if self.frame_setting: stdout_success(f"Frames for {len(self.files_found)} videos saved in sub-folders within {self.sklearn_plot_dir} directory", elapsed_time=self.timer.elapsed_time_str, source=self.__class__.__name__) + +# if __name__ == "__main__": +# clf_plotter = PlotSklearnResultsMultiProcess(config_path=r"C:\troubleshooting\mitra\project_folder\project_config.ini", +# video_setting=True, +# frame_setting=False, +# rotate=False, +# video_file_path='FR_gq_CNO_0621.mp4', +# cores=-1, +# text_settings=False) +# clf_plotter.run() + + + + #text_settings = {'circle_scale': 5, 'font_size': 0.528, 'spacing_scale': 28, 'text_thickness': 2} # clf_plotter = PlotSklearnResultsMultiProcess(config_path='/Users/simon/Desktop/envs/simba/troubleshooting/beepboop174/project_folder/project_config.ini', # video_setting=True, diff --git a/simba/plotting/pose_plotter_mp.py b/simba/plotting/pose_plotter_mp.py index aafe48050..0b9d18707 100644 --- a/simba/plotting/pose_plotter_mp.py +++ b/simba/plotting/pose_plotter_mp.py @@ -3,7 +3,7 @@ import os import platform from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Union import cv2 import pandas as pd @@ -173,6 +173,17 @@ def run(self): stdout_success(f"Pose visualizations for {len(list(self.data.keys()))} video(s) created in {self.out_dir} directory", elapsed_time=self.config.timer.elapsed_time_str, source=self.__class__.__name__) +# if __name__ == "__main__": +# test = PosePlotterMultiProcess(data_path=r"D:\troubleshooting\pose_estimation\project_folder\csv\pose_estimation\3A_Mouse_5-choice_MustTouchTrainingNEWFINAL_a8_grayscale_clipped.csv", +# out_dir=None, +# circle_size=None, +# core_cnt=-1, +# palettes=None) +# test.run() + + + + # test = PosePlotterMultiProcess(data_path='/Users/simon/Desktop/envs/simba/troubleshooting/two_black_animals_14bp/project_folder/csv/outlier_corrected_movement_location/Together_1.csv', # out_dir=None, # circle_size=None, diff --git a/simba/roi_tools/ROI_analyzer.py b/simba/roi_tools/ROI_analyzer.py index 748fee2a0..95ad52364 100644 --- a/simba/roi_tools/ROI_analyzer.py +++ b/simba/roi_tools/ROI_analyzer.py @@ -58,8 +58,6 @@ def __init__(self, raise ROICoordinatesNotFoundError(expected_file_path=self.roi_coordinates_path) self.read_roi_data() FeatureExtractionMixin.__init__(self) - if detailed_bout_data and (not os.path.exists(self.detailed_roi_data_dir)): - os.makedirs(self.detailed_roi_data_dir) self.data_paths = read_data_paths(path=data_path, default=self.outlier_corrected_paths, default_name=self.outlier_corrected_dir, @@ -71,9 +69,7 @@ def __init__(self, raise CountError(msg=f"All body-part entries have to be unique. Got {body_parts}", source=self.__class__.__name__) self.bp_dict, self.bp_lk = {}, {} for bp in body_parts: - animal = self.find_animal_name_from_body_part_name( - bp_name=bp, bp_dict=self.animal_bp_dict - ) + animal = self.find_animal_name_from_body_part_name(bp_name=bp, bp_dict=self.animal_bp_dict) self.bp_dict[animal] = [f'{bp}_{"x"}', f'{bp}_{"y"}', f'{bp}_{"p"}'] self.bp_lk[animal] = bp self.roi_headers = [v for k, v in self.bp_dict.items()] @@ -218,18 +214,15 @@ def run(self): self.detailed_df = pd.concat(self.roi_bout_results, axis=0) self.detailed_df = self.detailed_df.rename(columns={"Event": "SHAPE NAME", "Start_time": "START TIME", "End Time": "END TIME", "Start_frame": "START FRAME", "End_frame": "END FRAME", "Bout_time": "DURATION (S)"}) self.detailed_df["BODY-PART"] = self.detailed_df["ANIMAL"].map(self.bp_lk) - self.detailed_df = self.detailed_df[["VIDEO", "ANIMAL", "BODY-PART", "SHAPE NAME", "START TIME", "END TIME", "START FRAME", "END FRAME", "DURATION (S)"]] + self.detailed_df = self.detailed_df[["VIDEO", "ANIMAL", "BODY-PART", "SHAPE NAME", "START TIME", "END TIME", "START FRAME", "END FRAME", "DURATION (S)"]].reset_index(drop=True) def save(self): self.entry_results["BODY-PART"] = self.entry_results["ANIMAL"].map(self.bp_lk) self.time_results["BODY-PART"] = self.time_results["ANIMAL"].map(self.bp_lk) self.entry_results = self.entry_results[["VIDEO", "ANIMAL", "BODY-PART", "SHAPE", "ENTRY COUNT"]] self.time_results = self.time_results[["VIDEO", "ANIMAL", "BODY-PART", "SHAPE", "TIME (S)"]] - self.entry_results.to_csv(os.path.join(self.logs_path, f'{"ROI_entry_data"}_{self.datetime}.csv') - ) - self.time_results.to_csv( - os.path.join(self.logs_path, f'{"ROI_time_data"}_{self.datetime}.csv') - ) + self.entry_results.to_csv(os.path.join(self.logs_path, f'{"ROI_entry_data"}_{self.datetime}.csv')) + self.time_results.to_csv(os.path.join(self.logs_path, f'{"ROI_time_data"}_{self.datetime}.csv')) if self.detailed_bout_data and self.detailed_df is not None: detailed_path = os.path.join(self.logs_path, f'{"Detailed_ROI_data"}_{self.datetime}.csv') self.detailed_df.to_csv(detailed_path) diff --git a/simba/roi_tools/ROI_directing_analyzer.py b/simba/roi_tools/ROI_directing_analyzer.py index 7d4ab92e7..0c8a137da 100644 --- a/simba/roi_tools/ROI_directing_analyzer.py +++ b/simba/roi_tools/ROI_directing_analyzer.py @@ -35,30 +35,17 @@ class DirectingROIAnalyzer(ConfigReader, FeatureExtractionMixin): >>> test.save() """ - def __init__( - self, - config_path: Union[str, os.PathLike], - data_path: Optional[Union[str, os.PathLike]] = None, - ): + def __init__(self, config_path: Union[str, os.PathLike], + data_path: Optional[Union[str, os.PathLike]] = None): check_file_exist_and_readable(file_path=config_path) ConfigReader.__init__(self, config_path=config_path) FeatureExtractionMixin.__init__(self, config_path=config_path) - self.data_paths = read_data_paths( - path=data_path, - default=self.outlier_corrected_paths, - default_name=self.outlier_corrected_dir, - file_type=self.file_type, - ) + self.data_paths = read_data_paths(path=data_path, default=self.outlier_corrected_paths, default_name=self.outlier_corrected_dir, file_type=self.file_type) if not os.path.isfile(self.roi_coordinates_path): - raise ROICoordinatesNotFoundError( - expected_file_path=self.roi_coordinates_path - ) + raise ROICoordinatesNotFoundError(expected_file_path=self.roi_coordinates_path) if not self.check_directionality_viable()[0]: - raise InvalidInputError( - msg="Cannot compute directionality towards ROIs. The ear and nose data is tracked in the project", - source=self.__class__.__name__, - ) + raise InvalidInputError(msg="Cannot compute directionality towards ROIs. The ear and nose data is tracked in the project", source=self.__class__.__name__) self.read_roi_data() self.direct_bp_dict = self.check_directionality_cords() diff --git a/simba/roi_tools/ROI_feature_analyzer.py b/simba/roi_tools/ROI_feature_analyzer.py index ba1bdc728..67d7175a1 100644 --- a/simba/roi_tools/ROI_feature_analyzer.py +++ b/simba/roi_tools/ROI_feature_analyzer.py @@ -38,103 +38,61 @@ class ROIFeatureCreator(ConfigReader, FeatureExtractionMixin): :example: - >>> roi_featurizer = ROIFeatureCreator(config_path='/Users/simon/Desktop/envs/simba/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini', - >>> body_parts=['Nose_1', 'Nose_2']) + >>> roi_featurizer = ROIFeatureCreator(config_path='/Users/simon/Desktop/envs/simba/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini', body_parts=['Nose_1', 'Nose_2']) >>> roi_featurizer.run() >>> roi_featurizer.save() """ - def __init__( - self, - config_path: Union[str, os.PathLike], - body_parts: List[str], - data_path: Optional[Union[str, os.PathLike]] = None, - append_data: Optional[bool] = False, - ): + def __init__(self, + config_path: Union[str, os.PathLike], + body_parts: List[str], + data_path: Optional[Union[str, os.PathLike]] = None, + append_data: Optional[bool] = False): - check_valid_lst( - data=body_parts, - source=f"{self.__class__.__name__} body-parts", - valid_dtypes=(str,), - min_len=1, - ) + check_valid_lst(data=body_parts, source=f"{self.__class__.__name__} body-parts", valid_dtypes=(str,), min_len=1) if len(set(body_parts)) != len(body_parts): - raise CountError( - msg=f"All body-part entries have to be unique. Got {body_parts}", - source=self.__class__.__name__, - ) - log_event( - logger_name=str(__class__.__name__), - log_type=TagNames.CLASS_INIT.value, - msg=self.create_log_msg_from_init_args(locals=locals()), - ) + raise CountError(msg=f"All body-part entries have to be unique. Got {body_parts}", source=self.__class__.__name__) + log_event(logger_name=str(__class__.__name__), log_type=TagNames.CLASS_INIT.value, msg=self.create_log_msg_from_init_args(locals=locals())) ConfigReader.__init__(self, config_path=config_path) FeatureExtractionMixin.__init__(self, config_path=config_path) self.read_roi_data() if not os.path.isfile(self.roi_coordinates_path): - raise ROICoordinatesNotFoundError( - expected_file_path=self.roi_coordinates_path - ) + raise ROICoordinatesNotFoundError(expected_file_path=self.roi_coordinates_path) + for bp in body_parts: if bp not in self.body_parts_lst: - raise BodypartColumnNotFoundError( - msg=f"The body-part {bp} is not a valid body-part in the SimBA project. Options: {self.body_parts_lst}", - source=self.__class__.__name__, - ) + raise BodypartColumnNotFoundError(msg=f"The body-part {bp} is not a valid body-part in the SimBA project. Options: {self.body_parts_lst}", source=self.__class__.__name__) self.bp_lk = {} for cnt, bp in enumerate(body_parts): - animal = self.find_animal_name_from_body_part_name( - bp_name=bp, bp_dict=self.animal_bp_dict - ) - self.bp_lk[cnt] = [ - animal, - bp, - [f'{bp}_{"x"}', f'{bp}_{"y"}', f'{bp}_{"p"}'], - ] + animal = self.find_animal_name_from_body_part_name(bp_name=bp, bp_dict=self.animal_bp_dict) + self.bp_lk[cnt] = [animal, bp, [f'{bp}_{"x"}', f'{bp}_{"y"}', f'{bp}_{"p"}']] + self.roi_directing_viable = self.check_directionality_viable()[0] - log_event( - logger_name=str(__class__.__name__), - log_type=TagNames.CLASS_INIT.value, - msg=self.create_log_msg_from_init_args(locals=locals()), - ) - self.data_paths = read_data_paths( - path=data_path, - default=self.outlier_corrected_paths, - default_name=self.outlier_corrected_dir, - file_type=self.file_type, - ) + + log_event(logger_name=str(__class__.__name__), log_type=TagNames.CLASS_INIT.value, msg=self.create_log_msg_from_init_args(locals=locals())) + self.data_paths = read_data_paths( path=data_path, default=self.outlier_corrected_paths, default_name=self.outlier_corrected_dir, file_type=self.file_type) if self.roi_directing_viable: print("Directionality calculations are VIABLE.") - self.directing_analyzer = DirectingROIAnalyzer( - config_path=config_path, data_path=self.data_paths - ) + self.directing_analyzer = DirectingROIAnalyzer(config_path=config_path, data_path=self.data_paths) self.directing_analyzer.run() self.dr = self.directing_analyzer.results_df else: + print("Directionality calculations are NOT VIABLE.") self.directing_analyzer = None self.dr = None if len(self.outlier_corrected_paths) == 0: - raise NoFilesFoundError( - msg=f"No data found in the {self.outlier_corrected_dir} directory", - source=self.__class__.__name__, - ) + raise NoFilesFoundError(msg=f"No data found in the {self.outlier_corrected_dir} directory", source=self.__class__.__name__) if len(self.feature_file_paths) == 0: raise NoFilesFoundError( msg=f"No data found in the {self.features_dir} directory", source=self.__class__.__name__, ) self.append_data = append_data - print( - f"Processing {len(self.outlier_corrected_paths)} video(s) for ROI features..." - ) + print(f"Processing {len(self.outlier_corrected_paths)} video(s) for ROI features...") def run(self): - check_all_file_names_are_represented_in_video_log( - video_info_df=self.video_info_df, data_paths=self.outlier_corrected_paths - ) - self.summary = pd.DataFrame( - columns=["VIDEO", "ANIMAL", "SHAPE NAME", "MEASUREMENT", "VALUE"] - ) + check_all_file_names_are_represented_in_video_log(video_info_df=self.video_info_df, data_paths=self.outlier_corrected_paths) + self.summary = pd.DataFrame(columns=["VIDEO", "ANIMAL", "SHAPE NAME", "MEASUREMENT", "VALUE"]) if self.append_data: _o_paths = set([get_fn_ext(x)[1] for x in self.outlier_corrected_paths]) _f_paths = set([get_fn_ext(x)[1] for x in self.feature_file_paths]) @@ -285,16 +243,22 @@ def run(self): ) def save(self): - save_path = os.path.join( - self.logs_path, f"ROI_features_summary_{self.datetime}.csv" - ) + save_path = os.path.join(self.logs_path, f"ROI_features_summary_{self.datetime}.csv") self.summary.to_csv(save_path) print(f"ROI feature summary data saved at {save_path}") self.timer.stop_timer() - stdout_success( - msg=f"{len(self.outlier_corrected_paths)} new files with ROI features saved in {self.features_dir}", - elapsed_time=self.timer.elapsed_time_str, - ) + stdout_success(msg=f"{len(self.outlier_corrected_paths)} new files with ROI features saved in {self.features_dir}", elapsed_time=self.timer.elapsed_time_str,) + + + +# roi_featurizer = ROIFeatureCreator(config_path=r"C:\troubleshooting\spontenous_alternation\project_folder\project_config.ini", +# body_parts=['nose'], +# data_path=r"C:\troubleshooting\spontenous_alternation\project_folder\csv\outlier_corrected_movement_location\F1 HAB.csv", +# append_data=True) +# roi_featurizer.run() +#roi_featurizer.save() + + # diff --git a/simba/third_party_label_appenders/converters.py b/simba/third_party_label_appenders/converters.py new file mode 100644 index 000000000..88c4039d9 --- /dev/null +++ b/simba/third_party_label_appenders/converters.py @@ -0,0 +1,443 @@ +import json +import os +import itertools +import pandas as pd +import io +from PIL import Image +import base64 +from datetime import datetime +from typing import Dict, Optional, Tuple, Union, Any + +import cv2 +import numpy as np +from pycocotools import mask +from shapely.geometry import Polygon +from skimage.draw import polygon + +from simba.mixins.geometry_mixin import GeometryMixin +from simba.mixins.image_mixin import ImageMixin +from simba.utils.checks import check_instance, check_int, check_valid_array, check_if_dir_exists, check_if_keys_exist_in_dict, check_file_exist_and_readable, check_if_valid_img +from simba.utils.enums import Formats +from simba.utils.read_write import get_video_meta_data, read_df, read_frm_of_video, find_files_of_filetypes_in_directory, get_fn_ext +from simba.utils.errors import NoFilesFoundError, InvalidInputError +from simba.utils.printing import SimbaTimer, stdout_success + + +def geometry_to_rle(geometry: Union[np.ndarray, Polygon], img_size: Tuple[int, int]): + """ + Converts a geometry (polygon or NumPy array) into a Run-Length Encoding (RLE) mask, suitable for object detection or segmentation tasks. + + :param geometry: The geometry to be converted into an RLE. It can be either a shapely Polygon or a (n, 2) np.ndarray with vertices. + :param img_size: A tuple `(height, width)` representing the size of the image in which the geometry is to be encoded. This defines the dimensions of the output binary mask. + :return: + """ + check_instance(source=geometry_to_rle.__name__, instance=geometry, accepted_types=(Polygon, np.ndarray)) + if isinstance(geometry, (Polygon,)): + geometry = geometry.exterior.coords + else: + check_valid_array(data=geometry, source=geometry_to_rle.__name__, accepted_ndims=[(2,)], accepted_dtypes=Formats.NUMERIC_DTYPES.value) + binary_mask = np.zeros(img_size, dtype=np.uint8) + rr, cc = polygon(geometry[:, 0].flatten(), geometry[:, 1].flatten(), img_size) + binary_mask[rr, cc] = 1 + rle = mask.encode(np.asfortranarray(binary_mask)) + rle['counts'] = rle['counts'].decode('utf-8') + return rle + +def geometries_to_coco(geometries: Dict[str, np.ndarray], + video_path: Union[str, os.PathLike], + save_dir: Union[str, os.PathLike], + version: Optional[int] = 1, + description: Optional[str] = None, + licences: Optional[str] = None): + """ + :example: + >>> data_path = r"C:\troubleshooting\mitra\project_folder\csv\outlier_corrected_movement_location\FRR_gq_Saline_0624.csv" + >>> animal_data = read_df(file_path=data_path, file_type='csv', usecols=['Nose_x', 'Nose_y', 'Tail_base_x', 'Tail_base_y', 'Left_side_x', 'Left_side_y', 'Right_side_x', 'Right_side_y']).values.reshape(-1, 4, 2)[0:20].astype(np.int32) + >>> animal_polygons = GeometryMixin().bodyparts_to_polygon(data=animal_data) + >>> animal_polygons = GeometryMixin().multiframe_minimum_rotated_rectangle(shapes=animal_polygons) + >>> animal_polygons = GeometryMixin().geometries_to_exterior_keypoints(geometries=animal_polygons) + >>> animal_polygons = GeometryMixin.keypoints_to_axis_aligned_bounding_box(keypoints=animal_polygons) + >>> animal_polygons = {0: animal_polygons} + >>> geometries_to_coco(geometries=animal_polygons, video_path=r'C:\troubleshooting\mitra\project_folder\videos\FRR_gq_Saline_0624.mp4', save_dir=r"C:\troubleshooting\coco_data") + """ + + categories = [] + for cnt, i in enumerate(geometries.keys()): categories.append({'id': i, 'name': i, 'supercategory': i}) + results = {'info': {'year': datetime.now().year, 'version': version, 'description': description}, 'licences': licences, 'categories': categories} + video_data = get_video_meta_data(video_path) + w, h = video_data['width'], video_data['height'] + images = [] + annotations = [] + img_names = [] + if not os.path.isdir(save_dir): os.makedirs(save_dir) + save_img_dir = os.path.join(save_dir, 'img') + if not os.path.isdir(save_img_dir): os.makedirs(save_img_dir) + for category_cnt, (category_id, category_data) in enumerate(geometries.items()): + for img_cnt in range(category_data.shape[0]): + img_geometry = category_data[img_cnt] + img_name = f'{video_data["video_name"]}_{img_cnt}.png' + if img_name not in img_names: + images.append({'id': img_cnt, 'width': w, 'height': h, 'file_name': img_name}) + img = read_frm_of_video(video_path=video_path, frame_index=img_cnt) + img_save_path = os.path.join(save_img_dir, img_name) + cv2.imwrite(img_save_path, img) + img_names.append(img_name) + annotation_id = category_cnt * img_cnt + 1 + d = GeometryMixin().get_shape_lengths_widths(shapes=Polygon(img_geometry)) + a_h, a_w, a_a = d['max_length'], d['max_width'], d['max_area'] + bbox = [int(category_data[img_cnt][0][0]), int(category_data[img_cnt][0][1]), int(a_w), int(a_h)] + rle = geometry_to_rle(geometry=img_geometry, img_size=(h, w)) + annotation = {'id': annotation_id, 'image_id': img_cnt, 'category_id': category_id, 'bbox': bbox, 'area': a_a, 'iscrowd': 0, 'segmentation': rle} + annotations.append(annotation) + results['images'] = images + results['annotations'] = annotations + with open(os.path.join(save_dir, f"annotations.json"), "w") as final: + json.dump(results, final) + + +def geometries_to_yolo(geometries: Dict[Union[str, int], np.ndarray], + video_path: Union[str, os.PathLike], + save_dir: Union[str, os.PathLike], + verbose: Optional[bool] = True, + sample: Optional[int] = None, + obb: Optional[bool] = False) -> None: + """ + Converts geometrical shapes (like polygons) into YOLO format annotations and saves them along with corresponding video frames as images. + + :param Dict[Union[str, int], np.ndarray geometries: A dictionary where the keys represent category IDs (either string or int), and the values are NumPy arrays of shape `(n_frames, n_points, 2)`. Each entry in the array represents the geometry of an object in a particular frame (e.g., keypoints or polygons). + :param Union[str, os.PathLike] video_path: Path to the video file from which frames are extracted. The video is used to extract images corresponding to the geometrical annotations. + :param Union[str, os.PathLike] save_dir: The directory where the output images and YOLO annotation files will be saved. Images will be stored in a subfolder `images/` and annotations in `labels/`. + :param verbose: If `True`, prints progress while processing each frame. This can be useful for monitoring long-running tasks. Default is `True`. + :param sample: If provided, only a random sample of the geometries will be used for annotation. This value represents the number of frames to sample. If `None`, all frames will be processed. Default is `None`. + :param obb: If `True`, uses oriented bounding boxes (OBB) by extracting the four corner points of the geometries. Otherwise, axis-aligned bounding boxes (AABB) are used. Default is `False`. + :return None: + + :example: + >>> data_path = r"C:\troubleshooting\mitra\project_folder\csv\outlier_corrected_movement_location\501_MA142_Gi_CNO_0514.csv" + >>> animal_data = read_df(file_path=data_path, file_type='csv', usecols=['Nose_x', 'Nose_y', 'Tail_base_x', 'Tail_base_y', 'Left_side_x', 'Left_side_y', 'Right_side_x', 'Right_side_y']).values.reshape(-1, 4, 2).astype(np.int32) + >>> animal_polygons = GeometryMixin().bodyparts_to_polygon(data=animal_data) + >>> poygons = GeometryMixin().multiframe_minimum_rotated_rectangle(shapes=animal_polygons) + >>> animal_polygons = GeometryMixin().geometries_to_exterior_keypoints(geometries=poygons) + >>> animal_polygons = {0: animal_polygons} + >>> geometries_to_yolo(geometries=animal_polygons, video_path=r'C:\troubleshooting\mitra\project_folder\videos\501_MA142_Gi_CNO_0514.mp4', save_dir=r"C:\troubleshooting\coco_data", sample=500, obb=True) + """ + + video_data = get_video_meta_data(video_path) + categories = list(geometries.keys()) + w, h = video_data['width'], video_data['height'] + if not os.path.isdir(save_dir): os.makedirs(save_dir) + save_img_dir = os.path.join(save_dir, 'images') + save_labels_dir = os.path.join(save_dir, 'labels') + if not os.path.isdir(save_img_dir): os.makedirs(save_img_dir) + if not os.path.isdir(save_labels_dir): os.makedirs(save_labels_dir) + results, samples = {}, None + if sample is not None: + check_int(name='sample', value=sample, min_value=1, max_value=geometries[categories[0]].shape[0]) + samples = np.random.choice(np.arange(0, geometries[categories[0]].shape[0]-1), sample) + for category_cnt, (category_id, category_data) in enumerate(geometries.items()): + for img_cnt in range(category_data.shape[0]): + if sample is not None and img_cnt not in samples: + continue + else: + if verbose: + print(f'Writing category {category_cnt}, Image: {img_cnt}.') + img_geometry = category_data[img_cnt] + img_name = f'{video_data["video_name"]}_{img_cnt}.png' + if not obb: + shape_stats = GeometryMixin.get_shape_statistics(shapes=Polygon(img_geometry)) + x_center = shape_stats['centers'][0][0] / w + y_center = shape_stats['centers'][0][1] / h + width = shape_stats['widths'][0] / w + height = shape_stats['lengths'][0] / h + img_results = ' '.join([str(category_id), str(x_center), str(y_center), str(width), str(height)]) + else: + img_geometry = img_geometry[1:] + x1, y1 = img_geometry[0][0] / w, img_geometry[0][1] / h + x2, y2 = img_geometry[1][0] / w, img_geometry[1][1] / h + x3, y3 = img_geometry[2][0] / w, img_geometry[2][1] / h + x4, y4 = img_geometry[3][0] / w, img_geometry[3][1] / h + img_results = ' '.join([str(category_id), str(x1), str(y1), str(x2), str(y2), str(x3), str(y3), str(x4), str(y4)]) + if img_name not in results.keys(): + img = read_frm_of_video(video_path=video_path, frame_index=img_cnt) + img_save_path = os.path.join(save_img_dir, img_name) + cv2.imwrite(img_save_path, img) + results[img_name] = [img_results] + else: + results[img_name].append(img_results) + + for k, v in results.items(): + name = k.split(sep='.', maxsplit=2)[0] + file_name = os.path.join(save_labels_dir, f'{name}.txt') + with open(file_name, mode='wt', encoding='utf-8') as f: + f.write('\n'.join(v)) + + +def _b64_to_arr(img_b64) -> np.ndarray: + """ + Helper to convert byte string (e.g., from labelme) to image in numpy format + """ + f = io.BytesIO() + f.write(base64.b64decode(img_b64)) + img_arr = np.array(Image.open(f)) + return img_arr + + + +def _arr_to_b64(x: np.ndarray) -> str: + """ + Helper to convert image to byte string + """ + _, buffer = cv2.imencode('.jpg', x) + return base64.b64encode(buffer).decode("utf-8") + + +def labelme_to_dlc(labelme_dir: Union[str, os.PathLike], + scorer: Optional[str] = 'SN', + save_dir: Optional[Union[str, os.PathLike]] = None) -> None: + """ + Convert labels from labelme format to DLC format. + + :param Union[str, os.PathLike] labelme_dir: Directory with labelme json files. + :param Optional[str] scorer: Name of the scorer (anticipated by DLC as header) + :param Optional[Union[str, os.PathLike]] save_dir: Directory where to save the DLC annotations. If None, then same directory as labelme_dir with `_dlc_annotations` suffix. + :return: None + + :example: + >>> labelme_dir = r'D:\ts_annotations' + >>> labelme_to_dlc(labelme_dir=labelme_dir) + """ + + check_if_dir_exists(in_dir=labelme_dir) + annotation_paths = find_files_of_filetypes_in_directory(directory=labelme_dir, extensions=['.json'], raise_error=True) + results_dict = {} + images = {} + for annot_path in annotation_paths: + with open(annot_path) as f: + annot_data = json.load(f) + check_if_keys_exist_in_dict(data=annot_data, key=['shapes', 'imageData', 'imagePath'], name=annot_path) + img_name = os.path.basename(annot_data['imagePath']) + images[img_name] = _b64_to_arr(annot_data['imageData']) + for bp_data in annot_data['shapes']: + check_if_keys_exist_in_dict(data=bp_data, key=['label', 'points'], name=annot_path) + point_x, point_y = bp_data['points'][0][0], bp_data['points'][0][1] + lbl = bp_data['label'] + id = os.path.join('labeled-data', os.path.basename(labelme_dir), img_name) + if id not in results_dict.keys(): + results_dict[id] = {f'{lbl}': {'x': point_x, 'y': point_y}} + else: + results_dict[id].update({f'{lbl}': {'x': point_x, 'y': point_y}}) + + if save_dir is None: + save_dir = os.path.join(os.path.dirname(labelme_dir), os.path.basename(labelme_dir) + '_dlc_annotations') + if not os.path.isdir(save_dir): os.makedirs(save_dir) + + bp_names = set() + for img, bp in results_dict.items(): bp_names.update(set(bp.keys())) + col_names = list(itertools.product(*[[scorer], bp_names, ['x', 'y']])) + columns = pd.MultiIndex.from_tuples(col_names) + results = pd.DataFrame(columns=columns) + results.columns.names = ['scorer', 'bodyparts', 'coords'] + for img, bp_data in results_dict.items(): + for bp_name, bp_cords in bp_data.items(): + results.at[img, (scorer, bp_name, 'x')] = bp_cords['x'] + results.at[img, (scorer, bp_name, 'y')] = bp_cords['y'] + + for img_name, img in images.items(): + img_save_path = os.path.join(save_dir, img_name) + cv2.imwrite(img_save_path, img) + save_path = os.path.join(save_dir, f'CollectedData_{scorer}.csv') + results.to_csv(save_path) + + +def dlc_to_labelme(dlc_dir: Union[str, os.PathLike], + save_dir: Union[str, os.PathLike], + labelme_version: Optional[str] = '5.3.1', + flags: Optional[Dict[Any, Any]] = None, + verbose: Optional[bool] = True) -> None: + + """ + Convert a folder of DLC annotations into labelme json format. + + :param dlc_dir: Folder with DLC annotations. I.e., directory inside + :param save_dir: Directory to where to save the labelme json files. + :param labelme_version: Version number encoded in the json files. + :param flags: Flags included in the json files. + :param verbose: If True, prints progress/ + :return: None + + :example: + >>> dlc_to_labelme(dlc_dir=r"D:\TS_DLC\labeled-data\ts_annotations", save_dir=r"C:\troubleshooting\coco_data\labels\test") + """ + + timer = SimbaTimer(start=True) + check_if_dir_exists(dlc_dir, source=f'{dlc_to_labelme.__name__}') + collected_data_path = find_files_of_filetypes_in_directory(directory=dlc_dir, extensions=['.csv']) + collected_data_path = [x for x in collected_data_path if 'CollectedData' in x] + if len(collected_data_path) > 1: + raise NoFilesFoundError(msg=f'Two CSV annotation files found in {dlc_dir}', source=dlc_to_labelme.__name__) + elif len(collected_data_path) == 0: + raise NoFilesFoundError(msg=f'No CSV annotation files found in {dlc_dir} with anticipated CollectedData sub-string', source=dlc_to_labelme.__name__) + version = labelme_version + annotation_data = pd.read_csv(collected_data_path[0], header=[0, 1, 2]) + body_parts = set() + if flags is None: + flags = {} + body_part_headers = ['image'] + for i in annotation_data.columns[1:]: + if 'unnamed:' not in i[1].lower(): + body_parts.add(i[1]) + for i in body_parts: + body_part_headers.append(f'{i}_x'); body_part_headers.append(f'{i}_y') + annotation_data = annotation_data.iloc[:, 2:] + annotation_data.columns = body_part_headers + for cnt, (idx, idx_data) in enumerate(annotation_data.iterrows()): + if verbose: + print(f'Processing image {cnt+1}/{len(annotation_data)}...') + imgPath = idx_data['image'] + img_path = os.path.join(dlc_dir, imgPath) + img = cv2.imread(img_path) + check_file_exist_and_readable(img_path) + idx_data = idx_data.to_dict() + shapes = [] + for bp_name in body_parts: + img_shapes = {'label': bp_name, + 'points': [idx_data[f'{bp_name}_x'], idx_data[f'{bp_name}_y']], + 'group_id': None, + 'description': "", + 'shape_type': 'point', + 'flags': {}} + shapes.append(img_shapes) + out = {"version": version, + 'flags': flags, + 'shapes': shapes, + 'imagePath': imgPath, + 'imageData': _arr_to_b64(img), + 'imageHeight': img.shape[0], + 'imageWidth': img.shape[1]} + save_path = os.path.join(save_dir, get_fn_ext(filepath=imgPath)[1] + '.json') + with open(save_path, "w") as f: + json.dump(out, f) + timer.stop_timer() + if verbose: + stdout_success(f'Labelme data for {len(annotation_data)} image(s) saved in {save_dir} directory', elapsed_time=timer.elapsed_time_str) + + +def _b64_dict_to_imgs(x: Dict[str, str]): + """ + :example: + >>> df = labelme_to_df(labelme_dir=r'C:\troubleshooting\coco_data\labels\test_2') + >>> x = df.set_index('image_name')['image'].to_dict() + >>> _b64_dict_to_imgs(x) + """ + results = {} + for k, v in x.items(): + results[k] = _b64_to_arr(v) + return results + + +def normalize_img_dict(img_dict: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + img_ndims = set() + for img in img_dict.values(): + check_if_valid_img(data=img, source=normalize_img_dict.__name__, raise_error=True) + img_ndims.add(img.ndim) + if len(img_ndims) > 1: + raise InvalidInputError(msg=f'Images in dictonary have to all be either color OR greyscale. Got {img_ndims} dimensions.', source=normalize_img_dict.__name__) + + results = {} + if list(img_ndims)[0] == 2: + all_pixels = np.concatenate([img.ravel() for img in img_dict.values()]) + mean = np.mean(all_pixels) + std = np.std(all_pixels) + for img_name, img in img_dict.items(): + v = (img - mean) / std + v_rescaled = np.clip((v * 64) + 128, 0, 255) + results[img_name] = v_rescaled.astype(np.uint8) + else: + r, g, b = [], [], [] + for img in img_dict.values(): + r.append(np.mean(img[:, :, 0])) + g.append(np.mean(img[:, :, 1])) + b.append(np.mean(img[:, :, 2])) + r_mean, r_std = np.mean(r), np.std(r) + g_mean, g_std = np.mean(g), np.std(g) + b_mean, b_std = np.mean(b), np.std(b) + for img_name, img in img_dict.items(): + r = (img[:, :, 0] - r_mean) / r_std + g = (img[:, :, 1] - g_mean) / g_std + b = (img[:, :, 2] - b_mean) / b_std + r = np.clip((r * 64) + 128, 0, 255) # Scale and shift + g = np.clip((g * 64) + 128, 0, 255) # Scale and shift + b = np.clip((b * 64) + 128, 0, 255) # Scale and shift + results[img_name] = np.stack([r, g, b], axis=-1).astype(np.uint8) + + return results + +def labelme_to_df(labelme_dir: Union[str, os.PathLike], + greyscale: Optional[bool] = False, + pad: Optional[bool] = False, + normalize: Optional[bool] = False) -> pd.DataFrame: + + """ + >>> labelme_to_df(labelme_dir=r'C:\troubleshooting\coco_data\labels\test_2') + """ + check_if_dir_exists(in_dir=labelme_dir) + annotation_paths = find_files_of_filetypes_in_directory(directory=labelme_dir, extensions=['.json'], raise_error=True) + images = {} + annotations = [] + for annot_path in annotation_paths: + with open(annot_path) as f: annot_data = json.load(f) + check_if_keys_exist_in_dict(data=annot_data, key=['shapes', 'imageData'], name=annot_path) + img_name = os.path.basename(annot_data['imagePath']) + images[img_name] = _b64_to_arr(annot_data['imageData']) + if greyscale: + print(greyscale) + if len(images[img_name].shape) != 2: + images[img_name] = (0.07 * images[img_name][:, :, 2] + 0.72 * images[img_name][:, :, 1] + 0.21 * images[img_name][:, :, 0]).astype(np.uint8) + img_data = {} + for bp_data in annot_data['shapes']: + check_if_keys_exist_in_dict(data=bp_data, key=['label', 'points'], name=annot_path) + point_x, point_y = bp_data['points'][0], bp_data['points'][1] + lbl = bp_data['label'] + img_data[f'{lbl}_x'], img_data[f'{lbl}_y'] = point_x, point_y + img_data['image_name'] = img_name + annotations.append(pd.DataFrame.from_dict(img_data, orient='index').T) + if pad: + images = ImageMixin.pad_img_stack(image_dict=images) + if normalize: + images = normalize_img_dict(img_dict=images) + img_lst = [] + for k, v in images.items(): + img_lst.append(_arr_to_b64(v)) + out = pd.concat(annotations).reset_index(drop=True) + out['image'] = img_lst + return out + + +#df = labelme_to_df(labelme_dir=r'C:\troubleshooting\coco_data\labels\test_read', greyscale=False, pad=True, normalize=False) + + + + +#dlc_to_labelme(dlc_dir=r"D:\TS_DLC\labeled-data\ts_annotations", save_dir=r"C:\troubleshooting\coco_data\labels\test") + +# + + + + + +# x = df.set_index('image_name')['image'].to_dict() +# _b64_dict_to_imgs(x) + + + +# dlc_to_labelme(dlc_dir=r"D:\TS_DLC\labeled-data\ts_annotations", save_dir=r"C:\troubleshooting\coco_data\labels\test") + + +# +# def dlc_to_coco(): +# pass +# #TODO +# + + diff --git a/simba/ui/pop_ups/boolean_conditional_slicer_pup_up.py b/simba/ui/pop_ups/boolean_conditional_slicer_pup_up.py index cfd0469ef..7a8ef7bcd 100644 --- a/simba/ui/pop_ups/boolean_conditional_slicer_pup_up.py +++ b/simba/ui/pop_ups/boolean_conditional_slicer_pup_up.py @@ -23,7 +23,7 @@ def __init__(self, config_path: str): self.rule_cnt_dropdown.grid(row=0, column=0, sticky="NW") self.create_run_frm(run_function=self.run) - check_if_filepath_list_is_empty( filepaths=self.feature_file_paths, error_msg=f"No data found in {self.features_dir}") + check_if_filepath_list_is_empty(filepaths=self.feature_file_paths, error_msg=f"No data found in {self.features_dir}") data_df = read_df(file_path=self.feature_file_paths[0], file_type=self.file_type) self.bool_cols = data_df.columns[data_df.apply(self._is_bool)] if len(self.bool_cols) < 2: @@ -92,24 +92,14 @@ def run(self): selections = {} for rule_id, rule_data in self.rules.items(): unique_rule_behaviors.append(rule_data["behavior_drpdwn"].getChoices()) - selections[rule_data["behavior_drpdwn"].getChoices()] = rule_data[ - "status_drpdwn" - ].getChoices() - duplicates = list( - set( - [x for x in unique_rule_behaviors if unique_rule_behaviors.count(x) > 1] - ) - ) + selections[rule_data["behavior_drpdwn"].getChoices()] = rule_data["status_drpdwn"].getChoices() + duplicates = list(set([x for x in unique_rule_behaviors if unique_rule_behaviors.count(x) > 1])) if len(duplicates) > 0: - raise DuplicationError( - msg=f"Each row should be a unique behavior. However, behaviors {duplicates} are selected in more than 1 rows." - ) - boolean_calculator = BooleanConditionalCalculator( - config_path=self.config_path, rules=selections - ) + raise DuplicationError(msg=f"Each row should be a unique behavior. However, behaviors {duplicates} are selected in more than 1 rows.") + boolean_calculator = BooleanConditionalCalculator(config_path=self.config_path, rules=selections) boolean_calculator.run() boolean_calculator.save() -# roi_featurizer = BooleanConditionalSlicerPopUp(config_path='/Users/simon/Desktop/envs/troubleshooting/two_animals_16bp_032023/project_folder/project_config.ini') +# roi_featurizer = BooleanConditionalSlicerPopUp(config_path=r"C:\troubleshooting\two_black_animals_14bp\project_folder\project_config.ini") # roi_featurizer = BooleanConditionalSlicerPopUp(config_path='/Users/simon/Desktop/envs/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini') diff --git a/simba/utils/checks.py b/simba/utils/checks.py index cab05995a..6b82d82f8 100644 --- a/simba/utils/checks.py +++ b/simba/utils/checks.py @@ -791,7 +791,7 @@ def check_that_dir_has_list_of_filenames( def check_valid_array(data: np.ndarray, source: Optional[str] = "", - accepted_ndims: Optional[List[Tuple[int]]] = None, + accepted_ndims: Optional[Tuple[int]] = None, accepted_sizes: Optional[List[int]] = None, accepted_axis_0_shape: Optional[List[int]] = None, accepted_axis_1_shape: Optional[List[int]] = None, diff --git a/simba/utils/data.py b/simba/utils/data.py index 654941da9..17cb33190 100644 --- a/simba/utils/data.py +++ b/simba/utils/data.py @@ -48,7 +48,7 @@ read_roi_data, write_df) -def detect_bouts(data_df: pd.DataFrame, target_lst: List[str], fps: int) -> pd.DataFrame: +def detect_bouts(data_df: pd.DataFrame, target_lst: List[str], fps: Union[int, float]) -> pd.DataFrame: """ Detect behavior "bouts" (e.g., continous sequence of classified behavior-present frames) for specified classifiers. @@ -57,7 +57,7 @@ def detect_bouts(data_df: pd.DataFrame, target_lst: List[str], fps: int) -> pd.D :param pd.DataFrame data_df: Dataframe with fields representing classifications in boolean type. :param List[str] target_lst: Classifier names. E.g., ['Attack', 'Sniffing', 'Grooming'] or ROIs - :param int fps: The fps of the input video. + :param Union[int, float] fps: The fps of the input video. :return: Dataframe where bouts are represented by rows and fields are represented by 'Event type ', 'Start time', 'End time', 'Start frame', 'End frame', 'Bout time' :rtype: pd.DataFrame diff --git a/simba/utils/read_write.py b/simba/utils/read_write.py index dd72f4ae3..0f29b13b2 100644 --- a/simba/utils/read_write.py +++ b/simba/utils/read_write.py @@ -617,11 +617,13 @@ def read_video_info(vid_info_df: pd.DataFrame, return None else: try: - px_per_mm = float(video_settings["pixels/mm"]) - fps = float(video_settings["fps"]) + px_per_mm = float(video_settings["pixels/mm"].iloc[0]) + #px_per_mm = float(video_settings["pixels/mm"]) + fps = float(video_settings["fps"].iloc[0]) + #fps = float(video_settings["fps"]) return video_settings, px_per_mm, fps - except TypeError: - raise ParametersFileError(msg=f"Make sure the videos that are going to be analyzed are represented with APPROPRIATE VALUES inside the project_folder/logs/video_info.csv file in your SimBA project. Could not interpret the fps, pixels per millimeter and/or fps as numerical values for video {video_name}", source=read_video_info.__name__) + except TypeError as e: + raise ParametersFileError(msg=f"Make sure the videos that are going to be analyzed are represented with APPROPRIATE VALUES inside the project_folder/logs/video_info.csv file in your SimBA project. Could not interpret the fps, pixels per millimeter and/or fps as numerical values for video {video_name}: {e.args}", source=read_video_info.__name__) def find_all_videos_in_directory(directory: Union[str, os.PathLike], @@ -1035,8 +1037,11 @@ def str_2_bool(input_str: str) -> bool: >>> str_2_bool(input_str='yes') >>> True """ - check_str(name='input_str', value=input_str) - return input_str.lower() in ("yes", "true", "1") + if isinstance(input_str, bool): + return input_str + else: + check_str(name='input_str', value=input_str) + return input_str.lower() in ("yes", "true", "1") def tabulate_clf_info(clf_path: Union[str, os.PathLike]) -> None: @@ -1236,7 +1241,8 @@ def get_memory_usage_of_df(df: pd.DataFrame) -> Dict[str, float]: >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD')) >>> {'bytes': 3328, 'megabytes': 0.003328, 'gigabytes': 3e-06} """ - + if not isinstance(df, pd.DataFrame): + raise InvalidInputError(msg='df has to be a pandas dataframe', source=get_memory_usage_of_df.__name__) results = {} results["bytes"] = df.memory_usage(index=True).sum() results["megabytes"] = round(results["bytes"] / 1000000, 6) @@ -2515,3 +2521,19 @@ def labelme_to_dlc(labelme_dir: Union[str, os.PathLike], cv2.imwrite(img_save_path, img) save_path = os.path.join(save_dir, f'CollectedData_{scorer}.csv') results.to_csv(save_path) + +def get_memory_usage_array(x: np.ndarray) -> Dict[str, float]: + """ + Calculates the memory usage of a NumPy array in bytes, megabytes, and gigabytes. + + :param x: A NumPy array for which memory usage will be calculated. It should be a valid NumPy array with a defined size and dtype. + :return: A dictionary with memory usage information, containing the following keys: - "bytes": Memory usage in bytes. - "megabytes": Memory usage in megabytes. - "gigabytes": Memory usage in gigabytes. + """ + + check_valid_array(data=x, source=get_memory_usage_array.__name__) + results = {} + mb = int(x.size * x.itemsize / (1024 ** 2)) + results["bytes"] = int(mb * 1000) + results["megabytes"] = mb + results["gigabytes"] = int(mb / 1000) + return results \ No newline at end of file