From 4f12defb7eb6f412fcd35b66236fd5883e09fc3c Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Wed, 9 Mar 2022 15:17:43 +0000 Subject: [PATCH 01/11] feat: new notebook to calculate summary statistics for Z scores (and ratios) across the different entities --- notebooks/calculate_summary_statistics.ipynb | 624 ++++++++++++++++++ .../calculate_summary_statistics.py | 117 ++++ 2 files changed, 741 insertions(+) create mode 100644 notebooks/calculate_summary_statistics.ipynb create mode 100644 notebooks/diffable_python/calculate_summary_statistics.py diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb new file mode 100644 index 000000000..d13ef013a --- /dev/null +++ b/notebooks/calculate_summary_statistics.ipynb @@ -0,0 +1,624 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.outliers import Runner\n", + "from datetime import date\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from_date = date(year=2021,month=4,day=1)\n", + "to_date = date(year=2021,month=8,day=1)\n", + "r = Runner(from_date,to_date,5,[\"practice\",\"ccg\",\"pcn\",\"stp\"],False)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=7VcN0yeYTzq2E9Z3jdPuuj7sEjFtTb&prompt=consent&access_type=offline\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter the authorization code: 4/1AX4XfWiHK9sZGQcEnE9I3G-pDKHnzzLRbZ2FYsyEvv0x8Omm_TOcfbh3Z3A\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: 100%|██████████| 1/1 [00:00<00:00, 6.93rows/s]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e6192a9e28d549babea7ec4d09af9479", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='SUBMITTING | ', max=6499.0, style=ProgressStyle(descripti…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "77ffb0a12f3547acb5dd1ad59ea70c89", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='PROCESSING | ', max=6499.0, style=ProgressStyle(descripti…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8bc9cd88d202472ea47dca9b54750c3a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='COLLECTING | ', max=6499.0, style=ProgressStyle(descripti…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "ename": "TypeError", + "evalue": "add_item() argument after ** must be a mapping, not BrokenProcessPool", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/home/app/notebook/lib/outliers.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_entity_report\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 789\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 790\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 791\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 792\u001b[0m \u001b[0;31m# write out toc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: add_item() argument after ** must be a mapping, not BrokenProcessPool" + ] + } + ], + "source": [ + "r.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "### Extracting all the stored z scores etc across organisations\n", + "### so that summary statistics can be calculated\n", + "\n", + "e_data = pd.concat(\n", + " (d.assign(entity=e) for e, d in r.build.results.items())\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Entity counts\n", + "\n", + "Counts of each kind of entity (i.e., organisation)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n
practice6499
pcn1257
ccg106
stp42
\n", + "
" + ], + "text/plain": [ + " n\n", + "practice 6499\n", + "pcn 1257\n", + "ccg 106\n", + "stp 42" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Summarising the number of each kind of entity (organisation)\n", + "\n", + "e_counts = ( e_data.reset_index()[[\"practice\",\"entity\"]]\n", + " .drop_duplicates()['entity']\n", + " .value_counts()\n", + " .to_frame()\n", + " .rename( columns={'entity':'n'} ) )\n", + "\n", + "e_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chemical counts\n", + "\n", + "Counts of the number of chemicals for which we have data (Z scores etc)\n", + "within each type of organisation." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chemicals
pcn1294
practice1274
ccg706
stp364
\n", + "
" + ], + "text/plain": [ + " chemicals\n", + "pcn 1294\n", + "practice 1274\n", + "ccg 706\n", + "stp 364" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Summarising the number of unique chemicals analysed within\n", + "### each type of organisation\n", + "\n", + "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n", + " .drop_duplicates()['entity']\n", + " .value_counts()\n", + " .to_frame()\n", + " .rename( columns={'entity':'chemicals'} ) )\n", + "\n", + "c_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "### Combining the entity and chemical counts\n", + "\n", + "all_counts = e_counts.join( c_counts )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "### Calculating summary statistics for the ratio and the Z score\n", + "### within each entity type\n", + "\n", + "all_summary = e_data.groupby( \"entity\" )[[\"ratio\",\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "all_summary = all_summary.rename( columns={\"50%\":\"median\"}, inplace=False )\n", + "\n", + "### Defining which metrics will be displayed below\n", + "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary statistics for the z score in each organisation type" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nchemicalsmedianmaxminIQR
entity
stp423643.826.33-6.338.90
ccg1067063.5710.20-10.2010.14
pcn125712942.58543.19-141.339.86
practice649912740.005512.02-711.879.72
\n", + "
" + ], + "text/plain": [ + " n chemicals median max min IQR\n", + "entity \n", + "stp 42 364 3.82 6.33 -6.33 8.90\n", + "ccg 106 706 3.57 10.20 -10.20 10.14\n", + "pcn 1257 1294 2.58 543.19 -141.33 9.86\n", + "practice 6499 1274 0.00 5512.02 -711.87 9.72" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Extracting the summary statistics for the z scores\n", + "z_tmp = all_summary[all_summary.index.isin([\"z_score\"], level=1)]\n", + "\n", + "### Calculating IQR, removing the row index and rounding to 2dp\n", + "z_summary = ( z_tmp\n", + " .assign( IQR = z_tmp[\"75%\"]-z_tmp[\"25%\"] )\n", + " .droplevel(level=1)\n", + " .round(2) )\n", + "\n", + "z_summary.join( all_counts )[metrics_to_show]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary statistics for the ratio in each organisation type" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nchemicalsmedianmaxminIQR
entity
stp423640.091.00.00.64
ccg1067060.121.00.00.61
pcn125712940.131.00.00.49
practice649912740.141.00.00.44
\n", + "
" + ], + "text/plain": [ + " n chemicals median max min IQR\n", + "entity \n", + "stp 42 364 0.09 1.0 0.0 0.64\n", + "ccg 106 706 0.12 1.0 0.0 0.61\n", + "pcn 1257 1294 0.13 1.0 0.0 0.49\n", + "practice 6499 1274 0.14 1.0 0.0 0.44" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Extracting the summary statistics for the z scores\n", + "ratio_tmp = all_summary[all_summary.index.isin([\"ratio\"], level=1)]\n", + "\n", + "### Calculating IQR, removing the row index and rounding to 2dp\n", + "ratio_summary = ( ratio_tmp\n", + " .assign( IQR = ratio_tmp[\"75%\"]-ratio_tmp[\"25%\"] )\n", + " .droplevel(level=1)\n", + " .round(2) )\n", + "\n", + "ratio_summary.join( all_counts )[metrics_to_show]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all", + "encoding": "# -*- coding: utf-8 -*-", + "notebook_metadata_filter": "all,-language_info", + "text_representation": { + "extension": ".py", + "format_name": "light", + "format_version": "1.5", + "jupytext_version": "1.3.4" + } + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py new file mode 100644 index 000000000..71287c4ea --- /dev/null +++ b/notebooks/diffable_python/calculate_summary_statistics.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: all +# notebook_metadata_filter: all,-language_info +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.3.4 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +from lib.outliers import Runner +from datetime import date +import pandas as pd + +from_date = date(year=2021,month=4,day=1) +to_date = date(year=2021,month=8,day=1) +r = Runner(from_date,to_date,5,["practice","ccg","pcn","stp"],False) + +r.run() + +# + +### Extracting all the stored z scores etc across organisations +### so that summary statistics can be calculated + +e_data = pd.concat( + (d.assign(entity=e) for e, d in r.build.results.items()) +) +# - + +# ## Entity counts +# +# Counts of each kind of entity (i.e., organisation). + +# + +### Summarising the number of each kind of entity (organisation) + +e_counts = ( e_data.reset_index()[["practice","entity"]] + .drop_duplicates()['entity'] + .value_counts() + .to_frame() + .rename( columns={'entity':'n'} ) ) + +e_counts +# - + +# ## Chemical counts +# +# Counts of the number of chemicals for which we have data (Z scores etc) +# within each type of organisation. + +# + +### Summarising the number of unique chemicals analysed within +### each type of organisation + +c_counts = ( e_data.reset_index()[["chemical","entity"]] + .drop_duplicates()['entity'] + .value_counts() + .to_frame() + .rename( columns={'entity':'chemicals'} ) ) + +c_counts + +# + +### Combining the entity and chemical counts + +all_counts = e_counts.join( c_counts ) + + +# + +### Calculating summary statistics for the ratio and the Z score +### within each entity type + +all_summary = e_data.groupby( "entity" )[["ratio","z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +all_summary = all_summary.rename( columns={"50%":"median"}, inplace=False ) + +### Defining which metrics will be displayed below +metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ] +# - + + +# ## Summary statistics for the z score in each organisation type + +# + +### Extracting the summary statistics for the z scores +z_tmp = all_summary[all_summary.index.isin(["z_score"], level=1)] + +### Calculating IQR, removing the row index and rounding to 2dp +z_summary = ( z_tmp + .assign( IQR = z_tmp["75%"]-z_tmp["25%"] ) + .droplevel(level=1) + .round(2) ) + +z_summary.join( all_counts )[metrics_to_show] +# - + +# ## Summary statistics for the ratio in each organisation type + +# + +### Extracting the summary statistics for the z scores +ratio_tmp = all_summary[all_summary.index.isin(["ratio"], level=1)] + +### Calculating IQR, removing the row index and rounding to 2dp +ratio_summary = ( ratio_tmp + .assign( IQR = ratio_tmp["75%"]-ratio_tmp["25%"] ) + .droplevel(level=1) + .round(2) ) + +ratio_summary.join( all_counts )[metrics_to_show] +# - + From d0d319d1a47410c438f6b663bb6f5aeb40f0d61a Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Wed, 9 Mar 2022 15:26:52 +0000 Subject: [PATCH 02/11] feat: removing BigQuery key --- notebooks/calculate_summary_statistics.ipynb | 101 +------------------ 1 file changed, 2 insertions(+), 99 deletions(-) diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb index d13ef013a..c1a306529 100644 --- a/notebooks/calculate_summary_statistics.ipynb +++ b/notebooks/calculate_summary_statistics.ipynb @@ -24,106 +24,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=7VcN0yeYTzq2E9Z3jdPuuj7sEjFtTb&prompt=consent&access_type=offline\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Enter the authorization code: 4/1AX4XfWiHK9sZGQcEnE9I3G-pDKHnzzLRbZ2FYsyEvv0x8Omm_TOcfbh3Z3A\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 100%|██████████| 1/1 [00:00<00:00, 6.93rows/s]\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e6192a9e28d549babea7ec4d09af9479", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='SUBMITTING | ', max=6499.0, style=ProgressStyle(descripti…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "77ffb0a12f3547acb5dd1ad59ea70c89", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='PROCESSING | ', max=6499.0, style=ProgressStyle(descripti…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8bc9cd88d202472ea47dca9b54750c3a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='COLLECTING | ', max=6499.0, style=ProgressStyle(descripti…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "ename": "TypeError", - "evalue": "add_item() argument after ** must be a mapping, not BrokenProcessPool", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/home/app/notebook/lib/outliers.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_entity_report\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 789\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 790\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 791\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 792\u001b[0m \u001b[0;31m# write out toc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: add_item() argument after ** must be a mapping, not BrokenProcessPool" - ] - } - ], + "outputs": [], "source": [ "r.run()" ] From 1ba0a43c5055790ea00dc11ce155711f6973d242 Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Wed, 9 Mar 2022 16:36:57 +0000 Subject: [PATCH 03/11] feat: fix to only extract results, rather than generate new results --- notebooks/calculate_summary_statistics.ipynb | 3 ++- notebooks/diffable_python/calculate_summary_statistics.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb index c1a306529..4af94531b 100644 --- a/notebooks/calculate_summary_statistics.ipynb +++ b/notebooks/calculate_summary_statistics.ipynb @@ -28,7 +28,8 @@ "metadata": {}, "outputs": [], "source": [ - "r.run()" + "r.build.run()\n", + "r.build.fetch_results()" ] }, { diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py index 71287c4ea..b455eb46e 100644 --- a/notebooks/diffable_python/calculate_summary_statistics.py +++ b/notebooks/diffable_python/calculate_summary_statistics.py @@ -23,7 +23,8 @@ to_date = date(year=2021,month=8,day=1) r = Runner(from_date,to_date,5,["practice","ccg","pcn","stp"],False) -r.run() +r.build.run() +r.build.fetch_results() # + ### Extracting all the stored z scores etc across organisations From a756eef77a5d24307e9de5df19b7d56cfc34a9ff Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Thu, 10 Mar 2022 09:37:44 +0000 Subject: [PATCH 04/11] feat: separating results for higher-than-most and lower-than-most --- notebooks/calculate_summary_statistics.ipynb | 355 ++++++++++++++---- .../calculate_summary_statistics.py | 69 ++-- 2 files changed, 327 insertions(+), 97 deletions(-) diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb index 4af94531b..5a5d93d8a 100644 --- a/notebooks/calculate_summary_statistics.ipynb +++ b/notebooks/calculate_summary_statistics.ipynb @@ -24,9 +24,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=B0aBfO2cFTgPTpWIrbEXhCszrTmcNv&prompt=consent&access_type=offline\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter the authorization code: 4/1AX4XfWjWfGmWhSn3IUgFJA9Y1gOE418Hgdc8PD98NKa2Y2AW1-2axRotGlg\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: 100%|██████████| 1/1 [00:00<00:00, 4.85rows/s]\n" + ] + } + ], "source": [ "r.build.run()\n", "r.build.fetch_results()" @@ -34,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -113,7 +135,7 @@ "stp 42" ] }, - "execution_count": 5, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -136,13 +158,13 @@ "source": [ "## Chemical counts\n", "\n", - "Counts of the number of chemicals for which we have data (Z scores etc)\n", - "within each type of organisation." + "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n", + "5 z scores) amongst all organisations of the given type." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -198,14 +220,14 @@ "stp 364" ] }, - "execution_count": 6, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "### Summarising the number of unique chemicals analysed within\n", - "### each type of organisation\n", + "### Summarising the number of unique chemicals identified in the\n", + "### top/bottom five outliers amongst all organisations of the given type\n", "\n", "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n", " .drop_duplicates()['entity']\n", @@ -218,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": { "lines_to_next_cell": 2 }, @@ -231,32 +253,60 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 18, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ - "### Calculating summary statistics for the ratio and the Z score\n", - "### within each entity type\n", + "### Defining which metrics will be displayed in the summary tables\n", + "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "### Calculating summary statistics for the Z scores for those chemicals\n", + "### identified in the TOP 5 in at least one organisation of the entity type.\n", + "### There are the chemicals displayed in the 'Higher than most' table.\n", "\n", - "all_summary = e_data.groupby( \"entity\" )[[\"ratio\",\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", - "all_summary = all_summary.rename( columns={\"50%\":\"median\"}, inplace=False )\n", + "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "### Calculating summary statistics for the Z scores for those chemicals\n", + "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n", + "### There are the chemicals displayed in the 'Lower than most' table.\n", "\n", - "### Defining which metrics will be displayed below\n", - "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]" + "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Summary statistics for the z score in each organisation type" + "## Summary statistics for outlying Z scores in each organisation type\n", + "\n", + "### Higher than most chemicals\n", + "\n", + "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n", + "in each type of organisation. These are chemicals are seen to be used more often\n", + "in a particular organisation than its peers." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -302,79 +352,83 @@ " stp\n", " 42\n", " 364\n", - " 3.82\n", + " 6.14\n", " 6.33\n", - " -6.33\n", - " 8.90\n", + " 3.68\n", + " 0.78\n", " \n", " \n", " ccg\n", " 106\n", " 706\n", - " 3.57\n", + " 7.36\n", " 10.20\n", - " -10.20\n", - " 10.14\n", + " 3.56\n", + " 3.29\n", " \n", " \n", " pcn\n", " 1257\n", " 1294\n", - " 2.58\n", + " 7.26\n", " 543.19\n", - " -141.33\n", - " 9.86\n", + " 2.58\n", + " 5.11\n", " \n", " \n", " practice\n", " 6499\n", " 1274\n", - " 0.00\n", + " 7.28\n", " 5512.02\n", - " -711.87\n", - " 9.72\n", + " 1.13\n", + " 5.96\n", " \n", " \n", "\n", "" ], "text/plain": [ - " n chemicals median max min IQR\n", - "entity \n", - "stp 42 364 3.82 6.33 -6.33 8.90\n", - "ccg 106 706 3.57 10.20 -10.20 10.14\n", - "pcn 1257 1294 2.58 543.19 -141.33 9.86\n", - "practice 6499 1274 0.00 5512.02 -711.87 9.72" + " n chemicals median max min IQR\n", + "entity \n", + "stp 42 364 6.14 6.33 3.68 0.78\n", + "ccg 106 706 7.36 10.20 3.56 3.29\n", + "pcn 1257 1294 7.26 543.19 2.58 5.11\n", + "practice 6499 1274 7.28 5512.02 1.13 5.96" ] }, - "execution_count": 9, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "### Extracting the summary statistics for the z scores\n", - "z_tmp = all_summary[all_summary.index.isin([\"z_score\"], level=1)]\n", + "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n", "\n", "### Calculating IQR, removing the row index and rounding to 2dp\n", - "z_summary = ( z_tmp\n", - " .assign( IQR = z_tmp[\"75%\"]-z_tmp[\"25%\"] )\n", + "overused_toprint = ( overused_tmp\n", + " .assign( IQR = overused_tmp[\"75%\"]-overused_tmp[\"25%\"] )\n", " .droplevel(level=1)\n", " .round(2) )\n", "\n", - "z_summary.join( all_counts )[metrics_to_show]" + "overused_toprint.join( all_counts )[metrics_to_show]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Summary statistics for the ratio in each organisation type" + "### Lower than most chemicals\n", + "\n", + "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n", + "in each type of organisation. These are chemicals are seen to be used less often\n", + "in a particular organisation than its peers." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 22, "metadata": { "lines_to_next_cell": 0 }, @@ -422,75 +476,224 @@ " stp\n", " 42\n", " 364\n", - " 0.09\n", - " 1.0\n", - " 0.0\n", - " 0.64\n", + " -2.77\n", + " -1.78\n", + " -6.33\n", + " 1.15\n", " \n", " \n", " ccg\n", " 106\n", " 706\n", - " 0.12\n", - " 1.0\n", - " 0.0\n", - " 0.61\n", + " -2.78\n", + " -1.47\n", + " -10.20\n", + " 1.16\n", " \n", " \n", " pcn\n", " 1257\n", " 1294\n", - " 0.13\n", - " 1.0\n", - " 0.0\n", - " 0.49\n", + " -2.61\n", + " -1.49\n", + " -141.33\n", + " 1.03\n", " \n", " \n", " practice\n", " 6499\n", " 1274\n", - " 0.14\n", - " 1.0\n", - " 0.0\n", - " 0.44\n", + " -2.44\n", + " -1.13\n", + " -711.87\n", + " 1.07\n", " \n", " \n", "\n", "" ], "text/plain": [ - " n chemicals median max min IQR\n", - "entity \n", - "stp 42 364 0.09 1.0 0.0 0.64\n", - "ccg 106 706 0.12 1.0 0.0 0.61\n", - "pcn 1257 1294 0.13 1.0 0.0 0.49\n", - "practice 6499 1274 0.14 1.0 0.0 0.44" + " n chemicals median max min IQR\n", + "entity \n", + "stp 42 364 -2.77 -1.78 -6.33 1.15\n", + "ccg 106 706 -2.78 -1.47 -10.20 1.16\n", + "pcn 1257 1294 -2.61 -1.49 -141.33 1.03\n", + "practice 6499 1274 -2.44 -1.13 -711.87 1.07" ] }, - "execution_count": 10, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "### Extracting the summary statistics for the z scores\n", - "ratio_tmp = all_summary[all_summary.index.isin([\"ratio\"], level=1)]\n", + "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n", "\n", "### Calculating IQR, removing the row index and rounding to 2dp\n", - "ratio_summary = ( ratio_tmp\n", - " .assign( IQR = ratio_tmp[\"75%\"]-ratio_tmp[\"25%\"] )\n", + "underused_toprint = ( underused_tmp\n", + " .assign( IQR = underused_tmp[\"75%\"]-underused_tmp[\"25%\"] )\n", " .droplevel(level=1)\n", " .round(2) )\n", "\n", - "ratio_summary.join( all_counts )[metrics_to_show]" + "underused_toprint.join( all_counts )[metrics_to_show]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary\n", + "\n", + "Below is a summary table that combines the 'Higher than most' and 'Lower than most'\n", + "results displayed above." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Higher than mostLower than most
nchemicalsmedianmaxminIQRmedianmaxminIQR
entity
stp423646.146.333.680.78-2.77-1.78-6.331.15
ccg1067067.3610.203.563.29-2.78-1.47-10.201.16
pcn125712947.26543.192.585.11-2.61-1.49-141.331.03
practice649912747.285512.021.135.96-2.44-1.13-711.871.07
\n", + "
" + ], + "text/plain": [ + " Higher than most \\\n", + " n chemicals median max min IQR \n", + "entity \n", + "stp 42 364 6.14 6.33 3.68 0.78 \n", + "ccg 106 706 7.36 10.20 3.56 3.29 \n", + "pcn 1257 1294 7.26 543.19 2.58 5.11 \n", + "practice 6499 1274 7.28 5512.02 1.13 5.96 \n", + "\n", + " Lower than most \n", + " median max min IQR \n", + "entity \n", + "stp -2.77 -1.78 -6.33 1.15 \n", + "ccg -2.78 -1.47 -10.20 1.16 \n", + "pcn -2.61 -1.49 -141.33 1.03 \n", + "practice -2.44 -1.13 -711.87 1.07 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n", + " underused_toprint[metrics_to_show[2:]]],\n", + " keys=[\"Higher than most\", \"Lower than most\"],axis=1)" + ] } ], "metadata": { diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py index b455eb46e..652720e3d 100644 --- a/notebooks/diffable_python/calculate_summary_statistics.py +++ b/notebooks/diffable_python/calculate_summary_statistics.py @@ -53,12 +53,12 @@ # ## Chemical counts # -# Counts of the number of chemicals for which we have data (Z scores etc) -# within each type of organisation. +# Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom +# 5 z scores) amongst all organisations of the given type. # + -### Summarising the number of unique chemicals analysed within -### each type of organisation +### Summarising the number of unique chemicals identified in the +### top/bottom five outliers amongst all organisations of the given type c_counts = ( e_data.reset_index()[["chemical","entity"]] .drop_duplicates()['entity'] @@ -75,44 +75,71 @@ # + -### Calculating summary statistics for the ratio and the Z score -### within each entity type +### Defining which metrics will be displayed in the summary tables +metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ] -all_summary = e_data.groupby( "entity" )[["ratio","z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) -all_summary = all_summary.rename( columns={"50%":"median"}, inplace=False ) -### Defining which metrics will be displayed below -metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ] -# - +# + +### Calculating summary statistics for the Z scores for those chemicals +### identified in the TOP 5 in at least one organisation of the entity type. +### There are the chemicals displayed in the 'Higher than most' table. + +overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False ) + +# + +### Calculating summary statistics for the Z scores for those chemicals +### identified in the BOTTOM 5 in at least one organisation of the entity type. +### There are the chemicals displayed in the 'Lower than most' table. +underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False ) +# - -# ## Summary statistics for the z score in each organisation type +# ## Summary statistics for outlying Z scores in each organisation type +# +# ### Higher than most chemicals +# +# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals +# in each type of organisation. These are chemicals are seen to be used more often +# in a particular organisation than its peers. # + ### Extracting the summary statistics for the z scores -z_tmp = all_summary[all_summary.index.isin(["z_score"], level=1)] +overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)] ### Calculating IQR, removing the row index and rounding to 2dp -z_summary = ( z_tmp - .assign( IQR = z_tmp["75%"]-z_tmp["25%"] ) +overused_toprint = ( overused_tmp + .assign( IQR = overused_tmp["75%"]-overused_tmp["25%"] ) .droplevel(level=1) .round(2) ) -z_summary.join( all_counts )[metrics_to_show] +overused_toprint.join( all_counts )[metrics_to_show] # - -# ## Summary statistics for the ratio in each organisation type +# ### Lower than most chemicals +# +# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals +# in each type of organisation. These are chemicals are seen to be used less often +# in a particular organisation than its peers. # + ### Extracting the summary statistics for the z scores -ratio_tmp = all_summary[all_summary.index.isin(["ratio"], level=1)] +underused_tmp = underused_summary[underused_summary.index.isin(["z_score"], level=1)] ### Calculating IQR, removing the row index and rounding to 2dp -ratio_summary = ( ratio_tmp - .assign( IQR = ratio_tmp["75%"]-ratio_tmp["25%"] ) +underused_toprint = ( underused_tmp + .assign( IQR = underused_tmp["75%"]-underused_tmp["25%"] ) .droplevel(level=1) .round(2) ) -ratio_summary.join( all_counts )[metrics_to_show] +underused_toprint.join( all_counts )[metrics_to_show] # - +# ### Summary +# +# Below is a summary table that combines the 'Higher than most' and 'Lower than most' +# results displayed above. +pd.concat([overused_toprint.join( all_counts )[metrics_to_show], + underused_toprint[metrics_to_show[2:]]], + keys=["Higher than most", "Lower than most"],axis=1) From 6fd6f91c6fc398ee1e73d9c861c325a7487320da Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Wed, 13 Apr 2022 13:57:56 +0100 Subject: [PATCH 05/11] feat: update to notebook to include the top/bottom ten results --- ...alculate_summary_statistics_outlying.ipynb | 736 ++++++++++++++++++ .../calculate_summary_statistics_outlying.py | 147 ++++ 2 files changed, 883 insertions(+) create mode 100644 notebooks/calculate_summary_statistics_outlying.ipynb create mode 100644 notebooks/diffable_python/calculate_summary_statistics_outlying.py diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb new file mode 100644 index 000000000..24f2beb90 --- /dev/null +++ b/notebooks/calculate_summary_statistics_outlying.ipynb @@ -0,0 +1,736 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.outliers import Runner\n", + "from datetime import date\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from_date = date(year=2021,month=6,day=1)\n", + "to_date = date(year=2021,month=12,day=1)\n", + "r = Runner(from_date,to_date,10,[\"practice\",\"ccg\",\"pcn\",\"stp\"],False)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: 100%|██████████| 1/1 [00:00<00:00, 6.53rows/s]\n", + "Downloading: 100%|██████████| 129532/129532 [00:11<00:00, 11161.65rows/s]\n", + "Downloading: 100%|██████████| 302743/302743 [00:11<00:00, 26079.48rows/s]\n", + "Downloading: 100%|██████████| 1346/1346 [00:40<00:00, 33.64rows/s]\n", + "Downloading: 100%|██████████| 2121/2121 [00:00<00:00, 4642.75rows/s]\n", + "Downloading: 100%|██████████| 10905/10905 [00:00<00:00, 14726.74rows/s]\n", + "Downloading: 100%|██████████| 1138/1138 [00:00<00:00, 1338.28rows/s]\n", + "Downloading: 100%|██████████| 25140/25140 [00:02<00:00, 10127.85rows/s]\n", + "Downloading: 100%|██████████| 89049/89049 [00:05<00:00, 17194.56rows/s]\n", + "Downloading: 100%|██████████| 1416/1416 [00:08<00:00, 171.31rows/s]\n", + "Downloading: 100%|██████████| 842/842 [00:00<00:00, 3119.24rows/s]\n", + "Downloading: 100%|██████████| 3992/3992 [00:00<00:00, 11083.61rows/s]\n", + "Downloading: 100%|██████████| 680/680 [00:00<00:00, 1650.28rows/s]\n" + ] + } + ], + "source": [ + "r.build.run()\n", + "r.build.fetch_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "### Extracting all the stored z scores etc across organisations\n", + "### so that summary statistics can be calculated\n", + "\n", + "e_data = pd.concat(\n", + " (d.assign(entity=e) for e, d in r.build.results.items())\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Entity counts\n", + "\n", + "Counts of each kind of entity (i.e., organisation)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n
practice6476
pcn1257
ccg106
stp42
\n", + "
" + ], + "text/plain": [ + " n\n", + "practice 6476\n", + "pcn 1257\n", + "ccg 106\n", + "stp 42" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Summarising the number of each kind of entity (organisation)\n", + "\n", + "e_counts = ( e_data.reset_index()[[\"practice\",\"entity\"]]\n", + " .drop_duplicates()['entity']\n", + " .value_counts()\n", + " .to_frame()\n", + " .rename( columns={'entity':'n'} ) )\n", + "\n", + "e_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chemical counts\n", + "\n", + "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n", + "5 z scores) amongst all organisations of the given type." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chemicals
pcn1416
practice1346
ccg1138
stp680
\n", + "
" + ], + "text/plain": [ + " chemicals\n", + "pcn 1416\n", + "practice 1346\n", + "ccg 1138\n", + "stp 680" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Summarising the number of unique chemicals identified in the\n", + "### top/bottom five outliers amongst all organisations of the given type\n", + "\n", + "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n", + " .drop_duplicates()['entity']\n", + " .value_counts()\n", + " .to_frame()\n", + " .rename( columns={'entity':'chemicals'} ) )\n", + "\n", + "c_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "### Combining the entity and chemical counts\n", + "\n", + "all_counts = e_counts.join( c_counts )" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "### Defining which metrics will be displayed in the summary tables\n", + "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "### Calculating summary statistics for the Z scores for those chemicals\n", + "### identified in the TOP 5 in at least one organisation of the entity type.\n", + "### There are the chemicals displayed in the 'Higher than most' table.\n", + "\n", + "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "### Calculating summary statistics for the Z scores for those chemicals\n", + "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n", + "### There are the chemicals displayed in the 'Lower than most' table.\n", + "\n", + "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary statistics for outlying Z scores in each organisation type\n", + "\n", + "### Higher than most chemicals\n", + "\n", + "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n", + "in each type of organisation. These are chemicals are seen to be used more often\n", + "in a particular organisation than its peers." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nchemicalsmedianmaxminIQR
entity
stp426805.426.332.681.64
ccg10611385.7910.202.763.18
pcn125714165.282528.092.263.39
practice647613465.236825.501.213.84
\n", + "
" + ], + "text/plain": [ + " n chemicals median max min IQR\n", + "entity \n", + "stp 42 680 5.42 6.33 2.68 1.64\n", + "ccg 106 1138 5.79 10.20 2.76 3.18\n", + "pcn 1257 1416 5.28 2528.09 2.26 3.39\n", + "practice 6476 1346 5.23 6825.50 1.21 3.84" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Extracting the summary statistics for the z scores\n", + "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n", + "\n", + "### Calculating IQR, removing the row index and rounding to 2dp\n", + "overused_toprint = ( overused_tmp\n", + " .assign( IQR = overused_tmp[\"75%\"]-overused_tmp[\"25%\"] )\n", + " .droplevel(level=1)\n", + " .round(2) )\n", + "\n", + "overused_toprint.join( all_counts )[metrics_to_show]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lower than most chemicals\n", + "\n", + "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n", + "in each type of organisation. These are chemicals are seen to be used less often\n", + "in a particular organisation than its peers." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nchemicalsmedianmaxminIQR
entity
stp42680-2.35-1.47-6.330.79
ccg1061138-2.30-1.33-10.200.82
pcn12571416-2.18-1.30-159.770.77
practice64761346-2.08-0.05-307.230.81
\n", + "
" + ], + "text/plain": [ + " n chemicals median max min IQR\n", + "entity \n", + "stp 42 680 -2.35 -1.47 -6.33 0.79\n", + "ccg 106 1138 -2.30 -1.33 -10.20 0.82\n", + "pcn 1257 1416 -2.18 -1.30 -159.77 0.77\n", + "practice 6476 1346 -2.08 -0.05 -307.23 0.81" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Extracting the summary statistics for the z scores\n", + "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n", + "\n", + "### Calculating IQR, removing the row index and rounding to 2dp\n", + "underused_toprint = ( underused_tmp\n", + " .assign( IQR = underused_tmp[\"75%\"]-underused_tmp[\"25%\"] )\n", + " .droplevel(level=1)\n", + " .round(2) )\n", + "\n", + "underused_toprint.join( all_counts )[metrics_to_show]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary\n", + "\n", + "Below is a summary table that combines the 'Higher than most' and 'Lower than most'\n", + "results displayed above." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Higher than mostLower than most
nchemicalsmedianmaxminIQRmedianmaxminIQR
entity
stp426805.426.332.681.64-2.35-1.47-6.330.79
ccg10611385.7910.202.763.18-2.30-1.33-10.200.82
pcn125714165.282528.092.263.39-2.18-1.30-159.770.77
practice647613465.236825.501.213.84-2.08-0.05-307.230.81
\n", + "
" + ], + "text/plain": [ + " Higher than most \\\n", + " n chemicals median max min IQR \n", + "entity \n", + "stp 42 680 5.42 6.33 2.68 1.64 \n", + "ccg 106 1138 5.79 10.20 2.76 3.18 \n", + "pcn 1257 1416 5.28 2528.09 2.26 3.39 \n", + "practice 6476 1346 5.23 6825.50 1.21 3.84 \n", + "\n", + " Lower than most \n", + " median max min IQR \n", + "entity \n", + "stp -2.35 -1.47 -6.33 0.79 \n", + "ccg -2.30 -1.33 -10.20 0.82 \n", + "pcn -2.18 -1.30 -159.77 0.77 \n", + "practice -2.08 -0.05 -307.23 0.81 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n", + " underused_toprint[metrics_to_show[2:]]],\n", + " keys=[\"Higher than most\", \"Lower than most\"],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all", + "encoding": "# -*- coding: utf-8 -*-", + "notebook_metadata_filter": "all,-language_info", + "text_representation": { + "extension": ".py", + "format_name": "light", + "format_version": "1.5", + "jupytext_version": "1.3.4" + } + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py new file mode 100644 index 000000000..6fb6d9d3b --- /dev/null +++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: all +# notebook_metadata_filter: all,-language_info +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.3.4 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +from lib.outliers import Runner +from datetime import date +import pandas as pd + +from_date = date(year=2021,month=6,day=1) +to_date = date(year=2021,month=12,day=1) +r = Runner(from_date,to_date,10,["practice","ccg","pcn","stp"],False) + +r.build.run() +r.build.fetch_results() + +# + +### Extracting all the stored z scores etc across organisations +### so that summary statistics can be calculated + +e_data = pd.concat( + (d.assign(entity=e) for e, d in r.build.results.items()) +) +# - + +# ## Entity counts +# +# Counts of each kind of entity (i.e., organisation). + +# + +### Summarising the number of each kind of entity (organisation) + +e_counts = ( e_data.reset_index()[["practice","entity"]] + .drop_duplicates()['entity'] + .value_counts() + .to_frame() + .rename( columns={'entity':'n'} ) ) + +e_counts +# - + +# ## Chemical counts +# +# Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom +# 5 z scores) amongst all organisations of the given type. + +# + +### Summarising the number of unique chemicals identified in the +### top/bottom five outliers amongst all organisations of the given type + +c_counts = ( e_data.reset_index()[["chemical","entity"]] + .drop_duplicates()['entity'] + .value_counts() + .to_frame() + .rename( columns={'entity':'chemicals'} ) ) + +c_counts + +# + +### Combining the entity and chemical counts + +all_counts = e_counts.join( c_counts ) + + +# + +### Defining which metrics will be displayed in the summary tables +metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ] + + +# + +### Calculating summary statistics for the Z scores for those chemicals +### identified in the TOP 5 in at least one organisation of the entity type. +### There are the chemicals displayed in the 'Higher than most' table. + +overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False ) + +# + +### Calculating summary statistics for the Z scores for those chemicals +### identified in the BOTTOM 5 in at least one organisation of the entity type. +### There are the chemicals displayed in the 'Lower than most' table. + +underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False ) +# - + +# ## Summary statistics for outlying Z scores in each organisation type +# +# ### Higher than most chemicals +# +# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals +# in each type of organisation. These are chemicals are seen to be used more often +# in a particular organisation than its peers. + +# + +### Extracting the summary statistics for the z scores +overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)] + +### Calculating IQR, removing the row index and rounding to 2dp +overused_toprint = ( overused_tmp + .assign( IQR = overused_tmp["75%"]-overused_tmp["25%"] ) + .droplevel(level=1) + .round(2) ) + +overused_toprint.join( all_counts )[metrics_to_show] +# - + +# ### Lower than most chemicals +# +# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals +# in each type of organisation. These are chemicals are seen to be used less often +# in a particular organisation than its peers. + +# + +### Extracting the summary statistics for the z scores +underused_tmp = underused_summary[underused_summary.index.isin(["z_score"], level=1)] + +### Calculating IQR, removing the row index and rounding to 2dp +underused_toprint = ( underused_tmp + .assign( IQR = underused_tmp["75%"]-underused_tmp["25%"] ) + .droplevel(level=1) + .round(2) ) + +underused_toprint.join( all_counts )[metrics_to_show] +# - +# ### Summary +# +# Below is a summary table that combines the 'Higher than most' and 'Lower than most' +# results displayed above. + +pd.concat([overused_toprint.join( all_counts )[metrics_to_show], + underused_toprint[metrics_to_show[2:]]], + keys=["Higher than most", "Lower than most"],axis=1) + + From d1f8f222aaf50b8900eb9aa1e5e166194f35ed74 Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Thu, 14 Apr 2022 11:05:28 +0100 Subject: [PATCH 06/11] tidy: removing old notebook --- notebooks/calculate_summary_statistics.ipynb | 731 ------------------ .../calculate_summary_statistics.py | 145 ---- 2 files changed, 876 deletions(-) delete mode 100644 notebooks/calculate_summary_statistics.ipynb delete mode 100644 notebooks/diffable_python/calculate_summary_statistics.py diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb deleted file mode 100644 index 5a5d93d8a..000000000 --- a/notebooks/calculate_summary_statistics.ipynb +++ /dev/null @@ -1,731 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from lib.outliers import Runner\n", - "from datetime import date\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from_date = date(year=2021,month=4,day=1)\n", - "to_date = date(year=2021,month=8,day=1)\n", - "r = Runner(from_date,to_date,5,[\"practice\",\"ccg\",\"pcn\",\"stp\"],False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=B0aBfO2cFTgPTpWIrbEXhCszrTmcNv&prompt=consent&access_type=offline\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Enter the authorization code: 4/1AX4XfWjWfGmWhSn3IUgFJA9Y1gOE418Hgdc8PD98NKa2Y2AW1-2axRotGlg\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 100%|██████████| 1/1 [00:00<00:00, 4.85rows/s]\n" - ] - } - ], - "source": [ - "r.build.run()\n", - "r.build.fetch_results()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "### Extracting all the stored z scores etc across organisations\n", - "### so that summary statistics can be calculated\n", - "\n", - "e_data = pd.concat(\n", - " (d.assign(entity=e) for e, d in r.build.results.items())\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Entity counts\n", - "\n", - "Counts of each kind of entity (i.e., organisation)." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
n
practice6499
pcn1257
ccg106
stp42
\n", - "
" - ], - "text/plain": [ - " n\n", - "practice 6499\n", - "pcn 1257\n", - "ccg 106\n", - "stp 42" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "### Summarising the number of each kind of entity (organisation)\n", - "\n", - "e_counts = ( e_data.reset_index()[[\"practice\",\"entity\"]]\n", - " .drop_duplicates()['entity']\n", - " .value_counts()\n", - " .to_frame()\n", - " .rename( columns={'entity':'n'} ) )\n", - "\n", - "e_counts" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Chemical counts\n", - "\n", - "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n", - "5 z scores) amongst all organisations of the given type." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
chemicals
pcn1294
practice1274
ccg706
stp364
\n", - "
" - ], - "text/plain": [ - " chemicals\n", - "pcn 1294\n", - "practice 1274\n", - "ccg 706\n", - "stp 364" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "### Summarising the number of unique chemicals identified in the\n", - "### top/bottom five outliers amongst all organisations of the given type\n", - "\n", - "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n", - " .drop_duplicates()['entity']\n", - " .value_counts()\n", - " .to_frame()\n", - " .rename( columns={'entity':'chemicals'} ) )\n", - "\n", - "c_counts" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "### Combining the entity and chemical counts\n", - "\n", - "all_counts = e_counts.join( c_counts )" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "### Defining which metrics will be displayed in the summary tables\n", - "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "### Calculating summary statistics for the Z scores for those chemicals\n", - "### identified in the TOP 5 in at least one organisation of the entity type.\n", - "### There are the chemicals displayed in the 'Higher than most' table.\n", - "\n", - "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", - "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "### Calculating summary statistics for the Z scores for those chemicals\n", - "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n", - "### There are the chemicals displayed in the 'Lower than most' table.\n", - "\n", - "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", - "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary statistics for outlying Z scores in each organisation type\n", - "\n", - "### Higher than most chemicals\n", - "\n", - "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n", - "in each type of organisation. These are chemicals are seen to be used more often\n", - "in a particular organisation than its peers." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nchemicalsmedianmaxminIQR
entity
stp423646.146.333.680.78
ccg1067067.3610.203.563.29
pcn125712947.26543.192.585.11
practice649912747.285512.021.135.96
\n", - "
" - ], - "text/plain": [ - " n chemicals median max min IQR\n", - "entity \n", - "stp 42 364 6.14 6.33 3.68 0.78\n", - "ccg 106 706 7.36 10.20 3.56 3.29\n", - "pcn 1257 1294 7.26 543.19 2.58 5.11\n", - "practice 6499 1274 7.28 5512.02 1.13 5.96" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "### Extracting the summary statistics for the z scores\n", - "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n", - "\n", - "### Calculating IQR, removing the row index and rounding to 2dp\n", - "overused_toprint = ( overused_tmp\n", - " .assign( IQR = overused_tmp[\"75%\"]-overused_tmp[\"25%\"] )\n", - " .droplevel(level=1)\n", - " .round(2) )\n", - "\n", - "overused_toprint.join( all_counts )[metrics_to_show]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Lower than most chemicals\n", - "\n", - "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n", - "in each type of organisation. These are chemicals are seen to be used less often\n", - "in a particular organisation than its peers." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nchemicalsmedianmaxminIQR
entity
stp42364-2.77-1.78-6.331.15
ccg106706-2.78-1.47-10.201.16
pcn12571294-2.61-1.49-141.331.03
practice64991274-2.44-1.13-711.871.07
\n", - "
" - ], - "text/plain": [ - " n chemicals median max min IQR\n", - "entity \n", - "stp 42 364 -2.77 -1.78 -6.33 1.15\n", - "ccg 106 706 -2.78 -1.47 -10.20 1.16\n", - "pcn 1257 1294 -2.61 -1.49 -141.33 1.03\n", - "practice 6499 1274 -2.44 -1.13 -711.87 1.07" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "### Extracting the summary statistics for the z scores\n", - "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n", - "\n", - "### Calculating IQR, removing the row index and rounding to 2dp\n", - "underused_toprint = ( underused_tmp\n", - " .assign( IQR = underused_tmp[\"75%\"]-underused_tmp[\"25%\"] )\n", - " .droplevel(level=1)\n", - " .round(2) )\n", - "\n", - "underused_toprint.join( all_counts )[metrics_to_show]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Summary\n", - "\n", - "Below is a summary table that combines the 'Higher than most' and 'Lower than most'\n", - "results displayed above." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Higher than mostLower than most
nchemicalsmedianmaxminIQRmedianmaxminIQR
entity
stp423646.146.333.680.78-2.77-1.78-6.331.15
ccg1067067.3610.203.563.29-2.78-1.47-10.201.16
pcn125712947.26543.192.585.11-2.61-1.49-141.331.03
practice649912747.285512.021.135.96-2.44-1.13-711.871.07
\n", - "
" - ], - "text/plain": [ - " Higher than most \\\n", - " n chemicals median max min IQR \n", - "entity \n", - "stp 42 364 6.14 6.33 3.68 0.78 \n", - "ccg 106 706 7.36 10.20 3.56 3.29 \n", - "pcn 1257 1294 7.26 543.19 2.58 5.11 \n", - "practice 6499 1274 7.28 5512.02 1.13 5.96 \n", - "\n", - " Lower than most \n", - " median max min IQR \n", - "entity \n", - "stp -2.77 -1.78 -6.33 1.15 \n", - "ccg -2.78 -1.47 -10.20 1.16 \n", - "pcn -2.61 -1.49 -141.33 1.03 \n", - "practice -2.44 -1.13 -711.87 1.07 " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n", - " underused_toprint[metrics_to_show[2:]]],\n", - " keys=[\"Higher than most\", \"Lower than most\"],axis=1)" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "all", - "encoding": "# -*- coding: utf-8 -*-", - "notebook_metadata_filter": "all,-language_info", - "text_representation": { - "extension": ".py", - "format_name": "light", - "format_version": "1.5", - "jupytext_version": "1.3.4" - } - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py deleted file mode 100644 index 652720e3d..000000000 --- a/notebooks/diffable_python/calculate_summary_statistics.py +++ /dev/null @@ -1,145 +0,0 @@ -# -*- coding: utf-8 -*- -# --- -# jupyter: -# jupytext: -# cell_metadata_filter: all -# notebook_metadata_filter: all,-language_info -# text_representation: -# extension: .py -# format_name: light -# format_version: '1.5' -# jupytext_version: 1.3.4 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -from lib.outliers import Runner -from datetime import date -import pandas as pd - -from_date = date(year=2021,month=4,day=1) -to_date = date(year=2021,month=8,day=1) -r = Runner(from_date,to_date,5,["practice","ccg","pcn","stp"],False) - -r.build.run() -r.build.fetch_results() - -# + -### Extracting all the stored z scores etc across organisations -### so that summary statistics can be calculated - -e_data = pd.concat( - (d.assign(entity=e) for e, d in r.build.results.items()) -) -# - - -# ## Entity counts -# -# Counts of each kind of entity (i.e., organisation). - -# + -### Summarising the number of each kind of entity (organisation) - -e_counts = ( e_data.reset_index()[["practice","entity"]] - .drop_duplicates()['entity'] - .value_counts() - .to_frame() - .rename( columns={'entity':'n'} ) ) - -e_counts -# - - -# ## Chemical counts -# -# Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom -# 5 z scores) amongst all organisations of the given type. - -# + -### Summarising the number of unique chemicals identified in the -### top/bottom five outliers amongst all organisations of the given type - -c_counts = ( e_data.reset_index()[["chemical","entity"]] - .drop_duplicates()['entity'] - .value_counts() - .to_frame() - .rename( columns={'entity':'chemicals'} ) ) - -c_counts - -# + -### Combining the entity and chemical counts - -all_counts = e_counts.join( c_counts ) - - -# + -### Defining which metrics will be displayed in the summary tables -metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ] - - -# + -### Calculating summary statistics for the Z scores for those chemicals -### identified in the TOP 5 in at least one organisation of the entity type. -### There are the chemicals displayed in the 'Higher than most' table. - -overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) -overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False ) - -# + -### Calculating summary statistics for the Z scores for those chemicals -### identified in the BOTTOM 5 in at least one organisation of the entity type. -### There are the chemicals displayed in the 'Lower than most' table. - -underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) -underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False ) -# - - -# ## Summary statistics for outlying Z scores in each organisation type -# -# ### Higher than most chemicals -# -# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals -# in each type of organisation. These are chemicals are seen to be used more often -# in a particular organisation than its peers. - -# + -### Extracting the summary statistics for the z scores -overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)] - -### Calculating IQR, removing the row index and rounding to 2dp -overused_toprint = ( overused_tmp - .assign( IQR = overused_tmp["75%"]-overused_tmp["25%"] ) - .droplevel(level=1) - .round(2) ) - -overused_toprint.join( all_counts )[metrics_to_show] -# - - -# ### Lower than most chemicals -# -# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals -# in each type of organisation. These are chemicals are seen to be used less often -# in a particular organisation than its peers. - -# + -### Extracting the summary statistics for the z scores -underused_tmp = underused_summary[underused_summary.index.isin(["z_score"], level=1)] - -### Calculating IQR, removing the row index and rounding to 2dp -underused_toprint = ( underused_tmp - .assign( IQR = underused_tmp["75%"]-underused_tmp["25%"] ) - .droplevel(level=1) - .round(2) ) - -underused_toprint.join( all_counts )[metrics_to_show] -# - -# ### Summary -# -# Below is a summary table that combines the 'Higher than most' and 'Lower than most' -# results displayed above. - -pd.concat([overused_toprint.join( all_counts )[metrics_to_show], - underused_toprint[metrics_to_show[2:]]], - keys=["Higher than most", "Lower than most"],axis=1) From 0aac79520f757e23eb567e7fa3519bda3c1e0dd2 Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Thu, 14 Apr 2022 11:55:14 +0100 Subject: [PATCH 07/11] feat: updating queries on e_data object to extract the outlying chemical values for describe() --- ...alculate_summary_statistics_outlying.ipynb | 56 ++++++++----------- .../calculate_summary_statistics_outlying.py | 4 +- 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb index 24f2beb90..57224c678 100644 --- a/notebooks/calculate_summary_statistics_outlying.ipynb +++ b/notebooks/calculate_summary_statistics_outlying.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -24,26 +24,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Downloading: 100%|██████████| 1/1 [00:00<00:00, 6.53rows/s]\n", - "Downloading: 100%|██████████| 129532/129532 [00:11<00:00, 11161.65rows/s]\n", - "Downloading: 100%|██████████| 302743/302743 [00:11<00:00, 26079.48rows/s]\n", - "Downloading: 100%|██████████| 1346/1346 [00:40<00:00, 33.64rows/s]\n", - "Downloading: 100%|██████████| 2121/2121 [00:00<00:00, 4642.75rows/s]\n", - "Downloading: 100%|██████████| 10905/10905 [00:00<00:00, 14726.74rows/s]\n", - "Downloading: 100%|██████████| 1138/1138 [00:00<00:00, 1338.28rows/s]\n", - "Downloading: 100%|██████████| 25140/25140 [00:02<00:00, 10127.85rows/s]\n", - "Downloading: 100%|██████████| 89049/89049 [00:05<00:00, 17194.56rows/s]\n", - "Downloading: 100%|██████████| 1416/1416 [00:08<00:00, 171.31rows/s]\n", - "Downloading: 100%|██████████| 842/842 [00:00<00:00, 3119.24rows/s]\n", - "Downloading: 100%|██████████| 3992/3992 [00:00<00:00, 11083.61rows/s]\n", - "Downloading: 100%|██████████| 680/680 [00:00<00:00, 1650.28rows/s]\n" + "Downloading: 100%|██████████| 1/1 [00:00<00:00, 5.83rows/s]\n" ] } ], @@ -54,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -77,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -133,7 +121,7 @@ "stp 42" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -162,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -218,7 +206,7 @@ "stp 680" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -238,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": { "lines_to_next_cell": 2 }, @@ -251,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": { "lines_to_next_cell": 2 }, @@ -263,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -271,7 +259,7 @@ "### identified in the TOP 5 in at least one organisation of the entity type.\n", "### There are the chemicals displayed in the 'Higher than most' table.\n", "\n", - "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" ] }, @@ -285,7 +273,7 @@ "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n", "### There are the chemicals displayed in the 'Lower than most' table.\n", "\n", - "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" ] }, @@ -379,7 +367,7 @@ " 1346\n", " 5.23\n", " 6825.50\n", - " 1.21\n", + " 0.00\n", " 3.84\n", " \n", " \n", @@ -392,7 +380,7 @@ "stp 42 680 5.42 6.33 2.68 1.64\n", "ccg 106 1138 5.79 10.20 2.76 3.18\n", "pcn 1257 1416 5.28 2528.09 2.26 3.39\n", - "practice 6476 1346 5.23 6825.50 1.21 3.84" + "practice 6476 1346 5.23 6825.50 0.00 3.84" ] }, "execution_count": 13, @@ -502,7 +490,7 @@ " 6476\n", " 1346\n", " -2.08\n", - " -0.05\n", + " -0.99\n", " -307.23\n", " 0.81\n", " \n", @@ -516,7 +504,7 @@ "stp 42 680 -2.35 -1.47 -6.33 0.79\n", "ccg 106 1138 -2.30 -1.33 -10.20 0.82\n", "pcn 1257 1416 -2.18 -1.30 -159.77 0.77\n", - "practice 6476 1346 -2.08 -0.05 -307.23 0.81" + "practice 6476 1346 -2.08 -0.99 -307.23 0.81" ] }, "execution_count": 14, @@ -653,10 +641,10 @@ " 1346\n", " 5.23\n", " 6825.50\n", - " 1.21\n", + " 0.00\n", " 3.84\n", " -2.08\n", - " -0.05\n", + " -0.99\n", " -307.23\n", " 0.81\n", " \n", @@ -671,7 +659,7 @@ "stp 42 680 5.42 6.33 2.68 1.64 \n", "ccg 106 1138 5.79 10.20 2.76 3.18 \n", "pcn 1257 1416 5.28 2528.09 2.26 3.39 \n", - "practice 6476 1346 5.23 6825.50 1.21 3.84 \n", + "practice 6476 1346 5.23 6825.50 0.00 3.84 \n", "\n", " Lower than most \n", " median max min IQR \n", @@ -679,7 +667,7 @@ "stp -2.35 -1.47 -6.33 0.79 \n", "ccg -2.30 -1.33 -10.20 0.82 \n", "pcn -2.18 -1.30 -159.77 0.77 \n", - "practice -2.08 -0.05 -307.23 0.81 " + "practice -2.08 -0.99 -307.23 0.81 " ] }, "execution_count": 15, diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py index 6fb6d9d3b..c483c1de5 100644 --- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py +++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py @@ -84,7 +84,7 @@ ### identified in the TOP 5 in at least one organisation of the entity type. ### There are the chemicals displayed in the 'Higher than most' table. -overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False ) # + @@ -92,7 +92,7 @@ ### identified in the BOTTOM 5 in at least one organisation of the entity type. ### There are the chemicals displayed in the 'Lower than most' table. -underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False ) # - From 7cae036cced259392b9c6555a4febcbaef7ec83c Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Thu, 14 Apr 2022 12:02:03 +0100 Subject: [PATCH 08/11] feat: update to notebook --- ...alculate_summary_statistics_outlying.ipynb | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb index 57224c678..5abe1334e 100644 --- a/notebooks/calculate_summary_statistics_outlying.ipynb +++ b/notebooks/calculate_summary_statistics_outlying.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -24,14 +24,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Downloading: 100%|██████████| 1/1 [00:00<00:00, 5.83rows/s]\n" + "Downloading: 100%|██████████| 1/1 [00:00<00:00, 7.05rows/s]\n" ] } ], @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -121,7 +121,7 @@ "stp 42" ] }, - "execution_count": 5, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -206,7 +206,7 @@ "stp 680" ] }, - "execution_count": 6, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -226,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "metadata": { "lines_to_next_cell": 2 }, @@ -239,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 23, "metadata": { "lines_to_next_cell": 2 }, @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -292,7 +292,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -383,7 +383,7 @@ "practice 6476 1346 5.23 6825.50 0.00 3.84" ] }, - "execution_count": 13, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -414,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 27, "metadata": { "lines_to_next_cell": 0 }, @@ -507,7 +507,7 @@ "practice 6476 1346 -2.08 -0.99 -307.23 0.81" ] }, - "execution_count": 14, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -537,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -670,7 +670,7 @@ "practice -2.08 -0.99 -307.23 0.81 " ] }, - "execution_count": 15, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } From da7ce8b35370ec566141e53e3e95d2a653efff33 Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Thu, 14 Apr 2022 12:20:35 +0100 Subject: [PATCH 09/11] feat: removing z scores of 0 from summary statistics --- ...alculate_summary_statistics_outlying.ipynb | 155 ++++++++++++++++-- .../calculate_summary_statistics_outlying.py | 6 +- 2 files changed, 145 insertions(+), 16 deletions(-) diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb index 5abe1334e..72fed35ec 100644 --- a/notebooks/calculate_summary_statistics_outlying.ipynb +++ b/notebooks/calculate_summary_statistics_outlying.ipynb @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -259,13 +259,13 @@ "### identified in the TOP 5 in at least one organisation of the entity type.\n", "### There are the chemicals displayed in the 'Higher than most' table.\n", "\n", - "overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -273,7 +273,7 @@ "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n", "### There are the chemicals displayed in the 'Lower than most' table.\n", "\n", - "underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", + "underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" ] }, @@ -292,7 +292,134 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%median75%max
entity
stpz_score420.0-2.6503870.932614-6.326451-2.872841-2.352648-2.081660-1.473260
ccgz_score1060.0-2.6670101.228738-10.198503-2.813851-2.302282-1.990392-1.325057
pcnz_score12570.0-2.6099232.474167-159.768459-2.673923-2.183928-1.900201-1.296016
practicez_score64760.0-2.4953173.925005-307.234735-2.569772-2.076544-1.756406-0.987765
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% median \\\n", + "entity \n", + "stp z_score 420.0 -2.650387 0.932614 -6.326451 -2.872841 -2.352648 \n", + "ccg z_score 1060.0 -2.667010 1.228738 -10.198503 -2.813851 -2.302282 \n", + "pcn z_score 12570.0 -2.609923 2.474167 -159.768459 -2.673923 -2.183928 \n", + "practice z_score 64760.0 -2.495317 3.925005 -307.234735 -2.569772 -2.076544 \n", + "\n", + " 75% max \n", + "entity \n", + "stp z_score -2.081660 -1.473260 \n", + "ccg z_score -1.990392 -1.325057 \n", + "pcn z_score -1.900201 -1.296016 \n", + "practice z_score -1.756406 -0.987765 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "underused_summary" + ] + }, + { + "cell_type": "code", + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -367,7 +494,7 @@ " 1346\n", " 5.23\n", " 6825.50\n", - " 0.00\n", + " 1.21\n", " 3.84\n", " \n", " \n", @@ -380,10 +507,10 @@ "stp 42 680 5.42 6.33 2.68 1.64\n", "ccg 106 1138 5.79 10.20 2.76 3.18\n", "pcn 1257 1416 5.28 2528.09 2.26 3.39\n", - "practice 6476 1346 5.23 6825.50 0.00 3.84" + "practice 6476 1346 5.23 6825.50 1.21 3.84" ] }, - "execution_count": 26, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -414,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 33, "metadata": { "lines_to_next_cell": 0 }, @@ -507,7 +634,7 @@ "practice 6476 1346 -2.08 -0.99 -307.23 0.81" ] }, - "execution_count": 27, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -537,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -641,7 +768,7 @@ " 1346\n", " 5.23\n", " 6825.50\n", - " 0.00\n", + " 1.21\n", " 3.84\n", " -2.08\n", " -0.99\n", @@ -659,7 +786,7 @@ "stp 42 680 5.42 6.33 2.68 1.64 \n", "ccg 106 1138 5.79 10.20 2.76 3.18 \n", "pcn 1257 1416 5.28 2528.09 2.26 3.39 \n", - "practice 6476 1346 5.23 6825.50 0.00 3.84 \n", + "practice 6476 1346 5.23 6825.50 1.21 3.84 \n", "\n", " Lower than most \n", " median max min IQR \n", @@ -670,7 +797,7 @@ "practice -2.08 -0.99 -307.23 0.81 " ] }, - "execution_count": 28, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py index c483c1de5..fbf4e84ad 100644 --- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py +++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py @@ -84,7 +84,7 @@ ### identified in the TOP 5 in at least one organisation of the entity type. ### There are the chemicals displayed in the 'Higher than most' table. -overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False ) # + @@ -92,7 +92,7 @@ ### identified in the BOTTOM 5 in at least one organisation of the entity type. ### There are the chemicals displayed in the 'Lower than most' table. -underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) +underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False ) # - @@ -104,6 +104,8 @@ # in each type of organisation. These are chemicals are seen to be used more often # in a particular organisation than its peers. +underused_summary + # + ### Extracting the summary statistics for the z scores overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)] From 64c81ba4dff4e69cb0095fe3461446da9dba41a3 Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Thu, 14 Apr 2022 15:29:36 +0100 Subject: [PATCH 10/11] tidy: removing unnecessary cell --- ...alculate_summary_statistics_outlying.ipynb | 165 ++---------------- .../calculate_summary_statistics_outlying.py | 2 - 2 files changed, 19 insertions(+), 148 deletions(-) diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb index 72fed35ec..4c7e0e5ae 100644 --- a/notebooks/calculate_summary_statistics_outlying.ipynb +++ b/notebooks/calculate_summary_statistics_outlying.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 16, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -24,14 +24,14 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Downloading: 100%|██████████| 1/1 [00:00<00:00, 7.05rows/s]\n" + "Downloading: 100%|██████████| 1/1 [00:00<00:00, 3.01rows/s]\n" ] } ], @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -121,7 +121,7 @@ "stp 42" ] }, - "execution_count": 20, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -206,7 +206,7 @@ "stp 680" ] }, - "execution_count": 21, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -226,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 7, "metadata": { "lines_to_next_cell": 2 }, @@ -239,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 8, "metadata": { "lines_to_next_cell": 2 }, @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -292,134 +292,7 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countmeanstdmin25%median75%max
entity
stpz_score420.0-2.6503870.932614-6.326451-2.872841-2.352648-2.081660-1.473260
ccgz_score1060.0-2.6670101.228738-10.198503-2.813851-2.302282-1.990392-1.325057
pcnz_score12570.0-2.6099232.474167-159.768459-2.673923-2.183928-1.900201-1.296016
practicez_score64760.0-2.4953173.925005-307.234735-2.569772-2.076544-1.756406-0.987765
\n", - "
" - ], - "text/plain": [ - " count mean std min 25% median \\\n", - "entity \n", - "stp z_score 420.0 -2.650387 0.932614 -6.326451 -2.872841 -2.352648 \n", - "ccg z_score 1060.0 -2.667010 1.228738 -10.198503 -2.813851 -2.302282 \n", - "pcn z_score 12570.0 -2.609923 2.474167 -159.768459 -2.673923 -2.183928 \n", - "practice z_score 64760.0 -2.495317 3.925005 -307.234735 -2.569772 -2.076544 \n", - "\n", - " 75% max \n", - "entity \n", - "stp z_score -2.081660 -1.473260 \n", - "ccg z_score -1.990392 -1.325057 \n", - "pcn z_score -1.900201 -1.296016 \n", - "practice z_score -1.756406 -0.987765 " - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "underused_summary" - ] - }, - { - "cell_type": "code", - "execution_count": 32, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -510,7 +383,7 @@ "practice 6476 1346 5.23 6825.50 1.21 3.84" ] }, - "execution_count": 32, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -541,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 12, "metadata": { "lines_to_next_cell": 0 }, @@ -634,7 +507,7 @@ "practice 6476 1346 -2.08 -0.99 -307.23 0.81" ] }, - "execution_count": 33, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -664,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -797,7 +670,7 @@ "practice -2.08 -0.99 -307.23 0.81 " ] }, - "execution_count": 34, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py index fbf4e84ad..3d689bd72 100644 --- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py +++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py @@ -104,8 +104,6 @@ # in each type of organisation. These are chemicals are seen to be used more often # in a particular organisation than its peers. -underused_summary - # + ### Extracting the summary statistics for the z scores overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)] From eeee9e9c54d587d7623d2eb542f2c91df7aa69a5 Mon Sep 17 00:00:00 2001 From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com> Date: Mon, 22 Aug 2022 11:32:24 +0100 Subject: [PATCH 11/11] fix: amending text and comments to read top/bottom 10, not top/bottom 5 --- ...alculate_summary_statistics_outlying.ipynb | 498 +----------------- .../calculate_summary_statistics_outlying.py | 13 +- 2 files changed, 37 insertions(+), 474 deletions(-) diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb index 4c7e0e5ae..06048f41a 100644 --- a/notebooks/calculate_summary_statistics_outlying.ipynb +++ b/notebooks/calculate_summary_statistics_outlying.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -24,17 +24,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 100%|██████████| 1/1 [00:00<00:00, 3.01rows/s]\n" - ] - } - ], + "outputs": [], "source": [ "r.build.run()\n", "r.build.fetch_results()" @@ -42,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -65,67 +57,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
n
practice6476
pcn1257
ccg106
stp42
\n", - "
" - ], - "text/plain": [ - " n\n", - "practice 6476\n", - "pcn 1257\n", - "ccg 106\n", - "stp 42" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "### Summarising the number of each kind of entity (organisation)\n", "\n", @@ -145,72 +79,14 @@ "## Chemical counts\n", "\n", "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n", - "5 z scores) amongst all organisations of the given type." + "10 z scores) amongst all organisations of the given type." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
chemicals
pcn1416
practice1346
ccg1138
stp680
\n", - "
" - ], - "text/plain": [ - " chemicals\n", - "pcn 1416\n", - "practice 1346\n", - "ccg 1138\n", - "stp 680" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "### Summarising the number of unique chemicals identified in the\n", "### top/bottom five outliers amongst all organisations of the given type\n", @@ -226,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, @@ -239,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, @@ -251,12 +127,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### Calculating summary statistics for the Z scores for those chemicals\n", - "### identified in the TOP 5 in at least one organisation of the entity type.\n", + "### identified in the TOP 10 in at least one organisation of the entity type.\n", "### There are the chemicals displayed in the 'Higher than most' table.\n", "\n", "overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", @@ -265,12 +141,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### Calculating summary statistics for the Z scores for those chemicals\n", - "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n", + "### identified in the BOTTOM 10 in at least one organisation of the entity type.\n", "### There are the chemicals displayed in the 'Lower than most' table.\n", "\n", "underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", @@ -285,109 +161,16 @@ "\n", "### Higher than most chemicals\n", "\n", - "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n", + "The table below summarises the Z scores for the high outlying (i.e., top 10) chemicals\n", "in each type of organisation. These are chemicals are seen to be used more often\n", "in a particular organisation than its peers." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nchemicalsmedianmaxminIQR
entity
stp426805.426.332.681.64
ccg10611385.7910.202.763.18
pcn125714165.282528.092.263.39
practice647613465.236825.501.213.84
\n", - "
" - ], - "text/plain": [ - " n chemicals median max min IQR\n", - "entity \n", - "stp 42 680 5.42 6.33 2.68 1.64\n", - "ccg 106 1138 5.79 10.20 2.76 3.18\n", - "pcn 1257 1416 5.28 2528.09 2.26 3.39\n", - "practice 6476 1346 5.23 6825.50 1.21 3.84" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "### Extracting the summary statistics for the z scores\n", "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n", @@ -407,111 +190,18 @@ "source": [ "### Lower than most chemicals\n", "\n", - "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n", + "The table below summarises the Z scores for the low outlying (i.e., bottom 10) chemicals\n", "in each type of organisation. These are chemicals are seen to be used less often\n", "in a particular organisation than its peers." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "lines_to_next_cell": 0 }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nchemicalsmedianmaxminIQR
entity
stp42680-2.35-1.47-6.330.79
ccg1061138-2.30-1.33-10.200.82
pcn12571416-2.18-1.30-159.770.77
practice64761346-2.08-0.99-307.230.81
\n", - "
" - ], - "text/plain": [ - " n chemicals median max min IQR\n", - "entity \n", - "stp 42 680 -2.35 -1.47 -6.33 0.79\n", - "ccg 106 1138 -2.30 -1.33 -10.20 0.82\n", - "pcn 1257 1416 -2.18 -1.30 -159.77 0.77\n", - "practice 6476 1346 -2.08 -0.99 -307.23 0.81" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "### Extracting the summary statistics for the z scores\n", "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n", @@ -537,144 +227,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Higher than mostLower than most
nchemicalsmedianmaxminIQRmedianmaxminIQR
entity
stp426805.426.332.681.64-2.35-1.47-6.330.79
ccg10611385.7910.202.763.18-2.30-1.33-10.200.82
pcn125714165.282528.092.263.39-2.18-1.30-159.770.77
practice647613465.236825.501.213.84-2.08-0.99-307.230.81
\n", - "
" - ], - "text/plain": [ - " Higher than most \\\n", - " n chemicals median max min IQR \n", - "entity \n", - "stp 42 680 5.42 6.33 2.68 1.64 \n", - "ccg 106 1138 5.79 10.20 2.76 3.18 \n", - "pcn 1257 1416 5.28 2528.09 2.26 3.39 \n", - "practice 6476 1346 5.23 6825.50 1.21 3.84 \n", - "\n", - " Lower than most \n", - " median max min IQR \n", - "entity \n", - "stp -2.35 -1.47 -6.33 0.79 \n", - "ccg -2.30 -1.33 -10.20 0.82 \n", - "pcn -2.18 -1.30 -159.77 0.77 \n", - "practice -2.08 -0.99 -307.23 0.81 " - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n", " underused_toprint[metrics_to_show[2:]]],\n", @@ -717,6 +272,11 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.1" + }, + "vscode": { + "interpreter": { + "hash": "de1343822d6e7d7aeea8796be9d48304b0fa3610166e8740495ec86b33c71a9e" + } } }, "nbformat": 4, diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py index 3d689bd72..7bac1e5b4 100644 --- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py +++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py @@ -13,6 +13,9 @@ # display_name: Python 3 # language: python # name: python3 +# vscode: +# interpreter: +# hash: de1343822d6e7d7aeea8796be9d48304b0fa3610166e8740495ec86b33c71a9e # --- from lib.outliers import Runner @@ -54,7 +57,7 @@ # ## Chemical counts # # Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom -# 5 z scores) amongst all organisations of the given type. +# 10 z scores) amongst all organisations of the given type. # + ### Summarising the number of unique chemicals identified in the @@ -81,7 +84,7 @@ # + ### Calculating summary statistics for the Z scores for those chemicals -### identified in the TOP 5 in at least one organisation of the entity type. +### identified in the TOP 10 in at least one organisation of the entity type. ### There are the chemicals displayed in the 'Higher than most' table. overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) @@ -89,7 +92,7 @@ # + ### Calculating summary statistics for the Z scores for those chemicals -### identified in the BOTTOM 5 in at least one organisation of the entity type. +### identified in the BOTTOM 10 in at least one organisation of the entity type. ### There are the chemicals displayed in the 'Lower than most' table. underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0) @@ -100,7 +103,7 @@ # # ### Higher than most chemicals # -# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals +# The table below summarises the Z scores for the high outlying (i.e., top 10) chemicals # in each type of organisation. These are chemicals are seen to be used more often # in a particular organisation than its peers. @@ -119,7 +122,7 @@ # ### Lower than most chemicals # -# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals +# The table below summarises the Z scores for the low outlying (i.e., bottom 10) chemicals # in each type of organisation. These are chemicals are seen to be used less often # in a particular organisation than its peers.