From 4f12defb7eb6f412fcd35b66236fd5883e09fc3c Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Wed, 9 Mar 2022 15:17:43 +0000
Subject: [PATCH 01/11] feat: new notebook to calculate summary statistics for
 Z scores (and ratios) across the different entities

---
 notebooks/calculate_summary_statistics.ipynb  | 624 ++++++++++++++++++
 .../calculate_summary_statistics.py           | 117 ++++
 2 files changed, 741 insertions(+)
 create mode 100644 notebooks/calculate_summary_statistics.ipynb
 create mode 100644 notebooks/diffable_python/calculate_summary_statistics.py

diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb
new file mode 100644
index 000000000..d13ef013a
--- /dev/null
+++ b/notebooks/calculate_summary_statistics.ipynb
@@ -0,0 +1,624 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lib.outliers import Runner\n",
+    "from datetime import date\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from_date = date(year=2021,month=4,day=1)\n",
+    "to_date = date(year=2021,month=8,day=1)\n",
+    "r = Runner(from_date,to_date,5,[\"practice\",\"ccg\",\"pcn\",\"stp\"],False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=7VcN0yeYTzq2E9Z3jdPuuj7sEjFtTb&prompt=consent&access_type=offline\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Enter the authorization code:  4/1AX4XfWiHK9sZGQcEnE9I3G-pDKHnzzLRbZ2FYsyEvv0x8Omm_TOcfbh3Z3A\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  6.93rows/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e6192a9e28d549babea7ec4d09af9479",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='SUBMITTING | ', max=6499.0, style=ProgressStyle(descripti…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "77ffb0a12f3547acb5dd1ad59ea70c89",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='PROCESSING | ', max=6499.0, style=ProgressStyle(descripti…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8bc9cd88d202472ea47dca9b54750c3a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='COLLECTING | ', max=6499.0, style=ProgressStyle(descripti…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "add_item() argument after ** must be a mapping, not BrokenProcessPool",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-65f6f5f86dd5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/home/app/notebook/lib/outliers.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    788\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_entity_report\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    789\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 790\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    791\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    792\u001b[0m         \u001b[0;31m# write out toc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: add_item() argument after ** must be a mapping, not BrokenProcessPool"
+     ]
+    }
+   ],
+   "source": [
+    "r.run()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Extracting all the stored z scores etc across organisations\n",
+    "### so that summary statistics can be calculated\n",
+    "\n",
+    "e_data = pd.concat(\n",
+    "    (d.assign(entity=e) for e, d in r.build.results.items())\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Entity counts\n",
+    "\n",
+    "Counts of each kind of entity (i.e., organisation)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6499</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             n\n",
+       "practice  6499\n",
+       "pcn       1257\n",
+       "ccg        106\n",
+       "stp         42"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Summarising the number of each kind of entity (organisation)\n",
+    "\n",
+    "e_counts = ( e_data.reset_index()[[\"practice\",\"entity\"]]\n",
+    "            .drop_duplicates()['entity']\n",
+    "            .value_counts()\n",
+    "            .to_frame()\n",
+    "            .rename( columns={'entity':'n'} ) )\n",
+    "\n",
+    "e_counts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Chemical counts\n",
+    "\n",
+    "Counts of the number of chemicals for which we have data (Z scores etc)\n",
+    "within each type of organisation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>chemicals</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1294</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>1274</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>706</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>364</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          chemicals\n",
+       "pcn            1294\n",
+       "practice       1274\n",
+       "ccg             706\n",
+       "stp             364"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Summarising the number of unique chemicals analysed within\n",
+    "### each type of organisation\n",
+    "\n",
+    "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n",
+    "            .drop_duplicates()['entity']\n",
+    "            .value_counts()\n",
+    "            .to_frame()\n",
+    "            .rename( columns={'entity':'chemicals'} ) )\n",
+    "\n",
+    "c_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "### Combining the entity and chemical counts\n",
+    "\n",
+    "all_counts = e_counts.join( c_counts )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "### Calculating summary statistics for the ratio and the Z score\n",
+    "### within each entity type\n",
+    "\n",
+    "all_summary = e_data.groupby( \"entity\" )[[\"ratio\",\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "all_summary = all_summary.rename( columns={\"50%\":\"median\"}, inplace=False )\n",
+    "\n",
+    "### Defining which metrics will be displayed below\n",
+    "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary statistics for the z score in each organisation type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "      <th>chemicals</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entity</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "      <td>364</td>\n",
+       "      <td>3.82</td>\n",
+       "      <td>6.33</td>\n",
+       "      <td>-6.33</td>\n",
+       "      <td>8.90</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "      <td>706</td>\n",
+       "      <td>3.57</td>\n",
+       "      <td>10.20</td>\n",
+       "      <td>-10.20</td>\n",
+       "      <td>10.14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "      <td>1294</td>\n",
+       "      <td>2.58</td>\n",
+       "      <td>543.19</td>\n",
+       "      <td>-141.33</td>\n",
+       "      <td>9.86</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6499</td>\n",
+       "      <td>1274</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>5512.02</td>\n",
+       "      <td>-711.87</td>\n",
+       "      <td>9.72</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             n  chemicals  median      max     min    IQR\n",
+       "entity                                                   \n",
+       "stp         42        364    3.82     6.33   -6.33   8.90\n",
+       "ccg        106        706    3.57    10.20  -10.20  10.14\n",
+       "pcn       1257       1294    2.58   543.19 -141.33   9.86\n",
+       "practice  6499       1274    0.00  5512.02 -711.87   9.72"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Extracting the summary statistics for the z scores\n",
+    "z_tmp = all_summary[all_summary.index.isin([\"z_score\"], level=1)]\n",
+    "\n",
+    "### Calculating IQR, removing the row index and rounding to 2dp\n",
+    "z_summary = ( z_tmp\n",
+    "         .assign( IQR = z_tmp[\"75%\"]-z_tmp[\"25%\"] )\n",
+    "         .droplevel(level=1)\n",
+    "         .round(2) )\n",
+    "\n",
+    "z_summary.join( all_counts )[metrics_to_show]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary statistics for the ratio in each organisation type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "lines_to_next_cell": 0
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "      <th>chemicals</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entity</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "      <td>364</td>\n",
+       "      <td>0.09</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.64</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "      <td>706</td>\n",
+       "      <td>0.12</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.61</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "      <td>1294</td>\n",
+       "      <td>0.13</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.49</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6499</td>\n",
+       "      <td>1274</td>\n",
+       "      <td>0.14</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.44</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             n  chemicals  median  max  min   IQR\n",
+       "entity                                           \n",
+       "stp         42        364    0.09  1.0  0.0  0.64\n",
+       "ccg        106        706    0.12  1.0  0.0  0.61\n",
+       "pcn       1257       1294    0.13  1.0  0.0  0.49\n",
+       "practice  6499       1274    0.14  1.0  0.0  0.44"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Extracting the summary statistics for the z scores\n",
+    "ratio_tmp = all_summary[all_summary.index.isin([\"ratio\"], level=1)]\n",
+    "\n",
+    "### Calculating IQR, removing the row index and rounding to 2dp\n",
+    "ratio_summary = ( ratio_tmp\n",
+    "         .assign( IQR = ratio_tmp[\"75%\"]-ratio_tmp[\"25%\"] )\n",
+    "         .droplevel(level=1)\n",
+    "         .round(2) )\n",
+    "\n",
+    "ratio_summary.join( all_counts )[metrics_to_show]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "all",
+   "encoding": "# -*- coding: utf-8 -*-",
+   "notebook_metadata_filter": "all,-language_info",
+   "text_representation": {
+    "extension": ".py",
+    "format_name": "light",
+    "format_version": "1.5",
+    "jupytext_version": "1.3.4"
+   }
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py
new file mode 100644
index 000000000..71287c4ea
--- /dev/null
+++ b/notebooks/diffable_python/calculate_summary_statistics.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     cell_metadata_filter: all
+#     notebook_metadata_filter: all,-language_info
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.3.4
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+from lib.outliers import Runner
+from datetime import date
+import pandas as pd
+
+from_date = date(year=2021,month=4,day=1)
+to_date = date(year=2021,month=8,day=1)
+r = Runner(from_date,to_date,5,["practice","ccg","pcn","stp"],False)
+
+r.run()
+
+# +
+### Extracting all the stored z scores etc across organisations
+### so that summary statistics can be calculated
+
+e_data = pd.concat(
+    (d.assign(entity=e) for e, d in r.build.results.items())
+)
+# -
+
+# ## Entity counts
+#
+# Counts of each kind of entity (i.e., organisation).
+
+# +
+### Summarising the number of each kind of entity (organisation)
+
+e_counts = ( e_data.reset_index()[["practice","entity"]]
+            .drop_duplicates()['entity']
+            .value_counts()
+            .to_frame()
+            .rename( columns={'entity':'n'} ) )
+
+e_counts
+# -
+
+# ## Chemical counts
+#
+# Counts of the number of chemicals for which we have data (Z scores etc)
+# within each type of organisation.
+
+# +
+### Summarising the number of unique chemicals analysed within
+### each type of organisation
+
+c_counts = ( e_data.reset_index()[["chemical","entity"]]
+            .drop_duplicates()['entity']
+            .value_counts()
+            .to_frame()
+            .rename( columns={'entity':'chemicals'} ) )
+
+c_counts
+
+# +
+### Combining the entity and chemical counts
+
+all_counts = e_counts.join( c_counts )
+
+
+# +
+### Calculating summary statistics for the ratio and the Z score
+### within each entity type
+
+all_summary = e_data.groupby( "entity" )[["ratio","z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+all_summary = all_summary.rename( columns={"50%":"median"}, inplace=False )
+
+### Defining which metrics will be displayed below
+metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ]
+# -
+
+
+# ## Summary statistics for the z score in each organisation type
+
+# +
+### Extracting the summary statistics for the z scores
+z_tmp = all_summary[all_summary.index.isin(["z_score"], level=1)]
+
+### Calculating IQR, removing the row index and rounding to 2dp
+z_summary = ( z_tmp
+         .assign( IQR = z_tmp["75%"]-z_tmp["25%"] )
+         .droplevel(level=1)
+         .round(2) )
+
+z_summary.join( all_counts )[metrics_to_show]
+# -
+
+# ## Summary statistics for the ratio in each organisation type
+
+# +
+### Extracting the summary statistics for the z scores
+ratio_tmp = all_summary[all_summary.index.isin(["ratio"], level=1)]
+
+### Calculating IQR, removing the row index and rounding to 2dp
+ratio_summary = ( ratio_tmp
+         .assign( IQR = ratio_tmp["75%"]-ratio_tmp["25%"] )
+         .droplevel(level=1)
+         .round(2) )
+
+ratio_summary.join( all_counts )[metrics_to_show]
+# -
+

From d0d319d1a47410c438f6b663bb6f5aeb40f0d61a Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Wed, 9 Mar 2022 15:26:52 +0000
Subject: [PATCH 02/11] feat: removing BigQuery key

---
 notebooks/calculate_summary_statistics.ipynb | 101 +------------------
 1 file changed, 2 insertions(+), 99 deletions(-)

diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb
index d13ef013a..c1a306529 100644
--- a/notebooks/calculate_summary_statistics.ipynb
+++ b/notebooks/calculate_summary_statistics.ipynb
@@ -24,106 +24,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=7VcN0yeYTzq2E9Z3jdPuuj7sEjFtTb&prompt=consent&access_type=offline\n"
-     ]
-    },
-    {
-     "name": "stdin",
-     "output_type": "stream",
-     "text": [
-      "Enter the authorization code:  4/1AX4XfWiHK9sZGQcEnE9I3G-pDKHnzzLRbZ2FYsyEvv0x8Omm_TOcfbh3Z3A\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  6.93rows/s]\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e6192a9e28d549babea7ec4d09af9479",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='SUBMITTING | ', max=6499.0, style=ProgressStyle(descripti…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "77ffb0a12f3547acb5dd1ad59ea70c89",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='PROCESSING | ', max=6499.0, style=ProgressStyle(descripti…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8bc9cd88d202472ea47dca9b54750c3a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='COLLECTING | ', max=6499.0, style=ProgressStyle(descripti…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "ename": "TypeError",
-     "evalue": "add_item() argument after ** must be a mapping, not BrokenProcessPool",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-65f6f5f86dd5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/home/app/notebook/lib/outliers.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    788\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_entity_report\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    789\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 790\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    791\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    792\u001b[0m         \u001b[0;31m# write out toc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mTypeError\u001b[0m: add_item() argument after ** must be a mapping, not BrokenProcessPool"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "r.run()"
    ]

From 1ba0a43c5055790ea00dc11ce155711f6973d242 Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Wed, 9 Mar 2022 16:36:57 +0000
Subject: [PATCH 03/11] feat: fix to only extract results, rather than generate
 new results

---
 notebooks/calculate_summary_statistics.ipynb              | 3 ++-
 notebooks/diffable_python/calculate_summary_statistics.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb
index c1a306529..4af94531b 100644
--- a/notebooks/calculate_summary_statistics.ipynb
+++ b/notebooks/calculate_summary_statistics.ipynb
@@ -28,7 +28,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "r.run()"
+    "r.build.run()\n",
+    "r.build.fetch_results()"
    ]
   },
   {
diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py
index 71287c4ea..b455eb46e 100644
--- a/notebooks/diffable_python/calculate_summary_statistics.py
+++ b/notebooks/diffable_python/calculate_summary_statistics.py
@@ -23,7 +23,8 @@
 to_date = date(year=2021,month=8,day=1)
 r = Runner(from_date,to_date,5,["practice","ccg","pcn","stp"],False)
 
-r.run()
+r.build.run()
+r.build.fetch_results()
 
 # +
 ### Extracting all the stored z scores etc across organisations

From a756eef77a5d24307e9de5df19b7d56cfc34a9ff Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Thu, 10 Mar 2022 09:37:44 +0000
Subject: [PATCH 04/11] feat: separating results for higher-than-most and
 lower-than-most

---
 notebooks/calculate_summary_statistics.ipynb  | 355 ++++++++++++++----
 .../calculate_summary_statistics.py           |  69 ++--
 2 files changed, 327 insertions(+), 97 deletions(-)

diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb
index 4af94531b..5a5d93d8a 100644
--- a/notebooks/calculate_summary_statistics.ipynb
+++ b/notebooks/calculate_summary_statistics.ipynb
@@ -24,9 +24,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=B0aBfO2cFTgPTpWIrbEXhCszrTmcNv&prompt=consent&access_type=offline\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Enter the authorization code:  4/1AX4XfWjWfGmWhSn3IUgFJA9Y1gOE418Hgdc8PD98NKa2Y2AW1-2axRotGlg\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  4.85rows/s]\n"
+     ]
+    }
+   ],
    "source": [
     "r.build.run()\n",
     "r.build.fetch_results()"
@@ -34,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -57,7 +79,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -113,7 +135,7 @@
        "stp         42"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -136,13 +158,13 @@
    "source": [
     "## Chemical counts\n",
     "\n",
-    "Counts of the number of chemicals for which we have data (Z scores etc)\n",
-    "within each type of organisation."
+    "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n",
+    "5 z scores) amongst all organisations of the given type."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -198,14 +220,14 @@
        "stp             364"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "### Summarising the number of unique chemicals analysed within\n",
-    "### each type of organisation\n",
+    "### Summarising the number of unique chemicals identified in the\n",
+    "### top/bottom five outliers amongst all organisations of the given type\n",
     "\n",
     "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n",
     "            .drop_duplicates()['entity']\n",
@@ -218,7 +240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 17,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -231,32 +253,60 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 18,
    "metadata": {
     "lines_to_next_cell": 2
    },
    "outputs": [],
    "source": [
-    "### Calculating summary statistics for the ratio and the Z score\n",
-    "### within each entity type\n",
+    "### Defining which metrics will be displayed in the summary tables\n",
+    "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Calculating summary statistics for the Z scores for those chemicals\n",
+    "### identified in the TOP 5 in at least one organisation of the entity type.\n",
+    "### There are the chemicals displayed in the 'Higher than most' table.\n",
     "\n",
-    "all_summary = e_data.groupby( \"entity\" )[[\"ratio\",\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
-    "all_summary = all_summary.rename( columns={\"50%\":\"median\"}, inplace=False )\n",
+    "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Calculating summary statistics for the Z scores for those chemicals\n",
+    "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n",
+    "### There are the chemicals displayed in the 'Lower than most' table.\n",
     "\n",
-    "### Defining which metrics will be displayed below\n",
-    "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]"
+    "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Summary statistics for the z score in each organisation type"
+    "## Summary statistics for outlying Z scores in each organisation type\n",
+    "\n",
+    "### Higher than most chemicals\n",
+    "\n",
+    "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n",
+    "in each type of organisation. These are chemicals are seen to be used more often\n",
+    "in a particular organisation than its peers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -302,79 +352,83 @@
        "      <th>stp</th>\n",
        "      <td>42</td>\n",
        "      <td>364</td>\n",
-       "      <td>3.82</td>\n",
+       "      <td>6.14</td>\n",
        "      <td>6.33</td>\n",
-       "      <td>-6.33</td>\n",
-       "      <td>8.90</td>\n",
+       "      <td>3.68</td>\n",
+       "      <td>0.78</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>ccg</th>\n",
        "      <td>106</td>\n",
        "      <td>706</td>\n",
-       "      <td>3.57</td>\n",
+       "      <td>7.36</td>\n",
        "      <td>10.20</td>\n",
-       "      <td>-10.20</td>\n",
-       "      <td>10.14</td>\n",
+       "      <td>3.56</td>\n",
+       "      <td>3.29</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>pcn</th>\n",
        "      <td>1257</td>\n",
        "      <td>1294</td>\n",
-       "      <td>2.58</td>\n",
+       "      <td>7.26</td>\n",
        "      <td>543.19</td>\n",
-       "      <td>-141.33</td>\n",
-       "      <td>9.86</td>\n",
+       "      <td>2.58</td>\n",
+       "      <td>5.11</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>practice</th>\n",
        "      <td>6499</td>\n",
        "      <td>1274</td>\n",
-       "      <td>0.00</td>\n",
+       "      <td>7.28</td>\n",
        "      <td>5512.02</td>\n",
-       "      <td>-711.87</td>\n",
-       "      <td>9.72</td>\n",
+       "      <td>1.13</td>\n",
+       "      <td>5.96</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "             n  chemicals  median      max     min    IQR\n",
-       "entity                                                   \n",
-       "stp         42        364    3.82     6.33   -6.33   8.90\n",
-       "ccg        106        706    3.57    10.20  -10.20  10.14\n",
-       "pcn       1257       1294    2.58   543.19 -141.33   9.86\n",
-       "practice  6499       1274    0.00  5512.02 -711.87   9.72"
+       "             n  chemicals  median      max   min   IQR\n",
+       "entity                                                \n",
+       "stp         42        364    6.14     6.33  3.68  0.78\n",
+       "ccg        106        706    7.36    10.20  3.56  3.29\n",
+       "pcn       1257       1294    7.26   543.19  2.58  5.11\n",
+       "practice  6499       1274    7.28  5512.02  1.13  5.96"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "### Extracting the summary statistics for the z scores\n",
-    "z_tmp = all_summary[all_summary.index.isin([\"z_score\"], level=1)]\n",
+    "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n",
     "\n",
     "### Calculating IQR, removing the row index and rounding to 2dp\n",
-    "z_summary = ( z_tmp\n",
-    "         .assign( IQR = z_tmp[\"75%\"]-z_tmp[\"25%\"] )\n",
+    "overused_toprint = ( overused_tmp\n",
+    "         .assign( IQR = overused_tmp[\"75%\"]-overused_tmp[\"25%\"] )\n",
     "         .droplevel(level=1)\n",
     "         .round(2) )\n",
     "\n",
-    "z_summary.join( all_counts )[metrics_to_show]"
+    "overused_toprint.join( all_counts )[metrics_to_show]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Summary statistics for the ratio in each organisation type"
+    "### Lower than most chemicals\n",
+    "\n",
+    "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n",
+    "in each type of organisation. These are chemicals are seen to be used less often\n",
+    "in a particular organisation than its peers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 22,
    "metadata": {
     "lines_to_next_cell": 0
    },
@@ -422,75 +476,224 @@
        "      <th>stp</th>\n",
        "      <td>42</td>\n",
        "      <td>364</td>\n",
-       "      <td>0.09</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.64</td>\n",
+       "      <td>-2.77</td>\n",
+       "      <td>-1.78</td>\n",
+       "      <td>-6.33</td>\n",
+       "      <td>1.15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>ccg</th>\n",
        "      <td>106</td>\n",
        "      <td>706</td>\n",
-       "      <td>0.12</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.61</td>\n",
+       "      <td>-2.78</td>\n",
+       "      <td>-1.47</td>\n",
+       "      <td>-10.20</td>\n",
+       "      <td>1.16</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>pcn</th>\n",
        "      <td>1257</td>\n",
        "      <td>1294</td>\n",
-       "      <td>0.13</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.49</td>\n",
+       "      <td>-2.61</td>\n",
+       "      <td>-1.49</td>\n",
+       "      <td>-141.33</td>\n",
+       "      <td>1.03</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>practice</th>\n",
        "      <td>6499</td>\n",
        "      <td>1274</td>\n",
-       "      <td>0.14</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.44</td>\n",
+       "      <td>-2.44</td>\n",
+       "      <td>-1.13</td>\n",
+       "      <td>-711.87</td>\n",
+       "      <td>1.07</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "             n  chemicals  median  max  min   IQR\n",
-       "entity                                           \n",
-       "stp         42        364    0.09  1.0  0.0  0.64\n",
-       "ccg        106        706    0.12  1.0  0.0  0.61\n",
-       "pcn       1257       1294    0.13  1.0  0.0  0.49\n",
-       "practice  6499       1274    0.14  1.0  0.0  0.44"
+       "             n  chemicals  median   max     min   IQR\n",
+       "entity                                               \n",
+       "stp         42        364   -2.77 -1.78   -6.33  1.15\n",
+       "ccg        106        706   -2.78 -1.47  -10.20  1.16\n",
+       "pcn       1257       1294   -2.61 -1.49 -141.33  1.03\n",
+       "practice  6499       1274   -2.44 -1.13 -711.87  1.07"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "### Extracting the summary statistics for the z scores\n",
-    "ratio_tmp = all_summary[all_summary.index.isin([\"ratio\"], level=1)]\n",
+    "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n",
     "\n",
     "### Calculating IQR, removing the row index and rounding to 2dp\n",
-    "ratio_summary = ( ratio_tmp\n",
-    "         .assign( IQR = ratio_tmp[\"75%\"]-ratio_tmp[\"25%\"] )\n",
+    "underused_toprint = ( underused_tmp\n",
+    "         .assign( IQR = underused_tmp[\"75%\"]-underused_tmp[\"25%\"] )\n",
     "         .droplevel(level=1)\n",
     "         .round(2) )\n",
     "\n",
-    "ratio_summary.join( all_counts )[metrics_to_show]"
+    "underused_toprint.join( all_counts )[metrics_to_show]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Summary\n",
+    "\n",
+    "Below is a summary table that combines the 'Higher than most' and 'Lower than most'\n",
+    "results displayed above."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"6\" halign=\"left\">Higher than most</th>\n",
+       "      <th colspan=\"4\" halign=\"left\">Lower than most</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "      <th>chemicals</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entity</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "      <td>364</td>\n",
+       "      <td>6.14</td>\n",
+       "      <td>6.33</td>\n",
+       "      <td>3.68</td>\n",
+       "      <td>0.78</td>\n",
+       "      <td>-2.77</td>\n",
+       "      <td>-1.78</td>\n",
+       "      <td>-6.33</td>\n",
+       "      <td>1.15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "      <td>706</td>\n",
+       "      <td>7.36</td>\n",
+       "      <td>10.20</td>\n",
+       "      <td>3.56</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>-2.78</td>\n",
+       "      <td>-1.47</td>\n",
+       "      <td>-10.20</td>\n",
+       "      <td>1.16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "      <td>1294</td>\n",
+       "      <td>7.26</td>\n",
+       "      <td>543.19</td>\n",
+       "      <td>2.58</td>\n",
+       "      <td>5.11</td>\n",
+       "      <td>-2.61</td>\n",
+       "      <td>-1.49</td>\n",
+       "      <td>-141.33</td>\n",
+       "      <td>1.03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6499</td>\n",
+       "      <td>1274</td>\n",
+       "      <td>7.28</td>\n",
+       "      <td>5512.02</td>\n",
+       "      <td>1.13</td>\n",
+       "      <td>5.96</td>\n",
+       "      <td>-2.44</td>\n",
+       "      <td>-1.13</td>\n",
+       "      <td>-711.87</td>\n",
+       "      <td>1.07</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Higher than most                                        \\\n",
+       "                        n chemicals median      max   min   IQR   \n",
+       "entity                                                            \n",
+       "stp                    42       364   6.14     6.33  3.68  0.78   \n",
+       "ccg                   106       706   7.36    10.20  3.56  3.29   \n",
+       "pcn                  1257      1294   7.26   543.19  2.58  5.11   \n",
+       "practice             6499      1274   7.28  5512.02  1.13  5.96   \n",
+       "\n",
+       "         Lower than most                      \n",
+       "                  median   max     min   IQR  \n",
+       "entity                                        \n",
+       "stp                -2.77 -1.78   -6.33  1.15  \n",
+       "ccg                -2.78 -1.47  -10.20  1.16  \n",
+       "pcn                -2.61 -1.49 -141.33  1.03  \n",
+       "practice           -2.44 -1.13 -711.87  1.07  "
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n",
+    "           underused_toprint[metrics_to_show[2:]]],\n",
+    "          keys=[\"Higher than most\", \"Lower than most\"],axis=1)"
+   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py
index b455eb46e..652720e3d 100644
--- a/notebooks/diffable_python/calculate_summary_statistics.py
+++ b/notebooks/diffable_python/calculate_summary_statistics.py
@@ -53,12 +53,12 @@
 
 # ## Chemical counts
 #
-# Counts of the number of chemicals for which we have data (Z scores etc)
-# within each type of organisation.
+# Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom
+# 5 z scores) amongst all organisations of the given type.
 
 # +
-### Summarising the number of unique chemicals analysed within
-### each type of organisation
+### Summarising the number of unique chemicals identified in the
+### top/bottom five outliers amongst all organisations of the given type
 
 c_counts = ( e_data.reset_index()[["chemical","entity"]]
             .drop_duplicates()['entity']
@@ -75,44 +75,71 @@
 
 
 # +
-### Calculating summary statistics for the ratio and the Z score
-### within each entity type
+### Defining which metrics will be displayed in the summary tables
+metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ]
 
-all_summary = e_data.groupby( "entity" )[["ratio","z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
-all_summary = all_summary.rename( columns={"50%":"median"}, inplace=False )
 
-### Defining which metrics will be displayed below
-metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ]
-# -
+# +
+### Calculating summary statistics for the Z scores for those chemicals
+### identified in the TOP 5 in at least one organisation of the entity type.
+### There are the chemicals displayed in the 'Higher than most' table.
+
+overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False )
+
+# +
+### Calculating summary statistics for the Z scores for those chemicals
+### identified in the BOTTOM 5 in at least one organisation of the entity type.
+### There are the chemicals displayed in the 'Lower than most' table.
 
+underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False )
+# -
 
-# ## Summary statistics for the z score in each organisation type
+# ## Summary statistics for outlying Z scores in each organisation type
+#
+# ### Higher than most chemicals
+#
+# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals
+# in each type of organisation. These are chemicals are seen to be used more often
+# in a particular organisation than its peers.
 
 # +
 ### Extracting the summary statistics for the z scores
-z_tmp = all_summary[all_summary.index.isin(["z_score"], level=1)]
+overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)]
 
 ### Calculating IQR, removing the row index and rounding to 2dp
-z_summary = ( z_tmp
-         .assign( IQR = z_tmp["75%"]-z_tmp["25%"] )
+overused_toprint = ( overused_tmp
+         .assign( IQR = overused_tmp["75%"]-overused_tmp["25%"] )
          .droplevel(level=1)
          .round(2) )
 
-z_summary.join( all_counts )[metrics_to_show]
+overused_toprint.join( all_counts )[metrics_to_show]
 # -
 
-# ## Summary statistics for the ratio in each organisation type
+# ### Lower than most chemicals
+#
+# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals
+# in each type of organisation. These are chemicals are seen to be used less often
+# in a particular organisation than its peers.
 
 # +
 ### Extracting the summary statistics for the z scores
-ratio_tmp = all_summary[all_summary.index.isin(["ratio"], level=1)]
+underused_tmp = underused_summary[underused_summary.index.isin(["z_score"], level=1)]
 
 ### Calculating IQR, removing the row index and rounding to 2dp
-ratio_summary = ( ratio_tmp
-         .assign( IQR = ratio_tmp["75%"]-ratio_tmp["25%"] )
+underused_toprint = ( underused_tmp
+         .assign( IQR = underused_tmp["75%"]-underused_tmp["25%"] )
          .droplevel(level=1)
          .round(2) )
 
-ratio_summary.join( all_counts )[metrics_to_show]
+underused_toprint.join( all_counts )[metrics_to_show]
 # -
+# ### Summary
+#
+# Below is a summary table that combines the 'Higher than most' and 'Lower than most'
+# results displayed above.
 
+pd.concat([overused_toprint.join( all_counts )[metrics_to_show],
+           underused_toprint[metrics_to_show[2:]]],
+          keys=["Higher than most", "Lower than most"],axis=1)

From 6fd6f91c6fc398ee1e73d9c861c325a7487320da Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Wed, 13 Apr 2022 13:57:56 +0100
Subject: [PATCH 05/11] feat: update to notebook to include the top/bottom ten
 results

---
 ...alculate_summary_statistics_outlying.ipynb | 736 ++++++++++++++++++
 .../calculate_summary_statistics_outlying.py  | 147 ++++
 2 files changed, 883 insertions(+)
 create mode 100644 notebooks/calculate_summary_statistics_outlying.ipynb
 create mode 100644 notebooks/diffable_python/calculate_summary_statistics_outlying.py

diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb
new file mode 100644
index 000000000..24f2beb90
--- /dev/null
+++ b/notebooks/calculate_summary_statistics_outlying.ipynb
@@ -0,0 +1,736 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lib.outliers import Runner\n",
+    "from datetime import date\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from_date = date(year=2021,month=6,day=1)\n",
+    "to_date = date(year=2021,month=12,day=1)\n",
+    "r = Runner(from_date,to_date,10,[\"practice\",\"ccg\",\"pcn\",\"stp\"],False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  6.53rows/s]\n",
+      "Downloading: 100%|██████████| 129532/129532 [00:11<00:00, 11161.65rows/s]\n",
+      "Downloading: 100%|██████████| 302743/302743 [00:11<00:00, 26079.48rows/s]\n",
+      "Downloading: 100%|██████████| 1346/1346 [00:40<00:00, 33.64rows/s]\n",
+      "Downloading: 100%|██████████| 2121/2121 [00:00<00:00, 4642.75rows/s]\n",
+      "Downloading: 100%|██████████| 10905/10905 [00:00<00:00, 14726.74rows/s]\n",
+      "Downloading: 100%|██████████| 1138/1138 [00:00<00:00, 1338.28rows/s]\n",
+      "Downloading: 100%|██████████| 25140/25140 [00:02<00:00, 10127.85rows/s]\n",
+      "Downloading: 100%|██████████| 89049/89049 [00:05<00:00, 17194.56rows/s]\n",
+      "Downloading: 100%|██████████| 1416/1416 [00:08<00:00, 171.31rows/s]\n",
+      "Downloading: 100%|██████████| 842/842 [00:00<00:00, 3119.24rows/s]\n",
+      "Downloading: 100%|██████████| 3992/3992 [00:00<00:00, 11083.61rows/s]\n",
+      "Downloading: 100%|██████████| 680/680 [00:00<00:00, 1650.28rows/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "r.build.run()\n",
+    "r.build.fetch_results()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Extracting all the stored z scores etc across organisations\n",
+    "### so that summary statistics can be calculated\n",
+    "\n",
+    "e_data = pd.concat(\n",
+    "    (d.assign(entity=e) for e, d in r.build.results.items())\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Entity counts\n",
+    "\n",
+    "Counts of each kind of entity (i.e., organisation)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6476</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             n\n",
+       "practice  6476\n",
+       "pcn       1257\n",
+       "ccg        106\n",
+       "stp         42"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Summarising the number of each kind of entity (organisation)\n",
+    "\n",
+    "e_counts = ( e_data.reset_index()[[\"practice\",\"entity\"]]\n",
+    "            .drop_duplicates()['entity']\n",
+    "            .value_counts()\n",
+    "            .to_frame()\n",
+    "            .rename( columns={'entity':'n'} ) )\n",
+    "\n",
+    "e_counts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Chemical counts\n",
+    "\n",
+    "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n",
+    "5 z scores) amongst all organisations of the given type."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>chemicals</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1416</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>1346</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>1138</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>680</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          chemicals\n",
+       "pcn            1416\n",
+       "practice       1346\n",
+       "ccg            1138\n",
+       "stp             680"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Summarising the number of unique chemicals identified in the\n",
+    "### top/bottom five outliers amongst all organisations of the given type\n",
+    "\n",
+    "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n",
+    "            .drop_duplicates()['entity']\n",
+    "            .value_counts()\n",
+    "            .to_frame()\n",
+    "            .rename( columns={'entity':'chemicals'} ) )\n",
+    "\n",
+    "c_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "### Combining the entity and chemical counts\n",
+    "\n",
+    "all_counts = e_counts.join( c_counts )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "### Defining which metrics will be displayed in the summary tables\n",
+    "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Calculating summary statistics for the Z scores for those chemicals\n",
+    "### identified in the TOP 5 in at least one organisation of the entity type.\n",
+    "### There are the chemicals displayed in the 'Higher than most' table.\n",
+    "\n",
+    "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Calculating summary statistics for the Z scores for those chemicals\n",
+    "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n",
+    "### There are the chemicals displayed in the 'Lower than most' table.\n",
+    "\n",
+    "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary statistics for outlying Z scores in each organisation type\n",
+    "\n",
+    "### Higher than most chemicals\n",
+    "\n",
+    "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n",
+    "in each type of organisation. These are chemicals are seen to be used more often\n",
+    "in a particular organisation than its peers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "      <th>chemicals</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entity</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "      <td>680</td>\n",
+       "      <td>5.42</td>\n",
+       "      <td>6.33</td>\n",
+       "      <td>2.68</td>\n",
+       "      <td>1.64</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "      <td>1138</td>\n",
+       "      <td>5.79</td>\n",
+       "      <td>10.20</td>\n",
+       "      <td>2.76</td>\n",
+       "      <td>3.18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "      <td>1416</td>\n",
+       "      <td>5.28</td>\n",
+       "      <td>2528.09</td>\n",
+       "      <td>2.26</td>\n",
+       "      <td>3.39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6476</td>\n",
+       "      <td>1346</td>\n",
+       "      <td>5.23</td>\n",
+       "      <td>6825.50</td>\n",
+       "      <td>1.21</td>\n",
+       "      <td>3.84</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             n  chemicals  median      max   min   IQR\n",
+       "entity                                                \n",
+       "stp         42        680    5.42     6.33  2.68  1.64\n",
+       "ccg        106       1138    5.79    10.20  2.76  3.18\n",
+       "pcn       1257       1416    5.28  2528.09  2.26  3.39\n",
+       "practice  6476       1346    5.23  6825.50  1.21  3.84"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Extracting the summary statistics for the z scores\n",
+    "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n",
+    "\n",
+    "### Calculating IQR, removing the row index and rounding to 2dp\n",
+    "overused_toprint = ( overused_tmp\n",
+    "         .assign( IQR = overused_tmp[\"75%\"]-overused_tmp[\"25%\"] )\n",
+    "         .droplevel(level=1)\n",
+    "         .round(2) )\n",
+    "\n",
+    "overused_toprint.join( all_counts )[metrics_to_show]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Lower than most chemicals\n",
+    "\n",
+    "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n",
+    "in each type of organisation. These are chemicals are seen to be used less often\n",
+    "in a particular organisation than its peers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "lines_to_next_cell": 0
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "      <th>chemicals</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entity</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "      <td>680</td>\n",
+       "      <td>-2.35</td>\n",
+       "      <td>-1.47</td>\n",
+       "      <td>-6.33</td>\n",
+       "      <td>0.79</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "      <td>1138</td>\n",
+       "      <td>-2.30</td>\n",
+       "      <td>-1.33</td>\n",
+       "      <td>-10.20</td>\n",
+       "      <td>0.82</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "      <td>1416</td>\n",
+       "      <td>-2.18</td>\n",
+       "      <td>-1.30</td>\n",
+       "      <td>-159.77</td>\n",
+       "      <td>0.77</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6476</td>\n",
+       "      <td>1346</td>\n",
+       "      <td>-2.08</td>\n",
+       "      <td>-0.05</td>\n",
+       "      <td>-307.23</td>\n",
+       "      <td>0.81</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             n  chemicals  median   max     min   IQR\n",
+       "entity                                               \n",
+       "stp         42        680   -2.35 -1.47   -6.33  0.79\n",
+       "ccg        106       1138   -2.30 -1.33  -10.20  0.82\n",
+       "pcn       1257       1416   -2.18 -1.30 -159.77  0.77\n",
+       "practice  6476       1346   -2.08 -0.05 -307.23  0.81"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Extracting the summary statistics for the z scores\n",
+    "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n",
+    "\n",
+    "### Calculating IQR, removing the row index and rounding to 2dp\n",
+    "underused_toprint = ( underused_tmp\n",
+    "         .assign( IQR = underused_tmp[\"75%\"]-underused_tmp[\"25%\"] )\n",
+    "         .droplevel(level=1)\n",
+    "         .round(2) )\n",
+    "\n",
+    "underused_toprint.join( all_counts )[metrics_to_show]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Summary\n",
+    "\n",
+    "Below is a summary table that combines the 'Higher than most' and 'Lower than most'\n",
+    "results displayed above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"6\" halign=\"left\">Higher than most</th>\n",
+       "      <th colspan=\"4\" halign=\"left\">Lower than most</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>n</th>\n",
+       "      <th>chemicals</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "      <th>median</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>IQR</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entity</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <td>42</td>\n",
+       "      <td>680</td>\n",
+       "      <td>5.42</td>\n",
+       "      <td>6.33</td>\n",
+       "      <td>2.68</td>\n",
+       "      <td>1.64</td>\n",
+       "      <td>-2.35</td>\n",
+       "      <td>-1.47</td>\n",
+       "      <td>-6.33</td>\n",
+       "      <td>0.79</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <td>106</td>\n",
+       "      <td>1138</td>\n",
+       "      <td>5.79</td>\n",
+       "      <td>10.20</td>\n",
+       "      <td>2.76</td>\n",
+       "      <td>3.18</td>\n",
+       "      <td>-2.30</td>\n",
+       "      <td>-1.33</td>\n",
+       "      <td>-10.20</td>\n",
+       "      <td>0.82</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <td>1257</td>\n",
+       "      <td>1416</td>\n",
+       "      <td>5.28</td>\n",
+       "      <td>2528.09</td>\n",
+       "      <td>2.26</td>\n",
+       "      <td>3.39</td>\n",
+       "      <td>-2.18</td>\n",
+       "      <td>-1.30</td>\n",
+       "      <td>-159.77</td>\n",
+       "      <td>0.77</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <td>6476</td>\n",
+       "      <td>1346</td>\n",
+       "      <td>5.23</td>\n",
+       "      <td>6825.50</td>\n",
+       "      <td>1.21</td>\n",
+       "      <td>3.84</td>\n",
+       "      <td>-2.08</td>\n",
+       "      <td>-0.05</td>\n",
+       "      <td>-307.23</td>\n",
+       "      <td>0.81</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Higher than most                                        \\\n",
+       "                        n chemicals median      max   min   IQR   \n",
+       "entity                                                            \n",
+       "stp                    42       680   5.42     6.33  2.68  1.64   \n",
+       "ccg                   106      1138   5.79    10.20  2.76  3.18   \n",
+       "pcn                  1257      1416   5.28  2528.09  2.26  3.39   \n",
+       "practice             6476      1346   5.23  6825.50  1.21  3.84   \n",
+       "\n",
+       "         Lower than most                      \n",
+       "                  median   max     min   IQR  \n",
+       "entity                                        \n",
+       "stp                -2.35 -1.47   -6.33  0.79  \n",
+       "ccg                -2.30 -1.33  -10.20  0.82  \n",
+       "pcn                -2.18 -1.30 -159.77  0.77  \n",
+       "practice           -2.08 -0.05 -307.23  0.81  "
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n",
+    "           underused_toprint[metrics_to_show[2:]]],\n",
+    "          keys=[\"Higher than most\", \"Lower than most\"],axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "all",
+   "encoding": "# -*- coding: utf-8 -*-",
+   "notebook_metadata_filter": "all,-language_info",
+   "text_representation": {
+    "extension": ".py",
+    "format_name": "light",
+    "format_version": "1.5",
+    "jupytext_version": "1.3.4"
+   }
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
new file mode 100644
index 000000000..6fb6d9d3b
--- /dev/null
+++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     cell_metadata_filter: all
+#     notebook_metadata_filter: all,-language_info
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.3.4
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+from lib.outliers import Runner
+from datetime import date
+import pandas as pd
+
+from_date = date(year=2021,month=6,day=1)
+to_date = date(year=2021,month=12,day=1)
+r = Runner(from_date,to_date,10,["practice","ccg","pcn","stp"],False)
+
+r.build.run()
+r.build.fetch_results()
+
+# +
+### Extracting all the stored z scores etc across organisations
+### so that summary statistics can be calculated
+
+e_data = pd.concat(
+    (d.assign(entity=e) for e, d in r.build.results.items())
+)
+# -
+
+# ## Entity counts
+#
+# Counts of each kind of entity (i.e., organisation).
+
+# +
+### Summarising the number of each kind of entity (organisation)
+
+e_counts = ( e_data.reset_index()[["practice","entity"]]
+            .drop_duplicates()['entity']
+            .value_counts()
+            .to_frame()
+            .rename( columns={'entity':'n'} ) )
+
+e_counts
+# -
+
+# ## Chemical counts
+#
+# Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom
+# 5 z scores) amongst all organisations of the given type.
+
+# +
+### Summarising the number of unique chemicals identified in the
+### top/bottom five outliers amongst all organisations of the given type
+
+c_counts = ( e_data.reset_index()[["chemical","entity"]]
+            .drop_duplicates()['entity']
+            .value_counts()
+            .to_frame()
+            .rename( columns={'entity':'chemicals'} ) )
+
+c_counts
+
+# +
+### Combining the entity and chemical counts
+
+all_counts = e_counts.join( c_counts )
+
+
+# +
+### Defining which metrics will be displayed in the summary tables
+metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ]
+
+
+# +
+### Calculating summary statistics for the Z scores for those chemicals
+### identified in the TOP 5 in at least one organisation of the entity type.
+### There are the chemicals displayed in the 'Higher than most' table.
+
+overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False )
+
+# +
+### Calculating summary statistics for the Z scores for those chemicals
+### identified in the BOTTOM 5 in at least one organisation of the entity type.
+### There are the chemicals displayed in the 'Lower than most' table.
+
+underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False )
+# -
+
+# ## Summary statistics for outlying Z scores in each organisation type
+#
+# ### Higher than most chemicals
+#
+# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals
+# in each type of organisation. These are chemicals are seen to be used more often
+# in a particular organisation than its peers.
+
+# +
+### Extracting the summary statistics for the z scores
+overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)]
+
+### Calculating IQR, removing the row index and rounding to 2dp
+overused_toprint = ( overused_tmp
+         .assign( IQR = overused_tmp["75%"]-overused_tmp["25%"] )
+         .droplevel(level=1)
+         .round(2) )
+
+overused_toprint.join( all_counts )[metrics_to_show]
+# -
+
+# ### Lower than most chemicals
+#
+# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals
+# in each type of organisation. These are chemicals are seen to be used less often
+# in a particular organisation than its peers.
+
+# +
+### Extracting the summary statistics for the z scores
+underused_tmp = underused_summary[underused_summary.index.isin(["z_score"], level=1)]
+
+### Calculating IQR, removing the row index and rounding to 2dp
+underused_toprint = ( underused_tmp
+         .assign( IQR = underused_tmp["75%"]-underused_tmp["25%"] )
+         .droplevel(level=1)
+         .round(2) )
+
+underused_toprint.join( all_counts )[metrics_to_show]
+# -
+# ### Summary
+#
+# Below is a summary table that combines the 'Higher than most' and 'Lower than most'
+# results displayed above.
+
+pd.concat([overused_toprint.join( all_counts )[metrics_to_show],
+           underused_toprint[metrics_to_show[2:]]],
+          keys=["Higher than most", "Lower than most"],axis=1)
+
+

From d1f8f222aaf50b8900eb9aa1e5e166194f35ed74 Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Thu, 14 Apr 2022 11:05:28 +0100
Subject: [PATCH 06/11] tidy: removing old notebook

---
 notebooks/calculate_summary_statistics.ipynb  | 731 ------------------
 .../calculate_summary_statistics.py           | 145 ----
 2 files changed, 876 deletions(-)
 delete mode 100644 notebooks/calculate_summary_statistics.ipynb
 delete mode 100644 notebooks/diffable_python/calculate_summary_statistics.py

diff --git a/notebooks/calculate_summary_statistics.ipynb b/notebooks/calculate_summary_statistics.ipynb
deleted file mode 100644
index 5a5d93d8a..000000000
--- a/notebooks/calculate_summary_statistics.ipynb
+++ /dev/null
@@ -1,731 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from lib.outliers import Runner\n",
-    "from datetime import date\n",
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from_date = date(year=2021,month=4,day=1)\n",
-    "to_date = date(year=2021,month=8,day=1)\n",
-    "r = Runner(from_date,to_date,5,[\"practice\",\"ccg\",\"pcn\",\"stp\"],False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=B0aBfO2cFTgPTpWIrbEXhCszrTmcNv&prompt=consent&access_type=offline\n"
-     ]
-    },
-    {
-     "name": "stdin",
-     "output_type": "stream",
-     "text": [
-      "Enter the authorization code:  4/1AX4XfWjWfGmWhSn3IUgFJA9Y1gOE418Hgdc8PD98NKa2Y2AW1-2axRotGlg\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  4.85rows/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "r.build.run()\n",
-    "r.build.fetch_results()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### Extracting all the stored z scores etc across organisations\n",
-    "### so that summary statistics can be calculated\n",
-    "\n",
-    "e_data = pd.concat(\n",
-    "    (d.assign(entity=e) for e, d in r.build.results.items())\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Entity counts\n",
-    "\n",
-    "Counts of each kind of entity (i.e., organisation)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6499</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             n\n",
-       "practice  6499\n",
-       "pcn       1257\n",
-       "ccg        106\n",
-       "stp         42"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "### Summarising the number of each kind of entity (organisation)\n",
-    "\n",
-    "e_counts = ( e_data.reset_index()[[\"practice\",\"entity\"]]\n",
-    "            .drop_duplicates()['entity']\n",
-    "            .value_counts()\n",
-    "            .to_frame()\n",
-    "            .rename( columns={'entity':'n'} ) )\n",
-    "\n",
-    "e_counts"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Chemical counts\n",
-    "\n",
-    "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n",
-    "5 z scores) amongst all organisations of the given type."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>chemicals</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1294</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>1274</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>706</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>364</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          chemicals\n",
-       "pcn            1294\n",
-       "practice       1274\n",
-       "ccg             706\n",
-       "stp             364"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "### Summarising the number of unique chemicals identified in the\n",
-    "### top/bottom five outliers amongst all organisations of the given type\n",
-    "\n",
-    "c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n",
-    "            .drop_duplicates()['entity']\n",
-    "            .value_counts()\n",
-    "            .to_frame()\n",
-    "            .rename( columns={'entity':'chemicals'} ) )\n",
-    "\n",
-    "c_counts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
-    "### Combining the entity and chemical counts\n",
-    "\n",
-    "all_counts = e_counts.join( c_counts )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
-    "### Defining which metrics will be displayed in the summary tables\n",
-    "metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### Calculating summary statistics for the Z scores for those chemicals\n",
-    "### identified in the TOP 5 in at least one organisation of the entity type.\n",
-    "### There are the chemicals displayed in the 'Higher than most' table.\n",
-    "\n",
-    "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
-    "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### Calculating summary statistics for the Z scores for those chemicals\n",
-    "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n",
-    "### There are the chemicals displayed in the 'Lower than most' table.\n",
-    "\n",
-    "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
-    "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Summary statistics for outlying Z scores in each organisation type\n",
-    "\n",
-    "### Higher than most chemicals\n",
-    "\n",
-    "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n",
-    "in each type of organisation. These are chemicals are seen to be used more often\n",
-    "in a particular organisation than its peers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "      <th>chemicals</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>entity</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "      <td>364</td>\n",
-       "      <td>6.14</td>\n",
-       "      <td>6.33</td>\n",
-       "      <td>3.68</td>\n",
-       "      <td>0.78</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "      <td>706</td>\n",
-       "      <td>7.36</td>\n",
-       "      <td>10.20</td>\n",
-       "      <td>3.56</td>\n",
-       "      <td>3.29</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "      <td>1294</td>\n",
-       "      <td>7.26</td>\n",
-       "      <td>543.19</td>\n",
-       "      <td>2.58</td>\n",
-       "      <td>5.11</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6499</td>\n",
-       "      <td>1274</td>\n",
-       "      <td>7.28</td>\n",
-       "      <td>5512.02</td>\n",
-       "      <td>1.13</td>\n",
-       "      <td>5.96</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             n  chemicals  median      max   min   IQR\n",
-       "entity                                                \n",
-       "stp         42        364    6.14     6.33  3.68  0.78\n",
-       "ccg        106        706    7.36    10.20  3.56  3.29\n",
-       "pcn       1257       1294    7.26   543.19  2.58  5.11\n",
-       "practice  6499       1274    7.28  5512.02  1.13  5.96"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "### Extracting the summary statistics for the z scores\n",
-    "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n",
-    "\n",
-    "### Calculating IQR, removing the row index and rounding to 2dp\n",
-    "overused_toprint = ( overused_tmp\n",
-    "         .assign( IQR = overused_tmp[\"75%\"]-overused_tmp[\"25%\"] )\n",
-    "         .droplevel(level=1)\n",
-    "         .round(2) )\n",
-    "\n",
-    "overused_toprint.join( all_counts )[metrics_to_show]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Lower than most chemicals\n",
-    "\n",
-    "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n",
-    "in each type of organisation. These are chemicals are seen to be used less often\n",
-    "in a particular organisation than its peers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "lines_to_next_cell": 0
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "      <th>chemicals</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>entity</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "      <td>364</td>\n",
-       "      <td>-2.77</td>\n",
-       "      <td>-1.78</td>\n",
-       "      <td>-6.33</td>\n",
-       "      <td>1.15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "      <td>706</td>\n",
-       "      <td>-2.78</td>\n",
-       "      <td>-1.47</td>\n",
-       "      <td>-10.20</td>\n",
-       "      <td>1.16</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "      <td>1294</td>\n",
-       "      <td>-2.61</td>\n",
-       "      <td>-1.49</td>\n",
-       "      <td>-141.33</td>\n",
-       "      <td>1.03</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6499</td>\n",
-       "      <td>1274</td>\n",
-       "      <td>-2.44</td>\n",
-       "      <td>-1.13</td>\n",
-       "      <td>-711.87</td>\n",
-       "      <td>1.07</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             n  chemicals  median   max     min   IQR\n",
-       "entity                                               \n",
-       "stp         42        364   -2.77 -1.78   -6.33  1.15\n",
-       "ccg        106        706   -2.78 -1.47  -10.20  1.16\n",
-       "pcn       1257       1294   -2.61 -1.49 -141.33  1.03\n",
-       "practice  6499       1274   -2.44 -1.13 -711.87  1.07"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "### Extracting the summary statistics for the z scores\n",
-    "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n",
-    "\n",
-    "### Calculating IQR, removing the row index and rounding to 2dp\n",
-    "underused_toprint = ( underused_tmp\n",
-    "         .assign( IQR = underused_tmp[\"75%\"]-underused_tmp[\"25%\"] )\n",
-    "         .droplevel(level=1)\n",
-    "         .round(2) )\n",
-    "\n",
-    "underused_toprint.join( all_counts )[metrics_to_show]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Summary\n",
-    "\n",
-    "Below is a summary table that combines the 'Higher than most' and 'Lower than most'\n",
-    "results displayed above."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead tr th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead tr:last-of-type th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th colspan=\"6\" halign=\"left\">Higher than most</th>\n",
-       "      <th colspan=\"4\" halign=\"left\">Lower than most</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "      <th>chemicals</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>entity</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "      <td>364</td>\n",
-       "      <td>6.14</td>\n",
-       "      <td>6.33</td>\n",
-       "      <td>3.68</td>\n",
-       "      <td>0.78</td>\n",
-       "      <td>-2.77</td>\n",
-       "      <td>-1.78</td>\n",
-       "      <td>-6.33</td>\n",
-       "      <td>1.15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "      <td>706</td>\n",
-       "      <td>7.36</td>\n",
-       "      <td>10.20</td>\n",
-       "      <td>3.56</td>\n",
-       "      <td>3.29</td>\n",
-       "      <td>-2.78</td>\n",
-       "      <td>-1.47</td>\n",
-       "      <td>-10.20</td>\n",
-       "      <td>1.16</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "      <td>1294</td>\n",
-       "      <td>7.26</td>\n",
-       "      <td>543.19</td>\n",
-       "      <td>2.58</td>\n",
-       "      <td>5.11</td>\n",
-       "      <td>-2.61</td>\n",
-       "      <td>-1.49</td>\n",
-       "      <td>-141.33</td>\n",
-       "      <td>1.03</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6499</td>\n",
-       "      <td>1274</td>\n",
-       "      <td>7.28</td>\n",
-       "      <td>5512.02</td>\n",
-       "      <td>1.13</td>\n",
-       "      <td>5.96</td>\n",
-       "      <td>-2.44</td>\n",
-       "      <td>-1.13</td>\n",
-       "      <td>-711.87</td>\n",
-       "      <td>1.07</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         Higher than most                                        \\\n",
-       "                        n chemicals median      max   min   IQR   \n",
-       "entity                                                            \n",
-       "stp                    42       364   6.14     6.33  3.68  0.78   \n",
-       "ccg                   106       706   7.36    10.20  3.56  3.29   \n",
-       "pcn                  1257      1294   7.26   543.19  2.58  5.11   \n",
-       "practice             6499      1274   7.28  5512.02  1.13  5.96   \n",
-       "\n",
-       "         Lower than most                      \n",
-       "                  median   max     min   IQR  \n",
-       "entity                                        \n",
-       "stp                -2.77 -1.78   -6.33  1.15  \n",
-       "ccg                -2.78 -1.47  -10.20  1.16  \n",
-       "pcn                -2.61 -1.49 -141.33  1.03  \n",
-       "practice           -2.44 -1.13 -711.87  1.07  "
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n",
-    "           underused_toprint[metrics_to_show[2:]]],\n",
-    "          keys=[\"Higher than most\", \"Lower than most\"],axis=1)"
-   ]
-  }
- ],
- "metadata": {
-  "jupytext": {
-   "cell_metadata_filter": "all",
-   "encoding": "# -*- coding: utf-8 -*-",
-   "notebook_metadata_filter": "all,-language_info",
-   "text_representation": {
-    "extension": ".py",
-    "format_name": "light",
-    "format_version": "1.5",
-    "jupytext_version": "1.3.4"
-   }
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/diffable_python/calculate_summary_statistics.py b/notebooks/diffable_python/calculate_summary_statistics.py
deleted file mode 100644
index 652720e3d..000000000
--- a/notebooks/diffable_python/calculate_summary_statistics.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# -*- coding: utf-8 -*-
-# ---
-# jupyter:
-#   jupytext:
-#     cell_metadata_filter: all
-#     notebook_metadata_filter: all,-language_info
-#     text_representation:
-#       extension: .py
-#       format_name: light
-#       format_version: '1.5'
-#       jupytext_version: 1.3.4
-#   kernelspec:
-#     display_name: Python 3
-#     language: python
-#     name: python3
-# ---
-
-from lib.outliers import Runner
-from datetime import date
-import pandas as pd
-
-from_date = date(year=2021,month=4,day=1)
-to_date = date(year=2021,month=8,day=1)
-r = Runner(from_date,to_date,5,["practice","ccg","pcn","stp"],False)
-
-r.build.run()
-r.build.fetch_results()
-
-# +
-### Extracting all the stored z scores etc across organisations
-### so that summary statistics can be calculated
-
-e_data = pd.concat(
-    (d.assign(entity=e) for e, d in r.build.results.items())
-)
-# -
-
-# ## Entity counts
-#
-# Counts of each kind of entity (i.e., organisation).
-
-# +
-### Summarising the number of each kind of entity (organisation)
-
-e_counts = ( e_data.reset_index()[["practice","entity"]]
-            .drop_duplicates()['entity']
-            .value_counts()
-            .to_frame()
-            .rename( columns={'entity':'n'} ) )
-
-e_counts
-# -
-
-# ## Chemical counts
-#
-# Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom
-# 5 z scores) amongst all organisations of the given type.
-
-# +
-### Summarising the number of unique chemicals identified in the
-### top/bottom five outliers amongst all organisations of the given type
-
-c_counts = ( e_data.reset_index()[["chemical","entity"]]
-            .drop_duplicates()['entity']
-            .value_counts()
-            .to_frame()
-            .rename( columns={'entity':'chemicals'} ) )
-
-c_counts
-
-# +
-### Combining the entity and chemical counts
-
-all_counts = e_counts.join( c_counts )
-
-
-# +
-### Defining which metrics will be displayed in the summary tables
-metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ]
-
-
-# +
-### Calculating summary statistics for the Z scores for those chemicals
-### identified in the TOP 5 in at least one organisation of the entity type.
-### There are the chemicals displayed in the 'Higher than most' table.
-
-overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
-overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False )
-
-# +
-### Calculating summary statistics for the Z scores for those chemicals
-### identified in the BOTTOM 5 in at least one organisation of the entity type.
-### There are the chemicals displayed in the 'Lower than most' table.
-
-underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
-underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False )
-# -
-
-# ## Summary statistics for outlying Z scores in each organisation type
-#
-# ### Higher than most chemicals
-#
-# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals
-# in each type of organisation. These are chemicals are seen to be used more often
-# in a particular organisation than its peers.
-
-# +
-### Extracting the summary statistics for the z scores
-overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)]
-
-### Calculating IQR, removing the row index and rounding to 2dp
-overused_toprint = ( overused_tmp
-         .assign( IQR = overused_tmp["75%"]-overused_tmp["25%"] )
-         .droplevel(level=1)
-         .round(2) )
-
-overused_toprint.join( all_counts )[metrics_to_show]
-# -
-
-# ### Lower than most chemicals
-#
-# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals
-# in each type of organisation. These are chemicals are seen to be used less often
-# in a particular organisation than its peers.
-
-# +
-### Extracting the summary statistics for the z scores
-underused_tmp = underused_summary[underused_summary.index.isin(["z_score"], level=1)]
-
-### Calculating IQR, removing the row index and rounding to 2dp
-underused_toprint = ( underused_tmp
-         .assign( IQR = underused_tmp["75%"]-underused_tmp["25%"] )
-         .droplevel(level=1)
-         .round(2) )
-
-underused_toprint.join( all_counts )[metrics_to_show]
-# -
-# ### Summary
-#
-# Below is a summary table that combines the 'Higher than most' and 'Lower than most'
-# results displayed above.
-
-pd.concat([overused_toprint.join( all_counts )[metrics_to_show],
-           underused_toprint[metrics_to_show[2:]]],
-          keys=["Higher than most", "Lower than most"],axis=1)

From 0aac79520f757e23eb567e7fa3519bda3c1e0dd2 Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Thu, 14 Apr 2022 11:55:14 +0100
Subject: [PATCH 07/11] feat: updating queries on e_data object to extract the
 outlying chemical values for describe()

---
 ...alculate_summary_statistics_outlying.ipynb | 56 ++++++++-----------
 .../calculate_summary_statistics_outlying.py  |  4 +-
 2 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb
index 24f2beb90..57224c678 100644
--- a/notebooks/calculate_summary_statistics_outlying.ipynb
+++ b/notebooks/calculate_summary_statistics_outlying.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,26 +24,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  6.53rows/s]\n",
-      "Downloading: 100%|██████████| 129532/129532 [00:11<00:00, 11161.65rows/s]\n",
-      "Downloading: 100%|██████████| 302743/302743 [00:11<00:00, 26079.48rows/s]\n",
-      "Downloading: 100%|██████████| 1346/1346 [00:40<00:00, 33.64rows/s]\n",
-      "Downloading: 100%|██████████| 2121/2121 [00:00<00:00, 4642.75rows/s]\n",
-      "Downloading: 100%|██████████| 10905/10905 [00:00<00:00, 14726.74rows/s]\n",
-      "Downloading: 100%|██████████| 1138/1138 [00:00<00:00, 1338.28rows/s]\n",
-      "Downloading: 100%|██████████| 25140/25140 [00:02<00:00, 10127.85rows/s]\n",
-      "Downloading: 100%|██████████| 89049/89049 [00:05<00:00, 17194.56rows/s]\n",
-      "Downloading: 100%|██████████| 1416/1416 [00:08<00:00, 171.31rows/s]\n",
-      "Downloading: 100%|██████████| 842/842 [00:00<00:00, 3119.24rows/s]\n",
-      "Downloading: 100%|██████████| 3992/3992 [00:00<00:00, 11083.61rows/s]\n",
-      "Downloading: 100%|██████████| 680/680 [00:00<00:00, 1650.28rows/s]\n"
+      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  5.83rows/s]\n"
      ]
     }
    ],
@@ -54,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -77,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -133,7 +121,7 @@
        "stp         42"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -162,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -218,7 +206,7 @@
        "stp             680"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -238,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -251,7 +239,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -263,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -271,7 +259,7 @@
     "### identified in the TOP 5 in at least one organisation of the entity type.\n",
     "### There are the chemicals displayed in the 'Higher than most' table.\n",
     "\n",
-    "overused_summary = e_data.query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
     "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
    ]
   },
@@ -285,7 +273,7 @@
     "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n",
     "### There are the chemicals displayed in the 'Lower than most' table.\n",
     "\n",
-    "underused_summary = e_data.query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
     "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
    ]
   },
@@ -379,7 +367,7 @@
        "      <td>1346</td>\n",
        "      <td>5.23</td>\n",
        "      <td>6825.50</td>\n",
-       "      <td>1.21</td>\n",
+       "      <td>0.00</td>\n",
        "      <td>3.84</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -392,7 +380,7 @@
        "stp         42        680    5.42     6.33  2.68  1.64\n",
        "ccg        106       1138    5.79    10.20  2.76  3.18\n",
        "pcn       1257       1416    5.28  2528.09  2.26  3.39\n",
-       "practice  6476       1346    5.23  6825.50  1.21  3.84"
+       "practice  6476       1346    5.23  6825.50  0.00  3.84"
       ]
      },
      "execution_count": 13,
@@ -502,7 +490,7 @@
        "      <td>6476</td>\n",
        "      <td>1346</td>\n",
        "      <td>-2.08</td>\n",
-       "      <td>-0.05</td>\n",
+       "      <td>-0.99</td>\n",
        "      <td>-307.23</td>\n",
        "      <td>0.81</td>\n",
        "    </tr>\n",
@@ -516,7 +504,7 @@
        "stp         42        680   -2.35 -1.47   -6.33  0.79\n",
        "ccg        106       1138   -2.30 -1.33  -10.20  0.82\n",
        "pcn       1257       1416   -2.18 -1.30 -159.77  0.77\n",
-       "practice  6476       1346   -2.08 -0.05 -307.23  0.81"
+       "practice  6476       1346   -2.08 -0.99 -307.23  0.81"
       ]
      },
      "execution_count": 14,
@@ -653,10 +641,10 @@
        "      <td>1346</td>\n",
        "      <td>5.23</td>\n",
        "      <td>6825.50</td>\n",
-       "      <td>1.21</td>\n",
+       "      <td>0.00</td>\n",
        "      <td>3.84</td>\n",
        "      <td>-2.08</td>\n",
-       "      <td>-0.05</td>\n",
+       "      <td>-0.99</td>\n",
        "      <td>-307.23</td>\n",
        "      <td>0.81</td>\n",
        "    </tr>\n",
@@ -671,7 +659,7 @@
        "stp                    42       680   5.42     6.33  2.68  1.64   \n",
        "ccg                   106      1138   5.79    10.20  2.76  3.18   \n",
        "pcn                  1257      1416   5.28  2528.09  2.26  3.39   \n",
-       "practice             6476      1346   5.23  6825.50  1.21  3.84   \n",
+       "practice             6476      1346   5.23  6825.50  0.00  3.84   \n",
        "\n",
        "         Lower than most                      \n",
        "                  median   max     min   IQR  \n",
@@ -679,7 +667,7 @@
        "stp                -2.35 -1.47   -6.33  0.79  \n",
        "ccg                -2.30 -1.33  -10.20  0.82  \n",
        "pcn                -2.18 -1.30 -159.77  0.77  \n",
-       "practice           -2.08 -0.05 -307.23  0.81  "
+       "practice           -2.08 -0.99 -307.23  0.81  "
       ]
      },
      "execution_count": 15,
diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
index 6fb6d9d3b..c483c1de5 100644
--- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py
+++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
@@ -84,7 +84,7 @@
 ### identified in the TOP 5 in at least one organisation of the entity type.
 ### There are the chemicals displayed in the 'Higher than most' table.
 
-overused_summary = e_data.query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
 overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False )
 
 # +
@@ -92,7 +92,7 @@
 ### identified in the BOTTOM 5 in at least one organisation of the entity type.
 ### There are the chemicals displayed in the 'Lower than most' table.
 
-underused_summary = e_data.query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
 underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False )
 # -
 

From 7cae036cced259392b9c6555a4febcbaef7ec83c Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Thu, 14 Apr 2022 12:02:03 +0100
Subject: [PATCH 08/11] feat: update to notebook

---
 ...alculate_summary_statistics_outlying.ipynb | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb
index 57224c678..5abe1334e 100644
--- a/notebooks/calculate_summary_statistics_outlying.ipynb
+++ b/notebooks/calculate_summary_statistics_outlying.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,14 +24,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  5.83rows/s]\n"
+      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  7.05rows/s]\n"
      ]
     }
    ],
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -121,7 +121,7 @@
        "stp         42"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -150,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -206,7 +206,7 @@
        "stp             680"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -226,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -239,7 +239,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 23,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -251,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -265,7 +265,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -292,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -383,7 +383,7 @@
        "practice  6476       1346    5.23  6825.50  0.00  3.84"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -414,7 +414,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 27,
    "metadata": {
     "lines_to_next_cell": 0
    },
@@ -507,7 +507,7 @@
        "practice  6476       1346   -2.08 -0.99 -307.23  0.81"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -537,7 +537,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -670,7 +670,7 @@
        "practice           -2.08 -0.99 -307.23  0.81  "
       ]
      },
-     "execution_count": 15,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }

From da7ce8b35370ec566141e53e3e95d2a653efff33 Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Thu, 14 Apr 2022 12:20:35 +0100
Subject: [PATCH 09/11] feat: removing z scores of 0 from summary statistics

---
 ...alculate_summary_statistics_outlying.ipynb | 155 ++++++++++++++++--
 .../calculate_summary_statistics_outlying.py  |   6 +-
 2 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb
index 5abe1334e..72fed35ec 100644
--- a/notebooks/calculate_summary_statistics_outlying.ipynb
+++ b/notebooks/calculate_summary_statistics_outlying.ipynb
@@ -251,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -259,13 +259,13 @@
     "### identified in the TOP 5 in at least one organisation of the entity type.\n",
     "### There are the chemicals displayed in the 'Higher than most' table.\n",
     "\n",
-    "overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
     "overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -273,7 +273,7 @@
     "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n",
     "### There are the chemicals displayed in the 'Lower than most' table.\n",
     "\n",
-    "underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
+    "underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
     "underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )"
    ]
   },
@@ -292,7 +292,134 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>25%</th>\n",
+       "      <th>median</th>\n",
+       "      <th>75%</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entity</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>stp</th>\n",
+       "      <th>z_score</th>\n",
+       "      <td>420.0</td>\n",
+       "      <td>-2.650387</td>\n",
+       "      <td>0.932614</td>\n",
+       "      <td>-6.326451</td>\n",
+       "      <td>-2.872841</td>\n",
+       "      <td>-2.352648</td>\n",
+       "      <td>-2.081660</td>\n",
+       "      <td>-1.473260</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ccg</th>\n",
+       "      <th>z_score</th>\n",
+       "      <td>1060.0</td>\n",
+       "      <td>-2.667010</td>\n",
+       "      <td>1.228738</td>\n",
+       "      <td>-10.198503</td>\n",
+       "      <td>-2.813851</td>\n",
+       "      <td>-2.302282</td>\n",
+       "      <td>-1.990392</td>\n",
+       "      <td>-1.325057</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>pcn</th>\n",
+       "      <th>z_score</th>\n",
+       "      <td>12570.0</td>\n",
+       "      <td>-2.609923</td>\n",
+       "      <td>2.474167</td>\n",
+       "      <td>-159.768459</td>\n",
+       "      <td>-2.673923</td>\n",
+       "      <td>-2.183928</td>\n",
+       "      <td>-1.900201</td>\n",
+       "      <td>-1.296016</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>practice</th>\n",
+       "      <th>z_score</th>\n",
+       "      <td>64760.0</td>\n",
+       "      <td>-2.495317</td>\n",
+       "      <td>3.925005</td>\n",
+       "      <td>-307.234735</td>\n",
+       "      <td>-2.569772</td>\n",
+       "      <td>-2.076544</td>\n",
+       "      <td>-1.756406</td>\n",
+       "      <td>-0.987765</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    count      mean       std         min       25%    median  \\\n",
+       "entity                                                                          \n",
+       "stp      z_score    420.0 -2.650387  0.932614   -6.326451 -2.872841 -2.352648   \n",
+       "ccg      z_score   1060.0 -2.667010  1.228738  -10.198503 -2.813851 -2.302282   \n",
+       "pcn      z_score  12570.0 -2.609923  2.474167 -159.768459 -2.673923 -2.183928   \n",
+       "practice z_score  64760.0 -2.495317  3.925005 -307.234735 -2.569772 -2.076544   \n",
+       "\n",
+       "                       75%       max  \n",
+       "entity                                \n",
+       "stp      z_score -2.081660 -1.473260  \n",
+       "ccg      z_score -1.990392 -1.325057  \n",
+       "pcn      z_score -1.900201 -1.296016  \n",
+       "practice z_score -1.756406 -0.987765  "
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "underused_summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -367,7 +494,7 @@
        "      <td>1346</td>\n",
        "      <td>5.23</td>\n",
        "      <td>6825.50</td>\n",
-       "      <td>0.00</td>\n",
+       "      <td>1.21</td>\n",
        "      <td>3.84</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -380,10 +507,10 @@
        "stp         42        680    5.42     6.33  2.68  1.64\n",
        "ccg        106       1138    5.79    10.20  2.76  3.18\n",
        "pcn       1257       1416    5.28  2528.09  2.26  3.39\n",
-       "practice  6476       1346    5.23  6825.50  0.00  3.84"
+       "practice  6476       1346    5.23  6825.50  1.21  3.84"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -414,7 +541,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 33,
    "metadata": {
     "lines_to_next_cell": 0
    },
@@ -507,7 +634,7 @@
        "practice  6476       1346   -2.08 -0.99 -307.23  0.81"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -537,7 +664,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
@@ -641,7 +768,7 @@
        "      <td>1346</td>\n",
        "      <td>5.23</td>\n",
        "      <td>6825.50</td>\n",
-       "      <td>0.00</td>\n",
+       "      <td>1.21</td>\n",
        "      <td>3.84</td>\n",
        "      <td>-2.08</td>\n",
        "      <td>-0.99</td>\n",
@@ -659,7 +786,7 @@
        "stp                    42       680   5.42     6.33  2.68  1.64   \n",
        "ccg                   106      1138   5.79    10.20  2.76  3.18   \n",
        "pcn                  1257      1416   5.28  2528.09  2.26  3.39   \n",
-       "practice             6476      1346   5.23  6825.50  0.00  3.84   \n",
+       "practice             6476      1346   5.23  6825.50  1.21  3.84   \n",
        "\n",
        "         Lower than most                      \n",
        "                  median   max     min   IQR  \n",
@@ -670,7 +797,7 @@
        "practice           -2.08 -0.99 -307.23  0.81  "
       ]
      },
-     "execution_count": 28,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
index c483c1de5..fbf4e84ad 100644
--- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py
+++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
@@ -84,7 +84,7 @@
 ### identified in the TOP 5 in at least one organisation of the entity type.
 ### There are the chemicals displayed in the 'Higher than most' table.
 
-overused_summary = e_data.query('rank_high<=10').query('z_score>=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
 overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False )
 
 # +
@@ -92,7 +92,7 @@
 ### identified in the BOTTOM 5 in at least one organisation of the entity type.
 ### There are the chemicals displayed in the 'Lower than most' table.
 
-underused_summary = e_data.query('rank_low<=10').query('z_score<=0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
+underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
 underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False )
 # -
 
@@ -104,6 +104,8 @@
 # in each type of organisation. These are chemicals are seen to be used more often
 # in a particular organisation than its peers.
 
+underused_summary
+
 # +
 ### Extracting the summary statistics for the z scores
 overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)]

From 64c81ba4dff4e69cb0095fe3461446da9dba41a3 Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Thu, 14 Apr 2022 15:29:36 +0100
Subject: [PATCH 10/11] tidy: removing unnecessary cell

---
 ...alculate_summary_statistics_outlying.ipynb | 165 ++----------------
 .../calculate_summary_statistics_outlying.py  |   2 -
 2 files changed, 19 insertions(+), 148 deletions(-)

diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb
index 72fed35ec..4c7e0e5ae 100644
--- a/notebooks/calculate_summary_statistics_outlying.ipynb
+++ b/notebooks/calculate_summary_statistics_outlying.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,14 +24,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  7.05rows/s]\n"
+      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  3.01rows/s]\n"
      ]
     }
    ],
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -121,7 +121,7 @@
        "stp         42"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -150,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -206,7 +206,7 @@
        "stp             680"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -226,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 7,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -239,7 +239,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 8,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -251,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -265,7 +265,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -292,134 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th>count</th>\n",
-       "      <th>mean</th>\n",
-       "      <th>std</th>\n",
-       "      <th>min</th>\n",
-       "      <th>25%</th>\n",
-       "      <th>median</th>\n",
-       "      <th>75%</th>\n",
-       "      <th>max</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>entity</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <th>z_score</th>\n",
-       "      <td>420.0</td>\n",
-       "      <td>-2.650387</td>\n",
-       "      <td>0.932614</td>\n",
-       "      <td>-6.326451</td>\n",
-       "      <td>-2.872841</td>\n",
-       "      <td>-2.352648</td>\n",
-       "      <td>-2.081660</td>\n",
-       "      <td>-1.473260</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <th>z_score</th>\n",
-       "      <td>1060.0</td>\n",
-       "      <td>-2.667010</td>\n",
-       "      <td>1.228738</td>\n",
-       "      <td>-10.198503</td>\n",
-       "      <td>-2.813851</td>\n",
-       "      <td>-2.302282</td>\n",
-       "      <td>-1.990392</td>\n",
-       "      <td>-1.325057</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <th>z_score</th>\n",
-       "      <td>12570.0</td>\n",
-       "      <td>-2.609923</td>\n",
-       "      <td>2.474167</td>\n",
-       "      <td>-159.768459</td>\n",
-       "      <td>-2.673923</td>\n",
-       "      <td>-2.183928</td>\n",
-       "      <td>-1.900201</td>\n",
-       "      <td>-1.296016</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <th>z_score</th>\n",
-       "      <td>64760.0</td>\n",
-       "      <td>-2.495317</td>\n",
-       "      <td>3.925005</td>\n",
-       "      <td>-307.234735</td>\n",
-       "      <td>-2.569772</td>\n",
-       "      <td>-2.076544</td>\n",
-       "      <td>-1.756406</td>\n",
-       "      <td>-0.987765</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                    count      mean       std         min       25%    median  \\\n",
-       "entity                                                                          \n",
-       "stp      z_score    420.0 -2.650387  0.932614   -6.326451 -2.872841 -2.352648   \n",
-       "ccg      z_score   1060.0 -2.667010  1.228738  -10.198503 -2.813851 -2.302282   \n",
-       "pcn      z_score  12570.0 -2.609923  2.474167 -159.768459 -2.673923 -2.183928   \n",
-       "practice z_score  64760.0 -2.495317  3.925005 -307.234735 -2.569772 -2.076544   \n",
-       "\n",
-       "                       75%       max  \n",
-       "entity                                \n",
-       "stp      z_score -2.081660 -1.473260  \n",
-       "ccg      z_score -1.990392 -1.325057  \n",
-       "pcn      z_score -1.900201 -1.296016  \n",
-       "practice z_score -1.756406 -0.987765  "
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "underused_summary"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -510,7 +383,7 @@
        "practice  6476       1346    5.23  6825.50  1.21  3.84"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -541,7 +414,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 12,
    "metadata": {
     "lines_to_next_cell": 0
    },
@@ -634,7 +507,7 @@
        "practice  6476       1346   -2.08 -0.99 -307.23  0.81"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -664,7 +537,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -797,7 +670,7 @@
        "practice           -2.08 -0.99 -307.23  0.81  "
       ]
      },
-     "execution_count": 34,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
index fbf4e84ad..3d689bd72 100644
--- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py
+++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
@@ -104,8 +104,6 @@
 # in each type of organisation. These are chemicals are seen to be used more often
 # in a particular organisation than its peers.
 
-underused_summary
-
 # +
 ### Extracting the summary statistics for the z scores
 overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)]

From eeee9e9c54d587d7623d2eb542f2c91df7aa69a5 Mon Sep 17 00:00:00 2001
From: Lisa Hopcroft <54442530+LisaHopcroft@users.noreply.github.com>
Date: Mon, 22 Aug 2022 11:32:24 +0100
Subject: [PATCH 11/11] fix: amending text and comments to read top/bottom 10,
 not top/bottom 5

---
 ...alculate_summary_statistics_outlying.ipynb | 498 +-----------------
 .../calculate_summary_statistics_outlying.py  |  13 +-
 2 files changed, 37 insertions(+), 474 deletions(-)

diff --git a/notebooks/calculate_summary_statistics_outlying.ipynb b/notebooks/calculate_summary_statistics_outlying.ipynb
index 4c7e0e5ae..06048f41a 100644
--- a/notebooks/calculate_summary_statistics_outlying.ipynb
+++ b/notebooks/calculate_summary_statistics_outlying.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,17 +24,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  3.01rows/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "r.build.run()\n",
     "r.build.fetch_results()"
@@ -42,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,67 +57,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6476</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             n\n",
-       "practice  6476\n",
-       "pcn       1257\n",
-       "ccg        106\n",
-       "stp         42"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "### Summarising the number of each kind of entity (organisation)\n",
     "\n",
@@ -145,72 +79,14 @@
     "## Chemical counts\n",
     "\n",
     "Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n",
-    "5 z scores) amongst all organisations of the given type."
+    "10 z scores) amongst all organisations of the given type."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>chemicals</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1416</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>1346</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>1138</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>680</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          chemicals\n",
-       "pcn            1416\n",
-       "practice       1346\n",
-       "ccg            1138\n",
-       "stp             680"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "### Summarising the number of unique chemicals identified in the\n",
     "### top/bottom five outliers amongst all organisations of the given type\n",
@@ -226,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -239,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -251,12 +127,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "### Calculating summary statistics for the Z scores for those chemicals\n",
-    "### identified in the TOP 5 in at least one organisation of the entity type.\n",
+    "### identified in the TOP 10 in at least one organisation of the entity type.\n",
     "### There are the chemicals displayed in the 'Higher than most' table.\n",
     "\n",
     "overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
@@ -265,12 +141,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "### Calculating summary statistics for the Z scores for those chemicals\n",
-    "### identified in the BOTTOM 5 in at least one organisation of the entity type.\n",
+    "### identified in the BOTTOM 10 in at least one organisation of the entity type.\n",
     "### There are the chemicals displayed in the 'Lower than most' table.\n",
     "\n",
     "underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n",
@@ -285,109 +161,16 @@
     "\n",
     "### Higher than most chemicals\n",
     "\n",
-    "The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals\n",
+    "The table below summarises the Z scores for the high outlying (i.e., top 10) chemicals\n",
     "in each type of organisation. These are chemicals are seen to be used more often\n",
     "in a particular organisation than its peers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "      <th>chemicals</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>entity</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "      <td>680</td>\n",
-       "      <td>5.42</td>\n",
-       "      <td>6.33</td>\n",
-       "      <td>2.68</td>\n",
-       "      <td>1.64</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "      <td>1138</td>\n",
-       "      <td>5.79</td>\n",
-       "      <td>10.20</td>\n",
-       "      <td>2.76</td>\n",
-       "      <td>3.18</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "      <td>1416</td>\n",
-       "      <td>5.28</td>\n",
-       "      <td>2528.09</td>\n",
-       "      <td>2.26</td>\n",
-       "      <td>3.39</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6476</td>\n",
-       "      <td>1346</td>\n",
-       "      <td>5.23</td>\n",
-       "      <td>6825.50</td>\n",
-       "      <td>1.21</td>\n",
-       "      <td>3.84</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             n  chemicals  median      max   min   IQR\n",
-       "entity                                                \n",
-       "stp         42        680    5.42     6.33  2.68  1.64\n",
-       "ccg        106       1138    5.79    10.20  2.76  3.18\n",
-       "pcn       1257       1416    5.28  2528.09  2.26  3.39\n",
-       "practice  6476       1346    5.23  6825.50  1.21  3.84"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "### Extracting the summary statistics for the z scores\n",
     "overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n",
@@ -407,111 +190,18 @@
    "source": [
     "### Lower than most chemicals\n",
     "\n",
-    "The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals\n",
+    "The table below summarises the Z scores for the low outlying (i.e., bottom 10) chemicals\n",
     "in each type of organisation. These are chemicals are seen to be used less often\n",
     "in a particular organisation than its peers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {
     "lines_to_next_cell": 0
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "      <th>chemicals</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>entity</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "      <td>680</td>\n",
-       "      <td>-2.35</td>\n",
-       "      <td>-1.47</td>\n",
-       "      <td>-6.33</td>\n",
-       "      <td>0.79</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "      <td>1138</td>\n",
-       "      <td>-2.30</td>\n",
-       "      <td>-1.33</td>\n",
-       "      <td>-10.20</td>\n",
-       "      <td>0.82</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "      <td>1416</td>\n",
-       "      <td>-2.18</td>\n",
-       "      <td>-1.30</td>\n",
-       "      <td>-159.77</td>\n",
-       "      <td>0.77</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6476</td>\n",
-       "      <td>1346</td>\n",
-       "      <td>-2.08</td>\n",
-       "      <td>-0.99</td>\n",
-       "      <td>-307.23</td>\n",
-       "      <td>0.81</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             n  chemicals  median   max     min   IQR\n",
-       "entity                                               \n",
-       "stp         42        680   -2.35 -1.47   -6.33  0.79\n",
-       "ccg        106       1138   -2.30 -1.33  -10.20  0.82\n",
-       "pcn       1257       1416   -2.18 -1.30 -159.77  0.77\n",
-       "practice  6476       1346   -2.08 -0.99 -307.23  0.81"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "### Extracting the summary statistics for the z scores\n",
     "underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n",
@@ -537,144 +227,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead tr th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead tr:last-of-type th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th colspan=\"6\" halign=\"left\">Higher than most</th>\n",
-       "      <th colspan=\"4\" halign=\"left\">Lower than most</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th>n</th>\n",
-       "      <th>chemicals</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "      <th>median</th>\n",
-       "      <th>max</th>\n",
-       "      <th>min</th>\n",
-       "      <th>IQR</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>entity</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>stp</th>\n",
-       "      <td>42</td>\n",
-       "      <td>680</td>\n",
-       "      <td>5.42</td>\n",
-       "      <td>6.33</td>\n",
-       "      <td>2.68</td>\n",
-       "      <td>1.64</td>\n",
-       "      <td>-2.35</td>\n",
-       "      <td>-1.47</td>\n",
-       "      <td>-6.33</td>\n",
-       "      <td>0.79</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ccg</th>\n",
-       "      <td>106</td>\n",
-       "      <td>1138</td>\n",
-       "      <td>5.79</td>\n",
-       "      <td>10.20</td>\n",
-       "      <td>2.76</td>\n",
-       "      <td>3.18</td>\n",
-       "      <td>-2.30</td>\n",
-       "      <td>-1.33</td>\n",
-       "      <td>-10.20</td>\n",
-       "      <td>0.82</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>pcn</th>\n",
-       "      <td>1257</td>\n",
-       "      <td>1416</td>\n",
-       "      <td>5.28</td>\n",
-       "      <td>2528.09</td>\n",
-       "      <td>2.26</td>\n",
-       "      <td>3.39</td>\n",
-       "      <td>-2.18</td>\n",
-       "      <td>-1.30</td>\n",
-       "      <td>-159.77</td>\n",
-       "      <td>0.77</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>practice</th>\n",
-       "      <td>6476</td>\n",
-       "      <td>1346</td>\n",
-       "      <td>5.23</td>\n",
-       "      <td>6825.50</td>\n",
-       "      <td>1.21</td>\n",
-       "      <td>3.84</td>\n",
-       "      <td>-2.08</td>\n",
-       "      <td>-0.99</td>\n",
-       "      <td>-307.23</td>\n",
-       "      <td>0.81</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         Higher than most                                        \\\n",
-       "                        n chemicals median      max   min   IQR   \n",
-       "entity                                                            \n",
-       "stp                    42       680   5.42     6.33  2.68  1.64   \n",
-       "ccg                   106      1138   5.79    10.20  2.76  3.18   \n",
-       "pcn                  1257      1416   5.28  2528.09  2.26  3.39   \n",
-       "practice             6476      1346   5.23  6825.50  1.21  3.84   \n",
-       "\n",
-       "         Lower than most                      \n",
-       "                  median   max     min   IQR  \n",
-       "entity                                        \n",
-       "stp                -2.35 -1.47   -6.33  0.79  \n",
-       "ccg                -2.30 -1.33  -10.20  0.82  \n",
-       "pcn                -2.18 -1.30 -159.77  0.77  \n",
-       "practice           -2.08 -0.99 -307.23  0.81  "
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n",
     "           underused_toprint[metrics_to_show[2:]]],\n",
@@ -717,6 +272,11 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.1"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "de1343822d6e7d7aeea8796be9d48304b0fa3610166e8740495ec86b33c71a9e"
+   }
   }
  },
  "nbformat": 4,
diff --git a/notebooks/diffable_python/calculate_summary_statistics_outlying.py b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
index 3d689bd72..7bac1e5b4 100644
--- a/notebooks/diffable_python/calculate_summary_statistics_outlying.py
+++ b/notebooks/diffable_python/calculate_summary_statistics_outlying.py
@@ -13,6 +13,9 @@
 #     display_name: Python 3
 #     language: python
 #     name: python3
+#   vscode:
+#     interpreter:
+#       hash: de1343822d6e7d7aeea8796be9d48304b0fa3610166e8740495ec86b33c71a9e
 # ---
 
 from lib.outliers import Runner
@@ -54,7 +57,7 @@
 # ## Chemical counts
 #
 # Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom
-# 5 z scores) amongst all organisations of the given type.
+# 10 z scores) amongst all organisations of the given type.
 
 # +
 ### Summarising the number of unique chemicals identified in the
@@ -81,7 +84,7 @@
 
 # +
 ### Calculating summary statistics for the Z scores for those chemicals
-### identified in the TOP 5 in at least one organisation of the entity type.
+### identified in the TOP 10 in at least one organisation of the entity type.
 ### There are the chemicals displayed in the 'Higher than most' table.
 
 overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
@@ -89,7 +92,7 @@
 
 # +
 ### Calculating summary statistics for the Z scores for those chemicals
-### identified in the BOTTOM 5 in at least one organisation of the entity type.
+### identified in the BOTTOM 10 in at least one organisation of the entity type.
 ### There are the chemicals displayed in the 'Lower than most' table.
 
 underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
@@ -100,7 +103,7 @@
 #
 # ### Higher than most chemicals
 #
-# The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals
+# The table below summarises the Z scores for the high outlying (i.e., top 10) chemicals
 # in each type of organisation. These are chemicals are seen to be used more often
 # in a particular organisation than its peers.
 
@@ -119,7 +122,7 @@
 
 # ### Lower than most chemicals
 #
-# The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals
+# The table below summarises the Z scores for the low outlying (i.e., bottom 10) chemicals
 # in each type of organisation. These are chemicals are seen to be used less often
 # in a particular organisation than its peers.