-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from ebmdatalab/calculate-summary-statistics
Notebook to calculate summary statistics
- Loading branch information
Showing
2 changed files
with
434 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from lib.outliers import Runner\n", | ||
"from datetime import date\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from_date = date(year=2021,month=6,day=1)\n", | ||
"to_date = date(year=2021,month=12,day=1)\n", | ||
"r = Runner(from_date,to_date,10,[\"practice\",\"ccg\",\"pcn\",\"stp\"],False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"r.build.run()\n", | ||
"r.build.fetch_results()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### Extracting all the stored z scores etc across organisations\n", | ||
"### so that summary statistics can be calculated\n", | ||
"\n", | ||
"e_data = pd.concat(\n", | ||
" (d.assign(entity=e) for e, d in r.build.results.items())\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Entity counts\n", | ||
"\n", | ||
"Counts of each kind of entity (i.e., organisation)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### Summarising the number of each kind of entity (organisation)\n", | ||
"\n", | ||
"e_counts = ( e_data.reset_index()[[\"practice\",\"entity\"]]\n", | ||
" .drop_duplicates()['entity']\n", | ||
" .value_counts()\n", | ||
" .to_frame()\n", | ||
" .rename( columns={'entity':'n'} ) )\n", | ||
"\n", | ||
"e_counts" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Chemical counts\n", | ||
"\n", | ||
"Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom\n", | ||
"10 z scores) amongst all organisations of the given type." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### Summarising the number of unique chemicals identified in the\n", | ||
"### top/bottom five outliers amongst all organisations of the given type\n", | ||
"\n", | ||
"c_counts = ( e_data.reset_index()[[\"chemical\",\"entity\"]]\n", | ||
" .drop_duplicates()['entity']\n", | ||
" .value_counts()\n", | ||
" .to_frame()\n", | ||
" .rename( columns={'entity':'chemicals'} ) )\n", | ||
"\n", | ||
"c_counts" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"lines_to_next_cell": 2 | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"### Combining the entity and chemical counts\n", | ||
"\n", | ||
"all_counts = e_counts.join( c_counts )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"lines_to_next_cell": 2 | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"### Defining which metrics will be displayed in the summary tables\n", | ||
"metrics_to_show = [ \"n\", \"chemicals\", \"median\",\"max\",\"min\",\"IQR\" ]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### Calculating summary statistics for the Z scores for those chemicals\n", | ||
"### identified in the TOP 10 in at least one organisation of the entity type.\n", | ||
"### There are the chemicals displayed in the 'Higher than most' table.\n", | ||
"\n", | ||
"overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", | ||
"overused_summary = overused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### Calculating summary statistics for the Z scores for those chemicals\n", | ||
"### identified in the BOTTOM 10 in at least one organisation of the entity type.\n", | ||
"### There are the chemicals displayed in the 'Lower than most' table.\n", | ||
"\n", | ||
"underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( \"entity\" )[[\"z_score\"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)\n", | ||
"underused_summary = underused_summary.rename( columns={\"50%\":\"median\"}, inplace=False )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Summary statistics for outlying Z scores in each organisation type\n", | ||
"\n", | ||
"### Higher than most chemicals\n", | ||
"\n", | ||
"The table below summarises the Z scores for the high outlying (i.e., top 10) chemicals\n", | ||
"in each type of organisation. These are chemicals are seen to be used more often\n", | ||
"in a particular organisation than its peers." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### Extracting the summary statistics for the z scores\n", | ||
"overused_tmp = overused_summary[overused_summary.index.isin([\"z_score\"], level=1)]\n", | ||
"\n", | ||
"### Calculating IQR, removing the row index and rounding to 2dp\n", | ||
"overused_toprint = ( overused_tmp\n", | ||
" .assign( IQR = overused_tmp[\"75%\"]-overused_tmp[\"25%\"] )\n", | ||
" .droplevel(level=1)\n", | ||
" .round(2) )\n", | ||
"\n", | ||
"overused_toprint.join( all_counts )[metrics_to_show]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Lower than most chemicals\n", | ||
"\n", | ||
"The table below summarises the Z scores for the low outlying (i.e., bottom 10) chemicals\n", | ||
"in each type of organisation. These are chemicals are seen to be used less often\n", | ||
"in a particular organisation than its peers." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"lines_to_next_cell": 0 | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"### Extracting the summary statistics for the z scores\n", | ||
"underused_tmp = underused_summary[underused_summary.index.isin([\"z_score\"], level=1)]\n", | ||
"\n", | ||
"### Calculating IQR, removing the row index and rounding to 2dp\n", | ||
"underused_toprint = ( underused_tmp\n", | ||
" .assign( IQR = underused_tmp[\"75%\"]-underused_tmp[\"25%\"] )\n", | ||
" .droplevel(level=1)\n", | ||
" .round(2) )\n", | ||
"\n", | ||
"underused_toprint.join( all_counts )[metrics_to_show]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Summary\n", | ||
"\n", | ||
"Below is a summary table that combines the 'Higher than most' and 'Lower than most'\n", | ||
"results displayed above." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd.concat([overused_toprint.join( all_counts )[metrics_to_show],\n", | ||
" underused_toprint[metrics_to_show[2:]]],\n", | ||
" keys=[\"Higher than most\", \"Lower than most\"],axis=1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"jupytext": { | ||
"cell_metadata_filter": "all", | ||
"encoding": "# -*- coding: utf-8 -*-", | ||
"notebook_metadata_filter": "all,-language_info", | ||
"text_representation": { | ||
"extension": ".py", | ||
"format_name": "light", | ||
"format_version": "1.5", | ||
"jupytext_version": "1.3.4" | ||
} | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.1" | ||
}, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "de1343822d6e7d7aeea8796be9d48304b0fa3610166e8740495ec86b33c71a9e" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.