Skip to content

Commit

Permalink
Update member list and sector word count
Browse files Browse the repository at this point in the history
  • Loading branch information
luke-strange committed Jun 7, 2024
1 parent b8be988 commit 7519069
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 63 deletions.
163 changes: 127 additions & 36 deletions pipelines/truenorth/analyse_members_list.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@
"cells": [
{
"cell_type": "code",
"execution_count": 471,
"execution_count": 678,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.chdir('../..')\n",
"from pipelines.util import *\n",
"import pandas as pd\n",
"from datetime import datetime"
"from datetime import datetime\n",
"from collections import Counter\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 472,
"execution_count": 679,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -24,7 +26,7 @@
},
{
"cell_type": "code",
"execution_count": 473,
"execution_count": 680,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -34,7 +36,7 @@
},
{
"cell_type": "code",
"execution_count": 474,
"execution_count": 681,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -86,7 +88,7 @@
" <td>174</td>\n",
" <td>141</td>\n",
" <td>132</td>\n",
" <td>186</td>\n",
" <td>185</td>\n",
" <td>45</td>\n",
" <td>186</td>\n",
" <td>45</td>\n",
Expand All @@ -103,7 +105,7 @@
" <td>163</td>\n",
" <td>70</td>\n",
" <td>50</td>\n",
" <td>145</td>\n",
" <td>150</td>\n",
" <td>18</td>\n",
" <td>13</td>\n",
" <td>7</td>\n",
Expand Down Expand Up @@ -201,8 +203,8 @@
"top_percent_of_count 53.3 \n",
"\n",
" Company name City Industry sector location \\\n",
"count 174 141 132 186 45 \n",
"unique 163 70 50 145 18 \n",
"count 174 141 132 185 45 \n",
"unique 163 70 50 150 18 \n",
"top Brabners LLP London Legal Services Legal Lancashire \n",
"freq 5 19 12 7 9 \n",
"top_percent_of_count 2.9 13.5 9.1 3.8 20.0 \n",
Expand All @@ -229,7 +231,7 @@
"top_percent_of_count 44.4 "
]
},
"execution_count": 474,
"execution_count": 681,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -240,7 +242,7 @@
},
{
"cell_type": "code",
"execution_count": 475,
"execution_count": 682,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -314,7 +316,7 @@
},
{
"cell_type": "code",
"execution_count": 476,
"execution_count": 683,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -327,7 +329,7 @@
},
{
"cell_type": "code",
"execution_count": 477,
"execution_count": 684,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -337,7 +339,7 @@
},
{
"cell_type": "code",
"execution_count": 478,
"execution_count": 685,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -347,7 +349,7 @@
},
{
"cell_type": "code",
"execution_count": 479,
"execution_count": 686,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -366,7 +368,7 @@
},
{
"cell_type": "code",
"execution_count": 480,
"execution_count": 687,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -379,7 +381,7 @@
},
{
"cell_type": "code",
"execution_count": 481,
"execution_count": 688,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -390,7 +392,7 @@
},
{
"cell_type": "code",
"execution_count": 482,
"execution_count": 689,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -400,7 +402,7 @@
},
{
"cell_type": "code",
"execution_count": 483,
"execution_count": 690,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -417,7 +419,108 @@
},
{
"cell_type": "code",
"execution_count": 484,
"execution_count": 691,
"metadata": {},
"outputs": [],
"source": [
"total_members = len(data.index)\n",
"\n",
"total_companies = summary.loc['unique', 'Company name']\n",
"\n",
"top_company_size = summary.loc['top', 'company_size']\n",
"\n",
"top_company_size_pct = summary.loc['top_percent_of_count', 'company_size']\n",
"\n",
"top_industry = summary.loc['top', 'Industry']\n",
"\n",
"top_industry_pct = summary.loc['top_percent_of_count', 'Industry']\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Word frequency in the sector column"
]
},
{
"cell_type": "code",
"execution_count": 692,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'nan' is not a stirng type. Converting to string\n",
"\n",
"The most common words, with at least 2 occurences are:\n",
"'services' with 20 occurences\n",
"'construction' with 19 occurences\n",
"'education' with 16 occurences\n",
"'manufacturing' with 15 occurences\n",
"'engineering' with 14 occurences\n",
"'finance' with 13 occurences\n",
"'professional' with 13 occurences\n",
"'technology' with 12 occurences\n",
"'business' with 11 occurences\n",
"'estate' with 11 occurences\n",
"'real' with 10 occurences\n",
"'development' with 10 occurences\n"
]
}
],
"source": [
"def normalize_string(s):\n",
" # Convert to lowercase and remove non-alphanumeric characters (keeping spaces)\n",
" try:\n",
" s = s.lower()\n",
" except:\n",
" print(f\"'{s}' is not a stirng type. Converting to string\\n\")\n",
" return str(s)\n",
" s = re.sub(r'[^a-z0-9\\s]', '', s)\n",
" return s\n",
"data['normalized_sector'] = data['sector'].apply(normalize_string).str.split(';')\n",
" \n",
"# Flatten the list and further split by spaces to handle multi-word strings\n",
"all_words = [word for sublist in data['normalized_sector'] for item in sublist for word in item.split()]\n",
" \n",
"# Count the occurrences of each word\n",
"word_counts = Counter(all_words)\n",
" \n",
"# Find the most common word\n",
"most_common_words = word_counts.most_common(10)\n",
"least_common_words = word_counts.most_common()\n",
"\n",
"banned_words = ['and', 'or', 'of', 'it', 'the', 'for', 'with', 'we']\n",
"# print(\"The 10 most common words, not including 'and' are:\") \n",
"# for word, count in most_common_words:\n",
"# if word == banned_words:\n",
"# continue\n",
"# print(f\"'{word}' with {count} occurences\")\n",
"\n",
"print(\"The most common words, with at least 2 occurences are:\")\n",
"words = []\n",
"counts = [] \n",
"for word, count in least_common_words:\n",
" if word in banned_words:\n",
" continue\n",
" if count >= 10:\n",
" print(f\"'{word}' with {count} occurences\")\n",
" words.append(word)\n",
" counts.append(count)\n",
"\n",
"sector_counts = pd.DataFrame(data={'name': words, 'count': counts}).set_index('name')\n",
"# sector_counts['colour'] = round((sector_counts['count'] - min(sector_counts['count'])) / sector_counts['count'].max(), 3)\n",
"sector_counts.to_csv(os.path.join(SRC_DIR, 'themes/true-north/membership/_data/sector_word_counts.csv'))\n",
"# sector_strings = [item for sublist in data['sector'].str.split(';') for item in sublist]\n",
"# string_counts = Counter(sector_strings)\n",
"# print(string_counts.most_common())\n"
]
},
{
"cell_type": "code",
"execution_count": 693,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -485,7 +588,7 @@
" <th>4</th>\n",
" <td>Top industry</td>\n",
" <td>9.1</td>\n",
" <td>Of members work in Legal</td>\n",
" <td>Of members work in Legal Services</td>\n",
" <td>%</td>\n",
" <td></td>\n",
" </tr>\n",
Expand Down Expand Up @@ -542,7 +645,7 @@
"1 Unique companies \n",
"2 Since last month % \n",
"3 Of members work in companies of this size % \n",
"4 Of members work in Legal % \n",
"4 Of members work in Legal Services % \n",
"5 placeholder \n",
"6 Members represent the network on the True Nort... \n",
"7 Companies have been featured as Northern Stars \n",
Expand All @@ -560,24 +663,12 @@
"8 "
]
},
"execution_count": 484,
"execution_count": 693,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total_members = len(data.index)\n",
"\n",
"total_companies = summary.loc['unique', 'Company name']\n",
"\n",
"top_company_size = summary.loc['top', 'company_size']\n",
"\n",
"top_company_size_pct = summary.loc['top_percent_of_count', 'company_size']\n",
"\n",
"top_industry = summary.loc['top', 'sector']\n",
"\n",
"top_industry_pct = summary.loc['top_percent_of_count', 'Industry']\n",
"\n",
"advisory_council = 11\n",
"\n",
"if summary.loc['top', \"Are you currently a B Corp or in the process of becoming a B Corp?\"] == 'No':\n",
Expand Down Expand Up @@ -629,7 +720,7 @@
},
{
"cell_type": "code",
"execution_count": 485,
"execution_count": 694,
"metadata": {},
"outputs": [],
"source": [
Expand Down
Loading

0 comments on commit 7519069

Please sign in to comment.