Update member list and sector word count

open-innovations · Jun 7, 2024 · 7519069 · 7519069
1 parent b8be988
commit 7519069
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 63 deletions.
diff --git a/pipelines/truenorth/analyse_members_list.ipynb b/pipelines/truenorth/analyse_members_list.ipynb
@@ -2,20 +2,22 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 471,
+   "execution_count": 678,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "os.chdir('../..')\n",
     "from pipelines.util import *\n",
     "import pandas as pd\n",
-    "from datetime import datetime"
+    "from datetime import datetime\n",
+    "from collections import Counter\n",
+    "import re"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 472,
+   "execution_count": 679,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 473,
+   "execution_count": 680,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 474,
+   "execution_count": 681,
    "metadata": {},
    "outputs": [
     {
@@ -86,7 +88,7 @@
        "      <td>174</td>\n",
        "      <td>141</td>\n",
        "      <td>132</td>\n",
-       "      <td>186</td>\n",
+       "      <td>185</td>\n",
        "      <td>45</td>\n",
        "      <td>186</td>\n",
        "      <td>45</td>\n",
@@ -103,7 +105,7 @@
        "      <td>163</td>\n",
        "      <td>70</td>\n",
        "      <td>50</td>\n",
-       "      <td>145</td>\n",
+       "      <td>150</td>\n",
        "      <td>18</td>\n",
        "      <td>13</td>\n",
        "      <td>7</td>\n",
@@ -201,8 +203,8 @@
        "top_percent_of_count                                               53.3                                \n",
        "\n",
        "                      Company name    City        Industry sector    location  \\\n",
-       "count                          174     141             132    186          45   \n",
-       "unique                         163      70              50    145          18   \n",
+       "count                          174     141             132    185          45   \n",
+       "unique                         163      70              50    150          18   \n",
        "top                   Brabners LLP  London  Legal Services  Legal  Lancashire   \n",
        "freq                             5      19              12      7           9   \n",
        "top_percent_of_count           2.9    13.5             9.1    3.8        20.0   \n",
@@ -229,7 +231,7 @@
        "top_percent_of_count         44.4  "
       ]
      },
-     "execution_count": 474,
+     "execution_count": 681,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -240,7 +242,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 475,
+   "execution_count": 682,
    "metadata": {},
    "outputs": [
     {
@@ -314,7 +316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 476,
+   "execution_count": 683,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -327,7 +329,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 477,
+   "execution_count": 684,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -337,7 +339,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 478,
+   "execution_count": 685,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -347,7 +349,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 479,
+   "execution_count": 686,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -366,7 +368,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 480,
+   "execution_count": 687,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -379,7 +381,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 481,
+   "execution_count": 688,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -390,7 +392,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 482,
+   "execution_count": 689,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -400,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 483,
+   "execution_count": 690,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -417,7 +419,108 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 484,
+   "execution_count": 691,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_members = len(data.index)\n",
+    "\n",
+    "total_companies = summary.loc['unique', 'Company name']\n",
+    "\n",
+    "top_company_size = summary.loc['top', 'company_size']\n",
+    "\n",
+    "top_company_size_pct = summary.loc['top_percent_of_count', 'company_size']\n",
+    "\n",
+    "top_industry = summary.loc['top', 'Industry']\n",
+    "\n",
+    "top_industry_pct = summary.loc['top_percent_of_count', 'Industry']\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Word frequency in the sector column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 692,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'nan' is not a stirng type. Converting to string\n",
+      "\n",
+      "The most common words, with at least 2 occurences are:\n",
+      "'services' with 20 occurences\n",
+      "'construction' with 19 occurences\n",
+      "'education' with 16 occurences\n",
+      "'manufacturing' with 15 occurences\n",
+      "'engineering' with 14 occurences\n",
+      "'finance' with 13 occurences\n",
+      "'professional' with 13 occurences\n",
+      "'technology' with 12 occurences\n",
+      "'business' with 11 occurences\n",
+      "'estate' with 11 occurences\n",
+      "'real' with 10 occurences\n",
+      "'development' with 10 occurences\n"
+     ]
+    }
+   ],
+   "source": [
+    "def normalize_string(s):\n",
+    "    # Convert to lowercase and remove non-alphanumeric characters (keeping spaces)\n",
+    "    try:\n",
+    "        s = s.lower()\n",
+    "    except:\n",
+    "        print(f\"'{s}' is not a stirng type. Converting to string\\n\")\n",
+    "        return str(s)\n",
+    "    s = re.sub(r'[^a-z0-9\\s]', '', s)\n",
+    "    return s\n",
+    "data['normalized_sector'] = data['sector'].apply(normalize_string).str.split(';')\n",
+    " \n",
+    "# Flatten the list and further split by spaces to handle multi-word strings\n",
+    "all_words = [word for sublist in data['normalized_sector'] for item in sublist for word in item.split()]\n",
+    " \n",
+    "# Count the occurrences of each word\n",
+    "word_counts = Counter(all_words)\n",
+    " \n",
+    "# Find the most common word\n",
+    "most_common_words = word_counts.most_common(10)\n",
+    "least_common_words = word_counts.most_common()\n",
+    "\n",
+    "banned_words = ['and', 'or', 'of', 'it', 'the', 'for', 'with', 'we']\n",
+    "# print(\"The 10 most common words, not including 'and' are:\") \n",
+    "# for word, count in most_common_words:\n",
+    "#     if word == banned_words:\n",
+    "#         continue\n",
+    "#     print(f\"'{word}' with {count} occurences\")\n",
+    "\n",
+    "print(\"The most common words, with at least 2 occurences are:\")\n",
+    "words = []\n",
+    "counts = [] \n",
+    "for word, count in least_common_words:\n",
+    "    if word in banned_words:\n",
+    "        continue\n",
+    "    if count >= 10:\n",
+    "        print(f\"'{word}' with {count} occurences\")\n",
+    "        words.append(word)\n",
+    "        counts.append(count)\n",
+    "\n",
+    "sector_counts = pd.DataFrame(data={'name': words, 'count': counts}).set_index('name')\n",
+    "# sector_counts['colour'] = round((sector_counts['count'] - min(sector_counts['count'])) / sector_counts['count'].max(), 3)\n",
+    "sector_counts.to_csv(os.path.join(SRC_DIR, 'themes/true-north/membership/_data/sector_word_counts.csv'))\n",
+    "# sector_strings = [item for sublist in data['sector'].str.split(';') for item in sublist]\n",
+    "# string_counts = Counter(sector_strings)\n",
+    "# print(string_counts.most_common())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 693,
    "metadata": {},
    "outputs": [
     {
@@ -485,7 +588,7 @@
        "      <th>4</th>\n",
        "      <td>Top industry</td>\n",
        "      <td>9.1</td>\n",
-       "      <td>Of members work in Legal</td>\n",
+       "      <td>Of members work in Legal Services</td>\n",
        "      <td>%</td>\n",
        "      <td></td>\n",
        "    </tr>\n",
@@ -542,7 +645,7 @@
        "1                                   Unique companies        \n",
        "2                                   Since last month    %   \n",
        "3          Of members work in companies of this size    %   \n",
-       "4                           Of members work in Legal    %   \n",
+       "4                  Of members work in Legal Services    %   \n",
        "5                                        placeholder        \n",
        "6  Members represent the network on the True Nort...        \n",
        "7     Companies have been featured as Northern Stars        \n",
@@ -560,24 +663,12 @@
        "8                                      "
       ]
      },
-     "execution_count": 484,
+     "execution_count": 693,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "total_members = len(data.index)\n",
-    "\n",
-    "total_companies = summary.loc['unique', 'Company name']\n",
-    "\n",
-    "top_company_size = summary.loc['top', 'company_size']\n",
-    "\n",
-    "top_company_size_pct = summary.loc['top_percent_of_count', 'company_size']\n",
-    "\n",
-    "top_industry = summary.loc['top', 'sector']\n",
-    "\n",
-    "top_industry_pct = summary.loc['top_percent_of_count', 'Industry']\n",
-    "\n",
     "advisory_council = 11\n",
     "\n",
     "if summary.loc['top', \"Are you currently a B Corp or in the process of becoming a B Corp?\"] == 'No':\n",
@@ -629,7 +720,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 485,
+   "execution_count": 694,
    "metadata": {},
    "outputs": [],
    "source": [