Adds scripts for synthetic test set generation

NERC-CEH · Nov 15, 2024 · 23343d7 · 23343d7 · github-actions · Nov 15, 2024
1 parent 0592fcd
commit 23343d7
Show file tree

Hide file tree

Showing 3 changed files with 261 additions and 21 deletions.
diff --git a/notebooks/ragas_synth.ipynb b/notebooks/ragas_synth.ipynb
@@ -10,21 +10,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mpc/github/llm-eval/.testvenv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "import nest_asyncio\n",
     "from langchain_community.chat_models import ChatOllama\n",
     "from langchain_community.embeddings import OllamaEmbeddings\n",
     "from ragas.run_config import RunConfig\n",
     "from ragas.testset.evolutions import multi_context, reasoning, simple\n",
-    "from ragas.testset.generator import TestsetGenerator"
+    "from ragas.testset.generator import TestsetGenerator\n",
+    "from langchain.docstore.document import Document\n",
+    "import json"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,11 +72,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "docs = []  # load a set of langchain docs to base the synthetic test set generation on"
+    "with open(\"../data/extracted_metadata.json\") as f:\n",
+    "    json_data = json.load(f)\n",
+    "    docs = [\n",
+    "        Document(\n",
+    "            page_content=metadata[\"value\"],\n",
+    "            metadata={\"id\": metadata[\"id\"], \"field\": metadata[\"field\"]},\n",
+    "        )\n",
+    "        for metadata in json_data\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(metadata={'id': 'b77ce981-d038-4774-a620-f50da5dd3d31', 'field': 'title'}, page_content='Land Cover Map 2017 (land parcels, GB)'),\n",
+       " Document(metadata={'id': 'b77ce981-d038-4774-a620-f50da5dd3d31', 'field': 'description'}, page_content=\"This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Great Britain. It describes Great Britain's land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.  This dataset was derived from the corresponding LCM2017 20m classified pixels dataset.  All further LCM2017 datasets for Great Britain are derived from this land parcel product.  A range of land parcel attributes are provided.  These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help assessing classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.\\n\\n\"),\n",
+       " Document(metadata={'id': 'b77ce981-d038-4774-a620-f50da5dd3d31', 'field': 'lineage'}, page_content='The Land Parcels datasets of the LCM2017, LCM2018 and LCM2019 product range were derived from the corresponding 20m Classified Pixels datasets. They give per-parcel land cover information in 21 classes based on UK Biodiversity Action Plan broad habitats. A UK spatial framework of land parcels (vector-polygons) representing real-world objects (for example: fields, lakes, urban areas and so forth) was intersected with the 20m Classified Pixels to generate per-parcel pixel statistics. These include a histogram recording pixel frequency per land cover class, the modal land cover class, the total number of pixels and three attribute indicators of classification confidence. Land parcel classification results were validated against independently collected ground observations. All calculations were performed within a PosGIS database. Results were extracted from PostGIS using the QGIS export facility to give the final product as a SpatiaLite vector geodatabse. SpatiaLite is an open standard file format for geospatial vector databases and SpatiLite files can be read by most GIS software.'),\n",
+       " Document(metadata={'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e', 'field': 'title'}, page_content='Land Cover Map 2021 (25m rasterised land parcels, N. Ireland)'),\n",
+       " Document(metadata={'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e', 'field': 'description'}, page_content='This is a 25m pixel data set representing the land surface of Northern Ireland, classified into 21 UKCEH land cover classes, based upon Biodiversity Action Plan broad habitats. It is a three-band raster in GeoTiff format, produced by rasterising three properties of the classified land parcels dataset. The first band gives the most likely land cover type; the second band gives the per-parcel probability of the land cover, the third band is a measure of parcel purity. The probability and purity bands (scaled 0 to 100) combine to give an indication of uncertainty.  A full description of this and all UKCEH LCM2021 products are available from the LCM2021 product documentation accompanying this dataset.'),\n",
+       " Document(metadata={'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e', 'field': 'lineage'}, page_content='UKCEH’s automated land cover algorithms classify 10 m pixels across the whole of UK. Training data were automatically selected from stable land covers over the interval of 2018 to 2020. A Random Forest classifier used these to classify four composite images representing per season median surface reflectance. Seasonal images were integrated with context layers (e.g., height, aspect, slope, coastal proximity, urban proximity and so forth) to reduce confusion among classes with similar spectra.\\n\\nLand cover was validated by organising the 10 m pixel classification into a land parcel framework (the LCM2021 classified land parcels product). The classified land parcels were compared to known land cover producing a confusion matrix to determine overall and per class accuracy. Details are available from the product documentation accompanying this dataset.\\n\\nThe 25 m rasterised land parcels product is created by pixelating the corresponding land parcel product.'),\n",
+       " Document(metadata={'id': '36343ace-d56a-43ea-9d48-2f434dafcb26', 'field': 'title'}, page_content='Land Cover Map 2020 (land parcels, N. Ireland)'),\n",
+       " Document(metadata={'id': '36343ace-d56a-43ea-9d48-2f434dafcb26', 'field': 'description'}, page_content=\"This is the land parcel (polygon) dataset for the UKCEH Land Cover Map of 2020 (LCM2020) representing Northern Ireland. It describes Northern Ireland's land cover in 2020 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats.   A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation accompanying this dataset.\\n\\nLCM2020 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2020. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2020.   These are one of a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps in 2000, 2007, 2015 and annually since 2017.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability. \"),\n",
+       " Document(metadata={'id': '36343ace-d56a-43ea-9d48-2f434dafcb26', 'field': 'lineage'}, page_content='UKCEH’s automated land cover algorithms generated 10m classified pixels from Sentinel-2 satellite data. Training data were automatically selected from stable land covers over the interval of 2017 to 2019. A Random Forest classifier used these to classify four composite images representing per season median surface reflectance. Seasonal images were integrated with context layers (e.g., height, aspect, slope, coastal proximity, urban proximity and so forth) to reduce confusion among classes with similar spectra.\\n\\nLand cover was validated by organising the pixel classification into a land parcel spatial framework (the LCM2020 Classified Land Parcels product). The classified land parcels were compared to known land cover producing confusion matrix to determine overall and per class accuracy. Details are available from the product documentation.\\n\\nThis product represents the LCM2020 Classified Land Parcel, Northern Ireland.')]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
    ]
   },
   {
@@ -77,9 +124,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Filename and doc_id are the same for all nodes.                 \n",
+      "Generating: 100%|██████████| 5/5 [01:10<00:00, 14.03s/it]\n"
+     ]
+    }
+   ],
    "source": [
     "testset = gen.generate_with_langchain_docs(docs, 5, dist, is_async=False)"
    ]
@@ -93,19 +149,146 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>contexts</th>\n",
+       "      <th>ground_truth</th>\n",
+       "      <th>evolution_type</th>\n",
+       "      <th>metadata</th>\n",
+       "      <th>episode_done</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>What are the total number of land parcels iden...</td>\n",
+       "      <td>[Land Cover Map 2020 (land parcels, N. Ireland)]</td>\n",
+       "      <td>The answer to given question is not present in...</td>\n",
+       "      <td>simple</td>\n",
+       "      <td>[{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What are the land parcels included in the Land...</td>\n",
+       "      <td>[Land Cover Map 2017 (land parcels, GB)]</td>\n",
+       "      <td>The answer to given question is not present in...</td>\n",
+       "      <td>simple</td>\n",
+       "      <td>[{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>What are the main land cover classes used in t...</td>\n",
+       "      <td>[This is the land parcels (polygon) dataset fo...</td>\n",
+       "      <td>The main land cover classes used in the LCM201...</td>\n",
+       "      <td>simple</td>\n",
+       "      <td>[{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>What did RF ID in NI (LCM2020) vs UK (LCM2021)?</td>\n",
+       "      <td>[UKCEH’s automated land cover algorithms gener...</td>\n",
+       "      <td>The answer to given question is not present in...</td>\n",
+       "      <td>multi_context</td>\n",
+       "      <td>[{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>What's NI's top two land covers from '21 &amp; '20...</td>\n",
+       "      <td>[Land Cover Map 2021 (25m rasterised land parc...</td>\n",
+       "      <td>The top two land covers in Northern Ireland fo...</td>\n",
+       "      <td>reasoning</td>\n",
+       "      <td>[{'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e'...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            question  \\\n",
+       "0  What are the total number of land parcels iden...   \n",
+       "1  What are the land parcels included in the Land...   \n",
+       "2  What are the main land cover classes used in t...   \n",
+       "3    What did RF ID in NI (LCM2020) vs UK (LCM2021)?   \n",
+       "4  What's NI's top two land covers from '21 & '20...   \n",
+       "\n",
+       "                                            contexts  \\\n",
+       "0   [Land Cover Map 2020 (land parcels, N. Ireland)]   \n",
+       "1           [Land Cover Map 2017 (land parcels, GB)]   \n",
+       "2  [This is the land parcels (polygon) dataset fo...   \n",
+       "3  [UKCEH’s automated land cover algorithms gener...   \n",
+       "4  [Land Cover Map 2021 (25m rasterised land parc...   \n",
+       "\n",
+       "                                        ground_truth evolution_type  \\\n",
+       "0  The answer to given question is not present in...         simple   \n",
+       "1  The answer to given question is not present in...         simple   \n",
+       "2  The main land cover classes used in the LCM201...         simple   \n",
+       "3  The answer to given question is not present in...  multi_context   \n",
+       "4  The top two land covers in Northern Ireland fo...      reasoning   \n",
+       "\n",
+       "                                            metadata  episode_done  \n",
+       "0  [{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'...          True  \n",
+       "1  [{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'...          True  \n",
+       "2  [{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'...          True  \n",
+       "3  [{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'...          True  \n",
+       "4  [{'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e'...          True  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "df = testset.to_pandas()\n",
-    "df.to_csv(\"data/synthetic-datasets/test-set.csv\", index=False)\n",
     "df"
    ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": ".testvenv",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,21 +2,22 @@
 name = "llm-eval"
 version = "0.1.0"
 dependencies = [
-    "plotly == 5.24.1",
-    "pandas == 2.2.3",
+    "plotly == 5.22.0",
+    "pandas == 2.2.2",
     "numpy == 1.26.4",
     "kaleido == 0.2.1",
     "dvc[s3] == 3.2.0 ",
     "bitsandbytes == 0.44.1",
-    "haystack-ai == 2.6.0",
+    "haystack-ai == 2.2.3",
     "accelerate == 1.0.0",
-    "sentence-transformers == 3.1.1",
-    "chromadb == 0.5.15",
+    "sentence-transformers == 3.0.1",
+    "chromadb == 0.5.3",
     "ollama-haystack == 0.0.7",
-    "chroma-haystack == 0.22.1",
+    "chroma-haystack == 0.18.0",
     "ragas == 0.1.10",
-    "nltk == 3.9.1",
-    "nbformat == 4.2.0",
+    "nltk == 3.8.1",
+    "nbformat == 5.10.4",
+    "langchain == 0.2.7",
 ]
 
 [project.optional-dependencies]

diff --git a/scripts/generate_synthetic_testset.py b/scripts/generate_synthetic_testset.py
@@ -0,0 +1,56 @@
+import nest_asyncio
+from langchain_community.chat_models import ChatOllama
+from langchain_community.embeddings import OllamaEmbeddings
+from ragas.run_config import RunConfig
+from ragas.testset.evolutions import multi_context, reasoning, simple
+from ragas.testset.generator import TestsetGenerator
+from langchain.docstore.document import Document
+import json
+from argparse import ArgumentParser
+
+
+def load_metadata(metadata_file):
+    with open(metadata_file) as f:
+        json_data = json.load(f)
+        return [
+            Document(
+                page_content=metadata["value"],
+                metadata={"id": metadata["id"], "field": metadata["field"]},
+            )
+            for metadata in json_data
+        ]
+
+
+def main(metadata_file, testset_output_file, testset_size=5):
+    nest_asyncio.apply()
+    docs = load_metadata(metadata_file)
+    llm = ChatOllama(model="mistral-nemo", num_ctx=16384)
+    embeddings = OllamaEmbeddings(model="mistral-nemo", num_ctx=16384)
+    gen = TestsetGenerator.from_langchain(
+        llm, llm, embeddings, run_config=RunConfig(max_workers=1, max_retries=1)
+    )
+    dist = {simple: 0.6, multi_context: 0.2, reasoning: 0.2}
+    testset = gen.generate_with_langchain_docs(docs, testset_size, dist, is_async=False)
+    df = testset.to_pandas()
+    df.to_csv(testset_output_file, index=False)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("generate_synthetic_testset.py")
+    parser.add_argument(
+        "metadata_file",
+        help="Input file containing metadata to base the synthetic test questions on.",
+    )
+    parser.add_argument(
+        "testset_file",
+        help="Output file to write the test set to.",
+    )
+    parser.add_argument(
+        "testset_size",
+        help="How many questions to generate in the test set.",
+        type=int,
+        nargs="?",
+        const=5,
+    )
+    args = parser.parse_args()
+    main(args.metadata_file, args.testset_file, args.testset_size)