Skip to content

Commit

Permalink
Adds scripts for synthetic test set generation
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Nov 15, 2024
1 parent 0592fcd commit 23343d7
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 21 deletions.
209 changes: 196 additions & 13 deletions notebooks/ragas_synth.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,32 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mpc/github/llm-eval/.testvenv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import nest_asyncio\n",
"from langchain_community.chat_models import ChatOllama\n",
"from langchain_community.embeddings import OllamaEmbeddings\n",
"from ragas.run_config import RunConfig\n",
"from ragas.testset.evolutions import multi_context, reasoning, simple\n",
"from ragas.testset.generator import TestsetGenerator"
"from ragas.testset.generator import TestsetGenerator\n",
"from langchain.docstore.document import Document\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -40,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -61,11 +72,47 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"docs = [] # load a set of langchain docs to base the synthetic test set generation on"
"with open(\"../data/extracted_metadata.json\") as f:\n",
" json_data = json.load(f)\n",
" docs = [\n",
" Document(\n",
" page_content=metadata[\"value\"],\n",
" metadata={\"id\": metadata[\"id\"], \"field\": metadata[\"field\"]},\n",
" )\n",
" for metadata in json_data\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(metadata={'id': 'b77ce981-d038-4774-a620-f50da5dd3d31', 'field': 'title'}, page_content='Land Cover Map 2017 (land parcels, GB)'),\n",
" Document(metadata={'id': 'b77ce981-d038-4774-a620-f50da5dd3d31', 'field': 'description'}, page_content=\"This is the land parcels (polygon) dataset for the UKCEH Land Cover Map of 2017 (LCM2017) representing Great Britain. It describes Great Britain's land cover in 2017 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. This dataset was derived from the corresponding LCM2017 20m classified pixels dataset. All further LCM2017 datasets for Great Britain are derived from this land parcel product. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value, and a range of per-parcel pixel statistics to help assessing classification confidence and accuracy; for a full explanation please refer to the dataset documentation.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability.\\n\\n\"),\n",
" Document(metadata={'id': 'b77ce981-d038-4774-a620-f50da5dd3d31', 'field': 'lineage'}, page_content='The Land Parcels datasets of the LCM2017, LCM2018 and LCM2019 product range were derived from the corresponding 20m Classified Pixels datasets. They give per-parcel land cover information in 21 classes based on UK Biodiversity Action Plan broad habitats. A UK spatial framework of land parcels (vector-polygons) representing real-world objects (for example: fields, lakes, urban areas and so forth) was intersected with the 20m Classified Pixels to generate per-parcel pixel statistics. These include a histogram recording pixel frequency per land cover class, the modal land cover class, the total number of pixels and three attribute indicators of classification confidence. Land parcel classification results were validated against independently collected ground observations. All calculations were performed within a PosGIS database. Results were extracted from PostGIS using the QGIS export facility to give the final product as a SpatiaLite vector geodatabse. SpatiaLite is an open standard file format for geospatial vector databases and SpatiLite files can be read by most GIS software.'),\n",
" Document(metadata={'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e', 'field': 'title'}, page_content='Land Cover Map 2021 (25m rasterised land parcels, N. Ireland)'),\n",
" Document(metadata={'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e', 'field': 'description'}, page_content='This is a 25m pixel data set representing the land surface of Northern Ireland, classified into 21 UKCEH land cover classes, based upon Biodiversity Action Plan broad habitats. It is a three-band raster in GeoTiff format, produced by rasterising three properties of the classified land parcels dataset. The first band gives the most likely land cover type; the second band gives the per-parcel probability of the land cover, the third band is a measure of parcel purity. The probability and purity bands (scaled 0 to 100) combine to give an indication of uncertainty. A full description of this and all UKCEH LCM2021 products are available from the LCM2021 product documentation accompanying this dataset.'),\n",
" Document(metadata={'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e', 'field': 'lineage'}, page_content='UKCEH’s automated land cover algorithms classify 10 m pixels across the whole of UK. Training data were automatically selected from stable land covers over the interval of 2018 to 2020. A Random Forest classifier used these to classify four composite images representing per season median surface reflectance. Seasonal images were integrated with context layers (e.g., height, aspect, slope, coastal proximity, urban proximity and so forth) to reduce confusion among classes with similar spectra.\\n\\nLand cover was validated by organising the 10 m pixel classification into a land parcel framework (the LCM2021 classified land parcels product). The classified land parcels were compared to known land cover producing a confusion matrix to determine overall and per class accuracy. Details are available from the product documentation accompanying this dataset.\\n\\nThe 25 m rasterised land parcels product is created by pixelating the corresponding land parcel product.'),\n",
" Document(metadata={'id': '36343ace-d56a-43ea-9d48-2f434dafcb26', 'field': 'title'}, page_content='Land Cover Map 2020 (land parcels, N. Ireland)'),\n",
" Document(metadata={'id': '36343ace-d56a-43ea-9d48-2f434dafcb26', 'field': 'description'}, page_content=\"This is the land parcel (polygon) dataset for the UKCEH Land Cover Map of 2020 (LCM2020) representing Northern Ireland. It describes Northern Ireland's land cover in 2020 using UKCEH Land Cover Classes, which are based on UK Biodiversity Action Plan broad habitats. A range of land parcel attributes are provided. These include the dominant UKCEH Land Cover Class given as an integer value and a range of per-parcel pixel statistics to help assess classification confidence and accuracy; for a full explanation please refer to the dataset documentation accompanying this dataset.\\n\\nLCM2020 represents a suite of geospatial land cover datasets (raster and polygon) describing the UK land surface in 2020. These were produced at the UK Centre for Ecology & Hydrology by classifying satellite images from 2020. These are one of a series of UKCEH land cover maps, which began with the 1990 Land Cover Map of Great Britain (now usually referred to as LCM1990) followed by UK-wide land cover maps in 2000, 2007, 2015 and annually since 2017.\\n\\nThis work was supported by the Natural Environment Research Council award number NE/R016429/1 as part of the UK-SCAPE programme delivering National Capability. \"),\n",
" Document(metadata={'id': '36343ace-d56a-43ea-9d48-2f434dafcb26', 'field': 'lineage'}, page_content='UKCEH’s automated land cover algorithms generated 10m classified pixels from Sentinel-2 satellite data. Training data were automatically selected from stable land covers over the interval of 2017 to 2019. A Random Forest classifier used these to classify four composite images representing per season median surface reflectance. Seasonal images were integrated with context layers (e.g., height, aspect, slope, coastal proximity, urban proximity and so forth) to reduce confusion among classes with similar spectra.\\n\\nLand cover was validated by organising the pixel classification into a land parcel spatial framework (the LCM2020 Classified Land Parcels product). The classified land parcels were compared to known land cover producing confusion matrix to determine overall and per class accuracy. Details are available from the product documentation.\\n\\nThis product represents the LCM2020 Classified Land Parcel, Northern Ireland.')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs"
]
},
{
Expand All @@ -77,9 +124,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Filename and doc_id are the same for all nodes. \n",
"Generating: 100%|██████████| 5/5 [01:10<00:00, 14.03s/it]\n"
]
}
],
"source": [
"testset = gen.generate_with_langchain_docs(docs, 5, dist, is_async=False)"
]
Expand All @@ -93,19 +149,146 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>question</th>\n",
" <th>contexts</th>\n",
" <th>ground_truth</th>\n",
" <th>evolution_type</th>\n",
" <th>metadata</th>\n",
" <th>episode_done</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What are the total number of land parcels iden...</td>\n",
" <td>[Land Cover Map 2020 (land parcels, N. Ireland)]</td>\n",
" <td>The answer to given question is not present in...</td>\n",
" <td>simple</td>\n",
" <td>[{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'...</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What are the land parcels included in the Land...</td>\n",
" <td>[Land Cover Map 2017 (land parcels, GB)]</td>\n",
" <td>The answer to given question is not present in...</td>\n",
" <td>simple</td>\n",
" <td>[{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'...</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What are the main land cover classes used in t...</td>\n",
" <td>[This is the land parcels (polygon) dataset fo...</td>\n",
" <td>The main land cover classes used in the LCM201...</td>\n",
" <td>simple</td>\n",
" <td>[{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'...</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>What did RF ID in NI (LCM2020) vs UK (LCM2021)?</td>\n",
" <td>[UKCEH’s automated land cover algorithms gener...</td>\n",
" <td>The answer to given question is not present in...</td>\n",
" <td>multi_context</td>\n",
" <td>[{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'...</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>What's NI's top two land covers from '21 &amp; '20...</td>\n",
" <td>[Land Cover Map 2021 (25m rasterised land parc...</td>\n",
" <td>The top two land covers in Northern Ireland fo...</td>\n",
" <td>reasoning</td>\n",
" <td>[{'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e'...</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" question \\\n",
"0 What are the total number of land parcels iden... \n",
"1 What are the land parcels included in the Land... \n",
"2 What are the main land cover classes used in t... \n",
"3 What did RF ID in NI (LCM2020) vs UK (LCM2021)? \n",
"4 What's NI's top two land covers from '21 & '20... \n",
"\n",
" contexts \\\n",
"0 [Land Cover Map 2020 (land parcels, N. Ireland)] \n",
"1 [Land Cover Map 2017 (land parcels, GB)] \n",
"2 [This is the land parcels (polygon) dataset fo... \n",
"3 [UKCEH’s automated land cover algorithms gener... \n",
"4 [Land Cover Map 2021 (25m rasterised land parc... \n",
"\n",
" ground_truth evolution_type \\\n",
"0 The answer to given question is not present in... simple \n",
"1 The answer to given question is not present in... simple \n",
"2 The main land cover classes used in the LCM201... simple \n",
"3 The answer to given question is not present in... multi_context \n",
"4 The top two land covers in Northern Ireland fo... reasoning \n",
"\n",
" metadata episode_done \n",
"0 [{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'... True \n",
"1 [{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'... True \n",
"2 [{'id': 'b77ce981-d038-4774-a620-f50da5dd3d31'... True \n",
"3 [{'id': '36343ace-d56a-43ea-9d48-2f434dafcb26'... True \n",
"4 [{'id': 'f3310fe1-a6ea-4cdd-b9f6-f7fc66e4652e'... True "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = testset.to_pandas()\n",
"df.to_csv(\"data/synthetic-datasets/test-set.csv\", index=False)\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".testvenv",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
17 changes: 9 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,22 @@
name = "llm-eval"
version = "0.1.0"
dependencies = [
"plotly == 5.24.1",
"pandas == 2.2.3",
"plotly == 5.22.0",
"pandas == 2.2.2",
"numpy == 1.26.4",
"kaleido == 0.2.1",
"dvc[s3] == 3.2.0 ",
"bitsandbytes == 0.44.1",
"haystack-ai == 2.6.0",
"haystack-ai == 2.2.3",
"accelerate == 1.0.0",
"sentence-transformers == 3.1.1",
"chromadb == 0.5.15",
"sentence-transformers == 3.0.1",
"chromadb == 0.5.3",
"ollama-haystack == 0.0.7",
"chroma-haystack == 0.22.1",
"chroma-haystack == 0.18.0",
"ragas == 0.1.10",
"nltk == 3.9.1",
"nbformat == 4.2.0",
"nltk == 3.8.1",
"nbformat == 5.10.4",
"langchain == 0.2.7",
]

[project.optional-dependencies]
Expand Down
56 changes: 56 additions & 0 deletions scripts/generate_synthetic_testset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import nest_asyncio
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from ragas.run_config import RunConfig
from ragas.testset.evolutions import multi_context, reasoning, simple
from ragas.testset.generator import TestsetGenerator
from langchain.docstore.document import Document
import json
from argparse import ArgumentParser


def load_metadata(metadata_file):
with open(metadata_file) as f:
json_data = json.load(f)
return [
Document(
page_content=metadata["value"],
metadata={"id": metadata["id"], "field": metadata["field"]},
)
for metadata in json_data
]


def main(metadata_file, testset_output_file, testset_size=5):
nest_asyncio.apply()
docs = load_metadata(metadata_file)
llm = ChatOllama(model="mistral-nemo", num_ctx=16384)
embeddings = OllamaEmbeddings(model="mistral-nemo", num_ctx=16384)
gen = TestsetGenerator.from_langchain(
llm, llm, embeddings, run_config=RunConfig(max_workers=1, max_retries=1)
)
dist = {simple: 0.6, multi_context: 0.2, reasoning: 0.2}
testset = gen.generate_with_langchain_docs(docs, testset_size, dist, is_async=False)
df = testset.to_pandas()
df.to_csv(testset_output_file, index=False)


if __name__ == "__main__":
parser = ArgumentParser("generate_synthetic_testset.py")
parser.add_argument(
"metadata_file",
help="Input file containing metadata to base the synthetic test questions on.",
)
parser.add_argument(
"testset_file",
help="Output file to write the test set to.",
)
parser.add_argument(
"testset_size",
help="How many questions to generate in the test set.",
type=int,
nargs="?",
const=5,
)
args = parser.parse_args()
main(args.metadata_file, args.testset_file, args.testset_size)

1 comment on commit 23343d7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

context_recall: 0.5344457898461468
answer_correctness: 0.49286836983792814
answer_relevancy: 0.4926250621684178
context_precision: 0.4961491211510996

Please sign in to comment.