From 2af31c173d95eb6f73d3c5b5debf827becf1e852 Mon Sep 17 00:00:00 2001
From: Jerry Liu <jerryjliu98@gmail.com>
Date: Tue, 12 Sep 2023 16:53:59 -0700
Subject: [PATCH] Add "finetune + RAG" evaluation to knowledge fine-tuning
 notebook (#7643)

---
 .../knowledge/finetune_knowledge.ipynb        | 355 ++++++++++++++----
 1 file changed, 289 insertions(+), 66 deletions(-)

diff --git a/docs/examples/finetuning/knowledge/finetune_knowledge.ipynb b/docs/examples/finetuning/knowledge/finetune_knowledge.ipynb
index fe12cf9d5092e..6be00f37f94a3 100644
--- a/docs/examples/finetuning/knowledge/finetune_knowledge.ipynb
+++ b/docs/examples/finetuning/knowledge/finetune_knowledge.ipynb
@@ -23,7 +23,8 @@
     "import os\n",
     "import openai\n",
     "from llama_index import ServiceContext\n",
-    "from llama_index.llms import OpenAI"
+    "from llama_index.llms import OpenAI\n",
+    "from llama_index import VectorStoreIndex"
    ]
   },
   {
@@ -548,6 +549,25 @@
     "ft_model"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 285,
+   "id": "baf60cc8-6f19-459c-9b7d-7d17b641a8a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# [Optional] use fine-tuned model in RAG system too\n",
+    "from llama_index import ServiceContext\n",
+    "\n",
+    "ft_context = ServiceContext.from_defaults(\n",
+    "    llm=ft_model,\n",
+    "    callback_manager=callback_manager,\n",
+    ")\n",
+    "# baseline RAG system\n",
+    "ft_index = VectorStoreIndex(nodes, service_context=ft_context)\n",
+    "ft_query_engine = ft_index.as_query_engine()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "bdde395f-7a26-48f1-8c7d-038858866313",
@@ -700,8 +720,6 @@
    "outputs": [],
    "source": [
     "# baseline RAG system\n",
-    "from llama_index import VectorStoreIndex\n",
-    "\n",
     "base_index = VectorStoreIndex(nodes, service_context=gpt_35_context)\n",
     "base_query_engine = base_index.as_query_engine()"
    ]
@@ -759,7 +777,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 278,
+   "execution_count": 289,
    "id": "71c86f72-bf55-4edf-a911-0935c4a57a4c",
    "metadata": {},
    "outputs": [],
@@ -807,11 +825,13 @@
     "    raw_responses = []\n",
     "    for eval_dict in tqdm(eval_dicts):\n",
     "        gt_response = eval_dict[\"response\"]\n",
+    "        ft_rag_response = str(ft_query_engine.query(eval_dict[\"query\"]))\n",
     "        ft_response = str(query_model(ft_model, eval_dict))\n",
     "        rag_response = str(base_query_engine.query(eval_dict[\"query\"]))\n",
     "        base_response = str(query_model(base_model, eval_dict))\n",
     "\n",
     "        # try evaluations\n",
+    "        ft_rag_eval = eval_match_gt(eval_dict[\"query\"], gt_response, ft_rag_response)\n",
     "        ft_eval = eval_match_gt(eval_dict[\"query\"], gt_response, ft_response)\n",
     "        rag_eval = eval_match_gt(eval_dict[\"query\"], gt_response, rag_response)\n",
     "        base_eval = eval_match_gt(eval_dict[\"query\"], gt_response, base_response)\n",
@@ -819,9 +839,11 @@
     "        response_dict = {\n",
     "            \"query\": eval_dict[\"query\"],\n",
     "            \"gt_response\": gt_response,\n",
+    "            \"ft_rag_response\": ft_rag_response,\n",
     "            \"ft_response\": ft_response,\n",
     "            \"rag_response\": rag_response,\n",
     "            \"base_response\": base_response,\n",
+    "            \"ft_rag_eval\": ft_rag_eval,\n",
     "            \"ft_eval\": ft_eval,\n",
     "            \"rag_eval\": rag_eval,\n",
     "            \"base_eval\": base_eval,\n",
@@ -832,17 +854,29 @@
     "    raw_responses_df = pd.DataFrame(raw_responses)\n",
     "\n",
     "    eval_dict = {\n",
+    "        \"ft_rag_score\": raw_responses_df[\"ft_rag_eval\"].mean(),\n",
     "        \"ft_score\": raw_responses_df[\"ft_eval\"].mean(),\n",
     "        \"rag_score\": raw_responses_df[\"rag_eval\"].mean(),\n",
     "        \"base_score\": raw_responses_df[\"base_eval\"].mean(),\n",
     "    }\n",
     "\n",
-    "    return eval_dict, raw_responses_df"
+    "    sub_responses_df = raw_responses_df[\n",
+    "        [\n",
+    "            \"query\",\n",
+    "            \"gt_response\",\n",
+    "            \"ft_rag_response\",\n",
+    "            \"ft_response\",\n",
+    "            \"rag_response\",\n",
+    "            \"base_response\",\n",
+    "        ]\n",
+    "    ]\n",
+    "\n",
+    "    return eval_dict, raw_responses_df, sub_responses_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 274,
+   "execution_count": 290,
    "id": "76713b7d-1a31-4f65-a324-534f58b70269",
    "metadata": {},
    "outputs": [],
@@ -862,14 +896,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 275,
+   "execution_count": 291,
    "id": "0076956d-338e-4f22-a10f-253774e5b41e",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "application/json": {
+       "ascii": false,
+       "bar_format": null,
+       "colour": null,
+       "elapsed": 0.009070158004760742,
+       "initial": 0,
+       "n": 0,
+       "ncols": null,
+       "nrows": 37,
+       "postfix": null,
+       "prefix": "",
+       "rate": null,
+       "total": 1,
+       "unit": "it",
+       "unit_divisor": 1000,
+       "unit_scale": false
+      },
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a893aed4d0748a0afff9788c3ac626d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/plain": [
-       "{'ft_score': 1.0, 'rag_score': 1.0, 'base_score': 0.0}"
+       "{'ft_rag_score': 1.0, 'ft_score': 1.0, 'rag_score': 1.0, 'base_score': 0.0}"
       ]
      },
      "metadata": {},
@@ -898,12 +963,10 @@
        "      <th></th>\n",
        "      <th>query</th>\n",
        "      <th>gt_response</th>\n",
+       "      <th>ft_rag_response</th>\n",
        "      <th>ft_response</th>\n",
        "      <th>rag_response</th>\n",
        "      <th>base_response</th>\n",
-       "      <th>ft_eval</th>\n",
-       "      <th>rag_eval</th>\n",
-       "      <th>base_eval</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -911,12 +974,10 @@
        "      <th>0</th>\n",
        "      <td>How is the decision made whether to use safety context distillation or not?</td>\n",
        "      <td>The decision to use safety context distillation is made based on the reward model score. The safety reward model is leveraged to determine whether the context-distilled output receives a better reward model score than the original answer. If the context-distilled output gets a better reward model score, it is kept. This approach helps limit the negative impact of context distillation while still utilizing it in cases where it improves the reward model score.</td>\n",
-       "      <td>assistant: The decision to use safety context distillation is made based on the reward model score. If the reward model score is above a certain threshold, safety context distillation is used.</td>\n",
+       "      <td>The decision on whether to use safety context distillation or not is made based on the reward model score. The safety reward model is leveraged to determine whether the context-distilled output is preferred over the original answer. Safety context distillation is only kept on examples where it receives a better reward model score. This approach helps to limit the negative impact of context distillation while still improving the model's responses on prompts that it is not good at.</td>\n",
+       "      <td>assistant: The decision to use safety context distillation is made based on the reward model score. If the reward model score is higher than a certain threshold, safety context distillation is used.</td>\n",
        "      <td>The decision to use safety context distillation is made based on the reward model score. The safety reward model is used to evaluate whether the context-distilled output gets a better reward model score than the original answer. If the context-distilled output receives a better reward model score, it is kept. This approach helps limit the negative impact of context distillation while still improving the safety of the model's responses.</td>\n",
-       "      <td>assistant: The Llama 2 paper does not provide specific criteria for deciding when to use safety context distillation. The choice to use this method would likely depend on the specific requirements of the task and the potential risks involved. Safety context distillation is used to ensure that the model behaves safely even in situations that were not covered in the training data. If the task involves high-risk decisions or is in a domain where unexpected situations are likely to occur, it might be more important to use safety context distillation. However, this would likely be a decision made on a case-by-case basis, considering factors such as the complexity of the task, the quality and diversity of the training data, and the potential consequences of unsafe behavior.</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
+       "      <td>assistant: The decision to use safety context distillation in the Llama 2 paper is based on the nature of the situation and the need for safety. If the model is in a situation where it needs to generate safe responses, then safety context distillation is used. This is particularly important when the model is interacting with users in real-time, where there's a need to ensure that the outputs are safe and appropriate. The decision is not explicitly mentioned in the paper but is inferred from the context and the purpose of safety context distillation.</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -929,17 +990,17 @@
        "                                                                                                                                                                                                                                                                                                                                                                                                                                                                      gt_response  \\\n",
        "0  The decision to use safety context distillation is made based on the reward model score. The safety reward model is leveraged to determine whether the context-distilled output receives a better reward model score than the original answer. If the context-distilled output gets a better reward model score, it is kept. This approach helps limit the negative impact of context distillation while still utilizing it in cases where it improves the reward model score.   \n",
        "\n",
-       "                                                                                                                                                                                        ft_response  \\\n",
-       "0  assistant: The decision to use safety context distillation is made based on the reward model score. If the reward model score is above a certain threshold, safety context distillation is used.   \n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        ft_rag_response  \\\n",
+       "0  The decision on whether to use safety context distillation or not is made based on the reward model score. The safety reward model is leveraged to determine whether the context-distilled output is preferred over the original answer. Safety context distillation is only kept on examples where it receives a better reward model score. This approach helps to limit the negative impact of context distillation while still improving the model's responses on prompts that it is not good at.   \n",
+       "\n",
+       "                                                                                                                                                                                              ft_response  \\\n",
+       "0  assistant: The decision to use safety context distillation is made based on the reward model score. If the reward model score is higher than a certain threshold, safety context distillation is used.   \n",
        "\n",
        "                                                                                                                                                                                                                                                                                                                                                                                                                                              rag_response  \\\n",
        "0  The decision to use safety context distillation is made based on the reward model score. The safety reward model is used to evaluate whether the context-distilled output gets a better reward model score than the original answer. If the context-distilled output receives a better reward model score, it is kept. This approach helps limit the negative impact of context distillation while still improving the safety of the model's responses.   \n",
        "\n",
-       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                base_response  \\\n",
-       "0  assistant: The Llama 2 paper does not provide specific criteria for deciding when to use safety context distillation. The choice to use this method would likely depend on the specific requirements of the task and the potential risks involved. Safety context distillation is used to ensure that the model behaves safely even in situations that were not covered in the training data. If the task involves high-risk decisions or is in a domain where unexpected situations are likely to occur, it might be more important to use safety context distillation. However, this would likely be a decision made on a case-by-case basis, considering factors such as the complexity of the task, the quality and diversity of the training data, and the potential consequences of unsafe behavior.   \n",
-       "\n",
-       "   ft_eval  rag_eval  base_eval  \n",
-       "0        1         1          0  "
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 base_response  \n",
+       "0  assistant: The decision to use safety context distillation in the Llama 2 paper is based on the nature of the situation and the need for safety. If the model is in a situation where it needs to generate safe responses, then safety context distillation is used. This is particularly important when the model is interacting with users in real-time, where there's a need to ensure that the outputs are safe and appropriate. The decision is not explicitly mentioned in the paper but is inferred from the context and the purpose of safety context distillation.  "
       ]
      },
      "metadata": {},
@@ -947,21 +1008,52 @@
     }
    ],
    "source": [
-    "eval_dict, raw_response_df = run_evals(train_dicts[7:8])\n",
+    "eval_dict, raw_response_df, sub_responses_df = run_evals(train_dicts[7:8])\n",
     "display(eval_dict)\n",
-    "display(raw_response_df)"
+    "display(sub_responses_df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 276,
+   "execution_count": 292,
    "id": "edc4e854-f8e0-4e58-93ae-a3d6c3d8c0ed",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "application/json": {
+       "ascii": false,
+       "bar_format": null,
+       "colour": null,
+       "elapsed": 0.005460977554321289,
+       "initial": 0,
+       "n": 0,
+       "ncols": null,
+       "nrows": 37,
+       "postfix": null,
+       "prefix": "",
+       "rate": null,
+       "total": 1,
+       "unit": "it",
+       "unit_divisor": 1000,
+       "unit_scale": false
+      },
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "71282adcc74b4626b31a9fb6727428d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/plain": [
-       "{'ft_score': 0.0, 'rag_score': 1.0, 'base_score': 0.0}"
+       "{'ft_rag_score': 1.0, 'ft_score': 0.0, 'rag_score': 1.0, 'base_score': 0.0}"
       ]
      },
      "metadata": {},
@@ -990,12 +1082,10 @@
        "      <th></th>\n",
        "      <th>query</th>\n",
        "      <th>gt_response</th>\n",
+       "      <th>ft_rag_response</th>\n",
        "      <th>ft_response</th>\n",
        "      <th>rag_response</th>\n",
        "      <th>base_response</th>\n",
-       "      <th>ft_eval</th>\n",
-       "      <th>rag_eval</th>\n",
-       "      <th>base_eval</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -1003,12 +1093,10 @@
        "      <th>0</th>\n",
        "      <td>What model is used to predict the truthfulness and informativeness of the generated outputs from LLMs?</td>\n",
        "      <td>A fine-tuned GPT-3 model, referred to as \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.</td>\n",
+       "      <td>A fine-tuned GPT-3 model, also known as a \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.</td>\n",
        "      <td>assistant: The model used to predict the truthfulness and informativeness of the generated outputs from LLMs is called TruthfulQA.</td>\n",
-       "      <td>A fine-tuned GPT-3 model, referred to as \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.</td>\n",
-       "      <td>assistant: The Llama 2 paper does not specify a particular model used to predict the truthfulness and informativeness of the generated outputs from LLMs (Language Model). The paper primarily focuses on the limitations and potential risks of large language models. If you're referring to a different paper or model, please provide more details.</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
+       "      <td>A fine-tuned GPT-3 model, referred to as a \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.</td>\n",
+       "      <td>assistant: The Llama 2 paper does not specify a particular model used to predict the truthfulness and informativeness of the generated outputs from Language Models. The paper primarily focuses on the limitations and risks of large language models and does not delve into specific methods or models for evaluating truthfulness or informativeness.</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1021,17 +1109,17 @@
        "                                                                                                                                         gt_response  \\\n",
        "0  A fine-tuned GPT-3 model, referred to as \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.   \n",
        "\n",
+       "                                                                                                                                      ft_rag_response  \\\n",
+       "0  A fine-tuned GPT-3 model, also known as a \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.   \n",
+       "\n",
        "                                                                                                                          ft_response  \\\n",
        "0  assistant: The model used to predict the truthfulness and informativeness of the generated outputs from LLMs is called TruthfulQA.   \n",
        "\n",
-       "                                                                                                                                        rag_response  \\\n",
-       "0  A fine-tuned GPT-3 model, referred to as \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.   \n",
-       "\n",
-       "                                                                                                                                                                                                                                                                                                                                             base_response  \\\n",
-       "0  assistant: The Llama 2 paper does not specify a particular model used to predict the truthfulness and informativeness of the generated outputs from LLMs (Language Model). The paper primarily focuses on the limitations and potential risks of large language models. If you're referring to a different paper or model, please provide more details.   \n",
+       "                                                                                                                                          rag_response  \\\n",
+       "0  A fine-tuned GPT-3 model, referred to as a \"GPT-judge,\" is used to predict the truthfulness and informativeness of the generated outputs from LLMs.   \n",
        "\n",
-       "   ft_eval  rag_eval  base_eval  \n",
-       "0        0         1          0  "
+       "                                                                                                                                                                                                                                                                                                                                               base_response  \n",
+       "0  assistant: The Llama 2 paper does not specify a particular model used to predict the truthfulness and informativeness of the generated outputs from Language Models. The paper primarily focuses on the limitations and risks of large language models and does not delve into specific methods or models for evaluating truthfulness or informativeness.  "
       ]
      },
      "metadata": {},
@@ -1039,9 +1127,9 @@
     }
    ],
    "source": [
-    "eval_dict, raw_response_df = run_evals(eval_dicts[6:7])\n",
+    "eval_dict, raw_response_df, sub_responses_df = run_evals(eval_dicts[6:7])\n",
     "display(eval_dict)\n",
-    "display(raw_response_df)"
+    "display(sub_responses_df)"
    ]
   },
   {
@@ -1056,7 +1144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 281,
+   "execution_count": 297,
    "id": "72f7c8e3-435d-473b-a4a1-faa2667337d3",
    "metadata": {},
    "outputs": [],
@@ -1071,7 +1159,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 282,
+   "execution_count": 298,
    "id": "50caca49-396b-471b-a1ec-b78cd357776c",
    "metadata": {},
    "outputs": [
@@ -1081,11 +1169,11 @@
        "ascii": false,
        "bar_format": null,
        "colour": null,
-       "elapsed": 0.007797956466674805,
+       "elapsed": 0.006827116012573242,
        "initial": 0,
        "n": 0,
        "ncols": null,
-       "nrows": 28,
+       "nrows": 37,
        "postfix": null,
        "prefix": "",
        "rate": null,
@@ -1095,7 +1183,7 @@
        "unit_scale": false
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "22763a7cef1e4d1db40ed89c11c0c0d0",
+       "model_id": "d35989347b4442e78e8307ce1662a661",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1109,7 +1197,7 @@
     {
      "data": {
       "text/plain": [
-       "{'ft_score': 0.425, 'rag_score': 0.7, 'base_score': 0.225}"
+       "{'ft_rag_score': 0.75, 'ft_score': 0.45, 'rag_score': 0.7, 'base_score': 0.3}"
       ]
      },
      "metadata": {},
@@ -1117,14 +1205,145 @@
     }
    ],
    "source": [
-    "eval_result, raw_response_df = run_evals(train_dicts_sample)\n",
-    "display(eval_result)\n",
+    "result_train, raw_response_df, sub_responses_df = run_evals(train_dicts_sample)\n",
+    "display(result_train)\n",
     "# display(raw_response_df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 301,
+   "id": "554296bc-be25-4beb-bc3f-5afe7117e890",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>query</th>\n",
+       "      <th>gt_response</th>\n",
+       "      <th>ft_rag_response</th>\n",
+       "      <th>ft_response</th>\n",
+       "      <th>rag_response</th>\n",
+       "      <th>base_response</th>\n",
+       "      <th>ft_rag_eval</th>\n",
+       "      <th>ft_eval</th>\n",
+       "      <th>rag_eval</th>\n",
+       "      <th>base_eval</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>What is the five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark?</td>\n",
+       "      <td>The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 78.5.</td>\n",
+       "      <td>The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 78.5.</td>\n",
+       "      <td>assistant: The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 82.7.</td>\n",
+       "      <td>The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 85.0.</td>\n",
+       "      <td>assistant: The Llama 2 paper does not provide specific information on the five-shot performance of a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark. Please refer to the original paper or contact the authors for the most accurate information.</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>What is the role of third-party SFT data in the fine-tuning process of the Llama 2 model?</td>\n",
+       "      <td>The role of third-party SFT data in the fine-tuning process of the Llama 2 model is to provide additional examples for aligning the model towards dialogue-style instructions. However, the context does not provide specific details about how the third-party SFT data is utilized in the fine-tuning process.</td>\n",
+       "      <td>The role of third-party SFT data in the fine-tuning process of the Llama 2 model is to align LLMs (Language Model Models) towards dialogue-style instructions. However, it has been found that many of these third-party SFT datasets have insufficient diversity and quality. As a result, the focus was shifted to collecting several thousand examples of high-quality SFT data. By using fewer but higher-quality examples from vendor-based annotation efforts, the results notably improved. The findings are similar to previous research that suggests a limited set of clean instruction-tuning data can be sufficient to achieve a high level of quality. It was found that tens of thousands of SFT annotations were enough to achieve a high-quality result in the fine-tuning process of the Llama 2 model.</td>\n",
+       "      <td>assistant: The role of third-party SFT data in the fine-tuning process of the Llama 2 model is to provide additional training data for certain categories. This data is used to improve the performance of the model in those specific categories.</td>\n",
+       "      <td>The context does not provide information about the role of third-party SFT data in the fine-tuning process of the Llama 2 model.</td>\n",
+       "      <td>assistant: I'm sorry for any confusion, but as of my current knowledge, there is no specific model called \"Llama 2\". Therefore, I can't provide information about the role of third-party SFT data in its fine-tuning process. However, I can tell you that in general, third-party SFT (Supervised Fine-Tuning) data is often used in machine learning to improve the performance of a model by training it on a specific task. If you have any more details or if there's a specific context, I'd be happy to help further!</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>What measures have been taken to mitigate the generation of harmful, offensive, or biased content by Llama 2?</td>\n",
+       "      <td>Efforts have been made to mitigate the generation of harmful, offensive, or biased content by Llama 2. The model has undergone fine-tuning to address these issues, although some issues may still remain, especially for languages other than English where publicly available datasets were not available. OpenAI acknowledges the potential for harmful, offensive, or biased content due to the training on publicly available online datasets. They have attempted to balance safety with helpfulness, but in some instances, the safety tuning may go too far, resulting in an overly cautious approach. OpenAI encourages users of Llama 2 to be cautious and take extra steps in tuning and deployment as described in their Responsible Use Guide.</td>\n",
+       "      <td>Instruction fine-tuning has been used to mitigate the generation of harmful, offensive, or biased content by Llama 2. This process has shown to effectively reduce the percentage of toxic model generations, resulting in an effectively zero percentage of toxic generations for the fine-tuned Llama 2-Chat models of all sizes. The results of the toxicity analysis for the fine-tuned models are not presented in Table 45 because of this significant reduction in toxic content generation.</td>\n",
+       "      <td>assistant: Llama 2 has been trained on a filtered and curated dataset to mitigate the generation of harmful, offensive, or biased content. Additionally, a reward model has been used to fine-tune Llama 2-Chat to further align the model with human preferences and to reduce the generation of harmful and untruthful outputs. The reward model is based on the comparison of model responses, where human annotators rank the responses according to their quality. This iterative process of training and evaluation has helped in mitigating the generation of harmful, offensive, or biased content by Llama 2.</td>\n",
+       "      <td>Llama 2 has taken measures to mitigate the generation of harmful, offensive, or biased content. One of the measures is the use of the ToxiGen dataset, which contains implicitly toxic and benign sentences mentioning minority groups. Llama 2 uses a revised version of this dataset to reduce noise and filter out prompts for which annotators disagree on the target demographic group. Additionally, Llama 2 employs a default ToxiGen classifier tuned on RoBERTa to measure the toxicity of its generations. Furthermore, Llama 2 conducts a sentiment analysis using the Valence Aware Dictionary and Sentiment Reasoner (VADER) to evaluate the sentiments conveyed by the combination of prompt prefix and model generation. These measures help in analyzing and addressing the potential biases and toxicity in the generated content.</td>\n",
+       "      <td>assistant: The Llama 2 paper does not provide specific details on measures taken to mitigate the generation of harmful, offensive, or biased content. However, it's common for AI models like Llama 2 to incorporate various strategies to ensure responsible use. These may include the use of reinforcement learning from human feedback to reduce harmful and untruthful outputs, and the use of guidelines for human reviewers during the fine-tuning process to avoid potential biases. Additionally, efforts are often made to improve the default behavior of the system, and provide users with customization options to define the AI's values within broad bounds. Please refer to the original source or the organization behind Llama 2 for more specific information.</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                              query  \\\n",
+       "11  What is the five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark?   \n",
+       "22                                        What is the role of third-party SFT data in the fine-tuning process of the Llama 2 model?   \n",
+       "35                    What measures have been taken to mitigate the generation of harmful, offensive, or biased content by Llama 2?   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    gt_response  \\\n",
+       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 78.5.   \n",
+       "22                                                                                                                                                                                                                                                                                                                                                                                                                                             The role of third-party SFT data in the fine-tuning process of the Llama 2 model is to provide additional examples for aligning the model towards dialogue-style instructions. However, the context does not provide specific details about how the third-party SFT data is utilized in the fine-tuning process.   \n",
+       "35  Efforts have been made to mitigate the generation of harmful, offensive, or biased content by Llama 2. The model has undergone fine-tuning to address these issues, although some issues may still remain, especially for languages other than English where publicly available datasets were not available. OpenAI acknowledges the potential for harmful, offensive, or biased content due to the training on publicly available online datasets. They have attempted to balance safety with helpfulness, but in some instances, the safety tuning may go too far, resulting in an overly cautious approach. OpenAI encourages users of Llama 2 to be cautious and take extra steps in tuning and deployment as described in their Responsible Use Guide.   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             ft_rag_response  \\\n",
+       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 78.5.   \n",
+       "22  The role of third-party SFT data in the fine-tuning process of the Llama 2 model is to align LLMs (Language Model Models) towards dialogue-style instructions. However, it has been found that many of these third-party SFT datasets have insufficient diversity and quality. As a result, the focus was shifted to collecting several thousand examples of high-quality SFT data. By using fewer but higher-quality examples from vendor-based annotation efforts, the results notably improved. The findings are similar to previous research that suggests a limited set of clean instruction-tuning data can be sufficient to achieve a high level of quality. It was found that tens of thousands of SFT annotations were enough to achieve a high-quality result in the fine-tuning process of the Llama 2 model.   \n",
+       "35                                                                                                                                                                                                                                                                                                                       Instruction fine-tuning has been used to mitigate the generation of harmful, offensive, or biased content by Llama 2. This process has shown to effectively reduce the percentage of toxic model generations, resulting in an effectively zero percentage of toxic generations for the fine-tuned Llama 2-Chat models of all sizes. The results of the toxicity analysis for the fine-tuned models are not presented in Table 45 because of this significant reduction in toxic content generation.   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               ft_response  \\\n",
+       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                                              assistant: The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 82.7.   \n",
+       "22                                                                                                                                                                                                                                                                                                                                                                      assistant: The role of third-party SFT data in the fine-tuning process of the Llama 2 model is to provide additional training data for certain categories. This data is used to improve the performance of the model in those specific categories.   \n",
+       "35  assistant: Llama 2 has been trained on a filtered and curated dataset to mitigate the generation of harmful, offensive, or biased content. Additionally, a reward model has been used to fine-tune Llama 2-Chat to further align the model with human preferences and to reduce the generation of harmful and untruthful outputs. The reward model is based on the comparison of model responses, where human annotators rank the responses according to their quality. This iterative process of training and evaluation has helped in mitigating the generation of harmful, offensive, or biased content by Llama 2.   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           rag_response  \\\n",
+       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      The five-shot performance of Llama 2 with a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark is 85.0.   \n",
+       "22                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     The context does not provide information about the role of third-party SFT data in the fine-tuning process of the Llama 2 model.   \n",
+       "35  Llama 2 has taken measures to mitigate the generation of harmful, offensive, or biased content. One of the measures is the use of the ToxiGen dataset, which contains implicitly toxic and benign sentences mentioning minority groups. Llama 2 uses a revised version of this dataset to reduce noise and filter out prompts for which annotators disagree on the target demographic group. Additionally, Llama 2 employs a default ToxiGen classifier tuned on RoBERTa to measure the toxicity of its generations. Furthermore, Llama 2 conducts a sentiment analysis using the Valence Aware Dictionary and Sentiment Reasoner (VADER) to evaluate the sentiments conveyed by the combination of prompt prefix and model generation. These measures help in analyzing and addressing the potential biases and toxicity in the generated content.   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         base_response  \\\n",
+       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       assistant: The Llama 2 paper does not provide specific information on the five-shot performance of a 70B model on the Massive Multitask Language Understanding (MMLU) benchmark. Please refer to the original paper or contact the authors for the most accurate information.   \n",
+       "22                                                                                                                                                                                                                                                       assistant: I'm sorry for any confusion, but as of my current knowledge, there is no specific model called \"Llama 2\". Therefore, I can't provide information about the role of third-party SFT data in its fine-tuning process. However, I can tell you that in general, third-party SFT (Supervised Fine-Tuning) data is often used in machine learning to improve the performance of a model by training it on a specific task. If you have any more details or if there's a specific context, I'd be happy to help further!   \n",
+       "35  assistant: The Llama 2 paper does not provide specific details on measures taken to mitigate the generation of harmful, offensive, or biased content. However, it's common for AI models like Llama 2 to incorporate various strategies to ensure responsible use. These may include the use of reinforcement learning from human feedback to reduce harmful and untruthful outputs, and the use of guidelines for human reviewers during the fine-tuning process to avoid potential biases. Additionally, efforts are often made to improve the default behavior of the system, and provide users with customization options to define the AI's values within broad bounds. Please refer to the original source or the organization behind Llama 2 for more specific information.   \n",
+       "\n",
+       "    ft_rag_eval  ft_eval  rag_eval  base_eval  \n",
+       "11            1        0         0          0  \n",
+       "22            1        1         0          0  \n",
+       "35            1        1         0          0  "
+      ]
+     },
+     "execution_count": 301,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# look at where ft_rag_score did well but rag didn't\n",
+    "d = raw_response_df\n",
+    "d[(d[\"ft_rag_eval\"] == 1) & (d[\"rag_eval\"] == 0)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 295,
    "id": "b737d2a3-a8c3-46f5-9709-cb16b56365a9",
    "metadata": {},
    "outputs": [
@@ -1134,11 +1353,11 @@
        "ascii": false,
        "bar_format": null,
        "colour": null,
-       "elapsed": 0.008716106414794922,
+       "elapsed": 0.009910106658935547,
        "initial": 0,
        "n": 0,
        "ncols": null,
-       "nrows": 28,
+       "nrows": 37,
        "postfix": null,
        "prefix": "",
        "rate": null,
@@ -1148,7 +1367,7 @@
        "unit_scale": false
       },
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "900f482111954273aec557a292681958",
+       "model_id": "f8f8685cefb14e529baf3e97c2e469e2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1158,21 +1377,25 @@
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'ft_rag_score': 0.825,\n",
+       " 'ft_score': 0.375,\n",
+       " 'rag_score': 0.775,\n",
+       " 'base_score': 0.225}"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "eval_result, raw_response_df = run_evals(eval_dicts_sample)\n",
-    "display(eval_result)\n",
+    "result_eval, raw_response_df, sub_responses_df = run_evals(eval_dicts_sample)\n",
+    "display(result_eval)\n",
     "# display(raw_response_df)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8e6295bc-3637-49ab-9c87-060232b6c3da",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {