From 23e6ade54e4769dcd36875601c7597ffc172886c Mon Sep 17 00:00:00 2001
From: bilgeyucel <bilgeyucel96@gmail.com>
Date: Wed, 30 Oct 2024 21:27:24 +0300
Subject: [PATCH] Remove some output cells from the evaluating ai with Haystack
 cookbook

---
 notebooks/evaluating_ai_with_haystack.ipynb | 1744 ++-----------------
 1 file changed, 141 insertions(+), 1603 deletions(-)

diff --git a/notebooks/evaluating_ai_with_haystack.ipynb b/notebooks/evaluating_ai_with_haystack.ipynb
index 6eaa888..386ba87 100644
--- a/notebooks/evaluating_ai_with_haystack.ipynb
+++ b/notebooks/evaluating_ai_with_haystack.ipynb
@@ -10,7 +10,7 @@
         "\n",
         "by Bilge Yucel ([X](https://x.com/bilgeycl), [Linkedin](https://www.linkedin.com/in/bilge-yucel/))\n",
         "\n",
-        "In this cookbook, we walktrough the [Evaluators](https://docs.haystack.deepset.ai/docs/evaluators) in Haystack, create an evaluation pipeline, streamline the evaluation with [`EvaluationHarness`](https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness) and try different Evaluation Frameworks like [Ragas](https://haystack.deepset.ai/integrations/ragas) and [FlowJudge](https://haystack.deepset.ai/integrations/flow-judge). \n",
+        "In this cookbook, we walk through the [Evaluators](https://docs.haystack.deepset.ai/docs/evaluators) in Haystack, create an evaluation pipeline, streamline the evaluation with [`EvaluationHarness`](https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness) and try different Evaluation Frameworks like [Ragas](https://haystack.deepset.ai/integrations/ragas) and [FlowJudge](https://haystack.deepset.ai/integrations/flow-judge). \n",
         "\n",
         "📚 **Useful Resources:**\n",
         "* [Article: Benchmarking Haystack Pipelines for Optimal Performance](https://haystack.deepset.ai/blog/benchmarking-haystack-pipelines)\n",
@@ -950,7 +950,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 53,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -960,16 +960,102 @@
         "id": "f1rGBuvocwLB",
         "outputId": "d607b987-2f03-47dd-f3ee-0cedd73fe4b9"
       },
+      "outputs": [],
+      "source": [
+        "eval_results.to_pandas()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "collapsed": true,
+        "id": "dFrIfDXiaP41",
+        "outputId": "4f187782-7933-4bc0-89b9-51c6672c20ec"
+      },
+      "outputs": [],
+      "source": [
+        "index = 2\n",
+        "print(eval_pipeline_results['context_relevance'][\"individual_scores\"][index], \"\\nQuestion:\", questions[index],\"\\nTrue Answer:\", answers[index], \"\\nAnswer:\", predicted_answers[index])\n",
+        "print(\"\".join([doc.content for doc in retrieved_context[index]]))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OmkHqAsQZhFr"
+      },
+      "source": [
+        "## Evaluation Harness (Step 4, 5, and 6)\n",
+        "\n",
+        "* Runs the RAG pipeline\n",
+        "* Runs the evaluation\n",
+        "\n",
+        "> Try `EvaluationHarness` and give us feedback [on Github](https://github.com/deepset-ai/haystack-experimental/discussions/74)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "IQlMUUOeZkpT"
+      },
+      "outputs": [],
+      "source": [
+        "from haystack_experimental.evaluation.harness.rag import (\n",
+        "    DefaultRAGArchitecture,\n",
+        "    RAGEvaluationHarness,\n",
+        "    RAGEvaluationMetric,\n",
+        "    RAGEvaluationInput\n",
+        ")\n",
+        "\n",
+        "pipeline_eval_harness = RAGEvaluationHarness(\n",
+        "    rag_pipeline = basic_rag,\n",
+        "    rag_components=DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, # query_embedder, retriever, prompt_builder, generator\n",
+        "    metrics={\n",
+        "        RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY,\n",
+        "        RAGEvaluationMetric.FAITHFULNESS,\n",
+        "        RAGEvaluationMetric.CONTEXT_RELEVANCE,\n",
+        "    }\n",
+        ")\n",
+        "\n",
+        "eval_harness_input = RAGEvaluationInput(\n",
+        "    queries=questions,\n",
+        "    ground_truth_answers=answers,\n",
+        "    rag_pipeline_inputs={\n",
+        "        \"prompt_builder\": {\"question\": list(questions)},\n",
+        "    },\n",
+        ")\n",
+        "\n",
+        "harness_eval_run= pipeline_eval_harness.run(inputs=eval_harness_input, run_name=run_name)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 49,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 143
+        },
+        "collapsed": true,
+        "id": "edZNrdB9sKwm",
+        "outputId": "b4d8ad56-578e-4953-bdea-4c1d1d4f1a54"
+      },
       "outputs": [
         {
           "data": {
             "application/vnd.google.colaboratory.intrinsic+json": {
-              "summary": "{\n  \"name\": \"eval_results\",\n  \"rows\": 15,\n  \"fields\": [\n    {\n      \"column\": \"questions\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 15,\n        \"samples\": [\n          \"What detailed methodology does LLaMA utilize to ensure the diversity of its pre-training data, particularly in the context of filtering and language identification?\",\n          \"What specific enhancements are recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing?\",\n          \"What are the two main tasks BERT is pre-trained on?\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"contexts\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"true_answers\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 15,\n        \"samples\": [\n          \"LLaMA's methodology for ensuring data diversity includes comprehensive filtering criteria to remove low-quality content, language identification to support multilingual capabilities, and a balanced inclusion of various data sources such as Wikipedia, web text, and books, emphasizing the representation of a wide range of topics and languages.\",\n          \"Enhancements should focus on developing models with improved procedural knowledge, superior calculation abilities, and a more accurate calibration between confidence and actual performance, directly addressing the weaknesses uncovered in current evaluations.\",\n          \"Masked LM (MLM) and Next Sentence Prediction (NSP).\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"predicted_answers\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 13,\n        \"samples\": [\n          \"DetectGPT's detection approach is significant in the context of evolving large language model (LLM) capabilities as it addresses the challenges posed by these models, which are increasingly adept at generating coherent and convincing text. As LLMs improve, they become more attractive tools for various applications, including education and journalism; however, this rise brings with it the potential for misuse, such as the production of misleading or inaccurate information. \\n\\nDetectGPT leverages the hypothesis that machine-generated texts typically occupy regions of negative curvature in the log probability function of the generating model. By comparing the log probabilities of original passages with those of slightly modified perturbations, DetectGPT can effectively differentiate between human-written and model-generated text. This method not only enhances detection accuracy but also adapts to the rapid advancements in LLMs, making it a timely solution as these models proliferate.\\n\\nFurthermore, as LLMs are deployed in real-world contexts, such as automatic essay writing or news generation with minimal human oversight, the risks of fraudulence and misinformation escalate. DetectGPT provides a potential safeguard by improving the reliability of detecting AI-generated content, helping educators and news consumers ascertain the authenticity of the texts they engage with. \\n\\nOverall, DetectGPT's innovative detection strategy is crucial for mitigating the associated risks of LLM misuse, ensuring that the growing use of these technologies does not compromise academic integrity or the credibility of information disseminated to the public. Its ongoing development and improvement could serve as a vital countermeasure in an era of AI where trustworthiness becomes increasingly difficult to verify.\",\n          \"The context does not provide specific enhancements recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing. It mentions that models need \\\"substantial improvements\\\" and highlights issues with accuracy on socially important subjects like morality and law, but does not detail specific enhancements. Therefore, the answer is:\\n\\nNone\",\n          \"The two main tasks BERT is pre-trained on are the \\\"masked language model\\\" (MLM) task and the \\\"next sentence prediction\\\" (NSP) task.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"context_relevance\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          1,\n          0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"faithfulness\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.46147910349544863,\n        \"min\": 0.0,\n        \"max\": 1.0,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          0.0,\n          0.6666666666666666\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sas\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.3054620861350202,\n        \"min\": -0.005470070987939835,\n        \"max\": 0.9627792835235596,\n        \"num_unique_values\": 15,\n        \"samples\": [\n          -0.005470070987939835,\n          0.3704226613044739\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
+              "summary": "{\n  \"name\": \"harness_eval_run\",\n  \"rows\": 3,\n  \"fields\": [\n    {\n      \"column\": \"metrics\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"metric_context_relevance\",\n          \"metric_sas\",\n          \"metric_faithfulness\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.24119916658007876,\n        \"min\": 0.26666666666666666,\n        \"max\": 0.7477777777777778,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0.26666666666666666,\n          0.5377212050060431,\n          0.7477777777777778\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
               "type": "dataframe"
             },
             "text/html": [
               "\n",
-              "  <div id=\"df-f694982e-20d4-464c-b0d4-1bfc9dbf7766\" class=\"colab-df-container\">\n",
+              "  <div id=\"df-0ebdf450-47ed-45e2-a2cc-73d3e01abade\" class=\"colab-df-container\">\n",
               "    <div>\n",
               "<style scoped>\n",
               "    .dataframe tbody tr th:only-of-type {\n",
@@ -988,165 +1074,25 @@
               "  <thead>\n",
               "    <tr style=\"text-align: right;\">\n",
               "      <th></th>\n",
-              "      <th>questions</th>\n",
-              "      <th>contexts</th>\n",
-              "      <th>true_answers</th>\n",
-              "      <th>predicted_answers</th>\n",
-              "      <th>context_relevance</th>\n",
-              "      <th>faithfulness</th>\n",
-              "      <th>sas</th>\n",
+              "      <th>metrics</th>\n",
+              "      <th>score</th>\n",
               "    </tr>\n",
               "  </thead>\n",
               "  <tbody>\n",
               "    <tr>\n",
               "      <th>0</th>\n",
-              "      <td>What are the two main tasks BERT is pre-traine...</td>\n",
-              "      <td>[Document(id=1996eb783b7e2934527de00e3d5f82fb5...</td>\n",
-              "      <td>Masked LM (MLM) and Next Sentence Prediction (...</td>\n",
-              "      <td>The two main tasks BERT is pre-trained on are ...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0.552495</td>\n",
+              "      <td>metric_context_relevance</td>\n",
+              "      <td>0.266667</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>1</th>\n",
-              "      <td>What model sizes are reported for BERT, and wh...</td>\n",
-              "      <td>[Document(id=8906a653a71ec55161d5f8c6203335456...</td>\n",
-              "      <td>BERTBASE (L=12, H=768, A=12, Total Parameters=...</td>\n",
-              "      <td>The BERT model sizes reported are:\\n\\n1. **BER...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0.664142</td>\n",
+              "      <td>metric_sas</td>\n",
+              "      <td>0.537721</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>2</th>\n",
-              "      <td>How does BERT's architecture facilitate the us...</td>\n",
-              "      <td>[Document(id=320d3c00ef93938ee6cc92f6a742ba1ed...</td>\n",
-              "      <td>BERT uses a multi-layer bidirectional Transfor...</td>\n",
-              "      <td>BERT's architecture facilitates the use of a u...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0.817575</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>Can you describe the modifications LLaMA makes...</td>\n",
-              "      <td>[Document(id=f360dea1ec15f8f778718ae1e13eb855b...</td>\n",
-              "      <td>LLaMA incorporates pre-normalization (using R...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0.015276</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>How does LLaMA's approach to embedding layer o...</td>\n",
-              "      <td>[Document(id=f360dea1ec15f8f778718ae1e13eb855b...</td>\n",
-              "      <td>LLaMA introduces optimizations in its embeddin...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0.075397</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>How were the questions for the multitask test ...</td>\n",
-              "      <td>[Document(id=9415e713cf73ffea5ca383126c54f7ec4...</td>\n",
-              "      <td>Questions were manually collected by graduate ...</td>\n",
-              "      <td>The questions for the multitask test were manu...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0.652526</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>How does BERT's performance on the GLUE benchm...</td>\n",
-              "      <td>[Document(id=606c67eb5eeb136ad77616d2ef06a580b...</td>\n",
-              "      <td>BERT achieved new state-of-the-art on the GLUE...</td>\n",
-              "      <td>BERT significantly outperforms all previous st...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.833333</td>\n",
-              "      <td>0.857448</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>7</th>\n",
-              "      <td>What significant improvements does BERT bring ...</td>\n",
-              "      <td>[Document(id=4ca8419f5c01c094bbda9617b3ce328cb...</td>\n",
-              "      <td>BERT set new records on SQuAD v1.1 and v2.0, s...</td>\n",
-              "      <td>BERT brings substantial improvements to the SQ...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0.586361</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>8</th>\n",
-              "      <td>What unique aspect of the LLaMA training datas...</td>\n",
-              "      <td>[Document(id=236e5c1e3c782e68912426a7f2543710c...</td>\n",
-              "      <td>LLaMA's training dataset is distinctive for b...</td>\n",
-              "      <td>The unique aspect of the LLaMA training datase...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.666667</td>\n",
-              "      <td>0.962779</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>9</th>\n",
-              "      <td>What detailed methodology does LLaMA utilize t...</td>\n",
-              "      <td>[Document(id=9885fbffa74c564acd7a255e8b66a3343...</td>\n",
-              "      <td>LLaMA's methodology for ensuring data diversit...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>-0.005470</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>10</th>\n",
-              "      <td>What are the specific domains covered by the m...</td>\n",
-              "      <td>[Document(id=9415e713cf73ffea5ca383126c54f7ec4...</td>\n",
-              "      <td>The test covers 57 subjects across STEM, human...</td>\n",
-              "      <td>The specific domains covered by the multitask ...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0.620999</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>11</th>\n",
-              "      <td>What specific enhancements are recommended for...</td>\n",
-              "      <td>[Document(id=ac7c3c2e29e31cf47dc1027f7d31ea94d...</td>\n",
-              "      <td>Enhancements should focus on developing models...</td>\n",
-              "      <td>The context does not provide specific enhancem...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0.370423</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>12</th>\n",
-              "      <td>What methodology does DetectGPT use to generat...</td>\n",
-              "      <td>[Document(id=a862c889a8c02afa59e422bc2cbeb2425...</td>\n",
-              "      <td>DetectGPT generates minor perturbations using ...</td>\n",
-              "      <td>DetectGPT generates minor perturbations in the...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.666667</td>\n",
-              "      <td>0.734830</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>13</th>\n",
-              "      <td>Discuss the significance of DetectGPT's detect...</td>\n",
-              "      <td>[Document(id=ef8ff80b74a24f6cec05be8135930ba1b...</td>\n",
-              "      <td>DtectGPT's approach is significant as it provi...</td>\n",
-              "      <td>DetectGPT's detection approach is significant ...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0.508008</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>14</th>\n",
-              "      <td>How is the student model, DistilBERT, initiali...</td>\n",
-              "      <td>[Document(id=33d936e116b7764ce538130aaa40c7b37...</td>\n",
-              "      <td>DistilBERT is initialized from the teacher mod...</td>\n",
-              "      <td>The student model, DistilBERT, is initialized ...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0.778503</td>\n",
+              "      <td>metric_faithfulness</td>\n",
+              "      <td>0.747778</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
@@ -1154,7 +1100,7 @@
               "    <div class=\"colab-df-buttons\">\n",
               "\n",
               "  <div class=\"colab-df-container\">\n",
-              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f694982e-20d4-464c-b0d4-1bfc9dbf7766')\"\n",
+              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0ebdf450-47ed-45e2-a2cc-73d3e01abade')\"\n",
               "            title=\"Convert this dataframe to an interactive table.\"\n",
               "            style=\"display:none;\">\n",
               "\n",
@@ -1206,12 +1152,12 @@
               "\n",
               "    <script>\n",
               "      const buttonEl =\n",
-              "        document.querySelector('#df-f694982e-20d4-464c-b0d4-1bfc9dbf7766 button.colab-df-convert');\n",
+              "        document.querySelector('#df-0ebdf450-47ed-45e2-a2cc-73d3e01abade button.colab-df-convert');\n",
               "      buttonEl.style.display =\n",
               "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
               "\n",
               "      async function convertToInteractive(key) {\n",
-              "        const element = document.querySelector('#df-f694982e-20d4-464c-b0d4-1bfc9dbf7766');\n",
+              "        const element = document.querySelector('#df-0ebdf450-47ed-45e2-a2cc-73d3e01abade');\n",
               "        const dataTable =\n",
               "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
               "                                                    [key], {});\n",
@@ -1231,8 +1177,8 @@
               "  </div>\n",
               "\n",
               "\n",
-              "<div id=\"df-f650e8f4-1a32-4311-84df-63427c060d7f\">\n",
-              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-f650e8f4-1a32-4311-84df-63427c060d7f')\"\n",
+              "<div id=\"df-2538c162-f0d7-46f3-a10e-d5db2f7a7dcf\">\n",
+              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-2538c162-f0d7-46f3-a10e-d5db2f7a7dcf')\"\n",
               "            title=\"Suggest charts\"\n",
               "            style=\"display:none;\">\n",
               "\n",
@@ -1351,7 +1297,7 @@
               "    }\n",
               "    (() => {\n",
               "      let quickchartButtonEl =\n",
-              "        document.querySelector('#df-f650e8f4-1a32-4311-84df-63427c060d7f button');\n",
+              "        document.querySelector('#df-2538c162-f0d7-46f3-a10e-d5db2f7a7dcf button');\n",
               "      quickchartButtonEl.style.display =\n",
               "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
               "    })();\n",
@@ -1362,493 +1308,28 @@
               "  </div>\n"
             ],
             "text/plain": [
-              "                                            questions  \\\n",
-              "0   What are the two main tasks BERT is pre-traine...   \n",
-              "1   What model sizes are reported for BERT, and wh...   \n",
-              "2   How does BERT's architecture facilitate the us...   \n",
-              "3   Can you describe the modifications LLaMA makes...   \n",
-              "4   How does LLaMA's approach to embedding layer o...   \n",
-              "5   How were the questions for the multitask test ...   \n",
-              "6   How does BERT's performance on the GLUE benchm...   \n",
-              "7   What significant improvements does BERT bring ...   \n",
-              "8   What unique aspect of the LLaMA training datas...   \n",
-              "9   What detailed methodology does LLaMA utilize t...   \n",
-              "10  What are the specific domains covered by the m...   \n",
-              "11  What specific enhancements are recommended for...   \n",
-              "12  What methodology does DetectGPT use to generat...   \n",
-              "13  Discuss the significance of DetectGPT's detect...   \n",
-              "14  How is the student model, DistilBERT, initiali...   \n",
-              "\n",
-              "                                             contexts  \\\n",
-              "0   [Document(id=1996eb783b7e2934527de00e3d5f82fb5...   \n",
-              "1   [Document(id=8906a653a71ec55161d5f8c6203335456...   \n",
-              "2   [Document(id=320d3c00ef93938ee6cc92f6a742ba1ed...   \n",
-              "3   [Document(id=f360dea1ec15f8f778718ae1e13eb855b...   \n",
-              "4   [Document(id=f360dea1ec15f8f778718ae1e13eb855b...   \n",
-              "5   [Document(id=9415e713cf73ffea5ca383126c54f7ec4...   \n",
-              "6   [Document(id=606c67eb5eeb136ad77616d2ef06a580b...   \n",
-              "7   [Document(id=4ca8419f5c01c094bbda9617b3ce328cb...   \n",
-              "8   [Document(id=236e5c1e3c782e68912426a7f2543710c...   \n",
-              "9   [Document(id=9885fbffa74c564acd7a255e8b66a3343...   \n",
-              "10  [Document(id=9415e713cf73ffea5ca383126c54f7ec4...   \n",
-              "11  [Document(id=ac7c3c2e29e31cf47dc1027f7d31ea94d...   \n",
-              "12  [Document(id=a862c889a8c02afa59e422bc2cbeb2425...   \n",
-              "13  [Document(id=ef8ff80b74a24f6cec05be8135930ba1b...   \n",
-              "14  [Document(id=33d936e116b7764ce538130aaa40c7b37...   \n",
-              "\n",
-              "                                         true_answers  \\\n",
-              "0   Masked LM (MLM) and Next Sentence Prediction (...   \n",
-              "1   BERTBASE (L=12, H=768, A=12, Total Parameters=...   \n",
-              "2   BERT uses a multi-layer bidirectional Transfor...   \n",
-              "3    LLaMA incorporates pre-normalization (using R...   \n",
-              "4   LLaMA introduces optimizations in its embeddin...   \n",
-              "5   Questions were manually collected by graduate ...   \n",
-              "6   BERT achieved new state-of-the-art on the GLUE...   \n",
-              "7   BERT set new records on SQuAD v1.1 and v2.0, s...   \n",
-              "8    LLaMA's training dataset is distinctive for b...   \n",
-              "9   LLaMA's methodology for ensuring data diversit...   \n",
-              "10  The test covers 57 subjects across STEM, human...   \n",
-              "11  Enhancements should focus on developing models...   \n",
-              "12  DetectGPT generates minor perturbations using ...   \n",
-              "13  DtectGPT's approach is significant as it provi...   \n",
-              "14  DistilBERT is initialized from the teacher mod...   \n",
-              "\n",
-              "                                    predicted_answers  context_relevance  \\\n",
-              "0   The two main tasks BERT is pre-trained on are ...                  0   \n",
-              "1   The BERT model sizes reported are:\\n\\n1. **BER...                  0   \n",
-              "2   BERT's architecture facilitates the use of a u...                  0   \n",
-              "3                                                None                  0   \n",
-              "4                                                None                  0   \n",
-              "5   The questions for the multitask test were manu...                  0   \n",
-              "6   BERT significantly outperforms all previous st...                  0   \n",
-              "7   BERT brings substantial improvements to the SQ...                  0   \n",
-              "8   The unique aspect of the LLaMA training datase...                  0   \n",
-              "9                                                None                  0   \n",
-              "10  The specific domains covered by the multitask ...                  1   \n",
-              "11  The context does not provide specific enhancem...                  0   \n",
-              "12  DetectGPT generates minor perturbations in the...                  1   \n",
-              "13  DetectGPT's detection approach is significant ...                  0   \n",
-              "14  The student model, DistilBERT, is initialized ...                  1   \n",
-              "\n",
-              "    faithfulness       sas  \n",
-              "0       1.000000  0.552495  \n",
-              "1       0.000000  0.664142  \n",
-              "2       1.000000  0.817575  \n",
-              "3       0.000000  0.015276  \n",
-              "4       0.000000  0.075397  \n",
-              "5       1.000000  0.652526  \n",
-              "6       0.833333  0.857448  \n",
-              "7       1.000000  0.586361  \n",
-              "8       0.666667  0.962779  \n",
-              "9       0.000000 -0.005470  \n",
-              "10      1.000000  0.620999  \n",
-              "11      1.000000  0.370423  \n",
-              "12      0.666667  0.734830  \n",
-              "13      1.000000  0.508008  \n",
-              "14      0.000000  0.778503  "
+              "                    metrics     score\n",
+              "0  metric_context_relevance  0.266667\n",
+              "1                metric_sas  0.537721\n",
+              "2       metric_faithfulness  0.747778"
             ]
           },
-          "execution_count": 53,
+          "execution_count": 49,
           "metadata": {},
           "output_type": "execute_result"
         }
       ],
       "source": [
-        "eval_results.to_pandas()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 47,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "collapsed": true,
-        "id": "dFrIfDXiaP41",
-        "outputId": "4f187782-7933-4bc0-89b9-51c6672c20ec"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "0 \n",
-            "Question: How does BERT's architecture facilitate the use of a unified model across diverse NLP tasks? \n",
-            "True Answer: BERT uses a multi-layer bidirectional Transformer encoder architecture, allowing for minimal task-specific architecture modifications in fine-tuning. \n",
-            "Answer: BERT's architecture facilitates the use of a unified model across diverse NLP tasks through its design as a multi-layer bidirectional Transformer encoder. This architecture allows for minimal differences between the pre-trained model and the final downstream model architecture. By using a consistent approach to both pre-training and fine-tuning, BERT can adapt to various tasks with only a simple classification layer added on top. Additionally, BERT's capability to jointly condition on both left and right context in all layers enhances its versatility across different natural language processing tasks, thereby enabling state-of-the-art performances without substantial task-specific modifications.\n",
-            "BERT: Pre-training of Deep Bidirectional Transformers for\n",
-            "Language Understanding\n",
-            "Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova\n",
-            "Google AI Language\n",
-            "{jacobdevlin,mingweichang,kentonl,kristout}@google.com\n",
-            "Abstract\n",
-            "We introduce a new language representa-\n",
-            "tion model called BERT, which stands for\n",
-            "Bidirectional Encoder Representations from\n",
-            "Transformers. Unlike recent language repre-\n",
-            "sentation models (Peters et al., 2018a; Rad-\n",
-            "ford et al., 2018), BERT is designed to pre-\n",
-            "train deep bidirectional representations from\n",
-            "unlabeled text by jointly conditioning on both\n",
-            "left and right context in all layers. As a re-\n",
-            "sult, the pre-trained BERT model can be ﬁne-\n",
-            "tuned with just one additional output layer\n",
-            "to create state-of-the-art models for a wide\n",
-            "range of tasks, such as question answering and\n",
-            "language inference, without substantial task-\n",
-            "speciﬁc architecture modiﬁcations.\n",
-            "BERT is conceptually simple and empirically\n",
-            "powerful. It obtains new state-of-the-art re-\n",
-            "sults on eleven natural language processing\n",
-            "tasks, including pushing the GLUE score to\n",
-            "80.5% (7.7% point absolute improvement),\n",
-            "MultiNLI accuracy to 86.7% (4.6% absolute\n",
-            "improvement), SQuAD v1.1 question answer-\n",
-            "ing Test F1 to 93.2 (1.5 point absolute im-\n",
-            "provement) and SQuAD v2.0 Test F1 to 83.1\n",
-            "(5.1 point absolute improvement).\n",
-            "1 Introduction\n",
-            "Language model pre-training has been shown to\n",
-            "be effective for improving many natural language\n",
-            "processing tasks (Dai and Le, 2015; Peters et al.,\n",
-            "2018a; Radford et al., 2018; Howard and Ruder,\n",
-            "2018). These include sentence-level tasks such as\n",
-            "natural language inference (Bowman et al., 2015;\n",
-            "Williams et al., 2018) and paraphrasing (Dolan\n",
-            "and Brockett, 2005), which aim to predict the re-\n",
-            "lationships between sentences by analyzing them\n",
-            "holistically, as well as token-level tasks such as\n",
-            "named entity recognition and question answering,\n",
-            "where models are required to produce ﬁne-grained\n",
-            "output at the token level (Tjong Kim Sang and\n",
-            "De Meulder, 2003; Rajpurkar et al., 2016).\n",
-            "There are two existing strategies for apply-\n",
-            "ing pre-trained when\n",
-            "downstream task data is very small.\n",
-            "5.3 Feature-based Approach with BERT\n",
-            "All of the BERT results presented so far have used\n",
-            "the ﬁne-tuning approach, where a simple classiﬁ-\n",
-            "cation layer is added to the pre-trained model, and\n",
-            "all parameters are jointly ﬁne-tuned on a down-\n",
-            "stream task. However, the feature-based approach,\n",
-            "where ﬁxed features are extracted from the pre-\n",
-            "trained model, has certain advantages. First, not\n",
-            "all tasks can be easily represented by a Trans-\n",
-            "former encoder architecture, and therefore require\n",
-            "a task-speciﬁc model architecture to be added.\n",
-            "Second, there are major computational beneﬁts\n",
-            "to pre-compute an expensive representation of the\n",
-            "training data once and then run many experiments\n",
-            "with cheaper models on top of this representation.\n",
-            "In this section, we compare the two approaches\n",
-            "by applying BERT to the CoNLL-2003 Named\n",
-            "Entity Recognition (NER) task (Tjong Kim Sang\n",
-            "and De Meulder, 2003). In the input to BERT, we\n",
-            "use a case-preserving WordPiece model, and we\n",
-            "include the maximal document context provided\n",
-            "by the data. Following standard practice, we for-\n",
-            "mulate this as a tagging task but do not use a CRF\n",
-            "Hyperparams Dev Set Accuracy\n",
-            "#L #H #A LM (ppl) MNLI-m MRPC SST-2\n",
-            "3 768 12 5.84 77.9 79.8 88.4\n",
-            "6 768 3 5.24 80.6 82.2 90.7\n",
-            "6 768 12 4.68 81.9 84.8 91.3\n",
-            "12 768 12 3.99 84.4 86.7 92.9\n",
-            "12 1024 16 3.54 85.7 86.9 93.3\n",
-            "24 1024 16 3.23 86.6 87.8 93.7\n",
-            "Table 6: Ablation over BERT model size. #L = the\n",
-            "number of layers; #H = hidden size; #A = number of at-\n",
-            "tention heads. “LM (ppl)” is the masked LM perplexity\n",
-            "of held-out training data.\n",
-            "System Dev F1 Test F1\n",
-            "ELMo (Peters et al., 2018a) 95.7 92.2\n",
-            "CVT (Clark et al., 2018) C.2), BERT\n",
-            "performs (91.0 F1) comparably to our human baseline (94.9 F1). Given this small margin, we also\n",
-            "exclude GAP.\n",
-            "On Discovering Ongoing Conversations, our BERT baseline achieves an F1 of 51.9 on a version of\n",
-            "the task cast as sentence pair classiﬁcation (given two snippets of texts from plays, determine if the\n",
-            "second snippet is a continuation of the ﬁrst). This dataset is very class imbalanced (90% negative), so\n",
-            "we also experimented with a class-balanced version on which our BERT baselines achieves 88.4\n",
-            "F1. Qualitatively, we also found the task challenging for humans as there was little context for the\n",
-            "text snippets and the examples were drawn from plays using early English. Given this fairly high\n",
-            "machine performance and challenging nature for humans, we exclude this task from our benchmark.\n",
-            "Instructions tables begin on the following page.\n",
-            "10https://www.kaggle.com/c/quora-insincere-questions-classification/data\n",
-            "18\fTable 5: The instructions given to crowd-sourced worker describing the training phase for the Choice\n",
-            "of Plausible Answers (COPA) task.\n",
-            "The New York University Center for Data Science is collecting your answers for use in research\n",
-            "on computer understanding of English. Thank you for your help!\n",
-            "This project is a training task that needs to be completed before working on the main project\n",
-            "on AMT named Human Performance: Plausible Answer. Once you are done with the training,\n",
-            "please proceed to the main task! The qualiﬁcation approval is not immediate but we will add\n",
-            "you to our qualiﬁed workers list within a day.\n",
-            "In this training, you must answer the question on the page and then, to see how you did, click\n",
-            "the Check Work button at the bottom of the page before pre-trained with Ima-\n",
-            "geNet (Deng et al., 2009; Yosinski et al., 2014).\n",
-            "3 BERT\n",
-            "We introduce BERT and its detailed implementa-\n",
-            "tion in this section. There are two steps in our\n",
-            "framework: pre-training and ﬁne-tuning. Dur-\n",
-            "ing pre-training, the model is trained on unlabeled\n",
-            "data over different pre-training tasks. For ﬁne-\n",
-            "tuning, the BERT model is ﬁrst initialized with\n",
-            "the pre-trained parameters, and all of the param-\n",
-            "eters are ﬁne-tuned using labeled data from the\n",
-            "downstream tasks. Each downstream task has sep-\n",
-            "arate ﬁne-tuned models, even though they are ini-\n",
-            "tialized with the same pre-trained parameters. The\n",
-            "question-answering example in Figure 1 will serve\n",
-            "as a running example for this section.\n",
-            "A distinctive feature of BERT is its uniﬁed ar-\n",
-            "chitecture across different tasks. There is mini-\n",
-            "mal difference between the pre-trained architec-\n",
-            "ture and the ﬁnal downstream architecture.\n",
-            "Model Architecture BERT’s model architec-\n",
-            "ture is a multi-layer bidirectional Transformer en-\n",
-            "coder based on the original implementation de-\n",
-            "scribed in Vaswani et al. (2017) and released in\n",
-            "the tensor2tensor library.1 Because the use\n",
-            "of Transformers has become common and our im-\n",
-            "plementation is almost identical to the original,\n",
-            "we will omit an exhaustive background descrip-\n",
-            "tion of the model architecture and refer readers to\n",
-            "Vaswani et al. (2017) as well as excellent guides\n",
-            "such as “The Annotated Transformer.”2\n",
-            "In this work, we denote the number of layers\n",
-            "(i.e., Transformer blocks) as L, the hidden size as\n",
-            "H, and the number of self-attention heads as A.3\n",
-            "We primarily report results on two model sizes:\n",
-            "BERTBASE (L=12, H=768, A=12, Total Param-\n",
-            "eters=110M) and BERTLARGE (L=24, H=1024,\n",
-            "A=16, Total Parameters=340M).\n",
-            "BERTBASE was chosen to have the same model\n",
-            "size as OpenAI GPT for comparison purposes.\n",
-            "Critically, however, the BERT Transformer uses\n",
-            "bidirectional self-attention, while using the\n",
-            "default learning rate of (Radford et al., 2018). Im-\n",
-            "portantly, we generate randomly according to the\n",
-            "language model distribution, rather than perform-\n",
-            "ing beam search – this would bias the genera-\n",
-            "tions towards common words. For the WikiHow\n",
-            "endings, we used Nucleus Sampling with p “\n",
-            "0.98, which means that the probability weights for\n",
-            "the tail (those tokens with cumulative probabil-\n",
-            "ity mass ă0.02) are zeroed out (Holtzman et al.,\n",
-            "2019).\n",
-            "C BERT setup\n",
-            "We extensively study BERT in this paper, and\n",
-            "make no changes to the underlying architecture or\n",
-            "pretraining. For all of the experiments where we\n",
-            "provide context, we set up the input to the BERT\n",
-            "model like this:\n",
-            "[CLS] A woman is outside with a bucket and\n",
-            "a dog. The dog is running around trying to\n",
-            "avoid a bath. [SEP] She gets the dog wet,\n",
-            "then it runs away again [SEP]\n",
-            "In the case where only the ending is pro-\n",
-            "vided, we adopt the BERT-style ‘single-span’ set-\n",
-            "ting: [CLS] She gets the dog wet, then it runs\n",
-            "away again [SEP]\n",
-            "D A discussion on BERT\n",
-            "Hyperparameters and Instability\n",
-            "It is worth noting that many of our experiments\n",
-            "some instability. On the SW AG experiments, we\n",
-            "use the same hyperparameters as (Devlin et al.,\n",
-            "2018) - these generally work very well. 13 How-\n",
-            "ever, we ﬁnd that they become a bit unstable when\n",
-            "crossing over to make HellaSwag. Here, we dis-\n",
-            "cuss some strategies and insight that we picked up\n",
-            "on.\n",
-            "a. We use a batch size of 64 examples rather\n",
-            "than 16, and warm the model up for 20% of\n",
-            "the dataset (rather than 10%). This helps the\n",
-            "model adapt to SW AG more gradually, with-\n",
-            "out diverging early on.\n",
-            "b. For the Adversarial Filtering et al.,\n",
-            "2018). Given the productive use of MultiNLI in pretraining and intermediate ﬁne-tuning of pretrained\n",
-            "language models (Conneau et al., 2017; Phang et al., 2018, i.a.), for CB, RTE, and BoolQ, we use\n",
-            "MultiNLI as a transfer task by ﬁrst using the above procedure on MultiNLI. Similarly, given the\n",
-            "similarity of COPA to SW AG (Zellers et al., 2018), we ﬁrst ﬁne-tune BERT on SW AG. These results\n",
-            "are reported as BERT++. For all other tasks, we reuse the results of BERT ﬁne-tuned on just that task.\n",
-            "Other Baselines We include a baseline where for each task we simply predict the majority class,6\n",
-            "as well as a bag-of-words baseline where each input is represented as an average of its tokens’ GloVe\n",
-            "word vectors (the 300D/840B release from Pennington et al., 2014). Finally, we list the best known\n",
-            "result on each task as of May 2019, except on tasks which we recast (WSC), resplit (CB), or achieve\n",
-            "6For ReCoRD, we predict the entity that has the highest F1 with the other entity options.\n",
-            "8\fthe best known result (WiC). The outside results for COPA, MultiRC, and RTE are from Sap et al.\n",
-            "(2019), Trivedi et al. (2019), and Liu et al. (2019d) respectively.\n",
-            "Human Performance Pilehvar and Camacho-Collados (2019), Khashabi et al. (2018), Nangia and\n",
-            "Bowman (2019), and Zhang et al. (2018) respectively provide estimates for human performance\n",
-            "on WiC, MultiRC, RTE, and ReCoRD. For the remaining tasks, including the diagnostic set, we\n",
-            "estimate human performance by hiring crowdworker annotators through Amazon’s Mechanical Turk\n",
-            "platform to reannotate a sample of each test set. We follow a two step the two methods could be\n",
-            "minimally compared. The core argument of this\n",
-            "work is that the bi-directionality and the two pre-\n",
-            "training tasks presented in Section 3.1 account for\n",
-            "the majority of the empirical improvements, but\n",
-            "we do note that there are several other differences\n",
-            "between how BERT and GPT were trained:\n",
-            "• GPT is trained on the BooksCorpus (800M\n",
-            "words); BERT is trained on the BooksCor-\n",
-            "pus (800M words) and Wikipedia (2,500M\n",
-            "words).\n",
-            "• GPT uses a sentence separator ( [SEP]) and\n",
-            "classiﬁer token ( [CLS]) which are only in-\n",
-            "troduced at ﬁne-tuning time; BERT learns\n",
-            "[SEP], [CLS] and sentence A/B embed-\n",
-            "dings during pre-training.\n",
-            "• GPT was trained for 1M steps with a batch\n",
-            "size of 32,000 words; BERT was trained for\n",
-            "1M steps with a batch size of 128,000 words.\n",
-            "• GPT used the same learning rate of 5e-5 for\n",
-            "all ﬁne-tuning experiments; BERT chooses a\n",
-            "task-speciﬁc ﬁne-tuning learning rate which\n",
-            "performs the best on the development set.\n",
-            "To isolate the effect of these differences, we per-\n",
-            "form ablation experiments in Section 5.1 which\n",
-            "demonstrate that the majority of the improvements\n",
-            "are in fact coming from the two pre-training tasks\n",
-            "and the bidirectionality they enable.\n",
-            "A.5 Illustrations of Fine-tuning on Different\n",
-            "Tasks\n",
-            "The illustration of ﬁne-tuning BERT on different\n",
-            "tasks can be seen in Figure 4. Our task-speciﬁc\n",
-            "models are formed by incorporating BERT with\n",
-            "one additional output layer, so a minimal num-\n",
-            "ber of parameters need to be learned from scratch.\n",
-            "Among the tasks, (a) and (b) are sequence-level\n",
-            "tasks while (c) and (d) are token-level tasks. In\n",
-            "the ﬁgure, E represents the input embedding, Ti\n",
-            "represents the contextual representation of tokeni,\n",
-            "[CLS] is the special symbol for classiﬁcation out-\n",
-            "put, and [SEP] is the special Encoder and Decoder. However, recent\n",
-            "work leveraging transformers for language modeling such as\n",
-            "BERT (Devlin et al., 2018) and GPT-2 (Radford et al., 2019)\n",
-            "use only the Encoder or Decoder depending on their needs.\n",
-            "This work explores both a decoder architecture, GPT-2, and\n",
-            "an encoder architecture, BERT.\n",
-            "Figure 2 shows a schematic diagram of the model we used.\n",
-            "We refer the reader to prior work for a detailed descrip-\n",
-            "tion of the model architecture (Vaswani et al., 2017; Devlin\n",
-            "et al., 2018; Radford et al., 2019). It is worthwhile to men-\n",
-            "tion that both GPT-2 and BERT use GeLU (Hendrycks &\n",
-            "Gimpel, 2016) nonlinearities and layer normalization (Ba\n",
-            "et al., 2016) to the input of the multi-head attention and feed\n",
-            "forward layers, whereas the original transformer (Vaswani\n",
-            "et al., 2017) uses ReLU nonlinearities and applies layer\n",
-            "normalization to outputs.\n",
-            "2.3. Data and Model Parallelism in Deep Learning\n",
-            "There are two central paradigms for scaling out deep neu-\n",
-            "ral network training to numerous hardware accelerators:\n",
-            "data parallelism (Valiant, 1990) where a training minibatch\n",
-            "is split across multiple workers, and model parallelism in\n",
-            "which the memory usage and computation of a model is\n",
-            "distributed across multiple workers. By increasing the mini-\n",
-            "batch size proportionally to the number of available work-\n",
-            "ers (i.e. weak scaling), one observes near linear scaling\n",
-            "in training data throughput. However, large batch train-\n",
-            "ing introduces complications into the optimization process\n",
-            "that can result in reduced accuracy or longer time to conver-\n",
-            "gence, offsetting the beneﬁt of increased training throughput\n",
-            "(Keskar et al., 2017). Further research (Goyal et al., 2017;\n",
-            "You et al., 2017; 2019) has developed techniques to miti-\n",
-            "gate these effects and drive down the training time of ﬁne-tuning data shufﬂing and clas-\n",
-            "siﬁer layer initialization.9\n",
-            "Results are presented in Table 1. Both\n",
-            "BERTBASE and BERTLARGE outperform all sys-\n",
-            "tems on all tasks by a substantial margin, obtaining\n",
-            "4.5% and 7.0% respective average accuracy im-\n",
-            "provement over the prior state of the art. Note that\n",
-            "BERTBASE and OpenAI GPT are nearly identical\n",
-            "in terms of model architecture apart from the at-\n",
-            "tention masking. For the largest and most widely\n",
-            "reported GLUE task, MNLI, BERT obtains a 4.6%\n",
-            "absolute accuracy improvement. On the ofﬁcial\n",
-            "GLUE leaderboard10, BERTLARGE obtains a score\n",
-            "of 80.5, compared to OpenAI GPT, which obtains\n",
-            "72.8 as of the date of writing.\n",
-            "We ﬁnd that BERT LARGE signiﬁcantly outper-\n",
-            "forms BERTBASE across all tasks, especially those\n",
-            "with very little training data. The effect of model\n",
-            "size is explored more thoroughly in Section 5.2.\n",
-            "4.2 SQuAD v1.1\n",
-            "The Stanford Question Answering Dataset\n",
-            "(SQuAD v1.1) is a collection of 100k crowd-\n",
-            "sourced question/answer pairs (Rajpurkar et al.,\n",
-            "2016). Given a question and a passage from\n",
-            "9The GLUE data set distribution does not include the Test\n",
-            "labels, and we only made a single GLUE evaluation server\n",
-            "submission for each of BERTBASE and BERTLARGE .\n",
-            "10https://gluebenchmark.com/leaderboard\n",
-            "Wikipedia containing the answer, the task is to\n",
-            "predict the answer text span in the passage.\n",
-            "As shown in Figure 1, in the question answer-\n",
-            "ing task, we represent the input question and pas-\n",
-            "sage as a single packed sequence, with the ques-\n",
-            "tion using the A embedding and the passage using\n",
-            "the B embedding. We only introduce a start vec-\n",
-            "tor S ∈RH and an end vector E ∈RH during\n",
-            "ﬁne-tuning. The probability of word i being the\n",
-            "start of the answer span is computed as a dot prod-\n",
-            "uct between the GPT Trans-\n",
-            "former uses constrained self-attention where every\n",
-            "token can only attend to context to its left.4\n",
-            "1https://github.com/tensorﬂow/tensor2tensor\n",
-            "2http://nlp.seas.harvard.edu/2018/04/03/attention.html\n",
-            "3In all cases we set the feed-forward/ﬁlter size to be 4H,\n",
-            "i.e., 3072 for the H = 768and 4096 for the H = 1024.\n",
-            "4We note that in the literature the bidirectional Trans-\fInput/Output Representations To make BERT\n",
-            "handle a variety of down-stream tasks, our input\n",
-            "representation is able to unambiguously represent\n",
-            "both a single sentence and a pair of sentences\n",
-            "(e.g., ⟨Question, Answer ⟩) in one token sequence.\n",
-            "Throughout this work, a “sentence” can be an arbi-\n",
-            "trary span of contiguous text, rather than an actual\n",
-            "linguistic sentence. A “sequence” refers to the in-\n",
-            "put token sequence to BERT, which may be a sin-\n",
-            "gle sentence or two sentences packed together.\n",
-            "We use WordPiece embeddings (Wu et al.,\n",
-            "2016) with a 30,000 token vocabulary. The ﬁrst\n",
-            "token of every sequence is always a special clas-\n",
-            "siﬁcation token ( [CLS]). The ﬁnal hidden state\n",
-            "corresponding to this token is used as the ag-\n",
-            "gregate sequence representation for classiﬁcation\n",
-            "tasks. Sentence pairs are packed together into a\n",
-            "single sequence. We differentiate the sentences in\n",
-            "two ways. First, we separate them with a special\n",
-            "token ([SEP]). Second, we add a learned embed-\n",
-            "ding to every token indicating whether it belongs\n",
-            "to sentence A or sentence B. As shown in Figure 1,\n",
-            "we denote input embedding as E, the ﬁnal hidden\n",
-            "vector of the special [CLS] token as C ∈RH,\n",
-            "and the ﬁnal hidden vector for the ith input token\n",
-            "as Ti ∈RH.\n",
-            "For a given token, its input representation is\n",
-            "constructed by summing the corresponding token,\n",
-            "segment, and position embeddings. A visualiza-\n",
-            "tion of this construction can be seen \n"
-          ]
-        }
-      ],
-      "source": [
-        "index = 2\n",
-        "print(eval_pipeline_results['context_relevance'][\"individual_scores\"][index], \"\\nQuestion:\", questions[index],\"\\nTrue Answer:\", answers[index], \"\\nAnswer:\", predicted_answers[index])\n",
-        "print(\"\".join([doc.content for doc in retrieved_context[index]]))"
+        "harness_eval_run.results.score_report()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "OmkHqAsQZhFr"
+        "id": "0EivA58Ck31n"
       },
       "source": [
-        "## Evaluation Harness (Step 4, 5, and 6)\n",
-        "\n",
-        "* Runs the RAG pipeline\n",
-        "* Runs the evaluation\n",
-        "\n",
-        "> Try `EvaluationHarness` and give us feedback [on Github](https://github.com/deepset-ai/haystack-experimental/discussions/74)"
+        "Override some parameter"
       ]
     },
     {
@@ -1856,60 +1337,41 @@
       "execution_count": null,
       "metadata": {
         "collapsed": true,
-        "id": "IQlMUUOeZkpT"
+        "id": "KhDT1y3Bk3dG"
       },
       "outputs": [],
       "source": [
-        "from haystack_experimental.evaluation.harness.rag import (\n",
-        "    DefaultRAGArchitecture,\n",
-        "    RAGEvaluationHarness,\n",
-        "    RAGEvaluationMetric,\n",
-        "    RAGEvaluationInput\n",
-        ")\n",
-        "\n",
-        "pipeline_eval_harness = RAGEvaluationHarness(\n",
-        "    rag_pipeline = basic_rag,\n",
-        "    rag_components=DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, # query_embedder, retriever, prompt_builder, generator\n",
-        "    metrics={\n",
-        "        RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY,\n",
-        "        RAGEvaluationMetric.FAITHFULNESS,\n",
-        "        RAGEvaluationMetric.CONTEXT_RELEVANCE,\n",
-        "    }\n",
-        ")\n",
+        "from haystack_experimental.evaluation.harness.rag import RAGEvaluationOverrides\n",
         "\n",
-        "eval_harness_input = RAGEvaluationInput(\n",
-        "    queries=questions,\n",
-        "    ground_truth_answers=answers,\n",
-        "    rag_pipeline_inputs={\n",
-        "        \"prompt_builder\": {\"question\": list(questions)},\n",
-        "    },\n",
-        ")\n",
+        "overrides = RAGEvaluationOverrides(rag_pipeline={\n",
+        "    \"generator\": {\"model\": \"gpt-4\"},\n",
+        "})\n",
         "\n",
-        "harness_eval_run= pipeline_eval_harness.run(inputs=eval_harness_input, run_name=run_name)"
+        "harness_eval_run_gpt4 = pipeline_eval_harness.run(inputs=eval_harness_input, run_name=\"harness_eval_run_gpt4\", overrides=overrides)"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 49,
+      "execution_count": 51,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 143
         },
         "collapsed": true,
-        "id": "edZNrdB9sKwm",
-        "outputId": "b4d8ad56-578e-4953-bdea-4c1d1d4f1a54"
+        "id": "ZEtHeWXkwDeH",
+        "outputId": "b115a270-7fbb-4e29-e564-e4708a329b80"
       },
       "outputs": [
         {
           "data": {
             "application/vnd.google.colaboratory.intrinsic+json": {
-              "summary": "{\n  \"name\": \"harness_eval_run\",\n  \"rows\": 3,\n  \"fields\": [\n    {\n      \"column\": \"metrics\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"metric_context_relevance\",\n          \"metric_sas\",\n          \"metric_faithfulness\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.24119916658007876,\n        \"min\": 0.26666666666666666,\n        \"max\": 0.7477777777777778,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0.26666666666666666,\n          0.5377212050060431,\n          0.7477777777777778\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
+              "summary": "{\n  \"name\": \"harness_eval_run_gpt4\",\n  \"rows\": 3,\n  \"fields\": [\n    {\n      \"column\": \"metrics\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"metric_context_relevance\",\n          \"metric_sas\",\n          \"metric_faithfulness\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.2741642634740533,\n        \"min\": 0.26666666666666666,\n        \"max\": 0.7964285714285714,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0.26666666666666666,\n          0.6540726095438003,\n          0.7964285714285714\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
               "type": "dataframe"
             },
             "text/html": [
               "\n",
-              "  <div id=\"df-0ebdf450-47ed-45e2-a2cc-73d3e01abade\" class=\"colab-df-container\">\n",
+              "  <div id=\"df-cb22f832-8f72-44c5-a77c-fd7f5439726c\" class=\"colab-df-container\">\n",
               "    <div>\n",
               "<style scoped>\n",
               "    .dataframe tbody tr th:only-of-type {\n",
@@ -1941,12 +1403,12 @@
               "    <tr>\n",
               "      <th>1</th>\n",
               "      <td>metric_sas</td>\n",
-              "      <td>0.537721</td>\n",
+              "      <td>0.654073</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>2</th>\n",
               "      <td>metric_faithfulness</td>\n",
-              "      <td>0.747778</td>\n",
+              "      <td>0.796429</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
@@ -1954,7 +1416,7 @@
               "    <div class=\"colab-df-buttons\">\n",
               "\n",
               "  <div class=\"colab-df-container\">\n",
-              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0ebdf450-47ed-45e2-a2cc-73d3e01abade')\"\n",
+              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-cb22f832-8f72-44c5-a77c-fd7f5439726c')\"\n",
               "            title=\"Convert this dataframe to an interactive table.\"\n",
               "            style=\"display:none;\">\n",
               "\n",
@@ -2006,12 +1468,12 @@
               "\n",
               "    <script>\n",
               "      const buttonEl =\n",
-              "        document.querySelector('#df-0ebdf450-47ed-45e2-a2cc-73d3e01abade button.colab-df-convert');\n",
+              "        document.querySelector('#df-cb22f832-8f72-44c5-a77c-fd7f5439726c button.colab-df-convert');\n",
               "      buttonEl.style.display =\n",
               "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
               "\n",
               "      async function convertToInteractive(key) {\n",
-              "        const element = document.querySelector('#df-0ebdf450-47ed-45e2-a2cc-73d3e01abade');\n",
+              "        const element = document.querySelector('#df-cb22f832-8f72-44c5-a77c-fd7f5439726c');\n",
               "        const dataTable =\n",
               "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
               "                                                    [key], {});\n",
@@ -2031,8 +1493,8 @@
               "  </div>\n",
               "\n",
               "\n",
-              "<div id=\"df-2538c162-f0d7-46f3-a10e-d5db2f7a7dcf\">\n",
-              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-2538c162-f0d7-46f3-a10e-d5db2f7a7dcf')\"\n",
+              "<div id=\"df-cc0723c9-009c-495f-8920-a1adad84415d\">\n",
+              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-cc0723c9-009c-495f-8920-a1adad84415d')\"\n",
               "            title=\"Suggest charts\"\n",
               "            style=\"display:none;\">\n",
               "\n",
@@ -2151,7 +1613,7 @@
               "    }\n",
               "    (() => {\n",
               "      let quickchartButtonEl =\n",
-              "        document.querySelector('#df-2538c162-f0d7-46f3-a10e-d5db2f7a7dcf button');\n",
+              "        document.querySelector('#df-cc0723c9-009c-495f-8920-a1adad84415d button');\n",
               "      quickchartButtonEl.style.display =\n",
               "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
               "    })();\n",
@@ -2164,956 +1626,32 @@
             "text/plain": [
               "                    metrics     score\n",
               "0  metric_context_relevance  0.266667\n",
-              "1                metric_sas  0.537721\n",
-              "2       metric_faithfulness  0.747778"
+              "1                metric_sas  0.654073\n",
+              "2       metric_faithfulness  0.796429"
             ]
           },
-          "execution_count": 49,
+          "execution_count": 51,
           "metadata": {},
           "output_type": "execute_result"
         }
       ],
       "source": [
-        "harness_eval_run.results.score_report()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0EivA58Ck31n"
-      },
-      "source": [
-        "Override some parameter"
+        "harness_eval_run_gpt4.results.score_report()"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
         "collapsed": true,
-        "id": "KhDT1y3Bk3dG"
+        "id": "3NVSvHc28TbS",
+        "outputId": "acd217d6-32c3-49db-a648-00c3b969fba3"
       },
       "outputs": [],
-      "source": [
-        "from haystack_experimental.evaluation.harness.rag import RAGEvaluationOverrides\n",
-        "\n",
-        "overrides = RAGEvaluationOverrides(rag_pipeline={\n",
-        "    \"generator\": {\"model\": \"gpt-4\"},\n",
-        "})\n",
-        "\n",
-        "harness_eval_run_gpt4 = pipeline_eval_harness.run(inputs=eval_harness_input, run_name=\"harness_eval_run_gpt4\", overrides=overrides)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 51,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 143
-        },
-        "collapsed": true,
-        "id": "ZEtHeWXkwDeH",
-        "outputId": "b115a270-7fbb-4e29-e564-e4708a329b80"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "summary": "{\n  \"name\": \"harness_eval_run_gpt4\",\n  \"rows\": 3,\n  \"fields\": [\n    {\n      \"column\": \"metrics\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"metric_context_relevance\",\n          \"metric_sas\",\n          \"metric_faithfulness\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.2741642634740533,\n        \"min\": 0.26666666666666666,\n        \"max\": 0.7964285714285714,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0.26666666666666666,\n          0.6540726095438003,\n          0.7964285714285714\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
-              "type": "dataframe"
-            },
-            "text/html": [
-              "\n",
-              "  <div id=\"df-cb22f832-8f72-44c5-a77c-fd7f5439726c\" class=\"colab-df-container\">\n",
-              "    <div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>metrics</th>\n",
-              "      <th>score</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>metric_context_relevance</td>\n",
-              "      <td>0.266667</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>metric_sas</td>\n",
-              "      <td>0.654073</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>metric_faithfulness</td>\n",
-              "      <td>0.796429</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>\n",
-              "    <div class=\"colab-df-buttons\">\n",
-              "\n",
-              "  <div class=\"colab-df-container\">\n",
-              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-cb22f832-8f72-44c5-a77c-fd7f5439726c')\"\n",
-              "            title=\"Convert this dataframe to an interactive table.\"\n",
-              "            style=\"display:none;\">\n",
-              "\n",
-              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
-              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
-              "  </svg>\n",
-              "    </button>\n",
-              "\n",
-              "  <style>\n",
-              "    .colab-df-container {\n",
-              "      display:flex;\n",
-              "      gap: 12px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert {\n",
-              "      background-color: #E8F0FE;\n",
-              "      border: none;\n",
-              "      border-radius: 50%;\n",
-              "      cursor: pointer;\n",
-              "      display: none;\n",
-              "      fill: #1967D2;\n",
-              "      height: 32px;\n",
-              "      padding: 0 0 0 0;\n",
-              "      width: 32px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert:hover {\n",
-              "      background-color: #E2EBFA;\n",
-              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "      fill: #174EA6;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-buttons div {\n",
-              "      margin-bottom: 4px;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert {\n",
-              "      background-color: #3B4455;\n",
-              "      fill: #D2E3FC;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert:hover {\n",
-              "      background-color: #434B5C;\n",
-              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "      fill: #FFFFFF;\n",
-              "    }\n",
-              "  </style>\n",
-              "\n",
-              "    <script>\n",
-              "      const buttonEl =\n",
-              "        document.querySelector('#df-cb22f832-8f72-44c5-a77c-fd7f5439726c button.colab-df-convert');\n",
-              "      buttonEl.style.display =\n",
-              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "\n",
-              "      async function convertToInteractive(key) {\n",
-              "        const element = document.querySelector('#df-cb22f832-8f72-44c5-a77c-fd7f5439726c');\n",
-              "        const dataTable =\n",
-              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
-              "                                                    [key], {});\n",
-              "        if (!dataTable) return;\n",
-              "\n",
-              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
-              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
-              "          + ' to learn more about interactive tables.';\n",
-              "        element.innerHTML = '';\n",
-              "        dataTable['output_type'] = 'display_data';\n",
-              "        await google.colab.output.renderOutput(dataTable, element);\n",
-              "        const docLink = document.createElement('div');\n",
-              "        docLink.innerHTML = docLinkHtml;\n",
-              "        element.appendChild(docLink);\n",
-              "      }\n",
-              "    </script>\n",
-              "  </div>\n",
-              "\n",
-              "\n",
-              "<div id=\"df-cc0723c9-009c-495f-8920-a1adad84415d\">\n",
-              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-cc0723c9-009c-495f-8920-a1adad84415d')\"\n",
-              "            title=\"Suggest charts\"\n",
-              "            style=\"display:none;\">\n",
-              "\n",
-              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "     width=\"24px\">\n",
-              "    <g>\n",
-              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
-              "    </g>\n",
-              "</svg>\n",
-              "  </button>\n",
-              "\n",
-              "<style>\n",
-              "  .colab-df-quickchart {\n",
-              "      --bg-color: #E8F0FE;\n",
-              "      --fill-color: #1967D2;\n",
-              "      --hover-bg-color: #E2EBFA;\n",
-              "      --hover-fill-color: #174EA6;\n",
-              "      --disabled-fill-color: #AAA;\n",
-              "      --disabled-bg-color: #DDD;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart {\n",
-              "      --bg-color: #3B4455;\n",
-              "      --fill-color: #D2E3FC;\n",
-              "      --hover-bg-color: #434B5C;\n",
-              "      --hover-fill-color: #FFFFFF;\n",
-              "      --disabled-bg-color: #3B4455;\n",
-              "      --disabled-fill-color: #666;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart {\n",
-              "    background-color: var(--bg-color);\n",
-              "    border: none;\n",
-              "    border-radius: 50%;\n",
-              "    cursor: pointer;\n",
-              "    display: none;\n",
-              "    fill: var(--fill-color);\n",
-              "    height: 32px;\n",
-              "    padding: 0;\n",
-              "    width: 32px;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart:hover {\n",
-              "    background-color: var(--hover-bg-color);\n",
-              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "    fill: var(--button-hover-fill-color);\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart-complete:disabled,\n",
-              "  .colab-df-quickchart-complete:disabled:hover {\n",
-              "    background-color: var(--disabled-bg-color);\n",
-              "    fill: var(--disabled-fill-color);\n",
-              "    box-shadow: none;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-spinner {\n",
-              "    border: 2px solid var(--fill-color);\n",
-              "    border-color: transparent;\n",
-              "    border-bottom-color: var(--fill-color);\n",
-              "    animation:\n",
-              "      spin 1s steps(1) infinite;\n",
-              "  }\n",
-              "\n",
-              "  @keyframes spin {\n",
-              "    0% {\n",
-              "      border-color: transparent;\n",
-              "      border-bottom-color: var(--fill-color);\n",
-              "      border-left-color: var(--fill-color);\n",
-              "    }\n",
-              "    20% {\n",
-              "      border-color: transparent;\n",
-              "      border-left-color: var(--fill-color);\n",
-              "      border-top-color: var(--fill-color);\n",
-              "    }\n",
-              "    30% {\n",
-              "      border-color: transparent;\n",
-              "      border-left-color: var(--fill-color);\n",
-              "      border-top-color: var(--fill-color);\n",
-              "      border-right-color: var(--fill-color);\n",
-              "    }\n",
-              "    40% {\n",
-              "      border-color: transparent;\n",
-              "      border-right-color: var(--fill-color);\n",
-              "      border-top-color: var(--fill-color);\n",
-              "    }\n",
-              "    60% {\n",
-              "      border-color: transparent;\n",
-              "      border-right-color: var(--fill-color);\n",
-              "    }\n",
-              "    80% {\n",
-              "      border-color: transparent;\n",
-              "      border-right-color: var(--fill-color);\n",
-              "      border-bottom-color: var(--fill-color);\n",
-              "    }\n",
-              "    90% {\n",
-              "      border-color: transparent;\n",
-              "      border-bottom-color: var(--fill-color);\n",
-              "    }\n",
-              "  }\n",
-              "</style>\n",
-              "\n",
-              "  <script>\n",
-              "    async function quickchart(key) {\n",
-              "      const quickchartButtonEl =\n",
-              "        document.querySelector('#' + key + ' button');\n",
-              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
-              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
-              "      try {\n",
-              "        const charts = await google.colab.kernel.invokeFunction(\n",
-              "            'suggestCharts', [key], {});\n",
-              "      } catch (error) {\n",
-              "        console.error('Error during call to suggestCharts:', error);\n",
-              "      }\n",
-              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
-              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
-              "    }\n",
-              "    (() => {\n",
-              "      let quickchartButtonEl =\n",
-              "        document.querySelector('#df-cc0723c9-009c-495f-8920-a1adad84415d button');\n",
-              "      quickchartButtonEl.style.display =\n",
-              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "    })();\n",
-              "  </script>\n",
-              "</div>\n",
-              "\n",
-              "    </div>\n",
-              "  </div>\n"
-            ],
-            "text/plain": [
-              "                    metrics     score\n",
-              "0  metric_context_relevance  0.266667\n",
-              "1                metric_sas  0.654073\n",
-              "2       metric_faithfulness  0.796429"
-            ]
-          },
-          "execution_count": 51,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "harness_eval_run_gpt4.results.score_report()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 52,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000
-        },
-        "collapsed": true,
-        "id": "3NVSvHc28TbS",
-        "outputId": "acd217d6-32c3-49db-a648-00c3b969fba3"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "summary": "{\n  \"name\": \"harness_eval_run\",\n  \"rows\": 15,\n  \"fields\": [\n    {\n      \"column\": \"questions\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 15,\n        \"samples\": [\n          \"What detailed methodology does LLaMA utilize to ensure the diversity of its pre-training data, particularly in the context of filtering and language identification?\",\n          \"What specific enhancements are recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing?\",\n          \"What are the two main tasks BERT is pre-trained on?\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"contexts\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"responses\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 13,\n        \"samples\": [\n          \"DetectGPT's detection approach is significant within the context of evolving large language models (LLMs) due to its ability to effectively identify machine-generated text against the backdrop of increasingly sophisticated AI-generated content. As LLMs, such as GPT-3 and others, continue to improve in their ability to produce coherent and articulate text, the challenge of distinguishing between human-written and machine-generated content becomes more pronounced. This is particularly concerning in applications such as education and journalism, where the integrity of written material is paramount.\\n\\nDetectGPT leverages the hypothesis that machine-generated text often occupies areas of negative curvature within a model\\u2019s log probability function, allowing it to distinguish between human and AI outputs based on how perturbations to the text affect its likelihood score. This methodology not only enhances the accuracy of machine-generated text detection but also highlights the inherent differences in how models generate content compared to human authors. \\n\\nThe potential for misuse of LLMs amplifies the importance of a robust detection mechanism like DetectGPT. As AI continues to be adopted for generating news articles, essays, and even creative content, there is a growing risk of misinformation and academic dishonesty. DetectGPT provides educators, journalists, and content consumers with a valuable tool to evaluate the authenticity of text, thereby empowering them to address issues related to AI-generated content's proliferation, which can include factual inaccuracies and ethical dilemmas associated with authorship and originality.\\n\\nMoreover, as LLM capabilities evolve, the methods and tactics for evading detection, such as paraphrasing or using alternative decoding strategies, are likely to emerge. The findings of DetectGPT emphasize the necessity for continuous advancements in detection technologies to keep pace with the innovations in content generation. This ongoing arms race between generation and detection underscores the critical role that tools like DetectGPT will play in maintaining the trustworthiness of written content in a rapidly changing digital landscape.\",\n          \"The context does not provide specific enhancements recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing. It discusses the shortcomings of current models, such as their lopsided performance and difficulties with socially relevant subjects, but does not outline particular recommendations for improvements. Therefore, the answer is:\\n\\nNone\",\n          \"The two main tasks BERT is pre-trained on are the masked language model (MLM) and next sentence prediction (NSP).\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"ground_truth_answers\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 15,\n        \"samples\": [\n          \"LLaMA's methodology for ensuring data diversity includes comprehensive filtering criteria to remove low-quality content, language identification to support multilingual capabilities, and a balanced inclusion of various data sources such as Wikipedia, web text, and books, emphasizing the representation of a wide range of topics and languages.\",\n          \"Enhancements should focus on developing models with improved procedural knowledge, superior calculation abilities, and a more accurate calibration between confidence and actual performance, directly addressing the weaknesses uncovered in current evaluations.\",\n          \"Masked LM (MLM) and Next Sentence Prediction (NSP).\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rag_eval_metric_context_relevance\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          1,\n          0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rag_eval_metric_sas\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.30258921111168324,\n        \"min\": -0.005470070987939835,\n        \"max\": 0.8942041397094727,\n        \"num_unique_values\": 15,\n        \"samples\": [\n          -0.005470070987939835,\n          0.31024250388145447\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rag_eval_metric_faithfulness\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.40172577448254165,\n        \"min\": 0.0,\n        \"max\": 1.0,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          0.0,\n          0.75\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"harness_eval_run_gpt4_metric_context_relevance\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          1,\n          0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"harness_eval_run_gpt4_metric_sas\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.1794187585093937,\n        \"min\": 0.22081966698169708,\n        \"max\": 0.9491991400718689,\n        \"num_unique_values\": 15,\n        \"samples\": [\n          0.6814706325531006,\n          0.45860764384269714\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"harness_eval_run_gpt4_metric_faithfulness\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.36557644484599805,\n        \"min\": 0.0,\n        \"max\": 1.0,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          0.8571428571428571,\n          0.7142857142857143\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
-              "type": "dataframe"
-            },
-            "text/html": [
-              "\n",
-              "  <div id=\"df-4bb49d70-a7b9-4cb3-8be0-1401e510bf68\" class=\"colab-df-container\">\n",
-              "    <div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>questions</th>\n",
-              "      <th>contexts</th>\n",
-              "      <th>responses</th>\n",
-              "      <th>ground_truth_answers</th>\n",
-              "      <th>rag_eval_metric_context_relevance</th>\n",
-              "      <th>rag_eval_metric_sas</th>\n",
-              "      <th>rag_eval_metric_faithfulness</th>\n",
-              "      <th>harness_eval_run_gpt4_metric_context_relevance</th>\n",
-              "      <th>harness_eval_run_gpt4_metric_sas</th>\n",
-              "      <th>harness_eval_run_gpt4_metric_faithfulness</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>What are the two main tasks BERT is pre-traine...</td>\n",
-              "      <td>[pre-trained with Ima-\\ngeNet (Deng et al., 20...</td>\n",
-              "      <td>The two main tasks BERT is pre-trained on are ...</td>\n",
-              "      <td>Masked LM (MLM) and Next Sentence Prediction (...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.593595</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.220820</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>What model sizes are reported for BERT, and wh...</td>\n",
-              "      <td>[the\\ntraining loss for 336M and 752M BERT mod...</td>\n",
-              "      <td>The model sizes reported for BERT and their sp...</td>\n",
-              "      <td>BERTBASE (L=12, H=768, A=12, Total Parameters=...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.626480</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.762167</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>How does BERT's architecture facilitate the us...</td>\n",
-              "      <td>[BERT: Pre-training of Deep Bidirectional Tran...</td>\n",
-              "      <td>BERT's architecture facilitates the use of a u...</td>\n",
-              "      <td>BERT uses a multi-layer bidirectional Transfor...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.878212</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.697250</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>Can you describe the modifications LLaMA makes...</td>\n",
-              "      <td>[to the transformer\\narchitecture (Vaswani et ...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>LLaMA incorporates pre-normalization (using R...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.015276</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.563944</td>\n",
-              "      <td>0.857143</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>How does LLaMA's approach to embedding layer o...</td>\n",
-              "      <td>[to the transformer\\narchitecture (Vaswani et ...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>LLaMA introduces optimizations in its embeddin...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.075397</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.626173</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>How were the questions for the multitask test ...</td>\n",
-              "      <td>[of subjects that either do not neatly ﬁt into...</td>\n",
-              "      <td>The questions for the multitask test were manu...</td>\n",
-              "      <td>Questions were manually collected by graduate ...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.639905</td>\n",
-              "      <td>0.800000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.611838</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>How does BERT's performance on the GLUE benchm...</td>\n",
-              "      <td>[GLUE provides a lightweight classiﬁcation API...</td>\n",
-              "      <td>BERT significantly outperforms previous state-...</td>\n",
-              "      <td>BERT achieved new state-of-the-art on the GLUE...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.808857</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.853133</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>7</th>\n",
-              "      <td>What significant improvements does BERT bring ...</td>\n",
-              "      <td>[ﬁne-tuning data shufﬂing and clas-\\nsiﬁer lay...</td>\n",
-              "      <td>BERT brings significant improvements to the SQ...</td>\n",
-              "      <td>BERT set new records on SQuAD v1.1 and v2.0, s...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.653101</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.662145</td>\n",
-              "      <td>0.375000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>8</th>\n",
-              "      <td>What unique aspect of the LLaMA training datas...</td>\n",
-              "      <td>[model, Gopher, has worse\\nperformance than Ch...</td>\n",
-              "      <td>LLaMA was trained exclusively on publicly avai...</td>\n",
-              "      <td>LLaMA's training dataset is distinctive for b...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.894204</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.949199</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>9</th>\n",
-              "      <td>What detailed methodology does LLaMA utilize t...</td>\n",
-              "      <td>[the description and satisﬁes the\\ntest cases....</td>\n",
-              "      <td>None</td>\n",
-              "      <td>LLaMA's methodology for ensuring data diversit...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>-0.005470</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.681471</td>\n",
-              "      <td>0.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>10</th>\n",
-              "      <td>What are the specific domains covered by the m...</td>\n",
-              "      <td>[of subjects that either do not neatly ﬁt into...</td>\n",
-              "      <td>The specific domains covered by the multitask ...</td>\n",
-              "      <td>The test covers 57 subjects across STEM, human...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.581956</td>\n",
-              "      <td>0.666667</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.532457</td>\n",
-              "      <td>0.714286</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>11</th>\n",
-              "      <td>What specific enhancements are recommended for...</td>\n",
-              "      <td>[Published as a conference paper at ICLR 2021\\...</td>\n",
-              "      <td>The context does not provide specific enhancem...</td>\n",
-              "      <td>Enhancements should focus on developing models...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.310243</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.458608</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>12</th>\n",
-              "      <td>What methodology does DetectGPT use to generat...</td>\n",
-              "      <td>[of the data distribution on DetectGPT, partic...</td>\n",
-              "      <td>DetectGPT generates minor perturbations in the...</td>\n",
-              "      <td>DetectGPT generates minor perturbations using ...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.780353</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.822207</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>13</th>\n",
-              "      <td>Discuss the significance of DetectGPT's detect...</td>\n",
-              "      <td>[different from the\\nsource model, detection p...</td>\n",
-              "      <td>DetectGPT's detection approach is significant ...</td>\n",
-              "      <td>DtectGPT's approach is significant as it provi...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.491360</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.566447</td>\n",
-              "      <td>1.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>14</th>\n",
-              "      <td>How is the student model, DistilBERT, initiali...</td>\n",
-              "      <td>[works focus on building task-speciﬁc distilla...</td>\n",
-              "      <td>The student model, DistilBERT, is initialized ...</td>\n",
-              "      <td>DistilBERT is initialized from the teacher mod...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.722349</td>\n",
-              "      <td>0.750000</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0.803231</td>\n",
-              "      <td>0.000000</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>\n",
-              "    <div class=\"colab-df-buttons\">\n",
-              "\n",
-              "  <div class=\"colab-df-container\">\n",
-              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4bb49d70-a7b9-4cb3-8be0-1401e510bf68')\"\n",
-              "            title=\"Convert this dataframe to an interactive table.\"\n",
-              "            style=\"display:none;\">\n",
-              "\n",
-              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
-              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
-              "  </svg>\n",
-              "    </button>\n",
-              "\n",
-              "  <style>\n",
-              "    .colab-df-container {\n",
-              "      display:flex;\n",
-              "      gap: 12px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert {\n",
-              "      background-color: #E8F0FE;\n",
-              "      border: none;\n",
-              "      border-radius: 50%;\n",
-              "      cursor: pointer;\n",
-              "      display: none;\n",
-              "      fill: #1967D2;\n",
-              "      height: 32px;\n",
-              "      padding: 0 0 0 0;\n",
-              "      width: 32px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert:hover {\n",
-              "      background-color: #E2EBFA;\n",
-              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "      fill: #174EA6;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-buttons div {\n",
-              "      margin-bottom: 4px;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert {\n",
-              "      background-color: #3B4455;\n",
-              "      fill: #D2E3FC;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert:hover {\n",
-              "      background-color: #434B5C;\n",
-              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "      fill: #FFFFFF;\n",
-              "    }\n",
-              "  </style>\n",
-              "\n",
-              "    <script>\n",
-              "      const buttonEl =\n",
-              "        document.querySelector('#df-4bb49d70-a7b9-4cb3-8be0-1401e510bf68 button.colab-df-convert');\n",
-              "      buttonEl.style.display =\n",
-              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "\n",
-              "      async function convertToInteractive(key) {\n",
-              "        const element = document.querySelector('#df-4bb49d70-a7b9-4cb3-8be0-1401e510bf68');\n",
-              "        const dataTable =\n",
-              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
-              "                                                    [key], {});\n",
-              "        if (!dataTable) return;\n",
-              "\n",
-              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
-              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
-              "          + ' to learn more about interactive tables.';\n",
-              "        element.innerHTML = '';\n",
-              "        dataTable['output_type'] = 'display_data';\n",
-              "        await google.colab.output.renderOutput(dataTable, element);\n",
-              "        const docLink = document.createElement('div');\n",
-              "        docLink.innerHTML = docLinkHtml;\n",
-              "        element.appendChild(docLink);\n",
-              "      }\n",
-              "    </script>\n",
-              "  </div>\n",
-              "\n",
-              "\n",
-              "<div id=\"df-ead2c2a9-9b64-4f50-80fe-7a33bd6889d8\">\n",
-              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-ead2c2a9-9b64-4f50-80fe-7a33bd6889d8')\"\n",
-              "            title=\"Suggest charts\"\n",
-              "            style=\"display:none;\">\n",
-              "\n",
-              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "     width=\"24px\">\n",
-              "    <g>\n",
-              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
-              "    </g>\n",
-              "</svg>\n",
-              "  </button>\n",
-              "\n",
-              "<style>\n",
-              "  .colab-df-quickchart {\n",
-              "      --bg-color: #E8F0FE;\n",
-              "      --fill-color: #1967D2;\n",
-              "      --hover-bg-color: #E2EBFA;\n",
-              "      --hover-fill-color: #174EA6;\n",
-              "      --disabled-fill-color: #AAA;\n",
-              "      --disabled-bg-color: #DDD;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart {\n",
-              "      --bg-color: #3B4455;\n",
-              "      --fill-color: #D2E3FC;\n",
-              "      --hover-bg-color: #434B5C;\n",
-              "      --hover-fill-color: #FFFFFF;\n",
-              "      --disabled-bg-color: #3B4455;\n",
-              "      --disabled-fill-color: #666;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart {\n",
-              "    background-color: var(--bg-color);\n",
-              "    border: none;\n",
-              "    border-radius: 50%;\n",
-              "    cursor: pointer;\n",
-              "    display: none;\n",
-              "    fill: var(--fill-color);\n",
-              "    height: 32px;\n",
-              "    padding: 0;\n",
-              "    width: 32px;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart:hover {\n",
-              "    background-color: var(--hover-bg-color);\n",
-              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "    fill: var(--button-hover-fill-color);\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart-complete:disabled,\n",
-              "  .colab-df-quickchart-complete:disabled:hover {\n",
-              "    background-color: var(--disabled-bg-color);\n",
-              "    fill: var(--disabled-fill-color);\n",
-              "    box-shadow: none;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-spinner {\n",
-              "    border: 2px solid var(--fill-color);\n",
-              "    border-color: transparent;\n",
-              "    border-bottom-color: var(--fill-color);\n",
-              "    animation:\n",
-              "      spin 1s steps(1) infinite;\n",
-              "  }\n",
-              "\n",
-              "  @keyframes spin {\n",
-              "    0% {\n",
-              "      border-color: transparent;\n",
-              "      border-bottom-color: var(--fill-color);\n",
-              "      border-left-color: var(--fill-color);\n",
-              "    }\n",
-              "    20% {\n",
-              "      border-color: transparent;\n",
-              "      border-left-color: var(--fill-color);\n",
-              "      border-top-color: var(--fill-color);\n",
-              "    }\n",
-              "    30% {\n",
-              "      border-color: transparent;\n",
-              "      border-left-color: var(--fill-color);\n",
-              "      border-top-color: var(--fill-color);\n",
-              "      border-right-color: var(--fill-color);\n",
-              "    }\n",
-              "    40% {\n",
-              "      border-color: transparent;\n",
-              "      border-right-color: var(--fill-color);\n",
-              "      border-top-color: var(--fill-color);\n",
-              "    }\n",
-              "    60% {\n",
-              "      border-color: transparent;\n",
-              "      border-right-color: var(--fill-color);\n",
-              "    }\n",
-              "    80% {\n",
-              "      border-color: transparent;\n",
-              "      border-right-color: var(--fill-color);\n",
-              "      border-bottom-color: var(--fill-color);\n",
-              "    }\n",
-              "    90% {\n",
-              "      border-color: transparent;\n",
-              "      border-bottom-color: var(--fill-color);\n",
-              "    }\n",
-              "  }\n",
-              "</style>\n",
-              "\n",
-              "  <script>\n",
-              "    async function quickchart(key) {\n",
-              "      const quickchartButtonEl =\n",
-              "        document.querySelector('#' + key + ' button');\n",
-              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
-              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
-              "      try {\n",
-              "        const charts = await google.colab.kernel.invokeFunction(\n",
-              "            'suggestCharts', [key], {});\n",
-              "      } catch (error) {\n",
-              "        console.error('Error during call to suggestCharts:', error);\n",
-              "      }\n",
-              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
-              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
-              "    }\n",
-              "    (() => {\n",
-              "      let quickchartButtonEl =\n",
-              "        document.querySelector('#df-ead2c2a9-9b64-4f50-80fe-7a33bd6889d8 button');\n",
-              "      quickchartButtonEl.style.display =\n",
-              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "    })();\n",
-              "  </script>\n",
-              "</div>\n",
-              "\n",
-              "    </div>\n",
-              "  </div>\n"
-            ],
-            "text/plain": [
-              "                                            questions  \\\n",
-              "0   What are the two main tasks BERT is pre-traine...   \n",
-              "1   What model sizes are reported for BERT, and wh...   \n",
-              "2   How does BERT's architecture facilitate the us...   \n",
-              "3   Can you describe the modifications LLaMA makes...   \n",
-              "4   How does LLaMA's approach to embedding layer o...   \n",
-              "5   How were the questions for the multitask test ...   \n",
-              "6   How does BERT's performance on the GLUE benchm...   \n",
-              "7   What significant improvements does BERT bring ...   \n",
-              "8   What unique aspect of the LLaMA training datas...   \n",
-              "9   What detailed methodology does LLaMA utilize t...   \n",
-              "10  What are the specific domains covered by the m...   \n",
-              "11  What specific enhancements are recommended for...   \n",
-              "12  What methodology does DetectGPT use to generat...   \n",
-              "13  Discuss the significance of DetectGPT's detect...   \n",
-              "14  How is the student model, DistilBERT, initiali...   \n",
-              "\n",
-              "                                             contexts  \\\n",
-              "0   [pre-trained with Ima-\\ngeNet (Deng et al., 20...   \n",
-              "1   [the\\ntraining loss for 336M and 752M BERT mod...   \n",
-              "2   [BERT: Pre-training of Deep Bidirectional Tran...   \n",
-              "3   [to the transformer\\narchitecture (Vaswani et ...   \n",
-              "4   [to the transformer\\narchitecture (Vaswani et ...   \n",
-              "5   [of subjects that either do not neatly ﬁt into...   \n",
-              "6   [GLUE provides a lightweight classiﬁcation API...   \n",
-              "7   [ﬁne-tuning data shufﬂing and clas-\\nsiﬁer lay...   \n",
-              "8   [model, Gopher, has worse\\nperformance than Ch...   \n",
-              "9   [the description and satisﬁes the\\ntest cases....   \n",
-              "10  [of subjects that either do not neatly ﬁt into...   \n",
-              "11  [Published as a conference paper at ICLR 2021\\...   \n",
-              "12  [of the data distribution on DetectGPT, partic...   \n",
-              "13  [different from the\\nsource model, detection p...   \n",
-              "14  [works focus on building task-speciﬁc distilla...   \n",
-              "\n",
-              "                                            responses  \\\n",
-              "0   The two main tasks BERT is pre-trained on are ...   \n",
-              "1   The model sizes reported for BERT and their sp...   \n",
-              "2   BERT's architecture facilitates the use of a u...   \n",
-              "3                                                None   \n",
-              "4                                                None   \n",
-              "5   The questions for the multitask test were manu...   \n",
-              "6   BERT significantly outperforms previous state-...   \n",
-              "7   BERT brings significant improvements to the SQ...   \n",
-              "8   LLaMA was trained exclusively on publicly avai...   \n",
-              "9                                                None   \n",
-              "10  The specific domains covered by the multitask ...   \n",
-              "11  The context does not provide specific enhancem...   \n",
-              "12  DetectGPT generates minor perturbations in the...   \n",
-              "13  DetectGPT's detection approach is significant ...   \n",
-              "14  The student model, DistilBERT, is initialized ...   \n",
-              "\n",
-              "                                 ground_truth_answers  \\\n",
-              "0   Masked LM (MLM) and Next Sentence Prediction (...   \n",
-              "1   BERTBASE (L=12, H=768, A=12, Total Parameters=...   \n",
-              "2   BERT uses a multi-layer bidirectional Transfor...   \n",
-              "3    LLaMA incorporates pre-normalization (using R...   \n",
-              "4   LLaMA introduces optimizations in its embeddin...   \n",
-              "5   Questions were manually collected by graduate ...   \n",
-              "6   BERT achieved new state-of-the-art on the GLUE...   \n",
-              "7   BERT set new records on SQuAD v1.1 and v2.0, s...   \n",
-              "8    LLaMA's training dataset is distinctive for b...   \n",
-              "9   LLaMA's methodology for ensuring data diversit...   \n",
-              "10  The test covers 57 subjects across STEM, human...   \n",
-              "11  Enhancements should focus on developing models...   \n",
-              "12  DetectGPT generates minor perturbations using ...   \n",
-              "13  DtectGPT's approach is significant as it provi...   \n",
-              "14  DistilBERT is initialized from the teacher mod...   \n",
-              "\n",
-              "    rag_eval_metric_context_relevance  rag_eval_metric_sas  \\\n",
-              "0                                   0             0.593595   \n",
-              "1                                   0             0.626480   \n",
-              "2                                   1             0.878212   \n",
-              "3                                   0             0.015276   \n",
-              "4                                   0             0.075397   \n",
-              "5                                   0             0.639905   \n",
-              "6                                   0             0.808857   \n",
-              "7                                   0             0.653101   \n",
-              "8                                   0             0.894204   \n",
-              "9                                   0            -0.005470   \n",
-              "10                                  1             0.581956   \n",
-              "11                                  0             0.310243   \n",
-              "12                                  1             0.780353   \n",
-              "13                                  0             0.491360   \n",
-              "14                                  1             0.722349   \n",
-              "\n",
-              "    rag_eval_metric_faithfulness  \\\n",
-              "0                       1.000000   \n",
-              "1                       1.000000   \n",
-              "2                       1.000000   \n",
-              "3                       0.000000   \n",
-              "4                       0.000000   \n",
-              "5                       0.800000   \n",
-              "6                       1.000000   \n",
-              "7                       1.000000   \n",
-              "8                       1.000000   \n",
-              "9                       0.000000   \n",
-              "10                      0.666667   \n",
-              "11                      1.000000   \n",
-              "12                      1.000000   \n",
-              "13                      1.000000   \n",
-              "14                      0.750000   \n",
-              "\n",
-              "    harness_eval_run_gpt4_metric_context_relevance  \\\n",
-              "0                                                0   \n",
-              "1                                                0   \n",
-              "2                                                1   \n",
-              "3                                                0   \n",
-              "4                                                0   \n",
-              "5                                                0   \n",
-              "6                                                0   \n",
-              "7                                                0   \n",
-              "8                                                0   \n",
-              "9                                                0   \n",
-              "10                                               1   \n",
-              "11                                               0   \n",
-              "12                                               1   \n",
-              "13                                               0   \n",
-              "14                                               1   \n",
-              "\n",
-              "    harness_eval_run_gpt4_metric_sas  \\\n",
-              "0                           0.220820   \n",
-              "1                           0.762167   \n",
-              "2                           0.697250   \n",
-              "3                           0.563944   \n",
-              "4                           0.626173   \n",
-              "5                           0.611838   \n",
-              "6                           0.853133   \n",
-              "7                           0.662145   \n",
-              "8                           0.949199   \n",
-              "9                           0.681471   \n",
-              "10                          0.532457   \n",
-              "11                          0.458608   \n",
-              "12                          0.822207   \n",
-              "13                          0.566447   \n",
-              "14                          0.803231   \n",
-              "\n",
-              "    harness_eval_run_gpt4_metric_faithfulness  \n",
-              "0                                    1.000000  \n",
-              "1                                    1.000000  \n",
-              "2                                    1.000000  \n",
-              "3                                    0.857143  \n",
-              "4                                    1.000000  \n",
-              "5                                    1.000000  \n",
-              "6                                    1.000000  \n",
-              "7                                    0.375000  \n",
-              "8                                    1.000000  \n",
-              "9                                    0.000000  \n",
-              "10                                   0.714286  \n",
-              "11                                   1.000000  \n",
-              "12                                   1.000000  \n",
-              "13                                   1.000000  \n",
-              "14                                   0.000000  "
-            ]
-          },
-          "execution_count": 52,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
       "source": [
         "harness_eval_run.results.comparative_individual_scores_report(harness_eval_run_gpt4.results)"
       ]