From 23e6ade54e4769dcd36875601c7597ffc172886c Mon Sep 17 00:00:00 2001 From: bilgeyucel Date: Wed, 30 Oct 2024 21:27:24 +0300 Subject: [PATCH] Remove some output cells from the evaluating ai with Haystack cookbook --- notebooks/evaluating_ai_with_haystack.ipynb | 1744 ++----------------- 1 file changed, 141 insertions(+), 1603 deletions(-) diff --git a/notebooks/evaluating_ai_with_haystack.ipynb b/notebooks/evaluating_ai_with_haystack.ipynb index 6eaa888..386ba87 100644 --- a/notebooks/evaluating_ai_with_haystack.ipynb +++ b/notebooks/evaluating_ai_with_haystack.ipynb @@ -10,7 +10,7 @@ "\n", "by Bilge Yucel ([X](https://x.com/bilgeycl), [Linkedin](https://www.linkedin.com/in/bilge-yucel/))\n", "\n", - "In this cookbook, we walktrough the [Evaluators](https://docs.haystack.deepset.ai/docs/evaluators) in Haystack, create an evaluation pipeline, streamline the evaluation with [`EvaluationHarness`](https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness) and try different Evaluation Frameworks like [Ragas](https://haystack.deepset.ai/integrations/ragas) and [FlowJudge](https://haystack.deepset.ai/integrations/flow-judge). \n", + "In this cookbook, we walk through the [Evaluators](https://docs.haystack.deepset.ai/docs/evaluators) in Haystack, create an evaluation pipeline, streamline the evaluation with [`EvaluationHarness`](https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness) and try different Evaluation Frameworks like [Ragas](https://haystack.deepset.ai/integrations/ragas) and [FlowJudge](https://haystack.deepset.ai/integrations/flow-judge). \n", "\n", "šŸ“š **Useful Resources:**\n", "* [Article: Benchmarking Haystack Pipelines for Optimal Performance](https://haystack.deepset.ai/blog/benchmarking-haystack-pipelines)\n", @@ -950,7 +950,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -960,16 +960,102 @@ "id": "f1rGBuvocwLB", "outputId": "d607b987-2f03-47dd-f3ee-0cedd73fe4b9" }, + "outputs": [], + "source": [ + "eval_results.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "dFrIfDXiaP41", + "outputId": "4f187782-7933-4bc0-89b9-51c6672c20ec" + }, + "outputs": [], + "source": [ + "index = 2\n", + "print(eval_pipeline_results['context_relevance'][\"individual_scores\"][index], \"\\nQuestion:\", questions[index],\"\\nTrue Answer:\", answers[index], \"\\nAnswer:\", predicted_answers[index])\n", + "print(\"\".join([doc.content for doc in retrieved_context[index]]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OmkHqAsQZhFr" + }, + "source": [ + "## Evaluation Harness (Step 4, 5, and 6)\n", + "\n", + "* Runs the RAG pipeline\n", + "* Runs the evaluation\n", + "\n", + "> Try `EvaluationHarness` and give us feedback [on Github](https://github.com/deepset-ai/haystack-experimental/discussions/74)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "IQlMUUOeZkpT" + }, + "outputs": [], + "source": [ + "from haystack_experimental.evaluation.harness.rag import (\n", + " DefaultRAGArchitecture,\n", + " RAGEvaluationHarness,\n", + " RAGEvaluationMetric,\n", + " RAGEvaluationInput\n", + ")\n", + "\n", + "pipeline_eval_harness = RAGEvaluationHarness(\n", + " rag_pipeline = basic_rag,\n", + " rag_components=DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, # query_embedder, retriever, prompt_builder, generator\n", + " metrics={\n", + " RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY,\n", + " RAGEvaluationMetric.FAITHFULNESS,\n", + " RAGEvaluationMetric.CONTEXT_RELEVANCE,\n", + " }\n", + ")\n", + "\n", + "eval_harness_input = RAGEvaluationInput(\n", + " queries=questions,\n", + " ground_truth_answers=answers,\n", + " rag_pipeline_inputs={\n", + " \"prompt_builder\": {\"question\": list(questions)},\n", + " },\n", + ")\n", + "\n", + "harness_eval_run= pipeline_eval_harness.run(inputs=eval_harness_input, run_name=run_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "collapsed": true, + "id": "edZNrdB9sKwm", + "outputId": "b4d8ad56-578e-4953-bdea-4c1d1d4f1a54" + }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"eval_results\",\n \"rows\": 15,\n \"fields\": [\n {\n \"column\": \"questions\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"What detailed methodology does LLaMA utilize to ensure the diversity of its pre-training data, particularly in the context of filtering and language identification?\",\n \"What specific enhancements are recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing?\",\n \"What are the two main tasks BERT is pre-trained on?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"contexts\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"true_answers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"LLaMA's methodology for ensuring data diversity includes comprehensive filtering criteria to remove low-quality content, language identification to support multilingual capabilities, and a balanced inclusion of various data sources such as Wikipedia, web text, and books, emphasizing the representation of a wide range of topics and languages.\",\n \"Enhancements should focus on developing models with improved procedural knowledge, superior calculation abilities, and a more accurate calibration between confidence and actual performance, directly addressing the weaknesses uncovered in current evaluations.\",\n \"Masked LM (MLM) and Next Sentence Prediction (NSP).\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_answers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 13,\n \"samples\": [\n \"DetectGPT's detection approach is significant in the context of evolving large language model (LLM) capabilities as it addresses the challenges posed by these models, which are increasingly adept at generating coherent and convincing text. As LLMs improve, they become more attractive tools for various applications, including education and journalism; however, this rise brings with it the potential for misuse, such as the production of misleading or inaccurate information. \\n\\nDetectGPT leverages the hypothesis that machine-generated texts typically occupy regions of negative curvature in the log probability function of the generating model. By comparing the log probabilities of original passages with those of slightly modified perturbations, DetectGPT can effectively differentiate between human-written and model-generated text. This method not only enhances detection accuracy but also adapts to the rapid advancements in LLMs, making it a timely solution as these models proliferate.\\n\\nFurthermore, as LLMs are deployed in real-world contexts, such as automatic essay writing or news generation with minimal human oversight, the risks of fraudulence and misinformation escalate. DetectGPT provides a potential safeguard by improving the reliability of detecting AI-generated content, helping educators and news consumers ascertain the authenticity of the texts they engage with. \\n\\nOverall, DetectGPT's innovative detection strategy is crucial for mitigating the associated risks of LLM misuse, ensuring that the growing use of these technologies does not compromise academic integrity or the credibility of information disseminated to the public. Its ongoing development and improvement could serve as a vital countermeasure in an era of AI where trustworthiness becomes increasingly difficult to verify.\",\n \"The context does not provide specific enhancements recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing. It mentions that models need \\\"substantial improvements\\\" and highlights issues with accuracy on socially important subjects like morality and law, but does not detail specific enhancements. Therefore, the answer is:\\n\\nNone\",\n \"The two main tasks BERT is pre-trained on are the \\\"masked language model\\\" (MLM) task and the \\\"next sentence prediction\\\" (NSP) task.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"context_relevance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"faithfulness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.46147910349544863,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.0,\n 0.6666666666666666\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sas\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3054620861350202,\n \"min\": -0.005470070987939835,\n \"max\": 0.9627792835235596,\n \"num_unique_values\": 15,\n \"samples\": [\n -0.005470070987939835,\n 0.3704226613044739\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "summary": "{\n \"name\": \"harness_eval_run\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"metrics\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"metric_context_relevance\",\n \"metric_sas\",\n \"metric_faithfulness\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.24119916658007876,\n \"min\": 0.26666666666666666,\n \"max\": 0.7477777777777778,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.26666666666666666,\n 0.5377212050060431,\n 0.7477777777777778\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", "type": "dataframe" }, "text/html": [ "\n", - "
\n", + "
\n", "
\n", "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
metricsscore
0metric_context_relevance0.266667
1metric_sas0.654073
2metric_faithfulness0.796429
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " metrics score\n", - "0 metric_context_relevance 0.266667\n", - "1 metric_sas 0.654073\n", - "2 metric_faithfulness 0.796429" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "harness_eval_run_gpt4.results.score_report()" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "collapsed": true, - "id": "3NVSvHc28TbS", - "outputId": "acd217d6-32c3-49db-a648-00c3b969fba3" - }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"harness_eval_run\",\n \"rows\": 15,\n \"fields\": [\n {\n \"column\": \"questions\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"What detailed methodology does LLaMA utilize to ensure the diversity of its pre-training data, particularly in the context of filtering and language identification?\",\n \"What specific enhancements are recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing?\",\n \"What are the two main tasks BERT is pre-trained on?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"contexts\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"responses\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 13,\n \"samples\": [\n \"DetectGPT's detection approach is significant within the context of evolving large language models (LLMs) due to its ability to effectively identify machine-generated text against the backdrop of increasingly sophisticated AI-generated content. As LLMs, such as GPT-3 and others, continue to improve in their ability to produce coherent and articulate text, the challenge of distinguishing between human-written and machine-generated content becomes more pronounced. This is particularly concerning in applications such as education and journalism, where the integrity of written material is paramount.\\n\\nDetectGPT leverages the hypothesis that machine-generated text often occupies areas of negative curvature within a model\\u2019s log probability function, allowing it to distinguish between human and AI outputs based on how perturbations to the text affect its likelihood score. This methodology not only enhances the accuracy of machine-generated text detection but also highlights the inherent differences in how models generate content compared to human authors. \\n\\nThe potential for misuse of LLMs amplifies the importance of a robust detection mechanism like DetectGPT. As AI continues to be adopted for generating news articles, essays, and even creative content, there is a growing risk of misinformation and academic dishonesty. DetectGPT provides educators, journalists, and content consumers with a valuable tool to evaluate the authenticity of text, thereby empowering them to address issues related to AI-generated content's proliferation, which can include factual inaccuracies and ethical dilemmas associated with authorship and originality.\\n\\nMoreover, as LLM capabilities evolve, the methods and tactics for evading detection, such as paraphrasing or using alternative decoding strategies, are likely to emerge. The findings of DetectGPT emphasize the necessity for continuous advancements in detection technologies to keep pace with the innovations in content generation. This ongoing arms race between generation and detection underscores the critical role that tools like DetectGPT will play in maintaining the trustworthiness of written content in a rapidly changing digital landscape.\",\n \"The context does not provide specific enhancements recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing. It discusses the shortcomings of current models, such as their lopsided performance and difficulties with socially relevant subjects, but does not outline particular recommendations for improvements. Therefore, the answer is:\\n\\nNone\",\n \"The two main tasks BERT is pre-trained on are the masked language model (MLM) and next sentence prediction (NSP).\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ground_truth_answers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"LLaMA's methodology for ensuring data diversity includes comprehensive filtering criteria to remove low-quality content, language identification to support multilingual capabilities, and a balanced inclusion of various data sources such as Wikipedia, web text, and books, emphasizing the representation of a wide range of topics and languages.\",\n \"Enhancements should focus on developing models with improved procedural knowledge, superior calculation abilities, and a more accurate calibration between confidence and actual performance, directly addressing the weaknesses uncovered in current evaluations.\",\n \"Masked LM (MLM) and Next Sentence Prediction (NSP).\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rag_eval_metric_context_relevance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rag_eval_metric_sas\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.30258921111168324,\n \"min\": -0.005470070987939835,\n \"max\": 0.8942041397094727,\n \"num_unique_values\": 15,\n \"samples\": [\n -0.005470070987939835,\n 0.31024250388145447\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rag_eval_metric_faithfulness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.40172577448254165,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.0,\n 0.75\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"harness_eval_run_gpt4_metric_context_relevance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"harness_eval_run_gpt4_metric_sas\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1794187585093937,\n \"min\": 0.22081966698169708,\n \"max\": 0.9491991400718689,\n \"num_unique_values\": 15,\n \"samples\": [\n 0.6814706325531006,\n 0.45860764384269714\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"harness_eval_run_gpt4_metric_faithfulness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.36557644484599805,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.8571428571428571,\n 0.7142857142857143\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionscontextsresponsesground_truth_answersrag_eval_metric_context_relevancerag_eval_metric_sasrag_eval_metric_faithfulnessharness_eval_run_gpt4_metric_context_relevanceharness_eval_run_gpt4_metric_sasharness_eval_run_gpt4_metric_faithfulness
0What are the two main tasks BERT is pre-traine...[pre-trained with Ima-\\ngeNet (Deng et al., 20...The two main tasks BERT is pre-trained on are ...Masked LM (MLM) and Next Sentence Prediction (...00.5935951.00000000.2208201.000000
1What model sizes are reported for BERT, and wh...[the\\ntraining loss for 336M and 752M BERT mod...The model sizes reported for BERT and their sp...BERTBASE (L=12, H=768, A=12, Total Parameters=...00.6264801.00000000.7621671.000000
2How does BERT's architecture facilitate the us...[BERT: Pre-training of Deep Bidirectional Tran...BERT's architecture facilitates the use of a u...BERT uses a multi-layer bidirectional Transfor...10.8782121.00000010.6972501.000000
3Can you describe the modifications LLaMA makes...[to the transformer\\narchitecture (Vaswani et ...NoneLLaMA incorporates pre-normalization (using R...00.0152760.00000000.5639440.857143
4How does LLaMA's approach to embedding layer o...[to the transformer\\narchitecture (Vaswani et ...NoneLLaMA introduces optimizations in its embeddin...00.0753970.00000000.6261731.000000
5How were the questions for the multitask test ...[of subjects that either do not neatly ļ¬t into...The questions for the multitask test were manu...Questions were manually collected by graduate ...00.6399050.80000000.6118381.000000
6How does BERT's performance on the GLUE benchm...[GLUE provides a lightweight classiļ¬cation API...BERT significantly outperforms previous state-...BERT achieved new state-of-the-art on the GLUE...00.8088571.00000000.8531331.000000
7What significant improvements does BERT bring ...[ļ¬ne-tuning data shufļ¬‚ing and clas-\\nsiļ¬er lay...BERT brings significant improvements to the SQ...BERT set new records on SQuAD v1.1 and v2.0, s...00.6531011.00000000.6621450.375000
8What unique aspect of the LLaMA training datas...[model, Gopher, has worse\\nperformance than Ch...LLaMA was trained exclusively on publicly avai...LLaMA's training dataset is distinctive for b...00.8942041.00000000.9491991.000000
9What detailed methodology does LLaMA utilize t...[the description and satisļ¬es the\\ntest cases....NoneLLaMA's methodology for ensuring data diversit...0-0.0054700.00000000.6814710.000000
10What are the specific domains covered by the m...[of subjects that either do not neatly ļ¬t into...The specific domains covered by the multitask ...The test covers 57 subjects across STEM, human...10.5819560.66666710.5324570.714286
11What specific enhancements are recommended for...[Published as a conference paper at ICLR 2021\\...The context does not provide specific enhancem...Enhancements should focus on developing models...00.3102431.00000000.4586081.000000
12What methodology does DetectGPT use to generat...[of the data distribution on DetectGPT, partic...DetectGPT generates minor perturbations in the...DetectGPT generates minor perturbations using ...10.7803531.00000010.8222071.000000
13Discuss the significance of DetectGPT's detect...[different from the\\nsource model, detection p...DetectGPT's detection approach is significant ...DtectGPT's approach is significant as it provi...00.4913601.00000000.5664471.000000
14How is the student model, DistilBERT, initiali...[works focus on building task-speciļ¬c distilla...The student model, DistilBERT, is initialized ...DistilBERT is initialized from the teacher mod...10.7223490.75000010.8032310.000000
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " questions \\\n", - "0 What are the two main tasks BERT is pre-traine... \n", - "1 What model sizes are reported for BERT, and wh... \n", - "2 How does BERT's architecture facilitate the us... \n", - "3 Can you describe the modifications LLaMA makes... \n", - "4 How does LLaMA's approach to embedding layer o... \n", - "5 How were the questions for the multitask test ... \n", - "6 How does BERT's performance on the GLUE benchm... \n", - "7 What significant improvements does BERT bring ... \n", - "8 What unique aspect of the LLaMA training datas... \n", - "9 What detailed methodology does LLaMA utilize t... \n", - "10 What are the specific domains covered by the m... \n", - "11 What specific enhancements are recommended for... \n", - "12 What methodology does DetectGPT use to generat... \n", - "13 Discuss the significance of DetectGPT's detect... \n", - "14 How is the student model, DistilBERT, initiali... \n", - "\n", - " contexts \\\n", - "0 [pre-trained with Ima-\\ngeNet (Deng et al., 20... \n", - "1 [the\\ntraining loss for 336M and 752M BERT mod... \n", - "2 [BERT: Pre-training of Deep Bidirectional Tran... \n", - "3 [to the transformer\\narchitecture (Vaswani et ... \n", - "4 [to the transformer\\narchitecture (Vaswani et ... \n", - "5 [of subjects that either do not neatly ļ¬t into... \n", - "6 [GLUE provides a lightweight classiļ¬cation API... \n", - "7 [ļ¬ne-tuning data shufļ¬‚ing and clas-\\nsiļ¬er lay... \n", - "8 [model, Gopher, has worse\\nperformance than Ch... \n", - "9 [the description and satisļ¬es the\\ntest cases.... \n", - "10 [of subjects that either do not neatly ļ¬t into... \n", - "11 [Published as a conference paper at ICLR 2021\\... \n", - "12 [of the data distribution on DetectGPT, partic... \n", - "13 [different from the\\nsource model, detection p... \n", - "14 [works focus on building task-speciļ¬c distilla... \n", - "\n", - " responses \\\n", - "0 The two main tasks BERT is pre-trained on are ... \n", - "1 The model sizes reported for BERT and their sp... \n", - "2 BERT's architecture facilitates the use of a u... \n", - "3 None \n", - "4 None \n", - "5 The questions for the multitask test were manu... \n", - "6 BERT significantly outperforms previous state-... \n", - "7 BERT brings significant improvements to the SQ... \n", - "8 LLaMA was trained exclusively on publicly avai... \n", - "9 None \n", - "10 The specific domains covered by the multitask ... \n", - "11 The context does not provide specific enhancem... \n", - "12 DetectGPT generates minor perturbations in the... \n", - "13 DetectGPT's detection approach is significant ... \n", - "14 The student model, DistilBERT, is initialized ... \n", - "\n", - " ground_truth_answers \\\n", - "0 Masked LM (MLM) and Next Sentence Prediction (... \n", - "1 BERTBASE (L=12, H=768, A=12, Total Parameters=... \n", - "2 BERT uses a multi-layer bidirectional Transfor... \n", - "3 LLaMA incorporates pre-normalization (using R... \n", - "4 LLaMA introduces optimizations in its embeddin... \n", - "5 Questions were manually collected by graduate ... \n", - "6 BERT achieved new state-of-the-art on the GLUE... \n", - "7 BERT set new records on SQuAD v1.1 and v2.0, s... \n", - "8 LLaMA's training dataset is distinctive for b... \n", - "9 LLaMA's methodology for ensuring data diversit... \n", - "10 The test covers 57 subjects across STEM, human... \n", - "11 Enhancements should focus on developing models... \n", - "12 DetectGPT generates minor perturbations using ... \n", - "13 DtectGPT's approach is significant as it provi... \n", - "14 DistilBERT is initialized from the teacher mod... \n", - "\n", - " rag_eval_metric_context_relevance rag_eval_metric_sas \\\n", - "0 0 0.593595 \n", - "1 0 0.626480 \n", - "2 1 0.878212 \n", - "3 0 0.015276 \n", - "4 0 0.075397 \n", - "5 0 0.639905 \n", - "6 0 0.808857 \n", - "7 0 0.653101 \n", - "8 0 0.894204 \n", - "9 0 -0.005470 \n", - "10 1 0.581956 \n", - "11 0 0.310243 \n", - "12 1 0.780353 \n", - "13 0 0.491360 \n", - "14 1 0.722349 \n", - "\n", - " rag_eval_metric_faithfulness \\\n", - "0 1.000000 \n", - "1 1.000000 \n", - "2 1.000000 \n", - "3 0.000000 \n", - "4 0.000000 \n", - "5 0.800000 \n", - "6 1.000000 \n", - "7 1.000000 \n", - "8 1.000000 \n", - "9 0.000000 \n", - "10 0.666667 \n", - "11 1.000000 \n", - "12 1.000000 \n", - "13 1.000000 \n", - "14 0.750000 \n", - "\n", - " harness_eval_run_gpt4_metric_context_relevance \\\n", - "0 0 \n", - "1 0 \n", - "2 1 \n", - "3 0 \n", - "4 0 \n", - "5 0 \n", - "6 0 \n", - "7 0 \n", - "8 0 \n", - "9 0 \n", - "10 1 \n", - "11 0 \n", - "12 1 \n", - "13 0 \n", - "14 1 \n", - "\n", - " harness_eval_run_gpt4_metric_sas \\\n", - "0 0.220820 \n", - "1 0.762167 \n", - "2 0.697250 \n", - "3 0.563944 \n", - "4 0.626173 \n", - "5 0.611838 \n", - "6 0.853133 \n", - "7 0.662145 \n", - "8 0.949199 \n", - "9 0.681471 \n", - "10 0.532457 \n", - "11 0.458608 \n", - "12 0.822207 \n", - "13 0.566447 \n", - "14 0.803231 \n", - "\n", - " harness_eval_run_gpt4_metric_faithfulness \n", - "0 1.000000 \n", - "1 1.000000 \n", - "2 1.000000 \n", - "3 0.857143 \n", - "4 1.000000 \n", - "5 1.000000 \n", - "6 1.000000 \n", - "7 0.375000 \n", - "8 1.000000 \n", - "9 0.000000 \n", - "10 0.714286 \n", - "11 1.000000 \n", - "12 1.000000 \n", - "13 1.000000 \n", - "14 0.000000 " - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "harness_eval_run.results.comparative_individual_scores_report(harness_eval_run_gpt4.results)" ]