diff --git a/Colab_RAG_Eval.ipynb b/Colab_RAG_Eval.ipynb
new file mode 100644
index 0000000..a4523f3
--- /dev/null
+++ b/Colab_RAG_Eval.ipynb
@@ -0,0 +1,136 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Prepare Colab environment"
+      ],
+      "metadata": {
+        "id": "7qU5hiy9W3Wc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "AxBJHd2ckD0c"
+      },
+      "outputs": [],
+      "source": [
+        "# clone github repo\n",
+        "!git clone https://github.com/sjoerdoffringa/RAGMeUp.git\n",
+        "\n",
+        "# delete torch from requirements for Colab\n",
+        "!sed -i '/torch/d' RAGMeUp/server/requirements.txt\n",
+        "\n",
+        "# change ragas version\n",
+        "!sed -i '/ragas/d' RAGMeUp/server/requirements.txt && echo \"ragas==0.2.6\" >> RAGMeUp/server/requirements.txt\n",
+        "\n",
+        "# install requirements\n",
+        "!pip install -r RAGMeUp/server/requirements.txt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "Ia_OPalmnHKo"
+      },
+      "outputs": [],
+      "source": [
+        "# set working directory in server folder\n",
+        "%cd RAGMeUp/server\n",
+        "\n",
+        "# copy environment template\n",
+        "!mv .env.evaltemplate .env"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "wX4rdS7RDRR2"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import login\n",
+        "from dotenv import load_dotenv\n",
+        "import os\n",
+        "\n",
+        "!git config --global credential.helper store\n",
+        "\n",
+        "# login by inserting token manually\n",
+        "!huggingface-cli login"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Run scripts"
+      ],
+      "metadata": {
+        "id": "JOHcyUi1XOV8"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Run testset generation\n",
+        "\n",
+        "# load environment\n",
+        "load_dotenv()\n",
+        "\n",
+        "# change environment variables\n",
+        "os.environ['eval_qa_pairs'] = \"5\"\n",
+        "os.environ['llm_model'] = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+        "\n",
+        "# run script\n",
+        "!python eval_create_testset.py"
+      ],
+      "metadata": {
+        "id": "O5sa6kRJWG8G"
+      },
+      "execution_count": 8,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Run RAG evaluation\n",
+        "\n",
+        "# load environment\n",
+        "load_dotenv()\n",
+        "\n",
+        "# change environment variables\n",
+        "os.environ[\"llm_model\"] = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+        "os.environ[\"rerank_model\"] = \"cross-encoder/ms-marco-TinyBERT-L-2-v2\"\n",
+        "os.environ[\"eval_testset_directory\"] = \"testsets/30QA/\"\n",
+        "os.environ[\"eval_RAG_instance_name\"] = \"3.1-8B_TinyBERT\"\n",
+        "\n",
+        "# run script\n",
+        "!python eval_evaluate_RAG.py"
+      ],
+      "metadata": {
+        "id": "veM8VorXZWnA"
+      },
+      "execution_count": 9,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/README.md b/README.md
index 1f05b87..dc0f9f4 100644
--- a/README.md
+++ b/README.md
@@ -1,180 +1,29 @@
-# RAG Me Up
-RAG Me Up is a generic framework (server + UIs) that enables you do to RAG on your own dataset **easily**. Its essence is a small and lightweight server and a couple of ways to run UIs to communicate with the server (or write your own).
 
-RAG Me Up can run on CPU but is best run on any GPU with at least 16GB of vRAM when using the default instruct model.
-
-Combine the power of RAG with the power of fine-tuning - check out our [LLaMa2Lang repository](https://github.com/UnderstandLingBV/LLaMa2Lang) on fine-tuning LLMs which can then be used in RAG Me Up.
-
-# Updates
-- **2024-09-23** Hybrid retrieval with Postgres only (dense vectors  with pgvector and sparse BM25 with pg_search)
-- **2024-09-06** Implemented [Re2](https://arxiv.org/abs/2309.06275)
-- **2024-09-04** Added an evaluation script that uses Ragas to evaluate your RAG pipeline
-- **2024-08-30** Added Ollama compatibility
-- **2024-08-27** Using cross encoders now so you can specify your own reranking model
-- **2024-07-30** Added multiple provenance attribution methods
-- **2024-06-26** Updated readme, added more file types, robust self-inflection
-- **2024-06-05** Upgraded to Langchain v0.2
-
-# Installation
-## Server
-```
-git clone https://github.com/UnderstandLingBV/RAGMeUp.git
-cd server
-pip install -r requirements.txt
-```
-Then run the server using `python server.py` from the server subfolder.
-
-## Scala UI
-Make sure you have JDK 17+. Download and install [SBT](https://www.scala-sbt.org/) and run `sbt run` from the `server/scala` directory or alternatively download the [compiled binary](https://github.com/UnderstandLingBV/RAGMeUp/releases/tag/scala-ui) and run `bin/ragemup(.bat)`
-
-## Using Postgres (adviced for production)
-RAG Me Up supports Postgres as hybrid retrieval database with both pgvector and pg_search installed. To run Postgres instead of Milvus, follow these steps.
-
-- In the postgres folder is a Dockerfile, build it using `docker build -t ragmeup-pgvector-pgsearch .`
-- Run the container using `docker run --name ragmeup-pgvector-pgsearch -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d ragmeup-pgvector-pgsearch`
-- Once in use, our custom PostgresBM25Retriever will automatically create the right indexes for you.
-- pgvector however, will not do this automatically so you have to create them yourself (perhaps after loading the documents first so the right tables are created):
-    - Make sure the vector column is an actual vector (it's not by default): `ALTER TABLE langchain_pg_embedding ALTER COLUMN embedding TYPE vector(384);`
-    - Create the index (may take a while with a lot of data): `CREATE INDEX ON langchain_pg_embedding USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64);`
-- Be sure to set up the right paths in your .env file `vector_store_uri='postgresql+psycopg://langchain:langchain@localhost:6024/langchain'` and `vector_store_sparse_uri='postgresql://langchain:langchain@localhost:6024/langchain'`
-
-# How does RAG Me Up work?
-RAG Me Up aims to provide a robust RAG pipeline that is configurable without necessarily writing any code. To achieve this, a couple of strategies are used to make sure that the user query can be accurately answered through the documents provided.
-
-The RAG pipeline is visualized in the image below:
-![RAG pipeline drawing](./ragmeup.drawio.svg)
-
-The following steps are executed. Take note that some steps are optional and can be turned off through configuring the `.env` file.
-
-__Top part - Indexing__
-1. You collect and make your documents available to RAG Me Up.
-2. Using different file type loaders, RAG Me Up will read the contents of your documents. Note that for some document types like JSON and XML, you need to specify additional configuration to tell RAG Me Up what to extract.
-3. Your documents get chunked using a recursive splitter.
-4. The chunks get converted into document (chunk) embeddings using an embedding model. Note that this model is usually a different one than the LLM you intend to use for chat.
-5. RAG Me Up uses a hybrid search strategy, combining dense vectors in the vector database with sparse vectors using BM25. By default, RAG Me Up uses a local [Milvus database](https://milvus.io/).
-
-__Bottom part - Inference__
-1. Inference starts with a user asking a query. This query can either be an initial query or a follow-up query with an associated history and documents retrieved before. Note that both (chat history, documents) need to be passed on by a UI to properly handle follow-up querying.
-2. A check is done if new documents need to be fetched, this can be due to one of two cases:
-    - There is no history given in which case we always need to fetch documents
-    - **[OPTIONAL]** The LLM itself will judge whether or not the question - in isolation - is phrased in such a way that new documents are fetched or whether it is a follow-up question on existing documents. A flag called `fetch_new_documents` is set to indicate whether or not new documents need to be fetched.
-3. Documents are fetched from both the vector database (dense) and the BM25 index (sparse). Only executed if `fetch_new_documents` is set.
-4. **[OPTIONAL]** Reranking is applied to extract the most relevant documents returned by the previous step. Only executed if `fetch_new_documents` is set.
-5. **[OPTIONAL]** The LLM is asked to judge whether or not the documents retrieved contain an accurate answer to the user's query. Only executed if `fetch_new_documents` is set.
-    - If this is not the case, the LLM is used to rewrite the query with the instruction to optimize for distance based similarity search. This is then fed back into step 3. **but only once** to avoid lengthy or infinite loops.
-6. The documents are injected into the prompt with the user query. The documents can come from:
-    - The retrieval and reranking of the document databases, if `fetch_new_documents` is set.
-    - The history passed on with the initial user query, if `fetch_new_documents` is **not** set.
-7. The LLM is asked to answer the query with the given chat history and documents.
-8. The answer, chat history and documents are returned.
-
-# Configuration
-RAG Me Up uses a `.env` file for configuration, see `.env.template`. The following fields can be configured:
-
-## LLM configuration
-- `llm_model` This is the main LLM (instruct or chat) model to use that you will converse with. Default is LLaMa3-8B
-- `llm_assistant_token` This should contain the unique query (sub)string that indicates where in a prompt template the assistant's answer starts
-- `embedding_model` The model used to convert your documents' chunks into vectors that will be stored in the vector store
-- `trust_remote_code` Set this to true if your LLM needs to execute remote code
-- `force_cpu` When set to True, forces RAG Me Up to run fully on CPU (not recommended)
-
-### Use OpenAI
-If you want to use OpenAI as LLM backend, make sure to set `use_openai` to True and make sure you (externally) set the environment variable `OPENAI_API_KEY` to be your OpenAI API Key.
-
-### Use Gemini
-If you want to use Gemini as LLM backend, make sure to set `use_gemini` to True and make sure you (externally) set the environment variable `GOOGLE_API_KEY` to be your Gemini API Key.
-
-### Use Azure OpenAI
-If you want to use Azure OpenAI as LLM backend, make sure to set `use_azure` to True and make sure you (externally) set the following environment variables:
-- `AZURE_OPENAI_API_KEY`
-- `AZURE_OPENAI_API_VERSION`
-- `AZURE_OPENAI_ENDPOINT`
-- `AZURE_OPENAI_CHAT_DEPLOYMENT_NAME`
-
-## Use Ollama
-If you want to use Ollama as LLM backend, make sure to install Ollama and set `use_ollama` to True. The model to use should be given in `ollama_model`.
-
-## RAG Provenance
-One of the biggest, arguably unsolved, challenges of RAG is to do good provenance attribution: tracking which of the source documents retrieved from your database led to the LLM generating its answer (the most). RAG Me Up implements several ways of achieving this, each with its own pros and cons.
-
-The following environment variables can be set for provenance attribution.
-
-- `provenance_method` Can be one of `rerank, attention, similarity, llm`. If `rerank` is `False` and the value of `provenance_method` is either `rerank` or none of the allowed values, provenance attribution is turned completely off
-- `provenance_similarity_llm` If `provenance_method` is set to `similarity`, this model will be used to compute the similarity scores
-- `provenance_include_query` Set to True or False to include the query itself when attributing provenance
-- `provenance_llm_prompt` If `provenance_method` is set to `llm`, this prompt will be used to let the LLM attribute the provenance of each document in isolation.
-
-The different provenance attribution metrics are described below.
-
-### `provenance_method=rerank` (preferred for closed LLMs)
-This uses the reranker as the provenance method. While the reranking is already used when retrieving documents (if reranking is turned on), this only applies the rerankers cross-attention to the documents and the *query*. For provenance attribution, we use the same reranking to apply cross-attention to the *answer* (and potentially the query too).
-
-### `provenance_method=attention` (preferred for OS LLMs)
-This is probably the most accurate way of tracking provenance but it can only be used with OS LLMs that allow to return the attention weights. The way we track provenance is by looking at the actual attention weights (of the last attention layer in the model) for each token from the answer to the document and vice versa, optionally we do the same for the query if `provenance_include_query=True`.
-
-### `provenance_method=similarity`
-This method uses a sentence transformer (LM) to get dense vectors for each document as well as for the answer (and potentially query). We then use a cosine similarity to get the similarity of the document vectors to the answer (+ query).
-
-### `provenance_method=llm`
-The LLM that is used to generate messages is now also used to attribute the provenance of each document in isolation. We use the `provenance_llm_prompt` as the prompt to ask the LLM to perform this task. Note that the outcome of this provenance method is highly influenced by the prompt and the strength of the model. As a good practice, make sure you force the LLM to return numbers on a relatively small scale (eg. score from 1 to 3). Using something like a percentage for each document will likely result in random outcomes.
-
-## Data configuration
-- `data_directory` The directory that contains your (initial) documents to load into the vector store
-- `file_types` Comma-separated list of file types to load. Supported file types: `PDF, JSON, DOCX, XSLX, PPTX, CSV, XML`
-- `json_schema` If you are loading JSON, this should be the schema (using `jq_schema`). For example, use `.` for the root of a JSON object if your data contains JSON objects only and `.[0]` for the first element in each JSON array if your data contains JSON arrays with one JSON object in them
-- `json_text_content` Whether or not the JSON data should be loaded as textual content or as structured content (in case of a JSON object)
-- `xml_xpath` If you are loading XML, this should be the XPath of the documents to load (the tags that contain your text)
-
-## Retrieval configuration
-- `vector_store_uri` RAG Me Up caches your vector store on disk if possible to make loading a next time faster. This is the location where the vector store is stored. Remove this file to force a reload of all your documents
-- `vector_store_k` The number of documents to retrieve from the vector store
-- `rerank` Set to either True or False to enable reranking
-- `rerank_k` The number of documents to keep after reranking. Note that if you use reranking, this should be your final target for `k` and `vector_store_k` should be set (significantly) higher. For example, set `vector_store_k` to 10 and `rerank_k` to 3
-- `rerank_model` The cross encoder reranking retrieval model to use. Sensible defaults are `cross-encoder/ms-marco-TinyBERT-L-2-v2` for speed and `colbert-ir/colbertv2.0` for accuracy (`antoinelouis/colbert-xm` for multilingual). Set this value to  `flashrank` to use the FlashrankReranker.
-
-## LLM parameters
-- `temperature` The chat LLM's temperature. Increase this to create more diverse answers
-- `repetition_penalty` The penalty for repeating outputs in the chat answers. Some models are very sensitive to this parameter and need a value bigger than 1.0 (penalty) while others benefit from inversing it (lower than 1.0)
-- `max_new_tokens` This caps how much tokens the LLM can generate in its answer. More tokens means slower throughput and more memory usage
-
-## Prompt configuration
-- `rag_instruction` An instruction message for the LLM to let it know what to do. Should include a mentioning of it performing RAG and that documents will be given as input context to generate the answer from.
-- `rag_question_initial` The initial question prompt that will be given to the LLM only for the first question a user asks, that is, without chat history
-- `rag_question_followup` This is a follow-up question the user is asking. While the context resulting from the prompt will be populated by RAG from the vector store, if chat history is present, this prompt will be used instead of `rag_question_initial`
-
-### Document retrieval
-- `rag_fetch_new_instruction` RAG Me Up automatically determines whether or not new documents should be fetched from the vector store or whether the user is asking a follow-up question on the already fetched documents by leveraging the same LLM that is used for chat. This environment variable determines the prompt to use to make this decision. Be very sure to instruct your LLM to answer with yes or no only and make sure your LLM is capable enough to follow this instruction
-- `rag_fetch_new_question` The question prompt used in conjunction with `rag_fetch_new_instruction` to decide if new documents should be fetched or not
-
-### Rewriting (self-inflection)
-- `user_rewrite_loop` Set to either True or False to enable the rewriting of the initial query. Note that a rewrite will always occur at most once
-- `rewrite_query_instruction` This is the instruction of the prompt that is used to ask the LLM to judge whether a rewrite is necessary or not. Make sure you force the LLM to answer with yes or no only
-- `rewrite_query_question` This is the actual query part of the prompt that isued to ask the LLM to judge a rewrite
-- `rewrite_query_prompt` If the rewrite loop is on and the LLM judges a rewrite is required, this is the instruction with question asked to the LLM to rewrite the user query into a phrasing more optimized for RAG. Make sure to instruct your model adequately.
-
-### Re2
-- `use_re2` Set to either True or False to enable [Re2 (Re-reading)](https://arxiv.org/abs/2309.06275) which repeats the question, generally improving the quality of the answer generated by the LLM.
-- `re2_prompt` The prompt used in between the question and the repeated question to signal that we are re-asking.
-
-## Document splitting configuration
-- `splitter` The Langchain document splitter to use. Supported splitters are `RecursiveCharacterTextSplitter` and `SemanticChunker`.
-- `chunk_size` The chunk size to use when splitting up documents for `RecursiveCharacterTextSplitter`
-- `chunk_overlap` The chunk overlap for `RecursiveCharacterTextSplitter`
-- `breakpoint_threshold_type` Sets the breakpoint threshold type when using the `SemanticChunker` ([see here](https://python.langchain.com/v0.2/docs/how_to/semantic-chunker/)). Can be one of: percentile, standard_deviation, interquartile, gradient
-- `breakpoint_threshold_amount` The amount to use for the threshold type, in float. Set to `None` to leave default
-- `number_of_chunks` The number of chunks to use for the threshold type, in int. Set to `None` to leave default
-
-# Evaluation
-While RAG evaluation is difficult and subjective to begin with, frameworks such as [Ragas](https://docs.ragas.io/en/stable/) can give some metrics as to how well your RAG pipeline and its prompts are working, allowing us to benchmark one approach over the other quantitatively.
-
-RAG Me Up uses Ragas to evaluate your pipeline. You can run an evaluation based on your `.env` using `python Ragas_eval.py`. The following configuration parameters can be set for evaluation:
-
-- `ragas_sample_size` The amount of document (chunks) to use in evaluation. These are sampled from your data directory after chunking.
-- `ragas_qa_pairs` Ragas works upon questions and ground truth answers. The amount of such pairs to create based on the sampled document chunks is set by this parameter.
-- `ragas_question_instruction` The instruction prompt used to generate the questions of the Ragas input pairs.
-- `ragas_question_query` The query prompt used to generate the questions of the Ragas input pairs.
-- `ragas_answer_instruction` The instruction prompt used to generate the answers of the Ragas input pairs.
-- `ragas_answer_query` The query prompt used to generate the answers of the Ragas input pairs.
-
-# Funding
-We are actively looking for funding to democratize AI and advance its applications. Contact us at info@commandos.ai if you want to invest.
+# RAG Evaluation
+
+This repository builds on the RAGMeUp framework. This README is specific to the added RAG Evaluation framework. The framework was ran from Google Colab. It is advised to run the scripts from [`Colab_RAG_Eval.ipynb`](./Colab_RAG_Eval.ipynb) in the Colab environment. The file uses a .env template for evaluation. This template is loaded as .env file, and later described variables can be changed by writing to this environment. Lastly, Ensure you have a HuggingFace Token to insert. 
+
+Run the [`eval_create_testset.py`](./server/eval_create_testset.py) file to create a testset. This testset is a dataset of QA-pairs. It is saved as a .csv file in the folder testsets. If this directory does not exist in the server directory, it is created. Within this folder, a new folder is created to save the `testset.csv` file and a `rag_chunk.pickle` file, which stores the chunks that are parsed from the documents.\
+The following variables can be adjusted before creating the testset:
+- `chunk_size` set the size of chunks the script uses to generate questions from.
+- `rerank_k` set to define how many chunks the LLM uses to generate a question from. (It is advised keep `rerank` to True for RAG Evaluation).
+- `eval_qa_pairs` set the number of Question-Answer pairs that should be generated. 
+- `eval_sample_size` set the number of chunks to sample from for generating QA-pairs.
+- `eval_question_query` set the prompt for generating questions.
+- `eval_catch_irrelevant_chunks` Set if a prompt should be added to the question query to allow the LLM not to create a question based on irrelevant chunks (True/False).
+- `eval_catch_irrelevant_chunks_prompt` Set the prompt to use if the previous variable is True.
+- `eval_check_sample_relevance` set if the LLM should first judge a chunk if it is relevant to generate a question from (True/False). 
+- `eval_check_sample_relevance_instruction` set the instruction prompt if check_sample_relevance is True.
+- `eval_check_sample_relevance_query` set the query prompt if check_sample_relevance is True.
+- `eval_retrieve_samples` set if the same samples as a previously generated testset should be used (True/False).
+- `eval_retrieve_samples_folder` set the folder from which the testset should be retrieved if the previous variable is True.
+- `eval_use_example_questions` Set if a prompt should be added to the question query to provide example questions to the LLM.
+- `eval_example_questions` Set the example questions if the previous variable is True. Provide them as a string of a list.
+- `eval_example_questions_prompt` set the prompt to instruct the LLM what to do with the example questions if use_example_questions is True.
+
+Run the [`eval_evaluate_RAG.py`](./server/eval_evaluate_RAG.py) file to evaluate a RAG instance with a specified testset. The RAG's retrieved chunks and generated answers are added to the testset, and Recall and Recall-top-k are computed and printed. The resulting evalset in the same way as the testset, and as a excel file for inspection. The following variables can be adjusted before evaluating the RAG:
+- `eval_testset_directory` set the directory in which the testset to use can be found.
+- `eval_RAG_instance_name` set the name of the RAG instance, such that instances can be compared from their column names.
+- `eval_ragas` set if the Ragas library should be used to compute evaluation metrics. Note that this is expected to give a timeout or Out Of Memory error when running in Colab.
+
+The repository includes the data that was used in the analysis in the [server/data](./server/data) folder, but this can be changed to any documents. Also, an example testset based on this data is included in [server/testsets/30QA](./server/testsets/30QA).
\ No newline at end of file
diff --git a/server/.env.evaltemplate b/server/.env.evaltemplate
new file mode 100644
index 0000000..ff21d20
--- /dev/null
+++ b/server/.env.evaltemplate
@@ -0,0 +1,113 @@
+HF_TOKEN=
+llm_model=meta-llama/Meta-Llama-3.1-8B-Instruct
+llm_assistant_token="<|eot_id|>assistant\n\n"
+embedding_model=avsolatorio/GIST-small-Embedding-v0
+trust_remote_code=True
+force_cpu=False
+
+provenance_method=rerank
+provenance_similarity_llm=sentence-transformers/distiluse-base-multilingual-cased-v2
+provenance_include_query=False
+provenance_llm_prompt="Instruction: You are a provenance auditor that needs to exactly determine how much an answer given to a user question was based on a given input document, knowing that more than just that one document were considered. Documents may be fully used verbatim, partially used or even translated. You need to give a score indicating how much a source document was used in creating the answer given to a user query, this score must be 0 = source document is not used at all, 1 = barely used, 2 = moderately used, 3 = mostly used, 4 = almost fully used and 5 = full text included in answer. You are forced to always answer only with the score from 0 to 5, don't explain yourself or add more text than just the score.
+
+The user's query is:
+
+{query}
+
+The answer given is to this user query is:
+
+{answer}
+
+The source document that you need to score is the following:
+
+{context}"
+
+data_directory='data'
+file_types="pdf,json,docx,pptx,xslx,csv,xml"
+json_schema="."
+json_text_content=False
+xml_xpath="//"
+
+vector_store=milvus
+vector_store_uri='data.db'
+vector_store_collection=ragmeup_documents
+vector_store_sparse_uri=bm25_db.pickle
+vector_store_initial_load=True
+vector_store_k=10
+document_chunks_pickle=rag_chunks.pickle
+rerank=True
+rerank_k=1
+rerank_model=flashrank
+
+temperature=0.2
+repetition_penalty=1.1
+max_new_tokens=1000
+
+rag_instruction="Instruction: You are a digital librarian that can answer generic questions on relevant content quickly and succinctly. Here are a few documents from the library that you can use to answer the user's question, retrieved as documents from a database. Be sure to motivate your answer and always mention your source, so which of the documents you used to formulate the answer:
+
+{context}"
+rag_question_initial="The initial question you have to answer:
+
+{question}"
+rag_question_followup="The follow-up question you have to answer:
+
+{question}"
+rag_fetch_new_instruction="Instruction: You are a digital librarian with a database that contains relevant documents for user queries. Users want to ask questions based on those documents and ask questions that either need you to fetch new documents from the database or that are a followup question on previously obtained documents. You need to decide whether you are going to fetch new documents or whether the user is asking a follow-up question but you don't get to see the actual documents the user potentially is looking at.\nShould new documents be fetched from the database based on this user query? Answer with yes or no."
+rag_fetch_new_question="The user question is the following: \"{question}\"\n"
+
+use_rewrite_loop=False
+rewrite_query_instruction="You have to answer a user question based on documents retrieved from a document database. It is your task to decide whether or not the documents contain the answer to the user's query. You can always only answer with exactly yes or no. The documents that are currently fetched from the database are:
+
+{context}"
+rewrite_query_question="The user's question is:
+
+{question}"
+rewrite_query_prompt="You are given a user query that should be answered by looking up documents that from a document store using a distance based similarity measure. The documents fetched from the document store were found to be irrelevant to answer the question. Rewrite the following question into an alternative that increases the likelihood of finding relevant documents from the database. You may only answer with the exact rephrasing. The original question is: {question}"
+
+use_re2=True
+re2_prompt="Read the question again: "
+
+splitter='RecursiveCharacterTextSplitter'
+chunk_size=1024
+chunk_overlap=40
+breakpoint_threshold_type=percentile
+breakpoint_threshold_amount=None
+number_of_chunks=None
+
+use_openai=False
+openai_model_name='gpt-4o-mini'
+use_gemini=False
+gemini_model_name='gemini-pro'
+use_azure=False
+use_ollama=False
+ollama_model='llama3.1'
+
+eval_sample_size=200
+eval_qa_pairs=10
+eval_timeout=300
+eval_max_workers=1
+
+eval_catch_irrelevant_chunks=False
+eval_catch_irrelevant_chunks_prompt="If it does not make sense to ask a question about the document at all, reply only with 'None'"
+eval_check_sample_relevance=False
+eval_check_sample_relevance_instruction="You judge documents on the potential to ask a meaningful question about its content. if this is the case, reply with 'True'. If not, reply with 'False'. You cannot reply with anything else. Document: {context}"
+eval_check_sample_relevance_query="Judge the document on its potential to ask a meaningful question about its content. Output nothing else but 'True' or 'False'."
+eval_retrieve_samples=False
+eval_retrieve_samples_folder=1
+eval_use_example_questions=False
+eval_example_questions=None
+eval_example_questions_prompt="Here are a few example questions. generate the question in a similar fashion:\n"
+eval_question_instruction="You direct another LLM with questions. Write a question we can ask to an LLM that it will be able to answer based on these existing documents. Make sure the question can be accurately answered using the documents' contents and never ever reply with anything else but the question we need to supply to the LLM:
+
+{context}"
+eval_question_query="Generate a question to that can be answered given the input documents, nothing else but the question and no explanation."
+eval_answer_instruction="You are a digital librarian and need to answer questions based on input documents. Here are the documents you are forced to base your answer on:
+
+{context}"
+eval_answer_query="Answer the following question, never give any explanation or other output than the generated article itself:
+
+{question}"
+
+eval_testset_directory='testsets/30QA/'
+eval_RAG_instance_name='Model1'
+eval_ragas=False
\ No newline at end of file
diff --git a/server/.env.template b/server/.env.template
index 064d37b..b35a05c 100644
--- a/server/.env.template
+++ b/server/.env.template
@@ -82,7 +82,7 @@ use_azure=False
 use_ollama=False
 ollama_model='llama3.1'
 
-ragas_sample_size=200
+ragas_sample_size=5 # default is 200
 ragas_qa_pairs=10
 ragas_timeout=300
 ragas_max_workers=1
diff --git a/server/data/2021-SSCI-AComparativeStudyOfFuzzyTopicModelsAndLDAInTermsOfInterpretability.pdf b/server/data/2021-SSCI-AComparativeStudyOfFuzzyTopicModelsAndLDAInTermsOfInterpretability.pdf
new file mode 100644
index 0000000..2ea9f79
Binary files /dev/null and b/server/data/2021-SSCI-AComparativeStudyOfFuzzyTopicModelsAndLDAInTermsOfInterpretability.pdf differ
diff --git a/server/data/2023-IFSA-InterpretingTopicModelsWithChatGPT (1).pdf b/server/data/2023-IFSA-InterpretingTopicModelsWithChatGPT (1).pdf
new file mode 100644
index 0000000..03432b3
Binary files /dev/null and b/server/data/2023-IFSA-InterpretingTopicModelsWithChatGPT (1).pdf differ
diff --git a/server/data/A Joint Introduction to Natural language processing and deep learning.pdf b/server/data/A Joint Introduction to Natural language processing and deep learning.pdf
new file mode 100644
index 0000000..19a12e6
Binary files /dev/null and b/server/data/A Joint Introduction to Natural language processing and deep learning.pdf differ
diff --git a/server/data/BERTopic-NeuralTopicModelingWithAClassBasedTF-IDFprocedure.pdf b/server/data/BERTopic-NeuralTopicModelingWithAClassBasedTF-IDFprocedure.pdf
new file mode 100644
index 0000000..06dfe18
Binary files /dev/null and b/server/data/BERTopic-NeuralTopicModelingWithAClassBasedTF-IDFprocedure.pdf differ
diff --git a/server/data/Efficient Estimation of Word Representations in vector space.pdf b/server/data/Efficient Estimation of Word Representations in vector space.pdf
new file mode 100644
index 0000000..aa17ab0
Binary files /dev/null and b/server/data/Efficient Estimation of Word Representations in vector space.pdf differ
diff --git a/server/data/LanguageModelsAreFewShotLearners.pdf b/server/data/LanguageModelsAreFewShotLearners.pdf
new file mode 100644
index 0000000..8cfe482
Binary files /dev/null and b/server/data/LanguageModelsAreFewShotLearners.pdf differ
diff --git a/server/data/LatenDirichletAllocation.pdf b/server/data/LatenDirichletAllocation.pdf
new file mode 100644
index 0000000..1b7cfaa
Binary files /dev/null and b/server/data/LatenDirichletAllocation.pdf differ
diff --git a/server/data/attention is all you need.pdf b/server/data/attention is all you need.pdf
new file mode 100644
index 0000000..2b8c574
Binary files /dev/null and b/server/data/attention is all you need.pdf differ
diff --git a/server/data/enriching word vectors with subword information.pdf b/server/data/enriching word vectors with subword information.pdf
new file mode 100644
index 0000000..56c428e
Binary files /dev/null and b/server/data/enriching word vectors with subword information.pdf differ
diff --git a/server/eval_create_testset.py b/server/eval_create_testset.py
new file mode 100644
index 0000000..e4f9b36
--- /dev/null
+++ b/server/eval_create_testset.py
@@ -0,0 +1,240 @@
+import random
+import logging
+from dotenv import load_dotenv
+import os
+from random import sample
+import pandas as pd
+import shutil
+import pickle
+import ast
+
+from RAGHelper_cloud import RAGHelperCloud
+from RAGHelper_local import RAGHelperLocal
+from RAGHelper import RAGHelper
+
+from datasets import Dataset
+
+from langchain.prompts import PromptTemplate
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.runnable import RunnablePassthrough
+
+
+# Load RAG
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+use_cloud = False
+if os.getenv("use_openai") == "True" or os.getenv("use_gemini") == "True" or os.getenv("use_azure") == "True":
+    raghelper = RAGHelperCloud(logger)
+    use_cloud = True
+else:
+    raghelper = RAGHelperLocal(logger)
+
+
+# Set variables from environment
+use_n_documents = int(os.getenv("vector_store_k"))
+if os.getenv("rerank") == "True":
+    use_n_documents = int(os.getenv("rerank_k"))
+end_string = os.getenv("llm_assistant_token")
+sample_size = int(os.getenv("eval_sample_size"))
+n_qa_pairs = int(os.getenv("eval_qa_pairs"))
+
+use_example_questions = os.getenv("eval_use_example_questions").lower() == "true"
+catch_irrelevant_chunks = os.getenv("eval_catch_irrelevant_chunks").lower() == "true"
+check_sample_relevance = os.getenv("eval_check_sample_relevance").lower() == "true"
+retrieve_samples = os.getenv("eval_retrieve_samples").lower() == "true"
+
+
+# configure example questions
+if use_example_questions:
+  example_questions = ast.literal_eval(os.getenv("eval_example_questions"))
+  current_q_query = os.getenv('eval_question_query')
+  prompt_text = os.getenv('eval_example_questions_prompt')
+  for i, question in enumerate(example_questions):
+      prompt_text += f'{i+1}. {question}\n'
+  new_q_query = current_q_query + prompt_text
+  os.environ['eval_question_query'] = new_q_query
+
+# configure catching irrelevant chunks (denying question generation)
+if catch_irrelevant_chunks:
+  current_q_query = os.getenv('eval_question_query')
+  prompt_text = os.getenv('eval_catch_irrelevant_chunks_prompt')
+  new_q_query = f'{current_q_query} {prompt_text}'
+  os.environ['eval_question_query'] = new_q_query
+
+# configure using the same samples as defined testset
+if retrieve_samples:
+  testset_path = f'./testsets/{os.getenv("eval_retrieve_samples_folder")}/'
+
+  # load data
+  testset = pd.read_csv(testset_path + 'testset.csv')
+  testset['true_doc_ids'] = testset['true_doc_ids'].apply(ast.literal_eval)
+  n_qa_pairs = len(testset)
+
+  # Path to chunks
+  testset_chunks_path = testset_path + 'rag_chunks.pickle'
+
+  # Load chunks
+  with open(testset_chunks_path, "rb") as file:
+      rag_chunks = pickle.load(file)
+
+  sampled_docs = testset['true_doc_ids'].apply(
+    lambda ids: [doc for doc in rag_chunks if doc.metadata['id'] in ids]
+    ).to_list()
+
+
+
+# Set up the documents and get a sample
+documents = raghelper.chunked_documents
+document_sample = sample(documents, sample_size)
+
+# Prepare template for checking sample relevance
+if check_sample_relevance:
+    if use_cloud:
+        thread = [
+            ('system', os.getenv('eval_sample_relevance_instruction')),
+            ('human', os.getenv('eval_sample_relevance_query'))
+        ]
+        prompt = ChatPromptTemplate.from_messages(thread)
+    else:
+        thread = [
+            {'role': 'system', 'content': os.getenv("eval_sample_relevance_instruction")},
+            {'role': 'user', 'content': os.getenv("eval_sample_relevance_query")}
+        ]
+        prompt_template = raghelper.tokenizer.apply_chat_template(thread, tokenize=False)
+        prompt = PromptTemplate(
+            input_variables=["context"],
+            template=prompt_template,
+        )
+
+    rag_sample_relevance = prompt | raghelper.llm
+
+# Prepare template for generating questions
+if use_cloud:
+    thread = [
+        ('system', os.getenv('eval_question_instruction')),
+        ('human', os.getenv('eval_question_query'))
+    ]
+    prompt = ChatPromptTemplate.from_messages(thread)
+else:
+    thread = [
+        {'role': 'system', 'content': os.getenv("eval_question_instruction")},
+        {'role': 'user', 'content': os.getenv("eval_question_query")}
+    ]
+    prompt_template = raghelper.tokenizer.apply_chat_template(thread, tokenize=False)
+    prompt = PromptTemplate(
+        input_variables=["context"],
+        template=prompt_template,
+    )
+
+rag_question = prompt | raghelper.llm
+
+# Prepare template for generating answers with our questions
+if use_cloud:
+    thread = [
+        ('system', os.getenv('eval_answer_instruction')),
+        ('human', os.getenv('eval_answer_query'))
+    ]
+    prompt = ChatPromptTemplate.from_messages(thread)
+else:
+    thread = [
+        {'role': 'system', 'content': os.getenv("eval_answer_instruction")},
+        {'role': 'user', 'content': os.getenv("eval_answer_query")}
+    ]
+    prompt_template = raghelper.tokenizer.apply_chat_template(thread, tokenize=False)
+    prompt = PromptTemplate(
+        input_variables=["context", "question"],
+        template=prompt_template,
+    )
+
+rag_answer = prompt | raghelper.llm
+
+# Create test set
+qa_pairs = []
+qa_generated = 0
+rejected_samples = []
+
+while qa_generated < n_qa_pairs:
+    if retrieve_samples:
+        selected_docs = sampled_docs[qa_generated]
+    else:
+        selected_docs = random.sample(document_sample, min(use_n_documents, len(document_sample)))
+    formatted_docs = RAGHelper.format_documents(selected_docs)
+
+    if check_sample_relevance:
+        sample_relevance_chain = ({"context": RunnablePassthrough()} | rag_sample_relevance)
+        response = sample_relevance_chain.invoke(formatted_docs)
+        if use_cloud:
+            if hasattr(response, 'content'):
+                sample_relevance = response.content
+            elif hasattr(response, 'answer'):
+                sample_relevance = response.answer
+            elif 'answer' in response:
+                sample_relevance = response["answer"]
+        else:
+            sample_relevance = response[response.rindex(end_string)+len(end_string):]
+
+        if 'false' in sample_relevance.lower():
+            rejected_samples.append(selected_docs)
+            continue
+
+    question_chain = ({"context": RunnablePassthrough()} | rag_question)
+    response = question_chain.invoke(formatted_docs)
+    if use_cloud:
+        if hasattr(response, 'content'):
+            question = response.content
+        elif hasattr(response, 'answer'):
+            question = response.answer
+        elif 'answer' in response:
+            question = response["answer"]
+    else:
+        question = response[response.rindex(end_string)+len(end_string):]
+
+    answer_chain = ({"context": RunnablePassthrough(), "question": RunnablePassthrough()} | rag_answer)
+    response = answer_chain.invoke({"context": formatted_docs, "question": question})
+    if use_cloud:
+        if hasattr(response, 'content'):
+            answer = response.content
+        elif hasattr(response, 'answer'):
+            answer = response.answer
+        elif 'answer' in response:
+            answer = response["answer"]
+    else:
+        answer = response[response.rindex(end_string)+len(end_string):]
+
+    id_set = set([d.metadata.get("id", "no id found") for d in selected_docs])
+
+    qa_pairs.append({"question": question, "ground_truth": answer, "true_doc_ids": id_set})
+
+    qa_generated += 1
+
+# convert type dataset to pandas dataframe
+df_testset = pd.DataFrame(qa_pairs)
+
+# Save testset in folder
+rag_chunks_path = "rag_chunks.pickle"
+testsets_folder = "testsets"
+
+# Ensure the base testsets folder exists
+os.makedirs(testsets_folder, exist_ok=True)
+
+# Determine the next available folder name
+folder_number = 1
+while True:
+    target_folder = os.path.join(testsets_folder, str(folder_number))
+    if not os.path.exists(target_folder):
+        break
+    folder_number += 1
+
+# Create the new folder
+os.makedirs(target_folder)
+
+# Save the DataFrame as CSV in the new folder
+testset_csv_path = os.path.join(target_folder, "testset.csv")
+df_testset.to_csv(testset_csv_path, index=False)
+
+# Copy the rag_chunks file to the new folder
+rag_chunks_copy_path = os.path.join(target_folder, "rag_chunks.pickle")
+shutil.copy(rag_chunks_path, rag_chunks_copy_path)
+
+print(f"Testset saved in folder: {target_folder}")
\ No newline at end of file
diff --git a/server/eval_evaluate_RAG.py b/server/eval_evaluate_RAG.py
new file mode 100644
index 0000000..1e03ff4
--- /dev/null
+++ b/server/eval_evaluate_RAG.py
@@ -0,0 +1,181 @@
+import random
+import logging
+from dotenv import load_dotenv
+import os
+from random import sample
+import pandas as pd
+import shutil
+import pickle
+import ast
+
+from RAGHelper_cloud import RAGHelperCloud
+from RAGHelper_local import RAGHelperLocal
+from RAGHelper import RAGHelper
+
+from datasets import Dataset
+
+from langchain.prompts import PromptTemplate
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.runnable import RunnablePassthrough
+
+from ragas import EvaluationDataset
+from ragas import evaluate
+from ragas.metrics import context_precision, context_recall, faithfulness, answer_relevancy
+from ragas.run_config import RunConfig
+
+
+
+# Load RAG
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+use_cloud = False
+if os.getenv("use_openai") == "True" or os.getenv("use_gemini") == "True" or os.getenv("use_azure") == "True":
+    raghelper = RAGHelperCloud(logger)
+    use_cloud = True
+else:
+    raghelper = RAGHelperLocal(logger)
+
+# set variables
+end_string = os.getenv("llm_assistant_token")
+k = int(os.getenv("rerank_k"))
+eval_ragas = os.getenv("eval_ragas").lower() == "true"
+testset_path = os.getenv("eval_testset_directory")
+modelname = os.getenv("eval_RAG_instance_name")
+
+# load testset
+testset = pd.read_csv(testset_path + 'testset.csv')
+testset['true_doc_ids'] = testset['true_doc_ids'].apply(ast.literal_eval)
+
+# run RAG on each question in the testset
+evalset = testset.copy()
+
+RAG_answers = []
+RAG_doc_ids = []
+for question in testset['question']:
+    (nh, response) = raghelper.handle_user_interaction(question, [])
+    docs = response['docs']
+    if use_cloud:
+        if hasattr(response, 'content'):
+            answer = response.content
+        elif hasattr(response, 'answer'):
+            answer = response.answer
+        elif 'answer' in response:
+            answer = response["answer"]
+    else:
+        answer = response['text'][response['text'].rindex(end_string)+len(end_string):]
+
+    RAG_answers.append(answer)
+    RAG_doc_ids.append([d.metadata.get("id", "no id found") for d in docs])
+
+evalset[f'{modelname}_answer'] = RAG_answers
+evalset[f'{modelname}_doc_ids'] = RAG_doc_ids
+
+
+# Compute evaluation metrics
+
+# Ensure `true_doc_ids` is parsed from string to list
+if isinstance(evalset["true_doc_ids"].iloc[0], str):
+  evalset["true_doc_ids"] = evalset["true_doc_ids"].apply(ast.literal_eval)
+
+# Compute the count of intersecting IDs
+n_docs_colname = f'{modelname}_n_docs_identified'
+evalset[n_docs_colname] = evalset.apply(
+    lambda row: len(set(row["true_doc_ids"]) & set(row[f"{modelname}_doc_ids"])),
+    axis=1,
+)
+
+n_docs_top_k_colname = f'{modelname}_n_docs_identified_top_{k}'
+evalset[n_docs_top_k_colname] = evalset.apply(
+    lambda row: len(set(row["true_doc_ids"]) & set(row[f"{modelname}_doc_ids"][:k])),
+    axis=1,
+)
+
+recall_colname = f'{modelname}_recall'
+recall_top_k_colname = f'{modelname}_recall_top_{k}'
+evalset[recall_colname] = evalset[n_docs_colname] / evalset['true_doc_ids'].apply(len)
+evalset[recall_top_k_colname] = evalset[n_docs_top_k_colname] / evalset['true_doc_ids'].apply(len)
+
+mean_recall = evalset[recall_colname].mean()
+mean_recall_top_k = evalset[recall_top_k_colname].mean()
+print(f'Model {modelname} has a recall of {mean_recall}')
+print(f'Model {modelname} has a recall-top-{k} of {mean_recall_top_k}')
+
+# Add true doc contexts column
+with open(testset_path + 'rag_chunks.pickle', "rb") as file:
+    rag_chunks = pickle.load(file)
+
+evalset['true_doc_contexts'] = evalset['true_doc_ids'].apply(
+    lambda ids: [doc.page_content for doc in rag_chunks if doc.metadata['id'] in ids]
+    )
+
+# Ensure the base evalsets folder exists
+evalsets_folder = "evalsets"
+os.makedirs(evalsets_folder, exist_ok=True)
+
+# Determine the next available folder name
+folder_number = 1
+while True:
+    target_folder = os.path.join(evalsets_folder, str(folder_number))
+    if not os.path.exists(target_folder):
+        break
+    folder_number += 1
+
+# Create the new folder
+os.makedirs(target_folder)
+
+evalset.to_csv(os.path.join(target_folder, "evalset.csv"), index=False)
+evalset.to_excel(os.path.join(target_folder, "evalset.xlsx"), index=False)
+shutil.copy(testset_path + 'rag_chunks.pickle', os.path.join(target_folder, "rag_chunks.pickle"))
+
+
+
+# Create ragas formatted data
+if eval_ragas:
+    ragas_data = evalset.copy()
+    ragas_cols = ['question', 'ground_truth', 'answer', 'contexts']
+    
+    # get cols of first specified model
+    answer_col = [col for col in ragas_data.columns if col.endswith('answer')][0]
+    context_col = [col for col in ragas_data.columns if col.endswith('doc_ids')][1]
+
+    # rename answer_col to 'answer'
+    ragas_data = ragas_data.rename(columns={answer_col: 'answer'})
+
+    # Path to chunks
+    file_path = "rag_chunks.pickle"
+
+    # Load chunks
+    with open(file_path, "rb") as file:
+        rag_chunks = pickle.load(file)
+
+    ragas_data['contexts'] = ragas_data[context_col].apply(
+        lambda ids: [doc.page_content for doc in rag_chunks if doc.metadata['id'] in ids]
+        )
+
+    # delete columns that are not in ragas_cols
+    ragas_data = ragas_data[ragas_cols]
+
+    # Convert to new format in newer ragas version
+    if True:
+      new_ragas_cols = {"question": "user_input", "ground_truth": "reference", "answer": "response", "contexts": "retrieved_contexts"}
+      ragas_data = ragas_data.rename(new_ragas_cols, axis=1)
+    
+    # create ragas dataset
+    eval_dataset = EvaluationDataset.from_pandas(ragas_data)
+
+    # Evaluate
+    results = evaluate(
+        eval_dataset,
+        llm=raghelper.llm,
+        embeddings=raghelper.embeddings,
+        metrics=[
+            context_precision,
+            faithfulness,
+            answer_relevancy,
+            context_recall
+            ],
+        run_config=RunConfig(max_workers=1, timeout=600.0)
+        )
+    print("Evaluation Results:")
+    print(results)
\ No newline at end of file
diff --git a/server/testsets/30QA/rag_chunks.pickle b/server/testsets/30QA/rag_chunks.pickle
new file mode 100644
index 0000000..0c219cb
Binary files /dev/null and b/server/testsets/30QA/rag_chunks.pickle differ
diff --git a/server/testsets/30QA/testset.csv b/server/testsets/30QA/testset.csv
new file mode 100644
index 0000000..8798318
--- /dev/null
+++ b/server/testsets/30QA/testset.csv
@@ -0,0 +1,31 @@
+question,ground_truth,true_doc_ids
+What is the estimated energy cost of generating 100 pages of content from a fully trained GPT-3 model?,0.4 kW-hr.,{'f077cb2534244a2238c973c684a52740'}
+"What is the name of the journal where the paper ""Latent Dirichlet Allocation"" was published?",The Journal of Machine Learning research,{'393b4f9687ec234833f3b69a07f5cada'}
+"What is the name of the conference where Maximilian Köper, Christian Scheible, and Sabine Schulte im Walde presented their work on multilingual reliability and ""semantic"" structure of continuous word spaces?",IWCS 2015.,{'39eb6bd54646482fdb278dda81ff6d24'}
+What is the performance of the sisg model trained on 5% of the German G UR350 dataset compared to the cbow baseline?,Our model (sisg) trained on 5% of the data achieves better performance (66) than the cbow baseline.,{'0f7a7ad50dcef75663956e1588ff3d0f'}
+"What was the outcome of the supposed ""rationalist wave"" mentioned in the text?","The outcome of the supposed ""rationalist wave"" was not as predicted; instead, the deep learning era arrived in full force.",{'32fe5d2db760dc075bf8447855fb798b'}
+What is the proposed alternative scoring function in the subword model?,Each word wis represented as a bag of character n-gram.,{'c0cdf02fdf19a97836f64611ecc04d2a'}
+What is the performance comparison between FLSA-W and other models in terms of text classification when the number of topics is 50 or more?,"text classification. FLSA-W performs better than almost all models with 20-word-topics, whereas it only outperforms other models for 50+ topics with the 10-word-topics, solely based",{'0a03fd4fc8d6ca4de3408cd3f5c11d9f'}
+What was the parameter count of the transformer in the original paper mentioned in the document?,213 million,{'6b70a0dffda4606d5d39ea5d9f18b0af'}
+"What is the name of the publication where Hinton, G., Deng, L., Yu, D., Dahl, G., Mohamed, A.-r., Jaitly, N., Senior, A., Vanhoucke, V., Nguyen, P., Kingsbury, B., and Sainath, T. published their work?","IEEE Signal Processing Magazine, 29.",{'5434dea33c0115d513905024625b8bbc'}
+"What was the outcome of the supposed ""new rationalist wave"" mentioned in the text?",The deep learning era arrived in full force instead.,{'32fe5d2db760dc075bf8447855fb798b'}
+What is the relationship between the left-hand side of equation (12) and the KL divergence between two probabilities?,The difference between the left-hand side and the right-hand side of the Eq. (12) is the KL divergence between the variational posterior probability and the true posterior probability.,{'a0ef83741083f6ee650d982a7f8b05fb'}
+What are some key applications of maximum entropy models in natural language processing?,"A simple introduction to maximum entropy models for natural language processing. Technical report, University of Pennsylvania.",{'bd96d91b8a151233f631f6051f7aa59d'}
+What is the proposed alternative scoring function in the subword model?,Each word wis represented as a bag of character n-gram.,{'c0cdf02fdf19a97836f64611ecc04d2a'}
+"What is the number of parameters for the ""base"" model in Table 3?",65,{'53c15866b0b372542d12cbeb95ec49b9'}
+What percentage of accuracy did GPT-3 175B achieve on the task of associating pronouns with participant positions?,GPT-3 175B had the highest accuracy of all the models (64.17%) on this task.,{'9f9e9ff79a6e231bd1a55ae7dfc7d07d'}
+What is the estimated energy cost of generating 100 pages of content from a fully trained GPT-3 model?,0.4 kW-hr,{'f077cb2534244a2238c973c684a52740'}
+What is the main challenge associated with interpreting the outputs of topic models?,"Despite its wide applications, interpreting the outputs of topic models remains challenging.",{'119041ba53bd0881efd09803bc9b1ccb'}
+What is the primary assumption made when leveraging c-TF-IDF representations of topics in BERTopic?,The primary assumption made when leveraging c-TF-IDF representations of topics in BERTopic is that the temporal nature of topics should not influence the creation of global topics.,{'6f9983cb7179ec9925d489875dbb243c'}
+"What is the main challenge mentioned in the abstract of the document ""Towards Interpreting Topic Models with ChatGPT""?","Topic modeling has become a popular approach to identify semantic structures in text corpora. Despite its wide applications, interpreting the outputs of topic models remains challenging.",{'119041ba53bd0881efd09803bc9b1ccb'}
+What is the name of the software framework mentioned in document 0?,gensim,{'1ee7d87a1476c4603b13f68acb7316cc'}
+How many questions are there in total in the dataset described in the document?,There are 18844 questions in total in the dataset described in the document.,{'2eb093406e89481d6fe939fc22c6a891'}
+What is the primary difference between the first and second waves of NLP research?,"The empiricism in NLP and speech recognition in this second wave was based on data-intensive machine learning, which we now call “shallow” due to the general lack of abstractions constructed by many-layer or “deep” representations of data which would come in the third wave to be described in the next section.",{'fe50c8189b999fa326b6d18a5af388c9'}
+What was the parameter count of the transformer used in the original paper mentioned in the text?,213 million,{'6b70a0dffda4606d5d39ea5d9f18b0af'}
+What is the effect on training time when using one epoch versus three epochs for the CBOW model?,Training a model on twice as much data using one epoch gives comparable or better results than iterating over the same data for three epochs.,{'6ccc8206395aa553b57315c7fbc64638'}
+What are some key applications of maximum entropy models in natural language processing?,"A simple introduction to maximum entropy models for natural language processing. Technical report, University of Pennsylvania.",{'bd96d91b8a151233f631f6051f7aa59d'}
+What statistical method was used to compare the accuracy of the participant's scores with those of a control model?,A two-sample Student’s T-Test.,{'4f2c833ac8909a796c12a0c2e245053f'}
+What is the estimated energy cost of generating 100 pages of content from a fully trained GPT-3 175B model?,0.4 kW-hr.,{'f077cb2534244a2238c973c684a52740'}
+What is the approach used to reduce the size of the feature set in the document classification problem?,Using an LDA model for dimensionality reduction.,{'dab07e093f8b3cc04cba0d5f49c4540d'}
+What are some key applications of maximum entropy models in natural language processing?,"A simple introduction to maximum entropy models for natural language processing. Technical report, University of Pennsylvania.",{'bd96d91b8a151233f631f6051f7aa59d'}
+What was the primary goal of using the training data in the approaches described?,The training data were used to learn parameters of (shallow) statistical or neural models automatically from data.,{'eca67b69e84d3688f45b57ec192eada4'}
diff --git a/ui/scala/automated_run_sbt.bat b/ui/scala/automated_run_sbt.bat
new file mode 100644
index 0000000..994a0d5
--- /dev/null
+++ b/ui/scala/automated_run_sbt.bat
@@ -0,0 +1,8 @@
+:: Open localhost:9000 in the default browser
+start http://localhost:9000
+
+@echo off
+:: Run sbt in the current directory
+sbt run
+
+
diff --git a/ui/scala/scala-ui.db b/ui/scala/scala-ui.db
new file mode 100644
index 0000000..c71dd6b
Binary files /dev/null and b/ui/scala/scala-ui.db differ