From d35055a348f4bec67d945d151efafd4d6875c3c2 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Thu, 31 Aug 2023 18:26:05 +0100
Subject: [PATCH 01/14] Add first adala version

---
 README.md                                 |  39 ++++++
 adala/__init__.py                         |   1 +
 adala/analyst.py                          |  19 +++
 adala/engineer.py                         |  31 +++++
 adala/labeler.py                          |  35 ++++++
 adala/main.py                             | 139 ++++++++++++++++++++++
 adala/prompts/analyst.txt                 |   5 +
 adala/prompts/engineer_human_message.txt  |   3 +
 adala/prompts/engineer_system_message.txt |   7 ++
 adala/prompts/labeler.txt                 |  13 ++
 requirements.txt                          |   4 +
 setup.py                                  |  28 +++++
 12 files changed, 324 insertions(+)
 create mode 100644 adala/__init__.py
 create mode 100644 adala/analyst.py
 create mode 100644 adala/engineer.py
 create mode 100644 adala/labeler.py
 create mode 100644 adala/main.py
 create mode 100644 adala/prompts/analyst.txt
 create mode 100644 adala/prompts/engineer_human_message.txt
 create mode 100644 adala/prompts/engineer_system_message.txt
 create mode 100644 adala/prompts/labeler.txt
 create mode 100644 requirements.txt
 create mode 100644 setup.py

diff --git a/README.md b/README.md
index c0e9ca7..1a99ae6 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,41 @@
 # ADALA
 ADALA: Autonomous Data Labeling Agent
+
+## Quick Start
+
+### Install Label Studio
+
+```bash
+pip install label-studio
+```
+
+and start it with `label-studio`
+
+Now create a new project `Create > New Project > Labeling Setup > Natural Language Processing > Text Classification`
+Get the project ID `label_studio_project_id` from the URL, it will be used later.
+
+### Install ADALA
+
+```bash
+git clone https://github.com/HumanSignal/ADALA.git
+cd ADALA/
+pip install -e .
+```
+
+### Run ADALA
+
+```python
+import os
+import adala
+import pandas as pd
+
+os.environ['LABEL_STUDIO_API_TOKEN'] = 'your_token'
+os.environ['LABEL_STUDIO_HOST'] = 'http://localhost:8080'
+os.environ['OPENAI_API_KEY'] = 'your_key'
+
+
+df = pd.read_csv('data.csv')
+
+results = adala.label(df, label_studio_project_id, initial_instructions='Go go!')
+results['predicted_df']
+```
\ No newline at end of file
diff --git a/adala/__init__.py b/adala/__init__.py
new file mode 100644
index 0000000..8108262
--- /dev/null
+++ b/adala/__init__.py
@@ -0,0 +1 @@
+from .main import Adala, label
\ No newline at end of file
diff --git a/adala/analyst.py b/adala/analyst.py
new file mode 100644
index 0000000..43c4382
--- /dev/null
+++ b/adala/analyst.py
@@ -0,0 +1,19 @@
+import pandas as pd
+import os
+
+from typing import List, Dict
+from langchain import PromptTemplate, OpenAI, LLMChain
+from langchain.agents import create_pandas_dataframe_agent
+
+
+class Analyst:
+    PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'analyst.txt')).read()
+
+    def __init__(self):
+        self.llm = OpenAI(model_name='gpt-4', temperature=0)
+
+    def __call__(self, df: pd.DataFrame):
+        agent = create_pandas_dataframe_agent(llm=self.llm, df=df, verbose=True)
+        explorer_prompt = self.PROMPT_TEMPLATE
+        observations = agent.run(explorer_prompt)
+        return observations
diff --git a/adala/engineer.py b/adala/engineer.py
new file mode 100644
index 0000000..8e80482
--- /dev/null
+++ b/adala/engineer.py
@@ -0,0 +1,31 @@
+import os
+
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain.chains import LLMChain
+
+
+class Engineer:
+
+    SYSTEM_MESSAGE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'engineer_system_message.txt')).read()
+    HUMAN_MESSAGE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'engineer_human_message.txt')).read()
+
+    def __init__(self):
+        self.llm = ChatOpenAI(model_name='gpt-4', temperature=0.5)
+        self.system_message_prompt = SystemMessagePromptTemplate.from_template(self.SYSTEM_MESSAGE)
+        self.human_message_prompt = HumanMessagePromptTemplate.from_template(self.HUMAN_MESSAGE)
+        self.chat_prompt = ChatPromptTemplate.from_messages([
+            self.system_message_prompt,
+            self.human_message_prompt
+        ])
+        self.chain = LLMChain(
+            llm=self.llm,
+            prompt=self.chat_prompt
+        )
+
+    def __call__(self, current_instructions, observations):
+        new_instructions = self.chain.run(
+            instructions=current_instructions,
+            observations=observations
+        )
+        return new_instructions
diff --git a/adala/labeler.py b/adala/labeler.py
new file mode 100644
index 0000000..238b5bb
--- /dev/null
+++ b/adala/labeler.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import json
+import os
+import logging
+import difflib
+
+from typing import List
+from langchain import PromptTemplate, OpenAI, LLMChain
+
+logger = logging.getLogger(__name__)
+
+
+class Labeler:
+
+    PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'labeler.txt')).read()
+
+    def __init__(self):
+        self.llm = OpenAI(model_name='text-davinci-003', temperature=0)
+        self.llm_chain = LLMChain(
+            llm=self.llm,
+            prompt=PromptTemplate.from_template(self.PROMPT_TEMPLATE)
+        )
+
+    def match_labels(self, response: str, original_labels: List[str]):
+        scores = list(map(lambda l: difflib.SequenceMatcher(None, response, l).ratio(), original_labels))
+        return original_labels[scores.index(max(scores))]
+
+    def __call__(self, row: pd.Series, instructions: str, labels: List):
+        prediction = self.llm_chain.predict(
+            record=json.dumps(row.to_json()),
+            instructions=instructions,
+            labels=str(labels)
+        )
+        safe_prediction = self.match_labels(prediction, labels)
+        return safe_prediction
diff --git a/adala/main.py b/adala/main.py
new file mode 100644
index 0000000..fdf3408
--- /dev/null
+++ b/adala/main.py
@@ -0,0 +1,139 @@
+import logging
+import pandas as pd
+
+from typing import Optional, Dict
+from label_studio_sdk.utils import get_or_create_project
+
+from .analyst import Analyst
+from .engineer import Engineer
+from .labeler import Labeler
+
+logger = logging.getLogger(__name__)
+
+
+class Adala:
+
+    def __init__(self):
+        self.labeler = Labeler()
+        self.analyst = Analyst()
+        self.engineer = Engineer()
+
+    def run(
+        self,
+        df: pd.DataFrame,
+        initial_instructions: str,
+        label_studio_project_id: int,
+        validation_sample_size: Optional[int] = 5,
+        max_iterations: Optional[int] = 10,
+        max_accuracy: Optional[float] = 0.9
+    ) -> Dict[str, pd.DataFrame]:
+        """
+        Run the ADALA on the input data frame and give back output with instructions and accuracies
+        :param df:
+        :param initial_instructions:
+        :param label_studio_project_id:
+        :param validation_sample_size:
+        :param max_iterations:
+        :param max_accuracy:
+        :return:
+        """
+
+        project = get_or_create_project(project_id=label_studio_project_id)
+        labels = next(iter(project.parsed_label_config.values()))['labels']
+
+        logger.info(
+            f'Connected to project: {project.title} (ID={project.id})\n'
+            f'Target labels: {labels}'
+        )
+
+        current_instructions = initial_instructions
+
+        df = df.copy()
+        prev_df_val = project.get_dataframe()
+        logger.debug(f'Retrieved dataframe from project:\n{prev_df_val}')
+
+        history = []
+
+        for iteration in range(max_iterations):
+            df_val = df.sample(n=validation_sample_size, axis=0)
+            df.drop(df_val.index)
+
+            predictions = df_val.apply(
+                func=Labeler(),
+                axis=1,
+                instructions=current_instructions,
+                labels=labels
+            )
+            df_val = df_val.assign(predictions=predictions)
+
+            df_val = project.label_dataframe(df_val, preannotated_from_fields=['predictions'])
+
+            if not prev_df_val.empty:
+                predictions = prev_df_val.apply(
+                    func=Labeler(),
+                    axis=1,
+                    instructions=current_instructions,
+                    labels=labels
+                )
+                prev_df_val = prev_df_val.assign(predictions=predictions)
+                df_val = pd.concat([prev_df_val, df_val])
+
+            logger.debug(f'Updated dataframe:\n{df_val}')
+            prev_df_val = df_val
+            accuracy_score = (df_val['predictions'] == df_val['ground_truth']).mean()
+            accuracy = f'{round(100 * accuracy_score)}%'
+            logger.info(f'Accuracy: {accuracy}')
+            history.append({
+                'iteration': iteration,
+                'accuracy': accuracy,
+                'instructions': current_instructions
+            })
+
+            if accuracy_score > max_accuracy:
+                logger.info(f'Accuracy threshold reached: {accuracy_score} > {max_accuracy}')
+                break
+
+            observations = self.analyst(df_val)
+
+            new_instructions = self.engineer(current_instructions, observations)
+
+            logger.info(f'Old instructions: {current_instructions}\nNew instructions: {new_instructions}')
+
+            current_instructions = new_instructions
+
+        # run predictions on the rest of the dataset
+        predictions = df.apply(
+            func=Labeler(),
+            axis=1,
+            instructions=current_instructions,
+            labels=labels
+        )
+        df = pd.concat(
+            [prev_df_val, df.assign(predictions=predictions)]
+        )
+        return {
+            'predicted_df': df,
+            'history': pd.DataFrame.from_records(history)
+        }
+
+
+def label(
+    df: pd.DataFrame,
+    initial_instructions: str,
+    label_studio_project_id: int,
+    validation_sample_size: Optional[int] = 5,
+    max_iterations: Optional[int] = 10,
+    max_accuracy: Optional[float] = 0.9
+) -> Dict[str, pd.DataFrame]:
+    """
+    Run the ADALA on the input data frame and give back output with instructions and accuracies
+    :param df:
+    :param initial_instructions:
+    :param label_studio_project_id:
+    :param validation_sample_size:
+    :param max_iterations:
+    :param max_accuracy:
+    :return:
+    """
+    adala = Adala()
+    return adala.run(df, initial_instructions, label_studio_project_id, validation_sample_size, max_iterations, max_accuracy)
\ No newline at end of file
diff --git a/adala/prompts/analyst.txt b/adala/prompts/analyst.txt
new file mode 100644
index 0000000..e2144d1
--- /dev/null
+++ b/adala/prompts/analyst.txt
@@ -0,0 +1,5 @@
+You're provided with a dataset containing misclassified examples. The dataset includes various columns with arbitrary names, which serve as the input data for classification. Two additional columns are also present:
+
+- `ground_truth`: the correct labels.
+- `predictions`: the labels predicted by the model.
+Your goal is to analyze these columns to identify the most common error patterns in the dataset.
\ No newline at end of file
diff --git a/adala/prompts/engineer_human_message.txt b/adala/prompts/engineer_human_message.txt
new file mode 100644
index 0000000..10037eb
--- /dev/null
+++ b/adala/prompts/engineer_human_message.txt
@@ -0,0 +1,3 @@
+Observations: {observations}
+Current instructions: {instructions}
+New instructions for the next round:
\ No newline at end of file
diff --git a/adala/prompts/engineer_system_message.txt b/adala/prompts/engineer_system_message.txt
new file mode 100644
index 0000000..ff2bfec
--- /dev/null
+++ b/adala/prompts/engineer_system_message.txt
@@ -0,0 +1,7 @@
+You are to optimize instructions for the LLM to improve classification accuracy. The LLM solely bases its classifications on the provided records, with no access to external contexts. Given a record and associated instructions, it generates predictions. For example, with the text "The sky is blue.", the instruction "Identify if the statement is about nature.", and the labels "Yes" and "No", it predicts "Yes".
+
+Your task is informed by two sets of observations:
+
+1. A few concrete error examples where the LLM's predictions were off.
+2. Statistical observations derived from an analysis conducted over a pandas dataframe.
+Using these observations, assess the prior instruction and then refine it to better guide the LLM's predictions. Respond only with the refined instruction.
\ No newline at end of file
diff --git a/adala/prompts/labeler.txt b/adala/prompts/labeler.txt
new file mode 100644
index 0000000..114a7ee
--- /dev/null
+++ b/adala/prompts/labeler.txt
@@ -0,0 +1,13 @@
+Classify the following JSON record [RECORD] based on these instructions [INSTRUCTIONS] and choose from the provided labels [LABELS].
+
+Example:
+RECORD: {{"text": "The sky is blue."}}
+INSTRUCTIONS: Identify if the statement is about nature.
+LABELS: [Yes, No]
+ANSWER:
+Yes
+
+RECORD: {record}
+INSTRUCTIONS: {instructions}
+LABELS: {labels}
+ANSWER:
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..04207c4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+openai
+langchain
+label-studio-sdk @ git+https://github.com/HumanSignal/label-studio-sdk.git@pd-support
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..0cd185b
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
+"""
+import re
+import setuptools
+
+# Module dependencies
+requirements, dependency_links = [], []
+with open('requirements.txt') as f:
+    for line in f.read().splitlines():
+        requirements.append(line)
+
+setuptools.setup(
+    name='adala',
+    version='0.0.1',
+    author='Heartex',
+    author_email="hello@humansignal.com",
+    description='ADALA: Automated Data Labeling Agent',
+    url='https://github.com/HumanSignal/ADALA',
+    packages=setuptools.find_packages(),
+    include_package_data=True,
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.8',
+    install_requires=requirements
+)

From 627a030c0e2c20d99530f319a30ebb88cbcb2d77 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Fri, 1 Sep 2023 13:40:50 +0100
Subject: [PATCH 02/14] Add memory and inner loop

---
 adala/analyst.py          | 17 +++++---
 adala/labeler.py          |  4 +-
 adala/main.py             | 83 +++++++++++++++++----------------------
 adala/prompts/analyst.txt |  9 +++--
 4 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/adala/analyst.py b/adala/analyst.py
index 43c4382..9d17fda 100644
--- a/adala/analyst.py
+++ b/adala/analyst.py
@@ -4,16 +4,23 @@
 from typing import List, Dict
 from langchain import PromptTemplate, OpenAI, LLMChain
 from langchain.agents import create_pandas_dataframe_agent
+from langchain.memory import ConversationBufferMemory
 
 
 class Analyst:
-    PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'analyst.txt')).read()
-
-    def __init__(self):
-        self.llm = OpenAI(model_name='gpt-4', temperature=0)
+    PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'analyst_2.txt')).read()
 
     def __call__(self, df: pd.DataFrame):
-        agent = create_pandas_dataframe_agent(llm=self.llm, df=df, verbose=True)
+        llm = OpenAI(model_name='gpt-4', temperature=0)
+        memory = ConversationBufferMemory(memory_key='history')
+        df = df.copy()
+        df = df[df['ground_truth'] != df['predictions']]
+        agent = create_pandas_dataframe_agent(
+            llm=llm,
+            df=df,
+            verbose=True,
+            memory=memory
+        )
         explorer_prompt = self.PROMPT_TEMPLATE
         observations = agent.run(explorer_prompt)
         return observations
diff --git a/adala/labeler.py b/adala/labeler.py
index 238b5bb..1263605 100644
--- a/adala/labeler.py
+++ b/adala/labeler.py
@@ -26,8 +26,10 @@ def match_labels(self, response: str, original_labels: List[str]):
         return original_labels[scores.index(max(scores))]
 
     def __call__(self, row: pd.Series, instructions: str, labels: List):
+        row_dict = row.to_dict()
+        row_dict.pop('ground_truth', None)
         prediction = self.llm_chain.predict(
-            record=json.dumps(row.to_json()),
+            record=json.dumps(row_dict),
             instructions=instructions,
             labels=str(labels)
         )
diff --git a/adala/main.py b/adala/main.py
index fdf3408..81db171 100644
--- a/adala/main.py
+++ b/adala/main.py
@@ -26,7 +26,7 @@ def run(
         validation_sample_size: Optional[int] = 5,
         max_iterations: Optional[int] = 10,
         max_accuracy: Optional[float] = 0.9
-    ) -> Dict[str, pd.DataFrame]:
+    ) -> pd.DataFrame:
         """
         Run the ADALA on the input data frame and give back output with instructions and accuracies
         :param df:
@@ -50,56 +50,49 @@ def run(
 
         df = df.copy()
         prev_df_val = project.get_dataframe()
-        logger.debug(f'Retrieved dataframe from project:\n{prev_df_val}')
-
-        history = []
 
         for iteration in range(max_iterations):
             df_val = df.sample(n=validation_sample_size, axis=0)
             df.drop(df_val.index)
 
-            predictions = df_val.apply(
-                func=Labeler(),
-                axis=1,
-                instructions=current_instructions,
-                labels=labels
-            )
-            df_val = df_val.assign(predictions=predictions)
-
-            df_val = project.label_dataframe(df_val, preannotated_from_fields=['predictions'])
-
+            # create ground truth
+            df_val = project.label_dataframe(df_val)
             if not prev_df_val.empty:
-                predictions = prev_df_val.apply(
+                df_val = pd.concat([prev_df_val, df_val])
+
+            history = []
+            max_internal_iterations = 10
+            for internal_iteration in range(max_internal_iterations):
+                predictions = df_val.apply(
                     func=Labeler(),
                     axis=1,
                     instructions=current_instructions,
                     labels=labels
                 )
-                prev_df_val = prev_df_val.assign(predictions=predictions)
-                df_val = pd.concat([prev_df_val, df_val])
-
-            logger.debug(f'Updated dataframe:\n{df_val}')
-            prev_df_val = df_val
-            accuracy_score = (df_val['predictions'] == df_val['ground_truth']).mean()
-            accuracy = f'{round(100 * accuracy_score)}%'
-            logger.info(f'Accuracy: {accuracy}')
-            history.append({
-                'iteration': iteration,
-                'accuracy': accuracy,
-                'instructions': current_instructions
-            })
-
-            if accuracy_score > max_accuracy:
-                logger.info(f'Accuracy threshold reached: {accuracy_score} > {max_accuracy}')
-                break
-
-            observations = self.analyst(df_val)
-
-            new_instructions = self.engineer(current_instructions, observations)
-
-            logger.info(f'Old instructions: {current_instructions}\nNew instructions: {new_instructions}')
-
-            current_instructions = new_instructions
+                prev_df_val = df_val = df_val.assign(predictions=predictions)
+                accuracy = (df_val['predictions'] == df_val['ground_truth']).mean()
+                accuracy_str = f'{round(100 * accuracy)}%'
+                history.append({
+                    'iteration': internal_iteration,
+                    'accuracy': accuracy_str,
+                    'instructions': current_instructions
+                })
+                logger.info(f'Validation set: {df_val}')
+                logger.info(f'Current state: {pd.DataFrame.from_records(history)}')
+                if accuracy > max_accuracy:
+                    logger.info(f'Accuracy threshold reached: {accuracy} > {max_accuracy}')
+                    break
+                if len(history) >= 3 and (history[-1]['accuracy'] == history[-2]['accuracy'] == history[-3]['accuracy']):
+                    logger.info(f'Accuracy is not improving, trying to collect more data...')
+                    break
+
+                observations = self.analyst(df_val)
+
+                new_instructions = self.engineer(current_instructions, observations)
+
+                logger.info(f'Old instructions: {current_instructions}\nNew instructions: {new_instructions}')
+
+                current_instructions = new_instructions
 
         # run predictions on the rest of the dataset
         predictions = df.apply(
@@ -108,13 +101,9 @@ def run(
             instructions=current_instructions,
             labels=labels
         )
-        df = pd.concat(
-            [prev_df_val, df.assign(predictions=predictions)]
-        )
-        return {
-            'predicted_df': df,
-            'history': pd.DataFrame.from_records(history)
-        }
+        df = df.assign(predictions=predictions)
+        df = pd.concat([prev_df_val, df])
+        return df
 
 
 def label(
diff --git a/adala/prompts/analyst.txt b/adala/prompts/analyst.txt
index e2144d1..f3f0669 100644
--- a/adala/prompts/analyst.txt
+++ b/adala/prompts/analyst.txt
@@ -1,5 +1,6 @@
-You're provided with a dataset containing misclassified examples. The dataset includes various columns with arbitrary names, which serve as the input data for classification. Two additional columns are also present:
+You have a list of misclassified examples. Thoroughly analyze the input data to discern prevalent error patterns and identify potential causes for the model's confusion. Ensure your observations about the causes are derived solely from the input data and not based on the distribution of output labels.
 
-- `ground_truth`: the correct labels.
-- `predictions`: the labels predicted by the model.
-Your goal is to analyze these columns to identify the most common error patterns in the dataset.
\ No newline at end of file
+Previous observations:
+{history}
+
+Your current observations:
\ No newline at end of file

From 4e01ee8ba3a534056ea3811ad2190441af27d56e Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Fri, 1 Sep 2023 22:23:27 +0100
Subject: [PATCH 03/14] try Analyst -> chain

---
 adala/analyst.py | 63 +++++++++++++++++++++++++++++++++++++++---------
 adala/main.py    | 21 ++++++++--------
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/adala/analyst.py b/adala/analyst.py
index 9d17fda..2bbcd33 100644
--- a/adala/analyst.py
+++ b/adala/analyst.py
@@ -5,22 +5,61 @@
 from langchain import PromptTemplate, OpenAI, LLMChain
 from langchain.agents import create_pandas_dataframe_agent
 from langchain.memory import ConversationBufferMemory
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain.chains import LLMChain
 
 
 class Analyst:
-    PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'analyst_2.txt')).read()
+    SYSTEM_MESSAGE = '''\
+You are tasked as the AI guide for a team of labelers at a company specializing in text classification. \
+You'll be presented with a set of misclassified records, where the 'ground_truth' label differs \
+from the 'predictions'. The data is formatted in JSON as follows:
+[
+{{"ground_truth": "ground truth label", "predictions": "predicted label", ...}},
+...
+]
+
+Your objective is to closely examine these discrepancies, identify recurrent error patterns, \
+and provide specific guidance to the labelers on how to rectify and avoid these mistakes \
+in future labeling sessions.'''
+    HUMAN_MESSAGE = '''\
+ERRORS:
+{errors}
+GUIDANCE:
+'''
+
+    # PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'analyst_2.txt')).read()
 
     def __call__(self, df: pd.DataFrame):
-        llm = OpenAI(model_name='gpt-4', temperature=0)
-        memory = ConversationBufferMemory(memory_key='history')
-        df = df.copy()
-        df = df[df['ground_truth'] != df['predictions']]
-        agent = create_pandas_dataframe_agent(
-            llm=llm,
-            df=df,
-            verbose=True,
-            memory=memory
+        self.llm = ChatOpenAI(model_name='gpt-4', temperature=0.5)
+        self.system_message_prompt = SystemMessagePromptTemplate.from_template(self.SYSTEM_MESSAGE)
+        self.human_message_prompt = HumanMessagePromptTemplate.from_template(self.HUMAN_MESSAGE)
+        self.chat_prompt = ChatPromptTemplate.from_messages([
+            self.system_message_prompt,
+            self.human_message_prompt
+        ])
+        self.chain = LLMChain(
+            llm=self.llm,
+            prompt=self.chat_prompt
         )
-        explorer_prompt = self.PROMPT_TEMPLATE
-        observations = agent.run(explorer_prompt)
+        num_examples = 5
+        df = df[df['ground_truth'] != df['predictions']].sample(num_examples)
+        errors = df.to_json(orient='records')
+        observations = self.chain.run(errors=errors)
+        print(observations)
         return observations
+
+        # llm = OpenAI(model_name='gpt-4', temperature=0)
+        # memory = ConversationBufferMemory(memory_key='history')
+        # df = df.copy()
+        # df = df[df['ground_truth'] != df['predictions']]
+        # agent = create_pandas_dataframe_agent(
+        #     llm=llm,
+        #     df=df,
+        #     verbose=True,
+        #     agent_executor_kwargs={'memory': memory},
+        # )
+        # explorer_prompt = self.PROMPT_TEMPLATE
+        # observations = agent.run(explorer_prompt)
+        # return observations
diff --git a/adala/main.py b/adala/main.py
index 81db171..2d15184 100644
--- a/adala/main.py
+++ b/adala/main.py
@@ -52,13 +52,14 @@ def run(
         prev_df_val = project.get_dataframe()
 
         for iteration in range(max_iterations):
-            df_val = df.sample(n=validation_sample_size, axis=0)
-            df.drop(df_val.index)
-
-            # create ground truth
-            df_val = project.label_dataframe(df_val)
-            if not prev_df_val.empty:
-                df_val = pd.concat([prev_df_val, df_val])
+            # df_val = df.sample(n=validation_sample_size, axis=0)
+            # df.drop(df_val.index)
+            #
+            # # create ground truth
+            # df_val = project.label_dataframe(df_val)
+            # if not prev_df_val.empty:
+            #     df_val = pd.concat([prev_df_val, df_val])
+            df_val = prev_df_val
 
             history = []
             max_internal_iterations = 10
@@ -82,9 +83,9 @@ def run(
                 if accuracy > max_accuracy:
                     logger.info(f'Accuracy threshold reached: {accuracy} > {max_accuracy}')
                     break
-                if len(history) >= 3 and (history[-1]['accuracy'] == history[-2]['accuracy'] == history[-3]['accuracy']):
-                    logger.info(f'Accuracy is not improving, trying to collect more data...')
-                    break
+                # if len(history) >= 3 and (history[-1]['accuracy'] == history[-2]['accuracy'] == history[-3]['accuracy']):
+                #     logger.info(f'Accuracy is not improving, trying to collect more data...')
+                #     break
 
                 observations = self.analyst(df_val)
 

From 0b23ef73a5c334a9f69c96e561d448ba4b2484a7 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Mon, 4 Sep 2023 16:53:39 +0100
Subject: [PATCH 04/14] Add gen.py

---
 README.md                                |  94 +++++++--
 adala/__init__.py                        |   3 +-
 adala/engineer.py                        |  75 +++++++
 adala/gen.py                             | 245 +++++++++++++++++++++++
 adala/labeler.py                         |  21 +-
 adala/prompts/engineer_human_message.txt |   2 +-
 adala/prompts/labeler.txt                |   4 +-
 7 files changed, 416 insertions(+), 28 deletions(-)
 create mode 100644 adala/gen.py

diff --git a/README.md b/README.md
index 1a99ae6..e2303ac 100644
--- a/README.md
+++ b/README.md
@@ -1,41 +1,93 @@
 # ADALA
-ADALA: Autonomous Data Labeling Agent
+ADALA: Automated Data Labeling Framework.
 
-## Quick Start
+ADALA is a framework for automated data labeling. It uses a combination of Large Language Models (LLMs) and Active Learning (AL) to label data. It is designed to be used with [Label Studio](https://labelstud.io/) to provide a human-in-the-loop data labeling experience.
 
-### Install Label Studio
+Here is what ADALA does:
+- [LLM instructions generation](#llm-instructions-generation)
+- [Predicting dataset with LLM given the instructions](#applying-llm-to-the-dataset-given-the-instructions)
+- [Active learning with Human-in-the-Loop](#active-learning-with-human-in-the-loop)
+- [LLM uncertainty estimation](#llm-uncertainty-estimation)
 
+
+## Installation
+
+Install ADALA:
+```bash
+git clone https://github.com/HumanSignal/ADALA.git
+cd ADALA/
+pip install -e .
+```
+
+If you're planning to use human-in-the-loop labeling, install Label Studio:
 ```bash
 pip install label-studio
 ```
 
-and start it with `label-studio`
 
-Now create a new project `Create > New Project > Labeling Setup > Natural Language Processing > Text Classification`
-Get the project ID `label_studio_project_id` from the URL, it will be used later.
+## LLM instructions generation
 
-### Install ADALA
+ADALA uses Large Language Models (LLMs) to generate instructions for data labeling. You need to have an [OpenAI API](https://platform.openai.com/) key to use ADALA.
 
 ```bash
-git clone https://github.com/HumanSignal/ADALA.git
-cd ADALA/
-pip install -e .
+export OPENAI_API_KEY=your_key
 ```
 
-### Run ADALA
-
+Load the data into a pandas DataFrame:
 ```python
-import os
-import adala
 import pandas as pd
+df = pd.read_csv('dataset.csv')
+```
 
-os.environ['LABEL_STUDIO_API_TOKEN'] = 'your_token'
-os.environ['LABEL_STUDIO_HOST'] = 'http://localhost:8080'
-os.environ['OPENAI_API_KEY'] = 'your_key'
+The following method allows you to finetune instructions to classify each row in the DataFrame, given the ground truth labels in the specified column:
+```python
+import adala as ad
 
+instructions = ad.generate_instructions(df, ground_truth_column='label')
+```
 
-df = pd.read_csv('data.csv')
+## Applying LLM to the dataset given the instructions
+
+ADALA used optimized batch inference to run LLM on the dataset. 
+
+Create LLM predictor:
+```python
+predictor = ad.LLMPredictor(model='gpt3')
+```
 
-results = adala.label(df, label_studio_project_id, initial_instructions='Go go!')
-results['predicted_df']
-```
\ No newline at end of file
+Predict the dataset:
+```python
+predicted_df = predictor.predict(df, instructions=instructions, prediction_column='predictions')
+predicted_df['predictions']
+```
+
+## Active learning with Human-in-the-Loop
+
+Combining instructions generation and dataset prediction, ADALA can be used to create a human-in-the-loop automated data labeling experience with Label Studio.
+
+First [create a Label Studio project](https://labelstud.io/guide/setup_project).
+
+> Note: Currently ADALA is designed to work with Text Classification projects. Go to `Labeling Setup > Natural Language Processing > Text Classification`. Change label names to match your dataset labels.
+
+Get the project ID `project_id` from the URL, it will be used later.
+
+Setup environment variables with [Label Studio API token](https://labelstud.io/guide/api#Authenticate-to-the-API) and Label Studio host:
+```bash
+export LABEL_STUDIO_API_TOKEN=your_token
+export LABEL_STUDIO_HOST=http://localhost:8080
+```
+
+Run ADALA human-in-the-loop labeling:
+```python
+labeled_df = ad.human_in_the_loop(df, label_studio_project_id=project_id, output_column='autolabel')
+labeled_df['autolabel']
+```
+
+## LLM uncertainty estimation
+
+ADALA can be used to estimate LLM uncertainty for each row in the dataset. It is useful if you want to detect hallucinations or other forms of LLM errors.
+
+```python
+uncertainty_df = ad.estimate_uncertainty(df, instructions=instructions, prediction_column='predictions', uncertainty_column='uncertainty')
+uncertainty_df['uncertainty']
+```
diff --git a/adala/__init__.py b/adala/__init__.py
index 8108262..c3b4c02 100644
--- a/adala/__init__.py
+++ b/adala/__init__.py
@@ -1 +1,2 @@
-from .main import Adala, label
\ No newline at end of file
+from .main import Adala, label
+from .gen import optimize, generate_instructions, LLMPredictor
diff --git a/adala/engineer.py b/adala/engineer.py
index 8e80482..9b3b45e 100644
--- a/adala/engineer.py
+++ b/adala/engineer.py
@@ -29,3 +29,78 @@ def __call__(self, current_instructions, observations):
             observations=observations
         )
         return new_instructions
+
+
+class Engineer2:
+    SYSTEM_MESSAGE = '''\
+Act as an 'Instruction Tuner' for the LLM. You will be given two primary inputs:
+
+- The [CURRENT INSTRUCTION] used to guide the LLM's classification
+- Target set of [LABELS] for the dataset in question.
+- [CURRENT ERRORS] that emerged when this instruction was applied to a dataset.
+
+The ERRORS presented in JSON format, which contain the ground_truth label, \
+the predictions label, and the input data in one or more columns. \
+Here's an example format for the errors:
+
+```json
+[{{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, {{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, ...]
+```
+
+Analyze these inputs and craft a revised instruction for the LLM, aiming to enhance classification accuracy for the dataset in question. Deliver your response as the refined instruction.
+'''
+#     SYSTEM_MESSAGE = '''\
+# Act as an 'Instruction Tuner' for the LLM. \
+# You will be presented with information from two previous rounds of instructions, \
+# as well as associated errors presented in JSON format, which contain the ground_truth label, \
+# the predictions label, and the input data in one or more columns. \
+# Here's an example format for the errors:
+#
+# ```json
+# [{{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, {{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, ...]
+# ```
+#
+# Here's an example format for the instructions:
+#
+# Instruction from Round 1: [previous instruction 1]
+# Error observed from Round 1: [JSON format error data from previous instruction 1]
+# Instruction from Round 2: [previous instruction 2]
+# Error observed from Round 2: [JSON format error data from previous instruction 2]
+#
+# Expected output labels: [LABELS]
+#
+# Your task is to deeply analyze the provided instructions alongside their respective errors. \
+# After understanding the discrepancies, craft a new instruction for the LLM. \
+# This instruction should aim to reduce the observed errors from both rounds. \
+# Provide your refined instruction as the response.
+# '''
+    # HUMAN_MESSAGE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'engineer_human_message_2.txt')).read()
+    HUMAN_MESSAGE = '''\
+CURRENT INSTRUCTION: {instruction}
+LABELS: {labels}
+CURRENT ERRORS: {errors}
+
+New refined instruction:
+'''
+
+    def __init__(self):
+        self.llm = ChatOpenAI(model_name='gpt-4', temperature=0.5)
+        self.system_message_prompt = SystemMessagePromptTemplate.from_template(self.SYSTEM_MESSAGE)
+        self.human_message_prompt = HumanMessagePromptTemplate.from_template(self.HUMAN_MESSAGE)
+        self.chat_prompt = ChatPromptTemplate.from_messages([
+            self.system_message_prompt,
+            self.human_message_prompt
+        ])
+        self.chain = LLMChain(
+            llm=self.llm,
+            prompt=self.chat_prompt,
+            # verbose=True
+        )
+
+    def __call__(self, instruction, errors, labels):
+        new_instructions = self.chain.run(
+            instruction=instruction,
+            errors=errors,
+            labels=labels
+        )
+        return new_instructions
\ No newline at end of file
diff --git a/adala/gen.py b/adala/gen.py
new file mode 100644
index 0000000..8154afa
--- /dev/null
+++ b/adala/gen.py
@@ -0,0 +1,245 @@
+import json
+import difflib
+import pandas as pd
+import logging
+
+from uuid import uuid4
+from copy import deepcopy
+from typing import List, Optional
+
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain import PromptTemplate, OpenAI, LLMChain
+
+logger = logging.getLogger(__name__)
+
+
+class LLMPredictor:
+    # TODO: use RAG in 'Example' section
+    PROMPT_TEMPLATE = '''\
+Classify the following JSON record [RECORD] based on these instructions [INSTRUCTIONS] and choose from the provided labels [LABELS].
+
+Example:
+INSTRUCTIONS: "Identify if the statement is about nature."
+RECORD: {{"text": "The sky is blue."}}
+LABELS: [Yes, No]
+ANSWER:
+Yes
+
+INSTRUCTIONS: "{instructions}"
+RECORD: {record}
+LABELS: {labels}
+ANSWER:
+'''
+
+    def __init__(self):
+        self.llm = OpenAI(model_name='text-davinci-003', temperature=0)
+        self.llm_chain = LLMChain(
+            llm=self.llm,
+            prompt=PromptTemplate.from_template(self.PROMPT_TEMPLATE),
+            # verbose=True
+        )
+
+    def match_labels(self, response: str, original_labels: List[str]):
+        scores = list(map(lambda l: difflib.SequenceMatcher(None, response, l).ratio(), original_labels))
+        return original_labels[scores.index(max(scores))]
+
+    def __call__(self, row: pd.Series, instructions: str, labels: List):
+        row_dict = row.to_dict()
+        prediction = self.llm_chain.predict(
+            record=json.dumps(row_dict),
+            instructions=instructions,
+            labels=str(labels)
+        )
+        safe_prediction = self.match_labels(prediction, labels)
+        return safe_prediction
+
+    def predict(self, df: pd.DataFrame, instructions, labels, prediction_column='predictions') -> pd.DataFrame:
+        predictions = df.apply(
+            func=self,
+            axis=1,
+            instructions=instructions,
+            labels=labels,
+        )
+        return df.assign(**{prediction_column: predictions})
+
+
+def predict(instruction, df, labels):
+    predictions = df.apply(
+        func=LLMPredictor(),
+        axis=1,
+        instructions=instruction,
+        labels=labels,
+    )
+    df_pred = df.assign(predictions=predictions)
+    return df_pred
+
+
+def calc_fitness(records, df, labels, ground_truth_column, sample_size=5, top_n=5):
+    df = df.sample(n=sample_size, axis=0)
+    output_records = deepcopy(records)
+    for record in output_records:
+        df_pred = predict(record['instruction'], df.drop(columns=[ground_truth_column]), labels)
+        current_matches = (df_pred['predictions'] == df[ground_truth_column]).sum()
+        examples_seen = record['examples_seen']
+        total_examples_seen = examples_seen + sample_size
+        # iterative formula for calculating accuracy
+        record['accuracy'] = (examples_seen * record['accuracy'] + current_matches) / total_examples_seen
+        record['examples_seen'] = total_examples_seen
+        record['errors'] = df_pred[df_pred['predictions'] != df[ground_truth_column]].to_json(orient='records')
+
+    sorted_results = sorted(output_records, key=lambda x: x['accuracy'], reverse=True)
+    best_results = sorted_results[:top_n]
+    return best_results
+
+
+def adapt(current_instruction, errors, labels):
+    llm = ChatOpenAI(model_name='gpt-4', temperature=0.5)
+    system_message_prompt = SystemMessagePromptTemplate.from_template('''\
+Act as an 'Instruction Tuner' for the LLM. You will be given the inputs:
+
+- The [CURRENT INSTRUCTION] used to guide the LLM's classification
+- Target set of [LABELS] for the dataset in question.
+- [CURRENT ERRORS] that emerged when this instruction was applied to a dataset.
+
+The ERRORS presented in JSON format, which contain the ground_truth label, \
+the predictions label, and the input data in one or more columns. \
+Here's an example format for the errors:
+
+```json
+[{{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, {{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, ...]
+```
+
+Analyze these inputs and craft a revised instruction for the LLM, aiming to enhance classification accuracy for the dataset in question. Deliver your response as the refined instruction.
+''')
+    human_message_prompt = HumanMessagePromptTemplate.from_template('''\
+CURRENT INSTRUCTION: "{instruction}"
+LABELS: {labels}
+CURRENT ERRORS: {errors}
+
+New refined instruction:
+''')
+    chat_prompt = ChatPromptTemplate.from_messages([
+        system_message_prompt,
+        human_message_prompt
+    ])
+    chain = LLMChain(llm=llm, prompt=chat_prompt)
+    new_instructions = chain.run(
+        instruction=current_instruction,
+        errors=errors,
+        labels=labels
+    )
+    return new_instructions
+
+
+def mutate(current_instruction):
+    llm = ChatOpenAI(model_name='gpt-4', temperature=0.5)
+    system_message_prompt = SystemMessagePromptTemplate.from_template('''\
+Assume the role of an 'Instruction Optimizer' for the LLM.
+Examine the [CURRENT INSTRUCTION] provided. \
+Your task is to infuse it with common sense knowledge while keeping alterations minimal. \
+Deliver a concise, clear, and improved version.''')
+    human_message_prompt = HumanMessagePromptTemplate.from_template('''\
+CURRENT INSTRUCTION: "{instruction}"
+
+New instruction:
+''')
+    chat_prompt = ChatPromptTemplate.from_messages([
+        system_message_prompt,
+        human_message_prompt
+    ])
+    chain = LLMChain(llm=llm, prompt=chat_prompt)
+    new_instructions = chain.run(instruction=current_instruction)
+    return new_instructions
+
+
+def optimize(
+    df: pd.DataFrame,
+    ground_truth_column: str,
+    initial_instructions: List,
+    num_generations=10,
+    top_instructions=5,
+    validation_sample_size=5,
+):
+    records = [
+        {
+            'instruction': instruction,
+            'errors': '[]',
+            'accuracy': 0,
+            # 'variance': 0,
+            'examples_seen': 0,
+            'id': uuid4().hex[:4]
+        }
+        for instruction in initial_instructions
+    ]
+    labels = df[ground_truth_column].unique().tolist()
+    for generation in range(num_generations):
+        # calculate fitness value and corresponding errors
+        logger.info(f'Calculating fitness for {len(records)} instructions')
+        records = calc_fitness(
+            records=records,
+            df=df,
+            labels=labels,
+            ground_truth_column=ground_truth_column,
+            sample_size=validation_sample_size,
+            top_n=top_instructions,
+        )
+
+        # mutate the best instructions with accuracy<100% based on errors
+        best_results_with_errors = next((x for x in records if x['accuracy'] < 1), None)
+        if not best_results_with_errors:
+            # TODO: change this to a more sophisticated mutation
+            logger.info(f'All instructions have 100% accuracy. Mutating the best instruction {records[0]["id"]}...')
+            new_instruction = mutate(records[0]['instruction'])
+        else:
+            logger.info(f'Adapting the instruction {best_results_with_errors["id"]}...')
+            new_instruction = adapt(best_results_with_errors['instruction'], best_results_with_errors['errors'], labels)
+
+        # save only the best instructions and the new one
+        records = records + [{
+            'instruction': new_instruction,
+            'errors': '[]',
+            'accuracy': 0,
+            # 'variance': 0,
+            'examples_seen': 0,
+            'id': uuid4().hex[:4]
+        }]
+
+        logger.info(
+            f'Results of {generation} generation:\n'
+            f'{pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen"]]}')
+
+    # calculate fitness on final results
+    fitness = calc_fitness(records, df, labels, ground_truth_column, validation_sample_size, top_instructions)
+    logger.info(
+        f'Final results:\n{pd.DataFrame.from_records(fitness)[["id", "instruction", "accuracy", "examples_seen"]]}')
+    return fitness
+
+
+def generate_instructions(
+    df: pd.DataFrame,
+    ground_truth_column: str,
+    initial_instructions: Optional[List] = None,
+    num_generations=10,
+    top_instructions=5,
+    validation_sample_size=5,
+):
+    """
+    Generates instructions for the LLM to classify the data in the given dataframe.
+    :param df:
+    :param ground_truth_column:
+    :param initial_instructions:
+    :param num_generations:
+    :param top_instructions:
+    :param validation_sample_size:
+    :return:
+    """
+    results = optimize(
+        df=df,
+        ground_truth_column=ground_truth_column,
+        initial_instructions=initial_instructions or [''],
+        num_generations=num_generations,
+        top_instructions=top_instructions,
+        validation_sample_size=validation_sample_size,
+    )
+    return results[0]['instruction']
diff --git a/adala/labeler.py b/adala/labeler.py
index 1263605..cd80bc5 100644
--- a/adala/labeler.py
+++ b/adala/labeler.py
@@ -11,14 +11,29 @@
 
 
 class Labeler:
-
-    PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'labeler.txt')).read()
+    PROMPT_TEMPLATE = '''\
+Classify the following JSON record [RECORD] based on these instructions [INSTRUCTIONS] and choose from the provided labels [LABELS].
+
+Example:
+INSTRUCTIONS: Identify if the statement is about nature.
+RECORD: {{"text": "The sky is blue."}}
+LABELS: [Yes, No]
+ANSWER:
+Yes
+
+INSTRUCTIONS: {instructions}
+RECORD: {record}
+LABELS: {labels}
+ANSWER:
+'''
+    # PROMPT_TEMPLATE = open(os.path.join(os.path.dirname(__file__), 'prompts', 'labeler.txt')).read()
 
     def __init__(self):
         self.llm = OpenAI(model_name='text-davinci-003', temperature=0)
         self.llm_chain = LLMChain(
             llm=self.llm,
-            prompt=PromptTemplate.from_template(self.PROMPT_TEMPLATE)
+            prompt=PromptTemplate.from_template(self.PROMPT_TEMPLATE),
+            # verbose=True
         )
 
     def match_labels(self, response: str, original_labels: List[str]):
diff --git a/adala/prompts/engineer_human_message.txt b/adala/prompts/engineer_human_message.txt
index 10037eb..d417e3c 100644
--- a/adala/prompts/engineer_human_message.txt
+++ b/adala/prompts/engineer_human_message.txt
@@ -1,3 +1,3 @@
-Observations: {observations}
+Errors: {observations}
 Current instructions: {instructions}
 New instructions for the next round:
\ No newline at end of file
diff --git a/adala/prompts/labeler.txt b/adala/prompts/labeler.txt
index 114a7ee..3391851 100644
--- a/adala/prompts/labeler.txt
+++ b/adala/prompts/labeler.txt
@@ -1,13 +1,13 @@
 Classify the following JSON record [RECORD] based on these instructions [INSTRUCTIONS] and choose from the provided labels [LABELS].
 
 Example:
-RECORD: {{"text": "The sky is blue."}}
 INSTRUCTIONS: Identify if the statement is about nature.
+RECORD: {{"text": "The sky is blue."}}
 LABELS: [Yes, No]
 ANSWER:
 Yes
 
-RECORD: {record}
 INSTRUCTIONS: {instructions}
+RECORD: {record}
 LABELS: {labels}
 ANSWER:
\ No newline at end of file

From 611d3bfdab143d165619fe607a97f58b9eeb6c6e Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Mon, 4 Sep 2023 16:55:29 +0100
Subject: [PATCH 05/14] Formatting readme

---
 README.md | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e2303ac..2164b00 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,10 @@ The following method allows you to finetune instructions to classify each row in
 ```python
 import adala as ad
 
-instructions = ad.generate_instructions(df, ground_truth_column='label')
+instructions = ad.generate_instructions(
+    df,
+    ground_truth_column='label'
+)
 ```
 
 ## Applying LLM to the dataset given the instructions
@@ -57,7 +60,11 @@ predictor = ad.LLMPredictor(model='gpt3')
 
 Predict the dataset:
 ```python
-predicted_df = predictor.predict(df, instructions=instructions, prediction_column='predictions')
+predicted_df = predictor.predict(
+    df,
+    instructions=instructions,
+    prediction_column='predictions'
+)
 predicted_df['predictions']
 ```
 
@@ -79,7 +86,11 @@ export LABEL_STUDIO_HOST=http://localhost:8080
 
 Run ADALA human-in-the-loop labeling:
 ```python
-labeled_df = ad.human_in_the_loop(df, label_studio_project_id=project_id, output_column='autolabel')
+labeled_df = ad.human_in_the_loop(
+    df,
+    label_studio_project_id=project_id,
+    output_column='autolabel'
+)
 labeled_df['autolabel']
 ```
 
@@ -88,6 +99,11 @@ labeled_df['autolabel']
 ADALA can be used to estimate LLM uncertainty for each row in the dataset. It is useful if you want to detect hallucinations or other forms of LLM errors.
 
 ```python
-uncertainty_df = ad.estimate_uncertainty(df, instructions=instructions, prediction_column='predictions', uncertainty_column='uncertainty')
+uncertainty_df = ad.estimate_uncertainty(
+    df,
+    instructions=instructions,
+    prediction_column='predictions',
+    uncertainty_column='uncertainty'
+)
 uncertainty_df['uncertainty']
 ```

From 260e4eafeeabf8a33f773d6f3ee169618b3f3048 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Mon, 4 Sep 2023 16:58:31 +0100
Subject: [PATCH 06/14] Reformat README

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 2164b00..f5bb64f 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ export OPENAI_API_KEY=your_key
 Load the data into a pandas DataFrame:
 ```python
 import pandas as pd
-df = pd.read_csv('dataset.csv')
+input_df = pd.read_csv('dataset.csv')
 ```
 
 The following method allows you to finetune instructions to classify each row in the DataFrame, given the ground truth labels in the specified column:
@@ -44,7 +44,7 @@ The following method allows you to finetune instructions to classify each row in
 import adala as ad
 
 instructions = ad.generate_instructions(
-    df,
+    df=input_df,
     ground_truth_column='label'
 )
 ```
@@ -61,7 +61,7 @@ predictor = ad.LLMPredictor(model='gpt3')
 Predict the dataset:
 ```python
 predicted_df = predictor.predict(
-    df,
+    df=input_df,
     instructions=instructions,
     prediction_column='predictions'
 )
@@ -87,11 +87,11 @@ export LABEL_STUDIO_HOST=http://localhost:8080
 Run ADALA human-in-the-loop labeling:
 ```python
 labeled_df = ad.human_in_the_loop(
-    df,
+    df=input_df,
     label_studio_project_id=project_id,
-    output_column='autolabel'
+    output_column='predictions'
 )
-labeled_df['autolabel']
+labeled_df['predictions']
 ```
 
 ## LLM uncertainty estimation
@@ -100,10 +100,10 @@ ADALA can be used to estimate LLM uncertainty for each row in the dataset. It is
 
 ```python
 uncertainty_df = ad.estimate_uncertainty(
-    df,
+    df=labeled_df,
     instructions=instructions,
     prediction_column='predictions',
-    uncertainty_column='uncertainty'
+    output_column='uncertainty'
 )
 uncertainty_df['uncertainty']
 ```

From 2b33f35635129b526e32e1ece72d6e8b9a3a5a8d Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Mon, 4 Sep 2023 16:59:48 +0100
Subject: [PATCH 07/14] Add agents mention

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f5bb64f..919481e 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # ADALA
 ADALA: Automated Data Labeling Framework.
 
-ADALA is a framework for automated data labeling. It uses a combination of Large Language Models (LLMs) and Active Learning (AL) to label data. It is designed to be used with [Label Studio](https://labelstud.io/) to provide a human-in-the-loop data labeling experience.
+ADALA is a framework for automated data labeling. It uses a combination of Large Language Models (LLMs) autonomous agents and Active Learning (AL) to label data. It is designed to be used with [Label Studio](https://labelstud.io/) to provide a human-in-the-loop data labeling experience.
 
 Here is what ADALA does:
 - [LLM instructions generation](#llm-instructions-generation)

From 7bfd3e8d07067c568dc4ca42c50c4097a8c644d8 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Tue, 5 Sep 2023 12:30:21 +0100
Subject: [PATCH 08/14] Add modules, update readme

---
 README.md                                     |  72 ++++---
 adala/instructions/__init__.py                |   0
 adala/instructions/optimizer.py               | 175 ++++++++++++++++++
 adala/labeling/__init__.py                    |   0
 adala/labeling/human.py                       |  12 ++
 adala/predictors/__init__.py                  |   1 +
 adala/predictors/base.py                      |  96 ++++++++++
 .../prompts/simple_classification.txt         |  13 ++
 adala/quality/__init__.py                     |   0
 adala/quality/uncertainty.py                  |  13 ++
 10 files changed, 361 insertions(+), 21 deletions(-)
 create mode 100644 adala/instructions/__init__.py
 create mode 100644 adala/instructions/optimizer.py
 create mode 100644 adala/labeling/__init__.py
 create mode 100644 adala/labeling/human.py
 create mode 100644 adala/predictors/__init__.py
 create mode 100644 adala/predictors/base.py
 create mode 100644 adala/predictors/prompts/simple_classification.txt
 create mode 100644 adala/quality/__init__.py
 create mode 100644 adala/quality/uncertainty.py

diff --git a/README.md b/README.md
index 919481e..4500634 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,16 @@
-# ADALA
-ADALA: Automated Data Labeling Framework.
+# ADALA <img src="https://htx-pub.s3.amazonaws.com/samples/Adala.png" width="100" align="right"/>
+Automated Data Labeling Framework. 
+
+[![PyPI version](https://badge.fury.io/py/adala.svg)](https://badge.fury.io/py/adala)
+[![Python version](https://img.shields.io/pypi/pyversions/adala.svg)](https://pypi.python.org/pypi/adala)
+[![License](https://img.shields.io/pypi/l/adala.svg)](https://pypi.python.org/pypi/adala)
+
 
 ADALA is a framework for automated data labeling. It uses a combination of Large Language Models (LLMs) autonomous agents and Active Learning (AL) to label data. It is designed to be used with [Label Studio](https://labelstud.io/) to provide a human-in-the-loop data labeling experience.
 
 Here is what ADALA does:
 - [LLM instructions generation](#llm-instructions-generation)
-- [Predicting dataset with LLM given the instructions](#applying-llm-to-the-dataset-given-the-instructions)
+- [Predicting dataset with LLM](#predicting-dataset-with-llm)
 - [Active learning with Human-in-the-Loop](#active-learning-with-human-in-the-loop)
 - [LLM uncertainty estimation](#llm-uncertainty-estimation)
 
@@ -24,14 +29,12 @@ If you're planning to use human-in-the-loop labeling, install Label Studio:
 pip install label-studio
 ```
 
+## Load dataset
+ADALA works with datasets in various formats:
+- [Pandas DataFrame](#pandas-dataframe)
+- [Spark DataFrame](#spark-dataframe)
 
-## LLM instructions generation
-
-ADALA uses Large Language Models (LLMs) to generate instructions for data labeling. You need to have an [OpenAI API](https://platform.openai.com/) key to use ADALA.
-
-```bash
-export OPENAI_API_KEY=your_key
-```
+### Pandas DataFrame
 
 Load the data into a pandas DataFrame:
 ```python
@@ -39,35 +42,62 @@ import pandas as pd
 input_df = pd.read_csv('dataset.csv')
 ```
 
-The following method allows you to finetune instructions to classify each row in the DataFrame, given the ground truth labels in the specified column:
-```python
-import adala as ad
+### Spark DataFrame
 
-instructions = ad.generate_instructions(
-    df=input_df,
-    ground_truth_column='label'
-)
+```python
+from pyspark.sql import SparkSession
+spark = SparkSession.builder.getOrCreate()
 ```
 
-## Applying LLM to the dataset given the instructions
 
-ADALA used optimized batch inference to run LLM on the dataset. 
+## Predicting dataset with LLM
+
+ADALA inference is optimized to run in the batch mode - it is much faster to predict the whole dataset at once, rather than row-by-row.
 
 Create LLM predictor:
 ```python
-predictor = ad.LLMPredictor(model='gpt3')
+predictor = ad.LLMPredictor()
 ```
 
+There are multiple LLM models available in the table below:
+| Model    | Initialize predictor |
+| -------- | ------- |
+| [Any LangChain's LLM](https://python.langchain.com/docs/get_started/introduction.html) | `ad.LangChainLLMPredictor()`    |
+| [HuggingFace TGI](https://huggingface.co/text-generation-inference) | `ad.HuggingFaceLLMPredictor()`     |
+| [vLLM](https://vllm.ai/)    | `ad.VLLMPredictor()`    |
+| [llama.cpp](https://github.com/ggerganov/llama.cpp)   | `ad.LlamaCppPredictor()`   |
+
+
 Predict the dataset:
 ```python
 predicted_df = predictor.predict(
     df=input_df,
-    instructions=instructions,
+    instruction='Predict sentiment',
+    labels=['positive', 'negative'],
     prediction_column='predictions'
 )
 predicted_df['predictions']
 ```
 
+
+## LLM instructions generation
+
+ADALA can generate optimal LLM instructions for data labeling. You need to have an [OpenAI API](https://platform.openai.com/) key to use ADALA.
+
+```bash
+export OPENAI_API_KEY=your_key
+```
+
+The following method allows you to finetune instructions to classify each row in the DataFrame, given the ground truth labels in the specified column:
+```python
+import adala as ad
+
+instructions = ad.generate_instructions(
+    df=input_df,
+    ground_truth_column='label'
+)
+```
+
 ## Active learning with Human-in-the-Loop
 
 Combining instructions generation and dataset prediction, ADALA can be used to create a human-in-the-loop automated data labeling experience with Label Studio.
diff --git a/adala/instructions/__init__.py b/adala/instructions/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adala/instructions/optimizer.py b/adala/instructions/optimizer.py
new file mode 100644
index 0000000..f2c018e
--- /dev/null
+++ b/adala/instructions/optimizer.py
@@ -0,0 +1,175 @@
+import logging
+import pandas as pd
+
+from dataclasses import dataclass
+from typing import List
+from uuid import uuid4
+from copy import deepcopy
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain import PromptTemplate, OpenAI, LLMChain
+from adala.predictors import LLMPredictor
+
+logger = logging.getLogger(__name__)
+
+
+def calc_fitness(predictor: LLMPredictor, records, df, labels, ground_truth_column, sample_size=5, top_n=5):
+    df = df.sample(n=sample_size, axis=0)
+    output_records = deepcopy(records)
+    for record in output_records:
+        df_pred = predictor.predict(
+            df=df.drop(columns=[ground_truth_column]),
+            instruction=record['instruction'],
+            labels=labels,
+            output_column='predictions'
+        )
+        current_matches = (df_pred['predictions'] == df[ground_truth_column]).sum()
+        examples_seen = record['examples_seen']
+        total_examples_seen = examples_seen + sample_size
+        # iterative formula for calculating accuracy
+        record['accuracy'] = (examples_seen * record['accuracy'] + current_matches) / total_examples_seen
+        record['examples_seen'] = total_examples_seen
+        record['errors'] = df_pred[df_pred['predictions'] != df[ground_truth_column]].to_json(orient='records')
+
+    sorted_results = sorted(output_records, key=lambda x: x['accuracy'], reverse=True)
+    best_results = sorted_results[:top_n]
+    return best_results
+
+
+def regularize(instruction: str):
+    llm = ChatOpenAI(model_name='gpt-4', temperature=0.5)
+    system_message_prompt = SystemMessagePromptTemplate.from_template('''\
+    Assume the role of an 'Instruction Optimizer' for the LLM.
+    Examine the [CURRENT INSTRUCTION] provided. \
+    Your task is to infuse it with common sense knowledge while keeping alterations minimal. \
+    Deliver a concise, clear, and improved version.''')
+    human_message_prompt = HumanMessagePromptTemplate.from_template('''\
+    CURRENT INSTRUCTION: "{instruction}"
+
+    New instruction:
+    ''')
+    chat_prompt = ChatPromptTemplate.from_messages([
+        system_message_prompt,
+        human_message_prompt
+    ])
+    chain = LLMChain(llm=llm, prompt=chat_prompt)
+    new_instructions = chain.run(instruction=instruction)
+    return new_instructions
+
+
+def adapt(current_instruction, errors, labels):
+    llm = ChatOpenAI(model_name='gpt-4', temperature=0.5)
+    system_message_prompt = SystemMessagePromptTemplate.from_template('''\
+Act as an 'Instruction Tuner' for the LLM. You will be given the inputs:
+
+- The [CURRENT INSTRUCTION] used to guide the LLM's classification
+- Target set of [LABELS] for the dataset in question.
+- [CURRENT ERRORS] that emerged when this instruction was applied to a dataset.
+
+The ERRORS presented in JSON format, which contain the ground_truth label, \
+the predictions label, and the input data in one or more columns. \
+Here's an example format for the errors:
+
+```json
+[{{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, {{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, ...]
+```
+
+Analyze these inputs and craft a revised instruction for the LLM, aiming to enhance classification accuracy for the dataset in question. Deliver your response as the refined instruction.
+''')
+    human_message_prompt = HumanMessagePromptTemplate.from_template('''\
+CURRENT INSTRUCTION: "{instruction}"
+LABELS: {labels}
+CURRENT ERRORS: {errors}
+
+New refined instruction:
+''')
+    chat_prompt = ChatPromptTemplate.from_messages([
+        system_message_prompt,
+        human_message_prompt
+    ])
+    chain = LLMChain(llm=llm, prompt=chat_prompt)
+    new_instructions = chain.run(
+        instruction=current_instruction,
+        errors=errors,
+        labels=labels
+    )
+    return new_instructions
+
+
+@dataclass
+class GenerateInstructionResult:
+    """Result of the generate_instruction()"""
+    instruction: str
+    benchmark_table: pd.DataFrame
+
+
+def generate_instruction(
+    predictor: LLMPredictor,
+    df: pd.DataFrame,
+    ground_truth_column: str,
+    initial_instructions: List = None,
+    num_generations=10,
+    top_instructions=5,
+    validation_sample_size=5,
+) -> GenerateInstructionResult:
+    """Optimize the instruction for the LLM."""
+
+    records = [
+        {
+            'instruction': instruction,
+            'errors': '[]',
+            'accuracy': 0,
+            # 'variance': 0,
+            'examples_seen': 0,
+            'id': uuid4().hex[:4]
+        }
+        for instruction in initial_instructions
+    ]
+    labels = df[ground_truth_column].unique().tolist()
+    for generation in range(num_generations):
+        # calculate fitness value and corresponding errors
+        logger.info(f'Calculating fitness for {len(records)} instructions')
+        records = calc_fitness(
+            predictor=predictor,
+            records=records,
+            df=df,
+            labels=labels,
+            ground_truth_column=ground_truth_column,
+            sample_size=validation_sample_size,
+            top_n=top_instructions,
+        )
+
+        # mutate the best instructions with accuracy<100% based on errors
+        best_results_with_errors = next((x for x in records if x['accuracy'] < 1), None)
+        if not best_results_with_errors:
+            # TODO: change this to a more sophisticated mutation
+            logger.info(f'All instructions have 100% accuracy. Mutating the best instruction {records[0]["id"]}...')
+            new_instruction = regularize(records[0]['instruction'])
+        else:
+            logger.info(f'Adapting the instruction {best_results_with_errors["id"]}...')
+            new_instruction = adapt(best_results_with_errors['instruction'], best_results_with_errors['errors'], labels)
+
+        # save only the best instructions and the new one
+        records = records + [{
+            'instruction': new_instruction,
+            'errors': '[]',
+            'accuracy': 0,
+            # 'variance': 0,
+            'examples_seen': 0,
+            'id': uuid4().hex[:4]
+        }]
+
+        logger.info(
+            f'Results of {generation} generation:\n'
+            f'{pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen"]]}')
+
+    # calculate fitness on final results
+    fitness = calc_fitness(predictor, records, df, labels, ground_truth_column, validation_sample_size, top_instructions)
+    benchmark_table = pd.DataFrame.from_records(fitness)[["id", "instruction", "accuracy", "examples_seen"]]
+    logger.info(f'Final results:\n{benchmark_table}')
+
+    return GenerateInstructionResult(
+        instruction=fitness[0]['instruction'],
+        benchmark_table=benchmark_table
+    )
+
diff --git a/adala/labeling/__init__.py b/adala/labeling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adala/labeling/human.py b/adala/labeling/human.py
new file mode 100644
index 0000000..c49c55e
--- /dev/null
+++ b/adala/labeling/human.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def human_in_the_loop(
+    df: pd.DataFrame,
+    label_studio_project_id: int,
+    output_column: str = 'predictions'
+) -> pd.DataFrame:
+    """
+    Auto-annotate a pandas DataFrame with human-in-the-loop labeling from Label Studio.
+    """
+    pass
\ No newline at end of file
diff --git a/adala/predictors/__init__.py b/adala/predictors/__init__.py
new file mode 100644
index 0000000..5a07b66
--- /dev/null
+++ b/adala/predictors/__init__.py
@@ -0,0 +1 @@
+from .base import LLMPredictor
\ No newline at end of file
diff --git a/adala/predictors/base.py b/adala/predictors/base.py
new file mode 100644
index 0000000..b8e093d
--- /dev/null
+++ b/adala/predictors/base.py
@@ -0,0 +1,96 @@
+import pandas as pd
+import difflib
+import json
+
+from pathlib import Path
+from abc import ABC, abstractmethod
+from pydantic import BaseModel, validator, root_validator
+from typing import Optional, List
+from langchain import PromptTemplate, OpenAI, LLMChain
+from langchain.llms import BaseLLM
+
+
+class Predictor(BaseModel, ABC):
+    """
+    Base class for predictors.
+    """
+    pass
+
+
+class LLMPredictor(Predictor):
+    """
+    Base class for LLMPredictors that use Large Language Models (LLMs)
+    to make sample predictions given text instructions:
+    prediction = LLM(sample, instructions)
+    """
+
+    @abstractmethod
+    def predict_row(self, row: pd.Series, instruction: str, labels: List[str]) -> str:
+        """
+        Predict a single row from a pandas DataFrame.
+        To be used with pandas.DataFrame.apply:
+        df.apply(func=predict_row, axis=1, instruction=instruction)
+        """
+
+    @abstractmethod
+    def predict(
+        self,
+        df: pd.DataFrame,
+        instruction: str,
+        labels: List[str],
+        output_column: str = 'predictions'
+    ) -> pd.DataFrame:
+        """
+        Predict all rows from a pandas DataFrame.
+        """
+
+
+class LangChainLLMPredictor(LLMPredictor):
+    llm: Optional[BaseLLM] = None
+    llm_chain: Optional[LLMChain] = None
+    prompt_template: Optional[str] = None
+    verbose: bool = False
+
+    @root_validator
+    def initialize_llm(cls, values):
+        if values.get('prompt_template') is None:
+            default_file = Path(__file__).parent / 'prompts' / 'simple_classification.txt'
+            with open(default_file, 'r') as f:
+                values['prompt_template'] = f.read()
+        if values.get('llm') is None:
+            values['llm'] = OpenAI(model_name='text-davinci-003', temperature=0)
+        if values.get('llm_chain') is None:
+            values['llm_chain'] = LLMChain(
+                llm=values['llm'],
+                prompt=values['prompt_template'],
+                verbose=values['verbose']
+            )
+        return values
+
+    def predict_row(self, row: pd.Series, instruction: str, labels: List[str]) -> str:
+        row_dict = row.to_dict()
+        prediction = self.llm_chain.predict(
+            record=json.dumps(row_dict),
+            instructions=instruction,
+            labels=str(labels)
+        )
+        # match prediction to labels
+        scores = list(map(lambda l: difflib.SequenceMatcher(None, prediction, l).ratio(), labels))
+        safe_prediction = labels[scores.index(max(scores))]
+        return safe_prediction
+
+    def predict(
+        self,
+        df: pd.DataFrame,
+        instruction: str,
+        labels: List[str],
+        output_column: str='predictions'
+    ) -> pd.DataFrame:
+
+        predictions = df.apply(
+            func=self.predict_row,
+            axis=1,
+            instructions=instruction,
+            labels=labels,
+        )
+        return df.assign(**{output_column: predictions})
diff --git a/adala/predictors/prompts/simple_classification.txt b/adala/predictors/prompts/simple_classification.txt
new file mode 100644
index 0000000..5346ed0
--- /dev/null
+++ b/adala/predictors/prompts/simple_classification.txt
@@ -0,0 +1,13 @@
+Classify the following JSON record [RECORD] based on these instructions [INSTRUCTIONS] and choose from the provided labels [LABELS].
+
+Example:
+INSTRUCTIONS: "Identify if the statement is about nature."
+RECORD: {{"text": "The sky is blue."}}
+LABELS: [Yes, No]
+ANSWER:
+Yes
+
+INSTRUCTIONS: "{instructions}"
+RECORD: {record}
+LABELS: {labels}
+ANSWER:
diff --git a/adala/quality/__init__.py b/adala/quality/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adala/quality/uncertainty.py b/adala/quality/uncertainty.py
new file mode 100644
index 0000000..276de0e
--- /dev/null
+++ b/adala/quality/uncertainty.py
@@ -0,0 +1,13 @@
+import pandas as pd
+
+
+def estimate_uncertainty(
+    df: pd.DataFrame,
+    instruction: str,
+    prediction_column: str = 'predictions',
+    output_column: str = 'uncertainty'
+) -> pd.DataFrame:
+    """
+    Estimate uncertainty in a pandas DataFrame given LLM predictions
+    """
+    pass
\ No newline at end of file

From 08dbe1a97ce8c3957b937021c9ee72131bcfaca2 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Tue, 5 Sep 2023 23:10:32 +0100
Subject: [PATCH 09/14] Reorg package, update readme

---
 README.md                                     |  44 +++++--
 adala/__init__.py                             |   4 +-
 adala/{instructions => agents}/__init__.py    |   0
 adala/{ => agents}/analyst.py                 |   0
 adala/{ => agents}/engineer.py                |   0
 adala/{ => agents}/gen.py                     |   0
 adala/{ => agents}/labeler.py                 |   0
 adala/{ => agents}/main.py                    |   0
 adala/{ => agents}/prompts/analyst.txt        |   0
 .../prompts/engineer_human_message.txt        |   0
 .../prompts/engineer_system_message.txt       |   0
 adala/{ => agents}/prompts/labeler.txt        |   0
 adala/{labeling => labelers}/__init__.py      |   0
 adala/labelers/base.py                        | 117 ++++++++++++++++++
 .../prompts/simple_classification.txt         |   0
 adala/labeling/human.py                       |  12 --
 adala/optimizers/__init__.py                  |   0
 .../llm_instructions.py}                      |  31 +++--
 adala/predictors/__init__.py                  |   1 -
 adala/predictors/base.py                      |  96 --------------
 20 files changed, 172 insertions(+), 133 deletions(-)
 rename adala/{instructions => agents}/__init__.py (100%)
 rename adala/{ => agents}/analyst.py (100%)
 rename adala/{ => agents}/engineer.py (100%)
 rename adala/{ => agents}/gen.py (100%)
 rename adala/{ => agents}/labeler.py (100%)
 rename adala/{ => agents}/main.py (100%)
 rename adala/{ => agents}/prompts/analyst.txt (100%)
 rename adala/{ => agents}/prompts/engineer_human_message.txt (100%)
 rename adala/{ => agents}/prompts/engineer_system_message.txt (100%)
 rename adala/{ => agents}/prompts/labeler.txt (100%)
 rename adala/{labeling => labelers}/__init__.py (100%)
 create mode 100644 adala/labelers/base.py
 rename adala/{predictors => labelers}/prompts/simple_classification.txt (100%)
 delete mode 100644 adala/labeling/human.py
 create mode 100644 adala/optimizers/__init__.py
 rename adala/{instructions/optimizer.py => optimizers/llm_instructions.py} (88%)
 delete mode 100644 adala/predictors/__init__.py
 delete mode 100644 adala/predictors/base.py

diff --git a/README.md b/README.md
index 4500634..9467a83 100644
--- a/README.md
+++ b/README.md
@@ -54,29 +54,34 @@ spark = SparkSession.builder.getOrCreate()
 
 ADALA inference is optimized to run in the batch mode - it is much faster to predict the whole dataset at once, rather than row-by-row.
 
-Create LLM predictor:
+Create LLM labeler:
+
 ```python
-predictor = ad.LLMPredictor()
+import adala as ad
+
+labeler = ad.OpenAILabeler(model_name='gpt-4')
+labeler.label_string('The sun is white.', instruction='Is it true?', labels=['yes', 'no'])
 ```
 
 There are multiple LLM models available in the table below:
 | Model    | Initialize predictor |
 | -------- | ------- |
+| [OpenAI API](https://platform.openai.com/) | `ad.OpenAILabeler()`    |
 | [Any LangChain's LLM](https://python.langchain.com/docs/get_started/introduction.html) | `ad.LangChainLLMPredictor()`    |
 | [HuggingFace TGI](https://huggingface.co/text-generation-inference) | `ad.HuggingFaceLLMPredictor()`     |
 | [vLLM](https://vllm.ai/)    | `ad.VLLMPredictor()`    |
 | [llama.cpp](https://github.com/ggerganov/llama.cpp)   | `ad.LlamaCppPredictor()`   |
 
 
-Predict the dataset:
+Predict the whole dataset:
 ```python
-predicted_df = predictor.predict(
+labeled_df = labeler.label(
     df=input_df,
     instruction='Predict sentiment',
     labels=['positive', 'negative'],
-    prediction_column='predictions'
+    output_column='predictions'
 )
-predicted_df['predictions']
+labeled_df['predictions']
 ```
 
 
@@ -92,12 +97,24 @@ The following method allows you to finetune instructions to classify each row in
 ```python
 import adala as ad
 
-instructions = ad.generate_instructions(
+result = ad.generate_instructions(
+    labeler=labeler,
     df=input_df,
     ground_truth_column='label'
 )
 ```
 
+Now you can use the generated instructions to label the dataset with LLM:
+```python
+labeled_df = labeler.label(
+    df=input_df,
+    instruction=result.best_instruction,
+    labels=result.labels,
+    output_column='predictions'
+)
+```
+
+
 ## Active learning with Human-in-the-Loop
 
 Combining instructions generation and dataset prediction, ADALA can be used to create a human-in-the-loop automated data labeling experience with Label Studio.
@@ -114,12 +131,19 @@ export LABEL_STUDIO_API_TOKEN=your_token
 export LABEL_STUDIO_HOST=http://localhost:8080
 ```
 
-Run ADALA human-in-the-loop labeling:
+Generate LLM instructions with human-in-the-loop labeling:
+
 ```python
-labeled_df = ad.human_in_the_loop(
+import adala as ad
+
+labeled_df = ad.generate_instructions(
     df=input_df,
+    # ... other parameters
+    human_in_the_loop=True,
     label_studio_project_id=project_id,
-    output_column='predictions'
+    # use your Label Studio API token and host if not set as environment variables
+    label_studio_api_token='your_token',
+    label_studio_host='your_host'
 )
 labeled_df['predictions']
 ```
diff --git a/adala/__init__.py b/adala/__init__.py
index c3b4c02..1635c58 100644
--- a/adala/__init__.py
+++ b/adala/__init__.py
@@ -1,2 +1,2 @@
-from .main import Adala, label
-from .gen import optimize, generate_instructions, LLMPredictor
+from .labelers.base import OpenAILabeler, LangChainLabeler
+from .optimizers.llm_instructions import generate_instruction
diff --git a/adala/instructions/__init__.py b/adala/agents/__init__.py
similarity index 100%
rename from adala/instructions/__init__.py
rename to adala/agents/__init__.py
diff --git a/adala/analyst.py b/adala/agents/analyst.py
similarity index 100%
rename from adala/analyst.py
rename to adala/agents/analyst.py
diff --git a/adala/engineer.py b/adala/agents/engineer.py
similarity index 100%
rename from adala/engineer.py
rename to adala/agents/engineer.py
diff --git a/adala/gen.py b/adala/agents/gen.py
similarity index 100%
rename from adala/gen.py
rename to adala/agents/gen.py
diff --git a/adala/labeler.py b/adala/agents/labeler.py
similarity index 100%
rename from adala/labeler.py
rename to adala/agents/labeler.py
diff --git a/adala/main.py b/adala/agents/main.py
similarity index 100%
rename from adala/main.py
rename to adala/agents/main.py
diff --git a/adala/prompts/analyst.txt b/adala/agents/prompts/analyst.txt
similarity index 100%
rename from adala/prompts/analyst.txt
rename to adala/agents/prompts/analyst.txt
diff --git a/adala/prompts/engineer_human_message.txt b/adala/agents/prompts/engineer_human_message.txt
similarity index 100%
rename from adala/prompts/engineer_human_message.txt
rename to adala/agents/prompts/engineer_human_message.txt
diff --git a/adala/prompts/engineer_system_message.txt b/adala/agents/prompts/engineer_system_message.txt
similarity index 100%
rename from adala/prompts/engineer_system_message.txt
rename to adala/agents/prompts/engineer_system_message.txt
diff --git a/adala/prompts/labeler.txt b/adala/agents/prompts/labeler.txt
similarity index 100%
rename from adala/prompts/labeler.txt
rename to adala/agents/prompts/labeler.txt
diff --git a/adala/labeling/__init__.py b/adala/labelers/__init__.py
similarity index 100%
rename from adala/labeling/__init__.py
rename to adala/labelers/__init__.py
diff --git a/adala/labelers/base.py b/adala/labelers/base.py
new file mode 100644
index 0000000..5d93ee7
--- /dev/null
+++ b/adala/labelers/base.py
@@ -0,0 +1,117 @@
+import pandas as pd
+import difflib
+import json
+
+from pathlib import Path
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel, root_validator
+from typing import Optional, List, Dict
+from langchain import PromptTemplate, OpenAI, LLMChain
+from langchain.llms import BaseLLM
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
+
+
+class Labeler(BaseModel, ABC):
+    """
+    Base class for labelers.
+    """
+    pass
+
+
+class LLMLabeler(Labeler):
+    """
+    Base class for LLMLabeler that use Large Language Models (LLMs)
+    to generate label predictions given text instructions:
+    label = LLM(sample, instructions)
+    """
+
+    @abstractmethod
+    def label_string(self, input_string: str, instruction: str, labels: List[str]) -> str:
+        """
+        Label a string with LLM given instruction:
+        label = LLM(input_string, instruction)
+        """
+
+    @abstractmethod
+    def label_row(self, row: pd.Series, instruction: str, labels: List[str]) -> str:
+        """
+        Label a single row from a pandas DataFrame.
+        To be used with pandas.DataFrame.apply:
+        df.apply(func=label_row, axis=1, instruction="Select 'A' or 'B'", labels=['A', 'B'])
+        """
+
+    @abstractmethod
+    def label(
+        self,
+        df: pd.DataFrame,
+        instruction: str,
+        labels: List[str],
+        output_column: str = 'predictions'
+    ) -> pd.DataFrame:
+        """
+        Label all rows from a pandas DataFrame.
+        """
+
+
+class LangChainLabeler(LLMLabeler):
+    model_name: str = 'gpt-3.5-turbo'
+    temperature: float = 0
+    llm: Optional[BaseLLM] = None
+    llm_chain: Optional[LLMChain] = None
+    prompt: Optional[str] = None
+    verbose: bool = False
+
+    @root_validator
+    def initialize_llm(cls, values):
+        if values.get('prompt') is None:
+            default_file = Path(__file__).parent / 'prompts' / 'simple_classification.txt'
+            with open(default_file, 'r') as f:
+                values['prompt'] = f.read()
+        if values.get('llm') is None:
+            values['llm'] = ChatOpenAI(
+                model_name=values['model_name'],
+                temperature=values['temperature']
+            )
+        if values.get('llm_chain') is None:
+            prompt = HumanMessagePromptTemplate(prompt=PromptTemplate.from_template(values['prompt']))
+            values['llm_chain'] = LLMChain(
+                llm=values['llm'],
+                prompt=ChatPromptTemplate.from_messages([prompt]),
+                verbose=values['verbose']
+            )
+        return values
+
+    def label_string(self, input_string: str, instruction: str, labels: List[str]):
+        prediction = self.llm_chain.run(
+            record=input_string,
+            instructions=instruction,
+            labels=str(labels)
+        )
+        # match prediction to actual labels
+        scores = list(map(lambda l: difflib.SequenceMatcher(None, prediction, l).ratio(), labels))
+        return labels[scores.index(max(scores))]
+
+    def label_row(self, row: pd.Series, instruction: str, labels: List[str]) -> str:
+        return self.label_string(input_string=row.to_json(), instruction=instruction, labels=labels)
+
+    def label(
+        self,
+        df: pd.DataFrame,
+        instruction: str,
+        labels: List[str],
+        output_column: str = 'predictions'
+    ) -> pd.DataFrame:
+
+        predictions = df.apply(
+            func=self.label_row,
+            axis=1,
+            instruction=instruction,
+            labels=labels,
+        )
+        return df.assign(**{output_column: predictions})
+
+
+class OpenAILabeler(LangChainLabeler):
+    model_name = 'gpt-3.5-turbo'
diff --git a/adala/predictors/prompts/simple_classification.txt b/adala/labelers/prompts/simple_classification.txt
similarity index 100%
rename from adala/predictors/prompts/simple_classification.txt
rename to adala/labelers/prompts/simple_classification.txt
diff --git a/adala/labeling/human.py b/adala/labeling/human.py
deleted file mode 100644
index c49c55e..0000000
--- a/adala/labeling/human.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import pandas as pd
-
-
-def human_in_the_loop(
-    df: pd.DataFrame,
-    label_studio_project_id: int,
-    output_column: str = 'predictions'
-) -> pd.DataFrame:
-    """
-    Auto-annotate a pandas DataFrame with human-in-the-loop labeling from Label Studio.
-    """
-    pass
\ No newline at end of file
diff --git a/adala/optimizers/__init__.py b/adala/optimizers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adala/instructions/optimizer.py b/adala/optimizers/llm_instructions.py
similarity index 88%
rename from adala/instructions/optimizer.py
rename to adala/optimizers/llm_instructions.py
index f2c018e..9cf0d04 100644
--- a/adala/instructions/optimizer.py
+++ b/adala/optimizers/llm_instructions.py
@@ -8,16 +8,16 @@
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain import PromptTemplate, OpenAI, LLMChain
-from adala.predictors import LLMPredictor
+from adala.labelers.base import LLMLabeler
 
 logger = logging.getLogger(__name__)
 
 
-def calc_fitness(predictor: LLMPredictor, records, df, labels, ground_truth_column, sample_size=5, top_n=5):
+def calc_fitness(labeler: LLMLabeler, records, df, labels, ground_truth_column, sample_size=5, top_n=5):
     df = df.sample(n=sample_size, axis=0)
     output_records = deepcopy(records)
     for record in output_records:
-        df_pred = predictor.predict(
+        df_pred = labeler.label(
             df=df.drop(columns=[ground_truth_column]),
             instruction=record['instruction'],
             labels=labels,
@@ -99,21 +99,27 @@ def adapt(current_instruction, errors, labels):
 @dataclass
 class GenerateInstructionResult:
     """Result of the generate_instruction()"""
-    instruction: str
+    best_instruction: str
     benchmark_table: pd.DataFrame
+    labels: List[str]
 
 
 def generate_instruction(
-    predictor: LLMPredictor,
+    labeler: LLMLabeler,
     df: pd.DataFrame,
     ground_truth_column: str,
     initial_instructions: List = None,
-    num_generations=10,
-    top_instructions=5,
-    validation_sample_size=5,
+    num_generations: int = 10,
+    top_instructions: int = 5,
+    validation_sample_size: int = 5,
+    human_in_the_loop: bool = False,
+    label_studio_project_id: int = None,
+    label_studio_api_token: str = None,
+    label_studio_host: str = None,
 ) -> GenerateInstructionResult:
     """Optimize the instruction for the LLM."""
 
+    initial_instructions = initial_instructions or ['']
     records = [
         {
             'instruction': instruction,
@@ -130,7 +136,7 @@ def generate_instruction(
         # calculate fitness value and corresponding errors
         logger.info(f'Calculating fitness for {len(records)} instructions')
         records = calc_fitness(
-            predictor=predictor,
+            labeler=labeler,
             records=records,
             df=df,
             labels=labels,
@@ -164,12 +170,13 @@ def generate_instruction(
             f'{pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen"]]}')
 
     # calculate fitness on final results
-    fitness = calc_fitness(predictor, records, df, labels, ground_truth_column, validation_sample_size, top_instructions)
+    fitness = calc_fitness(labeler, records, df, labels, ground_truth_column, validation_sample_size, top_instructions)
     benchmark_table = pd.DataFrame.from_records(fitness)[["id", "instruction", "accuracy", "examples_seen"]]
     logger.info(f'Final results:\n{benchmark_table}')
 
     return GenerateInstructionResult(
-        instruction=fitness[0]['instruction'],
-        benchmark_table=benchmark_table
+        best_instruction=fitness[0]['instruction'],
+        benchmark_table=benchmark_table,
+        labels=labels
     )
 
diff --git a/adala/predictors/__init__.py b/adala/predictors/__init__.py
deleted file mode 100644
index 5a07b66..0000000
--- a/adala/predictors/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .base import LLMPredictor
\ No newline at end of file
diff --git a/adala/predictors/base.py b/adala/predictors/base.py
deleted file mode 100644
index b8e093d..0000000
--- a/adala/predictors/base.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import pandas as pd
-import difflib
-import json
-
-from pathlib import Path
-from abc import ABC, abstractmethod
-from pydantic import BaseModel, validator, root_validator
-from typing import Optional, List
-from langchain import PromptTemplate, OpenAI, LLMChain
-from langchain.llms import BaseLLM
-
-
-class Predictor(BaseModel, ABC):
-    """
-    Base class for predictors.
-    """
-    pass
-
-
-class LLMPredictor(Predictor):
-    """
-    Base class for LLMPredictors that use Large Language Models (LLMs)
-    to make sample predictions given text instructions:
-    prediction = LLM(sample, instructions)
-    """
-
-    @abstractmethod
-    def predict_row(self, row: pd.Series, instruction: str, labels: List[str]) -> str:
-        """
-        Predict a single row from a pandas DataFrame.
-        To be used with pandas.DataFrame.apply:
-        df.apply(func=predict_row, axis=1, instruction=instruction)
-        """
-
-    @abstractmethod
-    def predict(
-        self,
-        df: pd.DataFrame,
-        instruction: str,
-        labels: List[str],
-        output_column: str = 'predictions'
-    ) -> pd.DataFrame:
-        """
-        Predict all rows from a pandas DataFrame.
-        """
-
-
-class LangChainLLMPredictor(LLMPredictor):
-    llm: Optional[BaseLLM] = None
-    llm_chain: Optional[LLMChain] = None
-    prompt_template: Optional[str] = None
-    verbose: bool = False
-
-    @root_validator
-    def initialize_llm(cls, values):
-        if values.get('prompt_template') is None:
-            default_file = Path(__file__).parent / 'prompts' / 'simple_classification.txt'
-            with open(default_file, 'r') as f:
-                values['prompt_template'] = f.read()
-        if values.get('llm') is None:
-            values['llm'] = OpenAI(model_name='text-davinci-003', temperature=0)
-        if values.get('llm_chain') is None:
-            values['llm_chain'] = LLMChain(
-                llm=values['llm'],
-                prompt=values['prompt_template'],
-                verbose=values['verbose']
-            )
-        return values
-
-    def predict_row(self, row: pd.Series, instruction: str, labels: List[str]) -> str:
-        row_dict = row.to_dict()
-        prediction = self.llm_chain.predict(
-            record=json.dumps(row_dict),
-            instructions=instruction,
-            labels=str(labels)
-        )
-        # match prediction to labels
-        scores = list(map(lambda l: difflib.SequenceMatcher(None, prediction, l).ratio(), labels))
-        safe_prediction = labels[scores.index(max(scores))]
-        return safe_prediction
-
-    def predict(
-        self,
-        df: pd.DataFrame,
-        instruction: str,
-        labels: List[str],
-        output_column: str='predictions'
-    ) -> pd.DataFrame:
-
-        predictions = df.apply(
-            func=self.predict_row,
-            axis=1,
-            instructions=instruction,
-            labels=labels,
-        )
-        return df.assign(**{output_column: predictions})

From 000bd77afafd42d63ef5346abd95028b2944db25 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Tue, 5 Sep 2023 23:37:20 +0100
Subject: [PATCH 10/14] Add progress bar

---
 adala/labelers/base.py               | 5 +++--
 adala/optimizers/llm_instructions.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/adala/labelers/base.py b/adala/labelers/base.py
index 5d93ee7..84c2797 100644
--- a/adala/labelers/base.py
+++ b/adala/labelers/base.py
@@ -4,7 +4,7 @@
 
 from pathlib import Path
 from abc import ABC, abstractmethod
-
+from tqdm import tqdm
 from pydantic import BaseModel, root_validator
 from typing import Optional, List, Dict
 from langchain import PromptTemplate, OpenAI, LLMChain
@@ -104,7 +104,8 @@ def label(
         output_column: str = 'predictions'
     ) -> pd.DataFrame:
 
-        predictions = df.apply(
+        tqdm.pandas(desc='Labeling')
+        predictions = df.progress_apply(
             func=self.label_row,
             axis=1,
             instruction=instruction,
diff --git a/adala/optimizers/llm_instructions.py b/adala/optimizers/llm_instructions.py
index 9cf0d04..0d1c166 100644
--- a/adala/optimizers/llm_instructions.py
+++ b/adala/optimizers/llm_instructions.py
@@ -100,7 +100,7 @@ def adapt(current_instruction, errors, labels):
 class GenerateInstructionResult:
     """Result of the generate_instruction()"""
     best_instruction: str
-    benchmark_table: pd.DataFrame
+    benchmark: pd.DataFrame
     labels: List[str]
 
 

From 178f3782adb69fcfdae22210c7f5de5c312f319f Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Tue, 5 Sep 2023 23:41:00 +0100
Subject: [PATCH 11/14] Compute final result on whole df

---
 adala/optimizers/llm_instructions.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/adala/optimizers/llm_instructions.py b/adala/optimizers/llm_instructions.py
index 0d1c166..ad6eec4 100644
--- a/adala/optimizers/llm_instructions.py
+++ b/adala/optimizers/llm_instructions.py
@@ -170,13 +170,21 @@ def generate_instruction(
             f'{pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen"]]}')
 
     # calculate fitness on final results
-    fitness = calc_fitness(labeler, records, df, labels, ground_truth_column, validation_sample_size, top_instructions)
-    benchmark_table = pd.DataFrame.from_records(fitness)[["id", "instruction", "accuracy", "examples_seen"]]
+    records = calc_fitness(
+        labeler=labeler,
+        records=records,
+        df=df,
+        labels=labels,
+        ground_truth_column=ground_truth_column,
+        sample_size=len(df),
+        top_n=top_instructions,
+    )
+    benchmark_table = pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen"]]
     logger.info(f'Final results:\n{benchmark_table}')
 
     return GenerateInstructionResult(
-        best_instruction=fitness[0]['instruction'],
-        benchmark_table=benchmark_table,
+        best_instruction=records[0]['instruction'],
+        benchmark=benchmark_table,
         labels=labels
     )
 

From 6c2454cb176c1e8601106c60b2784c21404066b0 Mon Sep 17 00:00:00 2001
From: nik <nik@heartex.net>
Date: Fri, 6 Oct 2023 14:46:04 +0100
Subject: [PATCH 12/14] Add guidance support

---
 adala/datasets/__init__.py           |   0
 adala/datasets/base.py               |   8 +++
 adala/datasets/pd_dataframe.py       |   7 ++
 adala/labelers/base.py               | 102 +++++++++++++++++++++------
 adala/optimizers/llm_instructions.py |  71 ++++++++++++-------
 requirements.txt                     |   1 +
 6 files changed, 142 insertions(+), 47 deletions(-)
 create mode 100644 adala/datasets/__init__.py
 create mode 100644 adala/datasets/base.py
 create mode 100644 adala/datasets/pd_dataframe.py

diff --git a/adala/datasets/__init__.py b/adala/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adala/datasets/base.py b/adala/datasets/base.py
new file mode 100644
index 0000000..3b8f2a3
--- /dev/null
+++ b/adala/datasets/base.py
@@ -0,0 +1,8 @@
+from pydantic import BaseModel
+
+
+class Dataset(BaseModel):
+    """
+    Base class for datasets.
+    """
+    pass
\ No newline at end of file
diff --git a/adala/datasets/pd_dataframe.py b/adala/datasets/pd_dataframe.py
new file mode 100644
index 0000000..33964e3
--- /dev/null
+++ b/adala/datasets/pd_dataframe.py
@@ -0,0 +1,7 @@
+import pandas as pd
+
+from .base import Dataset
+
+
+class PandasDataframe(Dataset):
+    pass
diff --git a/adala/labelers/base.py b/adala/labelers/base.py
index 84c2797..ae6dcc1 100644
--- a/adala/labelers/base.py
+++ b/adala/labelers/base.py
@@ -1,6 +1,9 @@
 import pandas as pd
 import difflib
 import json
+import re
+import openai
+import guidance
 
 from pathlib import Path
 from abc import ABC, abstractmethod
@@ -10,7 +13,7 @@
 from langchain import PromptTemplate, OpenAI, LLMChain
 from langchain.llms import BaseLLM
 from langchain.chat_models import ChatOpenAI
-from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate, ChatPromptTemplate
 
 
 class Labeler(BaseModel, ABC):
@@ -26,9 +29,11 @@ class LLMLabeler(Labeler):
     to generate label predictions given text instructions:
     label = LLM(sample, instructions)
     """
+    prediction_column: str = 'predictions'
+    score_column: str = 'score'
 
     @abstractmethod
-    def label_string(self, input_string: str, instruction: str, labels: List[str]) -> str:
+    def label_string(self, input_string: str, instruction: str, labels: List[str]) -> Dict:
         """
         Label a string with LLM given instruction:
         label = LLM(input_string, instruction)
@@ -47,28 +52,74 @@ def label(
         self,
         df: pd.DataFrame,
         instruction: str,
-        labels: List[str],
-        output_column: str = 'predictions'
+        labels: List[str]
     ) -> pd.DataFrame:
         """
         Label all rows from a pandas DataFrame.
         """
 
 
+class OpenAILabeler(LLMLabeler):
+    model_name: str = 'gpt-3.5-turbo-instruct'
+    temperature: float = 0
+    prompt_template: str = '{{instruction}}\nInput: {{input}}\nOutput: {{select "output" options=labels logprobs="logprobs"}}'
+    verbose: bool = False
+
+    _llm = None
+
+    @root_validator
+    def initialize_llm(cls, values):
+        values['_llm'] = guidance(
+            template=values.get('prompt_template'),
+            llm=guidance.llms.OpenAI(values.get('model_name')),
+            silent=not values.get('verbose')
+        )
+        return values
+
+    def label_string(self, input_string: str, instruction: str, labels: Optional[List[str]] = None) -> Dict:
+        result = self._llm(input=input_string, instruction=instruction, labels=labels)
+        return {
+            self.prediction_column: result['output'],
+            self.score_column: result['logprobs'][result['output']]
+        }
+
+    def label_row(self, row: pd.Series, instruction: str, labels: List[str]) -> Dict:
+        return self.label_string(row.to_json(force_ascii=False), instruction, labels)
+
+    def label(
+        self,
+        df: pd.DataFrame,
+        instruction: str,
+        labels: List[str],
+        output_column: str = 'predictions'
+    ) -> pd.DataFrame:
+        tqdm.pandas(desc='Labeling')
+        df[[self.prediction_column, self.score_column]] = df.progress_apply(
+            func=self.label_row,
+            axis=1,
+            result_type='expand',
+            instruction=instruction,
+            labels=labels,
+        )
+        return df
+
+
 class LangChainLabeler(LLMLabeler):
     model_name: str = 'gpt-3.5-turbo'
     temperature: float = 0
     llm: Optional[BaseLLM] = None
     llm_chain: Optional[LLMChain] = None
     prompt: Optional[str] = None
+    system_prompt: Optional[str] = None
     verbose: bool = False
 
     @root_validator
     def initialize_llm(cls, values):
         if values.get('prompt') is None:
-            default_file = Path(__file__).parent / 'prompts' / 'simple_classification.txt'
-            with open(default_file, 'r') as f:
-                values['prompt'] = f.read()
+            # default_file = Path(__file__).parent / 'prompts' / 'simple_classification.txt'
+            # with open(default_file, 'r') as f:
+            #     values['prompt'] = f.read()
+            values['prompt'] = '{instructions}\n\nInput:\n{record}\n\nOutput:\n'
         if values.get('llm') is None:
             values['llm'] = ChatOpenAI(
                 model_name=values['model_name'],
@@ -76,31 +127,44 @@ def initialize_llm(cls, values):
             )
         if values.get('llm_chain') is None:
             prompt = HumanMessagePromptTemplate(prompt=PromptTemplate.from_template(values['prompt']))
+            messages = [prompt]
+            if values.get('system_prompt') is not None:
+                system_prompt = SystemMessagePromptTemplate(prompt=PromptTemplate.from_template(values['system_prompt']))
+                messages.insert(0, system_prompt)
             values['llm_chain'] = LLMChain(
                 llm=values['llm'],
-                prompt=ChatPromptTemplate.from_messages([prompt]),
+                prompt=ChatPromptTemplate.from_messages(messages=messages),
                 verbose=values['verbose']
             )
         return values
 
-    def label_string(self, input_string: str, instruction: str, labels: List[str]):
+    def label_string(self, input_string: str, instruction: str, labels: Optional[List[str]] = None):
         prediction = self.llm_chain.run(
             record=input_string,
             instructions=instruction,
-            labels=str(labels)
+            # labels=str(labels)
         )
-        # match prediction to actual labels
-        scores = list(map(lambda l: difflib.SequenceMatcher(None, prediction, l).ratio(), labels))
-        return labels[scores.index(max(scores))]
-
-    def label_row(self, row: pd.Series, instruction: str, labels: List[str]) -> str:
-        return self.label_string(input_string=row.to_json(), instruction=instruction, labels=labels)
+        if labels:
+            prediction = prediction.strip()
+            line_predictions = []
+            # for line_prediction in prediction.split('\n'):
+            for line_prediction in re.split(r',|\n', prediction):
+
+                # match prediction to actual labels
+                scores = list(map(lambda l: difflib.SequenceMatcher(None, line_prediction.strip(), l).ratio(), labels))
+                line_prediction = labels[scores.index(max(scores))]
+                line_predictions.append(line_prediction)
+            prediction = ','.join(sorted(line_predictions))
+        return prediction
+
+    def label_row(self, row: pd.Series, instruction: str, labels: Optional[List[str]] = None) -> str:
+        return self.label_string(input_string=row.to_json(force_ascii=False), instruction=instruction, labels=labels)
 
     def label(
         self,
         df: pd.DataFrame,
         instruction: str,
-        labels: List[str],
+        labels: Optional[List[str]] = None,
         output_column: str = 'predictions'
     ) -> pd.DataFrame:
 
@@ -112,7 +176,3 @@ def label(
             labels=labels,
         )
         return df.assign(**{output_column: predictions})
-
-
-class OpenAILabeler(LangChainLabeler):
-    model_name = 'gpt-3.5-turbo'
diff --git a/adala/optimizers/llm_instructions.py b/adala/optimizers/llm_instructions.py
index ad6eec4..c954562 100644
--- a/adala/optimizers/llm_instructions.py
+++ b/adala/optimizers/llm_instructions.py
@@ -16,20 +16,29 @@
 def calc_fitness(labeler: LLMLabeler, records, df, labels, ground_truth_column, sample_size=5, top_n=5):
     df = df.sample(n=sample_size, axis=0)
     output_records = deepcopy(records)
-    for record in output_records:
+    for i, record in enumerate(output_records):
         df_pred = labeler.label(
-            df=df.drop(columns=[ground_truth_column]),
+            df=df.drop(columns=[ground_truth_column] + [r['id'] for r in output_records[:i]]),
             instruction=record['instruction'],
-            labels=labels,
-            output_column='predictions'
+            labels=labels
         )
-        current_matches = (df_pred['predictions'] == df[ground_truth_column]).sum()
+        df_pred = pd.concat((df_pred, df[[ground_truth_column]]), axis=1)
+        current_matches = (df_pred['predictions'] == df_pred[ground_truth_column]).sum()
+        current_mismatches = (df_pred['predictions'] != df_pred[ground_truth_column]).sum()
         examples_seen = record['examples_seen']
         total_examples_seen = examples_seen + sample_size
         # iterative formula for calculating accuracy
         record['accuracy'] = (examples_seen * record['accuracy'] + current_matches) / total_examples_seen
         record['examples_seen'] = total_examples_seen
-        record['errors'] = df_pred[df_pred['predictions'] != df[ground_truth_column]].to_json(orient='records')
+        record['errors'] = '\n'.join(df_pred[df_pred['predictions'] != df_pred[ground_truth_column]].apply(
+            lambda r: f'INPUT: {r.drop(["predictions", ground_truth_column]).to_json()}\n'
+                      f'PREDICTED OUTPUT: {r["predictions"]}\n'
+                      f'EXPECTED OUTPUT: {r[ground_truth_column]}', axis=1))
+        record['mismatches'] = current_mismatches
+
+        df[record['id']] = df_pred['predictions']
+
+    logger.info(f'Result df:\n{df}')
 
     sorted_results = sorted(output_records, key=lambda x: x['accuracy'], reverse=True)
     best_results = sorted_results[:top_n]
@@ -62,24 +71,32 @@ def adapt(current_instruction, errors, labels):
     system_message_prompt = SystemMessagePromptTemplate.from_template('''\
 Act as an 'Instruction Tuner' for the LLM. You will be given the inputs:
 
-- The [CURRENT INSTRUCTION] used to guide the LLM's classification
+- The [CURRENT INSTRUCTION] used to guide the LLM's classification, including specific examples with ground truth labels.
 - Target set of [LABELS] for the dataset in question.
 - [CURRENT ERRORS] that emerged when this instruction was applied to a dataset.
 
-The ERRORS presented in JSON format, which contain the ground_truth label, \
-the predictions label, and the input data in one or more columns. \
-Here's an example format for the errors:
+The current errors are presented in the following format:
+INPUT: [input text]
+PREDICTED OUTPUT: [predicted label]
+EXPECTED OUTPUT: [ground truth label]
 
-```json
-[{{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, {{"ground_truth":"...","predictions":"...", "input_text": "...", "other": "data", ...}}, ...]
-```
+Carefully analyze these errors and craft a revised instruction for the LLM to fit the expected outputs. \
+Include 2-3 examples at the end of your response to demonstrate how the new instruction would be applied. \
+Use the following format for your examples:
 
-Analyze these inputs and craft a revised instruction for the LLM, aiming to enhance classification accuracy for the dataset in question. Deliver your response as the refined instruction.
-''')
+INPUT: [input text]
+OUTPUT: [expected output label]
+
+Use specific error examples and generalize them to address any observed errors that may occur in the future. 
+Deliver your response as the refined instruction.''')
+#Analyze these inputs and craft a revised instruction for the LLM, aiming to enhance classification accuracy for the dataset in question. Deliver your response as the refined instruction.
+#''')
     human_message_prompt = HumanMessagePromptTemplate.from_template('''\
-CURRENT INSTRUCTION: "{instruction}"
+CURRENT INSTRUCTION: {instruction}
 LABELS: {labels}
-CURRENT ERRORS: {errors}
+CURRENT ERRORS:
+
+{errors}
 
 New refined instruction:
 ''')
@@ -87,7 +104,7 @@ def adapt(current_instruction, errors, labels):
         system_message_prompt,
         human_message_prompt
     ])
-    chain = LLMChain(llm=llm, prompt=chat_prompt)
+    chain = LLMChain(llm=llm, prompt=chat_prompt, verbose=False)
     new_instructions = chain.run(
         instruction=current_instruction,
         errors=errors,
@@ -107,10 +124,10 @@ class GenerateInstructionResult:
 def generate_instruction(
     labeler: LLMLabeler,
     df: pd.DataFrame,
-    ground_truth_column: str,
+    ground_truth_column: str = 'ground_truth',
     initial_instructions: List = None,
     num_generations: int = 10,
-    top_instructions: int = 5,
+    top_instructions: int = 3,
     validation_sample_size: int = 5,
     human_in_the_loop: bool = False,
     label_studio_project_id: int = None,
@@ -120,6 +137,7 @@ def generate_instruction(
     """Optimize the instruction for the LLM."""
 
     initial_instructions = initial_instructions or ['']
+    df = df.dropna(subset=[ground_truth_column])
     records = [
         {
             'instruction': instruction,
@@ -132,6 +150,7 @@ def generate_instruction(
         for instruction in initial_instructions
     ]
     labels = df[ground_truth_column].unique().tolist()
+    # labels = None
     for generation in range(num_generations):
         # calculate fitness value and corresponding errors
         logger.info(f'Calculating fitness for {len(records)} instructions')
@@ -145,8 +164,12 @@ def generate_instruction(
             top_n=top_instructions,
         )
 
+        logger.info(
+            f'Results of {generation} generations:\n'
+            f'{pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen", "mismatches"]]}')
+
         # mutate the best instructions with accuracy<100% based on errors
-        best_results_with_errors = next((x for x in records if x['accuracy'] < 1), None)
+        best_results_with_errors = next((x for x in records if x['mismatches'] > 0), None)
         if not best_results_with_errors:
             # TODO: change this to a more sophisticated mutation
             logger.info(f'All instructions have 100% accuracy. Mutating the best instruction {records[0]["id"]}...')
@@ -165,10 +188,6 @@ def generate_instruction(
             'id': uuid4().hex[:4]
         }]
 
-        logger.info(
-            f'Results of {generation} generation:\n'
-            f'{pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen"]]}')
-
     # calculate fitness on final results
     records = calc_fitness(
         labeler=labeler,
@@ -179,7 +198,7 @@ def generate_instruction(
         sample_size=len(df),
         top_n=top_instructions,
     )
-    benchmark_table = pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen"]]
+    benchmark_table = pd.DataFrame.from_records(records)[["id", "instruction", "accuracy", "examples_seen", "mismatches"]]
     logger.info(f'Final results:\n{benchmark_table}')
 
     return GenerateInstructionResult(
diff --git a/requirements.txt b/requirements.txt
index 04207c4..afa7284 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 pandas
 openai
 langchain
+guidance
 label-studio-sdk @ git+https://github.com/HumanSignal/label-studio-sdk.git@pd-support
\ No newline at end of file

From beeb2bdc9c94e623d55906b16d0badaf236c14eb Mon Sep 17 00:00:00 2001
From: Michael Malyuk <michaelmalyuk@Michaels-Air.attlocal.net>
Date: Tue, 17 Oct 2023 09:09:19 -0700
Subject: [PATCH 13/14] Adding basic structure for the FastAPI server and
 command line

---
 api.py             | 44 ++++++++++++++++++++++++
 data_processing.py | 47 ++++++++++++++++++++++++++
 database.py        | 70 ++++++++++++++++++++++++++++++++++++++
 main.py            | 84 ++++++++++++++++++++++++++++++++++++++++++++++
 server.py          | 19 +++++++++++
 utility.py         | 33 ++++++++++++++++++
 6 files changed, 297 insertions(+)
 create mode 100644 api.py
 create mode 100644 data_processing.py
 create mode 100644 database.py
 create mode 100644 main.py
 create mode 100644 server.py
 create mode 100644 utility.py

diff --git a/api.py b/api.py
new file mode 100644
index 0000000..0e24ff4
--- /dev/null
+++ b/api.py
@@ -0,0 +1,44 @@
+
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from .data_processing import DataProcessing
+
+
+@app.post("/upload/")
+async def upload_api(file: UploadFile = File(...)):
+    with open(f"/temp/path/{file.filename}", "wb") as buffer:
+        buffer.write(file.file.read())
+    data_processing = DataProcessing()
+    return data_processing.send(f"/temp/path/{file.filename}")
+
+@app.post("/learn/")
+async def learn_api():
+    # ... (similar structure to upload_api)
+
+@app.post("/predict/")
+async def predict_api(input: UploadFile = File(...), output: str = None):
+    # ... (similar structure to upload_api)
+    
+@app.post("/add-skill/")
+async def add_skill_api():
+    # ... (similar structure to upload_api)
+
+@app.get("/list-skills/")
+def list_skills_api():
+    # ... (Endpoint logic)
+    
+@app.get("/list-runtimes/")
+def list_agents_api():
+    # ... (Endpoint logic)
+
+@app.get("/metrics/")
+def get_metrics_api():
+    # ... (Endpoint logic)
+
+@app.get("/info/")
+def info_api():
+    # ... (Endpoint logic)
+
+@app.get("/logs/")
+def logs_api():
+    # ... (Endpoint logic)
+        
diff --git a/data_processing.py b/data_processing.py
new file mode 100644
index 0000000..e4ba3f1
--- /dev/null
+++ b/data_processing.py
@@ -0,0 +1,47 @@
+
+class DataProcessing:
+    """
+    A class used to process data, including sending and predicting operations.
+    """
+
+    def send(self, file_path=None):
+        ""
+        Processes the specified file for sending operations.
+        
+        Parameters:
+        ----------
+        file_path : str, optional
+            The path to the file to be processed. Default is None.
+        
+        Returns:
+        -------
+        dict
+            A dictionary containing the processing message or an error message.
+        """
+        
+        if file_path:
+            return {"message": f"File {file_path} processed."}
+        else:
+            return {"error": "File path not provided"}
+
+    def predict(self, input_path=None, output_path=None):
+        """
+        Processes the input file for prediction operations and saves the results to the output path.
+        
+        Parameters:
+        ----------
+        input_path : str, optional
+            The path to the input file. Default is None.
+        output_path : str, optional
+            The path where prediction results should be saved. Default is None.
+        
+        Returns:
+        -------
+        dict
+            A dictionary containing the prediction results or a processing message.
+        """
+
+        if input_path and output_path:
+            return {"message": f"Predictions from {input_path} saved to {output_path}."}
+        else:
+            return {"predictions": "Sample prediction data"}
diff --git a/database.py b/database.py
new file mode 100644
index 0000000..00da7fe
--- /dev/null
+++ b/database.py
@@ -0,0 +1,70 @@
+
+class Database:
+    """
+    A class used to manage SQLite database operations.
+    
+    Attributes:
+    ----------
+    connection : sqlite3.Connection
+        An SQLite connection object to the database.
+    cursor : sqlite3.Cursor
+        A cursor object to execute SQL queries.
+    """
+    
+    def __init__(self, db_path=':memory:'):
+        """
+        Initializes the Database class with a connection to the specified SQLite database.
+        
+        Parameters:
+        ----------
+        db_path : str
+            The path to the SQLite database file. Default is in-memory database.
+        """
+        self.db_path = db_path
+        self.connection = None
+
+    def connect(self):
+        self.connection = sqlite3.connect(self.db_path)
+        return self.connection
+
+    def close(self):
+        if self.connection:
+            self.connection.close()class Database:
+    def __init__(self, db_path=':memory:'):
+        self.db_path = db_path
+        self.connection = None
+        self.cursor = None
+
+    def connect(self):
+        self.connection = sqlite3.connect(self.db_path)
+        self.cursor = self.connection.cursor()
+        return self.connection
+
+    def initialize_metrics_table(self):
+        self.cursor.execute("""
+        CREATE TABLE IF NOT EXISTS metrics (
+            id INTEGER PRIMARY KEY,
+            uptime INTEGER,
+            total_requests INTEGER,
+            total_processing_time INTEGER,
+            labeled_data_points INTEGER,
+            active_agents TEXT
+        )
+        """)
+        # Insert default metrics if the table is empty
+        self.cursor.execute("INSERT INTO metrics (uptime, total_requests, total_processing_time, labeled_data_points, active_agents) SELECT 0,0,0,0,'engineer,analyst' WHERE NOT EXISTS (SELECT 1 FROM metrics)")
+        self.connection.commit()
+
+    def get_metrics(self):
+        self.cursor.execute("SELECT * FROM metrics ORDER BY id DESC LIMIT 1")
+        return self.cursor.fetchone()
+
+    def update_metrics(self, **kwargs):
+        # Sample update operation, can be extended based on specific requirements
+        for key, value in kwargs.items():
+            self.cursor.execute(f"UPDATE metrics SET {key} = {key} + ? WHERE id = 1", (value,))
+        self.connection.commit()
+
+    def close(self):
+        if self.connection:
+            self.connection.close()
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..763182d
--- /dev/null
+++ b/main.py
@@ -0,0 +1,84 @@
+
+import argparse
+from flask import Flask
+import threading
+import os
+import signal
+import time
+import sqlite3
+
+import argparse
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from starlette.responses import JSONResponse
+import os
+import sqlite3
+
+
+import argparse
+from server import Server, app
+from data_processing import DataProcessing
+
+    
+parser = argparse.ArgumentParser(description="ADALA Command Line Interface")
+subparsers = parser.add_subparsers(dest='command')
+server = Server()
+data_processing = DataProcessing()
+utility = Utility()
+
+# Server
+server_parser = subparsers.add_parser('server')
+server_parser.add_argument('--port', type=int, default=8000)
+server_parser.set_defaults(func=server.start)
+
+server_instance = Server(app, port=args.port)
+server_instance.run()
+
+# Restart
+restart_parser = subparsers.add_parser('restart')
+restart_parser.add_argument('--port', type=int, default=8000)
+restart_parser.set_defaults(func=server.restart)
+
+# Shutdown
+shutdown_parser = subparsers.add_parser('shutdown')
+shutdown_parser.set_defaults(func=server.shutdown)
+
+# Send data
+send_parser = subparsers.add_parser('send')
+send_parser.add_argument('--file', required=True, help='Path to data file')
+send_parser.set_defaults(func=data_processing.send)
+
+# Predict
+predict_parser = subparsers.add_parser('predict')
+predict_parser.add_argument('--file', required=True, help='Path to test data file')
+predict_parser.add_argument('--output', required=True, help='Path to save the output')
+predict_parser.set_defaults(func=data_processing.predict)
+
+# List models
+list_models_parser = subparsers.add_parser('list-models')
+list_models_parser.set_defaults(func=utility.list_models)
+
+# List agents
+list_agents_parser = subparsers.add_parser('list-agents')
+list_agents_parser.set_defaults(func=utility.list_agents)
+
+# Logs
+logs_parser = subparsers.add_parser('logs')
+logs_parser.add_argument('--tail', type=int, default=10)
+logs_parser.set_defaults(func=utility.logs)
+
+# Metrics
+metrics_parser = subparsers.add_parser('metrics')
+metrics_parser.set_defaults(func=utility.metrics)
+
+# Help
+help_parser = subparsers.add_parser('help')
+help_parser.add_argument('command', type=str, help='Command to get help for')
+help_parser.set_defaults(func=utility.help)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if hasattr(args, 'func'):
+        args.func(args)
+    else:
+        parser.print_help()
diff --git a/server.py b/server.py
new file mode 100644
index 0000000..6f77c41
--- /dev/null
+++ b/server.py
@@ -0,0 +1,19 @@
+
+class Server:
+    def start(self, args):
+        global server_thread
+        self.database.connect()
+        self.database.initialize_metrics_table()
+        server_thread = threading.Thread(target=self.run_server, args=(args.port,))
+        server_thread.start()
+
+    def run_server(self, port):
+        app.run(host='0.0.0.0', port=port)
+
+    def restart(self, args):
+        self.shutdown(None)
+        time.sleep(2)
+        self.start(args)
+
+    def shutdown(self, args):
+        os.kill(os.getpid(), signal.SIGINT)
diff --git a/utility.py b/utility.py
new file mode 100644
index 0000000..b9ccfc4
--- /dev/null
+++ b/utility.py
@@ -0,0 +1,33 @@
+
+class Utility:
+    def list_models(self, args):
+        # Placeholder logic to list models
+        print("List of available models:\n- Model1\n- Model2\n- Model3")
+
+    def list_agents(self, args):
+        # Placeholder logic to list agents
+        print("List of available agents:\n- engineer\n- analyst\n- labeler")
+
+    def logs(self, args):
+        # Placeholder logic to display logs
+        print(f"Displaying the last {args.tail} log entries...")
+
+    def metrics(self, args):
+        self.database.connect()
+        metrics = self.database.get_metrics()
+        
+        if metrics:
+            _, uptime, total_requests, total_processing_time, labeled_data_points, active_agents = metrics
+            uptime_hours = uptime // 3600
+            average_processing_time = (total_processing_time / total_requests) if total_requests > 0 else 0
+
+            print(f"Server Uptime: {uptime_hours} hours")
+            print(f"Total Requests Processed: {total_requests}")
+            print(f"Average Processing Time: {average_processing_time:.2f} seconds")
+            print(f"Total Labeled Data Points: {labeled_data_points}")
+            print(f"Current Active Agents: {active_agents}")
+            
+        self.database.close()
+
+    def help(self, args):
+        parser.print_help(args.command)

From aad3a0487bf54b8e12bca1d9e41f8c6e192cdecb Mon Sep 17 00:00:00 2001
From: Michael Malyuk <michaelmalyuk@Michaels-Air.attlocal.net>
Date: Tue, 17 Oct 2023 09:12:49 -0700
Subject: [PATCH 14/14] Adding updated README and contribtuions guide

---
 CONTRIBUTION.md |  59 +++++++++++
 README2.md      | 263 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 322 insertions(+)
 create mode 100644 CONTRIBUTION.md
 create mode 100644 README2.md

diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md
new file mode 100644
index 0000000..50d07b8
--- /dev/null
+++ b/CONTRIBUTION.md
@@ -0,0 +1,59 @@
+# ADALA Project Contribution Guide: Agent and Skill Development
+
+Thank you for your interest in contributing to the ADALA Project's agent development! The robustness and versatility of our system primarily stem from the diverse agents and skills we deploy. This guide focuses on agent-related contributions, highlighting the importance of domain and task specificity.
+
+## Areas of Contribution:
+
+### Diverse Skills Contributions:
+
+ADALA welcomes agents equipped with a wide range of skills, each offering unique capabilities. From tasks such as classification, anomaly detection, and regression to specialized roles like sentiment analysis or recommendation systems, there's endless potential to broaden our agent spectrum. Skills designed for specific domains (like medical, finance, or nature) or tailored tasks within these areas can considerably amplify the system's efficacy.
+
+### Extending Skills:
+
+Start with the foundational Skill class and extend it to facilitate Adala in acquiring new skills. To understand better, examine how the Classification or NamedEntity skills were implemented.
+
+Example:
+
+```python
+
+```
+
+### Domain-Specific Skills
+
+Customize skills to particular domains, providing more profound insights and actionable feedback.
+
+Example:
+
+```python
+```
+
+#### Guidelines for New Skills:
+
+- Uniqueness: Focus on specificity. What unique problem does your skill resolve?
+- Integration: Ensure your skill aligns well with the existing ADALA framework.
+- Documentation: Offer comprehensive documentation, usage instances for your agent, and a testing environment (with a ground truth dataset).
+- Testing: Incorporate both unit and integration tests to guarantee a seamless integration with the ADALA system.
+
+### New Runtimes 
+
+Introduce innovative runtimes utilizing varying language models or even distinct model types for labeling tasks. Enhancing current implementations through performance optimization or new feature introduction is also encouraged.
+
+#### Adding a New Runtime:
+To introduce a new runtime, adhere to the structure delineated by the Runtime abstract class. Below is a rudimentary example:
+
+```python
+
+```
+
+## How to Contribute:
+
+- Fork the Repository: Create a fork of the ADALA repository on your GitHub account.
+- Clone, Branch, and Develop: Clone your fork, spawn a new branch for your contribution, and commence development.
+- Test and Commit: After modifications, conduct comprehensive testing. Once content, commit with an informative message.
+- Push and Pull Request: Push your amendments and formulate a pull request detailing your contribution's value.
+
+## Code of Conduct:
+While diverse contributions invigorate our project, it's paramount to sustain a harmonious and cooperative environment. Please adhere to our code of conduct.
+
+## Questions or Discussions:
+For inquiries or discussions concerning particular features, agents, or modifications, please initiate an issue. Your feedback propels the project's advancement.
diff --git a/README2.md b/README2.md
new file mode 100644
index 0000000..d879d16
--- /dev/null
+++ b/README2.md
@@ -0,0 +1,263 @@
+# ADALA
+
+Adala is an Autonomous DatA (Labeling) Agent framework. 
+
+Adala offers a robust framework for implementing agents specialized in
+data processing, with a particular emphasis on diverse data labeling
+tasks. These agents are autonomous, meaning they can independently
+acquire one or more skills through iterative learning. This learning
+process is influenced by their operating environment, observations,
+and reflections. Users define the environment by providing a ground
+truth dataset. Every agent learns and applies its skills in what we
+refer to as a "runtime", synonymous with LLM.
+
+Offered as an HTTP server, users can interact with Adala via command
+line or RESTful API, and directly integrate its features in Python
+Notebooks or scripts. The self-learning mechanism leverages Large
+Language Models (LLMs) from providers like OpenAI and VertexAI.
+
+### Why Choose Adala?
+
+- **Specialized in Data Processing**: While our agents excel in diverse
+  data labeling tasks, they can be tailored to a wide range of data
+  processing needs.
+- **Autonomous Learning**: Adala agents aren't just automated;
+  they're intelligent. They iteratively and independently develop
+  skills based on environment, observations, and reflections.
+- **User-Centric Environment Setup**: You have control. Define your
+  agent's learning environment simply by providing a ground truth
+  dataset.
+- **Optimized Runtime**: Our agents operate in a state-of-the-art runtime
+  environment, synonymous with LLM, ensuring efficiency and
+  adaptability.
+- **Extend to your domain**: Build custom agents and skills focused on
+  your specific domain.
+
+## Installation
+
+Install ADALA:
+
+```sh
+git clone https://github.com/HumanSignal/ADALA.git
+cd ADALA/
+pip install -e .
+```
+
+If you're planning to use human-in-the-loop labeling, or need a
+labeling tool to produce ground truth datasets, we suggest installing
+Label Studio. Adala is made to support Label Studio format right out
+of the box.  
+
+```sh
+pip install label-studio
+```
+
+## Quickstart
+
+In this example we will use ADALA as a standalone library directly
+inside our python notebook. You can open it in Collab right here.
+
+```python
+from adala import agents, datasets
+
+# this is the dataset with product reviews
+filepath = ""
+
+agent = agents.SingleShotAgent(
+    dataset=datasets.PandasDataFrame(
+        df=pd.read_csv(filepath, sep='\t', nrows=100)
+    ),
+
+    # create runtime
+    runtimes={
+		"openai": runtimes.OpenAIGPTRuntime(
+			model_name='gpt-3.5-turbo-instruct'
+		)
+	}
+
+	default_runtime="openai",
+
+    # add agent memory
+    memory=memories.FileMemory(
+        filepath='long_term_memory.jsonl'
+    ),
+	
+    skills={
+		"classify": skills.ClassificationSkill(
+			name='subjectivity_detection',
+			description='Understanding subjective and objective statements from text.',
+			instructions='Classify a product review as either expressing "Subjective" or "Objective" statements.'
+		)
+	}
+)
+
+step_result = agent.step()
+
+while step_result.artifact.experience.accuracy < 90 or 
+	  step_results.artifact.experience.learnings is not None:
+  print(step_result.summarize())
+  step_result = agent.step()
+
+print("Done!")
+```
+
+## Running ADALA Server
+
+Initiate the Adala server. Note: Each agent operates as its own web server.
+
+### Starting the Adala Server
+
+```sh
+# Start the Adala server on default port 8090
+adala start
+```
+
+### Uploading Ground Truth Data
+
+Before teaching skills to Adala, you need to set up the environment and upload data.
+
+```sh
+# Upload your dataset
+adala upload --file sample_dataset_ground_truth.json
+```
+
+### Teaching Skills to Adala
+
+Now, define and teach a new skill to Adala.
+
+```sh
+# Define a new skill for classifying objects
+adala add-skill --name "Object Classification" --description "Classify text into categories." --instruction "Example: Label trees, cars, and buildings."
+```
+
+```sh
+# Start the learning process
+adala learn --skill "Object Classification" --continuous
+```
+
+### Monitoring Optimization
+
+Track the progress of the optimization process.
+
+```sh
+# Check the optimization status
+adala status
+```
+
+### Applying Skills and Predictions
+
+You don't need to wait for optimization to finish. Instruct Adala to
+apply its skills on new data outside the environment, turning Adala
+into a prediction engine. If the predictions generated by the skill
+are then verified by human validators or another supervision system,
+this provides more ground truth data, enhancing the agent's skills.Use
+the learned skills and generate predictions.
+
+```sh
+# Apply the 'Object Classification' skill on new data
+adala apply-skill --name "Object Classification" --file sample_dataset_predict.json
+```
+
+### Review Metrics
+
+Get insights into Adala's performance.
+
+```sh
+# View detailed metrics
+adala metrics
+```
+
+## Executing ADALA Command Line
+
+```sh
+# Start the Adala server on default port 8090
+adala start --port 8090 --config config.yml
+
+# Upload your dataset
+adala upload --file sample_dataset_ground_truth.json
+
+# Define a new skill for classifying objects
+adala add-skill --name "Object Classification" --description "Classify images into categories." --instruction "Example: Label trees, cars, and buildings."
+
+# Start the learning process
+adala learn --skill "Object Classification"
+
+# Apply the 'Object Classification' skill on new data. If you do not pass specific skill it will apply each one
+adala predict --skill "Object Classification" --file sample_dataset_predict.json
+
+# Check the optimization status
+adala status
+
+# View detailed metrics
+adala metrics
+
+# Restart the Adala server
+adala restart
+
+# Shut down the Adala server
+adala shutdown
+
+# List all the skills
+adala list-skills
+
+# List all the runtimes
+adala list-runtimes
+
+# Retrieve raw logs
+adala logs
+
+# Provide help
+adala help <command>
+```
+
+## Contributing to ADALA
+
+Dive into the heart of Adala by enhancing our Skills, optimizing
+Runtimes, or pioneering new Agent Types. Whether you're crafting
+nuanced tasks, refining computational environments, or sculpting
+specialized agents for unique domains, your contributions will power
+Adala's evolution. Join us in shaping the future of intelligent
+systems and making Adala more versatile and impactful for users across
+the globe.
+
+Read more here.
+
+## How ADALA compares to other agent libraries
+
+## FAQ
+
+- What is an agent?
+- Agent is a set of skills and runtimes that could be used to execute
+  those skills. Each agent has its own unique environment (dataset)
+  attached to it. You can define your own agent class that would have
+  a unique set of skills for your domain.
+  
+- 
+
+## Interesting Stuff
+
+Skill is a learned ability to solve a specific task. Skill gets
+trained from the ground truth dataset. 
+
+- how you think about tasks?
+  step by step
+
+1. First step (what should happen input => output)
+2. Second step
+3. Third step
+
+Graph of your skills
+Idead of graph = idea of dynamic development of skills
+
+split the task
+split for 10 classes
+10 categories for my emails
+
+composability principle for graphs, building blocks
+use building blocks to 
+
+interpolation of knowledge
+enable you to connect A to B
+
+Reliable agents
+