Rename context-leakage-team to prompt-leakage-probing (#27)

airtai · Nov 27, 2024 · efe9c28 · efe9c28
1 parent 6a60a47
commit efe9c28
Show file tree

Hide file tree

Showing 56 changed files with 242 additions and 218 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -3,8 +3,7 @@
     "dockerComposeFile": [
         "./docker-compose.yml"
     ],
-    "service": "python-3.10-context_leakage_team",
-
+    "service": "python-3.10-prompt_leakage_probing",
     "secrets": {
         "OPENAI_API_KEY": {
             "description": "This key is optional and only needed if you are working on OpenAI-related code. Leave it blank if not required. You can always set it later as an environment variable in the codespace terminal."
@@ -17,7 +16,7 @@
         }
     },
     "shutdownAction": "stopCompose",
-    "workspaceFolder": "/workspaces/context_leakage_team",
+    "workspaceFolder": "/workspaces/prompt_leakage_probing",
     // "runArgs": [],
     "remoteEnv": {},
     "features": {

diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
@@ -2,20 +2,20 @@ version: '3'
 
 services:
   # nosemgrep: yaml.docker-compose.security.writable-filesystem-service.writable-filesystem-service
-  python-3.10-context_leakage_team:
+  python-3.10-prompt_leakage_probing:
     image: mcr.microsoft.com/devcontainers/python:3.10
-    container_name: context_leakage_team-${USER}-python-3.10
+    container_name: prompt_leakage_probing-${USER}-python-3.10
     volumes:
-      - ../:/workspaces/context_leakage_team:cached
+      - ../:/workspaces/prompt_leakage_probing:cached
     command: sleep infinity
 
     env_file:
       - ./devcontainer.env
     security_opt:
       - no-new-privileges:true
     networks:
-      - context_leakage_team-network
+      - prompt_leakage_probing-network
 
 networks:
-  context_leakage_team-network:
-    name: context_leakage_team-${USER}-network
+  prompt_leakage_probing-network:
+    name: prompt_leakage_probing-${USER}-network
diff --git a/README.md b/README.md
@@ -1,88 +1,107 @@
-# context-leakage-team
+# Prompt Leakage Probing
 
 ## Overview
-This project focuses on testing machine learning models while ensuring data confidentiality and preventing context leakage. You will need to configure the tested model and manage confidential and non-confidential information before running the context leakage detection tool.
+
+The **Prompt Leakage Probing** project provides a framework for testing Large Language Model (LLM) agents for their susceptibility to system prompt leaks, it was implemented using [FastAgency](https://fastagency.ai/latest/) and [AutoGen](https://ag2.ai/). It currently implements two attack strategies:
+
+1. **Simple Attack**: Uses `PromptGeneratorAgent` and `PromptLeakageClassifierAgent` to attempt prompt extraction.
+2. **Base64 Attack**: Enables `PromptGeneratorAgent` to encode sensitive parts of the prompt in Base64 to bypass sensitive prompt detection.
 
 ## Prerequisites
 
-Before proceeding, ensure the following:
+Ensure you have the following installed:
+
+- Python >=3.10
 
-- Python >=3.10 installed
+Additionally, ensure that your `OPENAI_API_KEY` is exported to your environment.
 
 ## Setup Instructions
 
-### 1. Clone the Repository
+### 1. Install the Project
 
-\```bash
-git clone https://github.com/airtai/context-leakage-team.git
-cd context-leakage-team
-\```
+Clone the repository and install the dependencies:
 
-### 2. Install Dependencies
+```bash
+pip install ."[dev]"
+```
 
-Install the required Python packages:
+### 2. Run the Project Locally
 
-\```bash
-pip install ."[dev]"
-\```
+Start the application using the provided script:
+
+```bash
+./scripts/run_fastapi_locally.sh
+```
+
+This will start the [FastAgency](https://fastagency.ai/latest/) FastAPI provider and Mesop provider instances. You can then access the application through your browser.
+
+### 3. Use the Devcontainer (Optional)
+
+The project comes with **Devcontainers** configured, enabling a streamlined development environment setup if you use tools like Visual Studio Code with the Devcontainer extension.
+
+## Application Usage
+
+When you open the application in your browser, you'll first see the workflow selection screen.
+
+![Workflow selection](docs/docs/assets/img/workflow_selection.png?raw=true "Workflow selection")
+
+### Running the Tests
+
+After selecting **"Attempt to leak the prompt from selected LLM model"**, you will start a workflow for probing the LLM for prompt leakage. During this process, you will:
+
+1. Select the prompt leakage scenario you want to test.
+2. Choose the model you want to test.
+3. Specify the number of attempts to leak the prompt in the chat.
+
+![Test configuration](docs/docs/assets/img/configuring_testing.png?raw=true "Test configuration")
+
+The `PromptGeneratorAgent` will then generate adversarial prompts aimed at making the tested agent leak its prompt. After each response from the tested agent, the `PromptLeakageClassifierAgent` will analyze the response and report the level of prompt leakage.
+
+#### Prompt generation
 
-### 3. Configure the Model API
+In this step, the `PromptGeneratorAgent` generates adversarial prompts designed to elicit prompt leakage from the tested agent. The tested model is then probed with the generated prompt using a function call.
 
-Before running the tool, you must configure the model's API settings in `tested_model_config/tested_model_api_config.json`.
+![Prompt generation](docs/docs/assets/img/prompt_generation.png?raw=true "Prompt generation")
 
-Example JSON configuration:
+#### Tested agent response
 
-\```json
-{
-    "url": "your model url",
-    "token": "your token"
-}
-\```
+The tested agent's response is the returned value from the function call initiated by the `PromptGeneratorAgent`. This response represents how the agent reacts to the probing prompt and serves as a input for subsequent analysis.
 
-- `url`: The URL where your model is accessible (such as an API endpoint).
-- `token`: The access token required to authenticate requests to the model API.
+![Tested agent response](docs/docs/assets/img/tested_agent_response.png?raw=true "Tested agent response")
 
-### 4. Add Confidential and Non-Confidential Model Information
+#### Response classification
 
-- **Confidential Part**: Add the confidential information related to your tested model in the `tested_model_config/tested_model_confidential.md` file.
+The response is then passed to the `PromptLeakageClassifierAgent`, which evaluates it for signs of prompt leakage. This evaluation informs whether sensitive information from the original prompt has been exposed. Additionally, the response may guide further prompt refinement, enabling an iterative approach to testing the agent's resilience against prompt leakage attempts.
 
-- **Non-Confidential Part**: Add the non-confidential information of your tested model in the `tested_model_config/tested_model_non_confidential.md` file.
+![Response classification](docs/docs/assets/img/response_classification.png?raw=true "Response classification")
 
-### 5. Run the Context Leakage Detection Team
+All response classifications are saved as CSV files in the `reports` folder. These files contain the prompt, response, reasoning, and leakage level. They are used to display the reports flow, which we will now demonstrate.
 
-Once the model configuration is set and the confidential/non-confidential parts are in place, you can run the context leakage detection tool using the following command:
+### Displaying the Reports
 
-\```bash
-fastagency run context_leakage_team/context_leakage_team.py
-\```
+In the workflow selection screen, select **"Report on the prompt leak attempt"**.
+This workflow provides a detailed report for each prompt leakage scenario and model combination that has been tested.
 
-## Folder Structure
+![Report flow](docs/docs/assets/img/report_flow.png?raw=true "Report flow")
 
-This is the basic folder structure, most of the non important files are omitted for simplicity of the overview.
+## Tested Models
 
-\```
-├── tested_model_config
-│   ├── tested_model_api_config.json         # Model API configuration file
-│   ├── tested_model_confidential.md         # Confidential model information
-│   └── tested_model_non_confidential.md     # Non-confidential model information
-├── context_leakage_team
-│   └── context_leakage_team.py              # Script to run context leakage detection
-├── pyproject.toml                           # List of dependencies
-└── README.md                                # Project documentation
-\```
+The project includes three tested model endpoints that are started alongside the service. These endpoints are used to demonstrate the prompt leakage workflow and can be accessed through predefined routes. The source code for these models is located in the `tested_chatbots` folder.
 
-## Example Usage
+### Description of Tested Models
 
-1. Ensure that your `tested_model_api_config.json` is correctly configured with the model URL and token.
-2. Place the confidential and non-confidential information in their respective files.
-3. Run the context leakage detection:
+| **Model** | **Description**                                                                                          | **Endpoint**   |
+|-----------|----------------------------------------------------------------------------------------------------------|----------------|
+| Easy      | Uses a basic prompt without any hardening techniques. No canary words are included, and no LLM guardrail is applied. | `/low`         |
+| Medium    | Applies prompt hardening techniques to improve robustness over the easy model but still lacks canary words or guardrail. | `/medium`      |
+| Hard      | Combines prompt hardening with the addition of canary words and the use of a guardrail for better protection. | `/high`        |
 
-\```bash
-fastagency run context_leakage_team/context_leakage_team.py
-\```
+### Implementation Details
 
-This will analyze the model configuration for potential context leaks.
+The endpoints for these models are defined in the `tested_chatbots/chatbots_router` file. They are part of the FastAPI provider and are available under the following paths:
 
-## License
+- `/low`: Easy model endpoint.
+- `/medium`: Medium model endpoint.
+- `/high`: Hard model endpoint.
 
-Include your license information here.
+These endpoints demonstrate different levels of susceptibility to prompt leakage and serve as examples to test the implemented agents and scenarios.
diff --git a/context_leakage_team/workflow/__init__.py b/context_leakage_team/workflow/__init__.py
diff --git a/context_leakage_team/workflow/agents/__init__.py b/context_leakage_team/workflow/agents/__init__.py
diff --git a/context_leakage_team/workflow/scenarios/context_leak/__init__.py b/context_leakage_team/workflow/scenarios/context_leak/__init__.py
diff --git a/context_leakage_team/workflow/workflow.py b/context_leakage_team/workflow/workflow.py
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -6,7 +6,7 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends nginx gettext \
     && rm -rf /var/lib/apt/lists/*
 
-COPY context_leakage_team /app/context_leakage_team
+COPY prompt_leakage_probing /app/prompt_leakage_probing
 
 COPY pyproject.toml README.md /app/
 COPY docker/content/* /app/

diff --git a/docker/content/run_fastagency.sh b/docker/content/run_fastagency.sh
@@ -25,15 +25,15 @@ cat /etc/nginx/conf.d/default.conf
 nginx -g "daemon off;" &
 
 # Run uvicorn server
-uvicorn context_leakage_team.deployment.main_1_fastapi:app --host 0.0.0.0 --port $FASTAPI_PORT > /dev/stdout 2>&1 &
+uvicorn prompt_leakage_probing.deployment.main_1_fastapi:app --host 0.0.0.0 --port $FASTAPI_PORT > /dev/stdout 2>&1 &
 
 # Run gunicorn server
 # Start multiple single-worker gunicorn instances on consecutive ports
 for ((i=1; i<$WORKERS+1; i++))
 do
 	PORT=$((MESOP_PORT + i))
     echo "Starting gunicorn on port $PORT"
-    gunicorn --workers=1 context_leakage_team.deployment.main_2_mesop:app --bind 0.0.0.0:$PORT > /dev/stdout 2>&1 &
+    gunicorn --workers=1 prompt_leakage_probing.deployment.main_2_mesop:app --bind 0.0.0.0:$PORT > /dev/stdout 2>&1 &
 done
 
 # Wait for all background processes

diff --git a/docs/docs/assets/img/configuring_testing.png b/docs/docs/assets/img/configuring_testing.png
diff --git a/docs/docs/assets/img/prompt_generation.png b/docs/docs/assets/img/prompt_generation.png
diff --git a/docs/docs/assets/img/report_flow.png b/docs/docs/assets/img/report_flow.png
diff --git a/docs/docs/assets/img/response_classification.png b/docs/docs/assets/img/response_classification.png
diff --git a/docs/docs/assets/img/tested_agent_response.png b/docs/docs/assets/img/tested_agent_response.png
diff --git a/docs/docs/assets/img/workflow_selection.png b/docs/docs/assets/img/workflow_selection.png
diff --git a/fly.toml b/fly.toml
@@ -1,9 +1,9 @@
-# fly.toml app configuration file generated for context_leakage_team
+# fly.toml app configuration file generated for prompt_leakage_probing
 #
 # See https://fly.io/docs/reference/configuration/ for information about how to use this file.
 #
 
-app = 'context-leakage-team'
+app = 'prompt-leakage-probing'
 primary_region = 'ams'
 
 [build]

diff --git a/context_leakage_team/__about__.py → prompt_leakage_probing/__about__.py b/context_leakage_team/__about__.py → prompt_leakage_probing/__about__.py
diff --git a/context_leakage_team/__init__.py → prompt_leakage_probing/__init__.py b/context_leakage_team/__init__.py → prompt_leakage_probing/__init__.py
diff --git a/context_leakage_team/deployment/__init__.py → ...pt_leakage_probing/deployment/__init__.py b/context_leakage_team/deployment/__init__.py → ...pt_leakage_probing/deployment/__init__.py
diff --git a/...leakage_team/deployment/main_1_fastapi.py → ...kage_probing/deployment/main_1_fastapi.py b/...leakage_team/deployment/main_1_fastapi.py → ...kage_probing/deployment/main_1_fastapi.py
@@ -20,4 +20,4 @@ def list_workflows() -> dict[str, Any]:
 
 
 # start the adapter with the following command
-# uvicorn context_leakage_team.deployment.main_1_fastapi:app --reload
+# uvicorn prompt_leakage_probing.deployment.main_1_fastapi:app --reload
diff --git a/...t_leakage_team/deployment/main_2_mesop.py → ...eakage_probing/deployment/main_2_mesop.py b/...t_leakage_team/deployment/main_2_mesop.py → ...eakage_probing/deployment/main_2_mesop.py
@@ -13,8 +13,8 @@
 app = FastAgency(
     provider=provider,
     ui=MesopUI(),
-    title="Context Leakage Team",
+    title="Prompt Leakage Probing",
 )
 
 # start the provider with the following command
-# gunicorn context_leakage_team.deployment.main_2_mesop:app -b 0.0.0.0:8888 --reload
+# gunicorn prompt_leakage_probing.deployment.main_2_mesop:app -b 0.0.0.0:8888 --reload
diff --git a/..._leakage_team/tested_chatbots/__init__.py → ...akage_probing/tested_chatbots/__init__.py b/..._leakage_team/tested_chatbots/__init__.py → ...akage_probing/tested_chatbots/__init__.py
diff --git a/...e_team/tested_chatbots/chatbots_router.py → ...robing/tested_chatbots/chatbots_router.py b/...e_team/tested_chatbots/chatbots_router.py → ...robing/tested_chatbots/chatbots_router.py
diff --git a/...xt_leakage_team/tested_chatbots/config.py → ...leakage_probing/tested_chatbots/config.py b/...xt_leakage_team/tested_chatbots/config.py → ...leakage_probing/tested_chatbots/config.py
diff --git a/...age_team/tested_chatbots/openai_client.py → ..._probing/tested_chatbots/openai_client.py b/...age_team/tested_chatbots/openai_client.py → ..._probing/tested_chatbots/openai_client.py
diff --git a/...age_team/tested_chatbots/prompt_loader.py → ..._probing/tested_chatbots/prompt_loader.py b/...age_team/tested_chatbots/prompt_loader.py → ..._probing/tested_chatbots/prompt_loader.py
diff --git a/...m/tested_chatbots/prompts/confidential.md → ...g/tested_chatbots/prompts/confidential.md b/...m/tested_chatbots/prompts/confidential.md → ...g/tested_chatbots/prompts/confidential.md
diff --git a/...ge_team/tested_chatbots/prompts/high.json → ...probing/tested_chatbots/prompts/high.json b/...ge_team/tested_chatbots/prompts/high.json → ...probing/tested_chatbots/prompts/high.json
diff --git a/...age_team/tested_chatbots/prompts/low.json → ..._probing/tested_chatbots/prompts/low.json b/...age_team/tested_chatbots/prompts/low.json → ..._probing/tested_chatbots/prompts/low.json
diff --git a/..._team/tested_chatbots/prompts/medium.json → ...obing/tested_chatbots/prompts/medium.json b/..._team/tested_chatbots/prompts/medium.json → ...obing/tested_chatbots/prompts/medium.json
diff --git a/...sted_chatbots/prompts/non_confidential.md → ...sted_chatbots/prompts/non_confidential.md b/...sted_chatbots/prompts/non_confidential.md → ...sted_chatbots/prompts/non_confidential.md
diff --git a/...t_leakage_team/tested_chatbots/service.py → ...eakage_probing/tested_chatbots/service.py b/...t_leakage_team/tested_chatbots/service.py → ...eakage_probing/tested_chatbots/service.py
diff --git a/prompt_leakage_probing/workflow/__init__.py b/prompt_leakage_probing/workflow/__init__.py
@@ -0,0 +1,3 @@
+from prompt_leakage_probing.workflow.workflow import wf
+
+__all__ = ["wf"]
diff --git a/prompt_leakage_probing/workflow/agents/__init__.py b/prompt_leakage_probing/workflow/agents/__init__.py
@@ -0,0 +1,11 @@
+from .prompt_leakage_black_box.prompt_leakage_black_box import (
+    PromptGeneratorAgent,
+)
+from .prompt_leakage_classifier.prompt_leakage_classifier import (
+    PromptLeakageClassifierAgent,
+)
+
+__all__ = [
+    "PromptGeneratorAgent",
+    "PromptLeakageClassifierAgent",
+]
diff --git a/...nts/context_leakage_black_box/__init__.py → ...ents/prompt_leakage_black_box/__init__.py b/...nts/context_leakage_black_box/__init__.py → ...ents/prompt_leakage_black_box/__init__.py
diff --git a/...ge_black_box/context_leakage_black_box.py → ...age_black_box/prompt_leakage_black_box.py b/...ge_black_box/context_leakage_black_box.py → ...age_black_box/prompt_leakage_black_box.py
@@ -4,14 +4,14 @@
 from autogen.agentchat import ConversableAgent
 
 
-class ContextLeakagePromptGeneratorAgent(ConversableAgent):  # type: ignore[misc]
+class PromptGeneratorAgent(ConversableAgent):  # type: ignore[misc]
     SYSTEM_MESSAGE = (Path(__file__).parent / "system_message.md").read_text()
 
     def __init__(self, **kwargs: Any):
-        """Initialize ContextLeakagePromptGeneratorAgent."""
+        """Initialize PromptGeneratorAgent."""
         super().__init__(
             name="Prompt_Generator_Agent",
-            description="Generates prompts to leak context from the tested LLM.",
+            description="Generates prompts to leak the prompt from the tested LLM.",
             system_message=self.SYSTEM_MESSAGE,
             **kwargs,
         )
diff --git a/...ntext_leakage_black_box/system_message.md → ...rompt_leakage_black_box/system_message.md b/...ntext_leakage_black_box/system_message.md → ...rompt_leakage_black_box/system_message.md
diff --git a/...ts/context_leakage_classifier/__init__.py → ...nts/prompt_leakage_classifier/__init__.py b/...ts/context_leakage_classifier/__init__.py → ...nts/prompt_leakage_classifier/__init__.py
diff --git a/..._classifier/context_leakage_classifier.py → ...e_classifier/prompt_leakage_classifier.py b/..._classifier/context_leakage_classifier.py → ...e_classifier/prompt_leakage_classifier.py
@@ -4,7 +4,7 @@
 from autogen.agentchat import ConversableAgent
 
 
-class ContextLeakageClassifierAgent(ConversableAgent):  # type: ignore[misc]
+class PromptLeakageClassifierAgent(ConversableAgent):  # type: ignore[misc]
     SYSTEM_MESSAGE = (Path(__file__).parent / "system_message.md").read_text()
 
     def __init__(
@@ -13,10 +13,10 @@ def __init__(
         non_confidential_part: str,
         **kwargs: Any,
     ):
-        """Initialize ContextLeakageClassifierAgent."""
+        """Initialize PromptLeakageClassifierAgent."""
         super().__init__(
-            name="Context_Leakage_Classifier",
-            description="Detect and classify context leakage in the model's response.",
+            name="Prompt_Leakage_Classifier",
+            description="Detect and classify prompt leakage in the model's response.",
             system_message=self.SYSTEM_MESSAGE.format(
                 confidential_part=confidential_part,
                 not_confidential_part=non_confidential_part,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,4 +20,4 @@ def list_workflows() -> dict[str, Any]:


		# start the adapter with the following command
		# uvicorn context_leakage_team.deployment.main_1_fastapi:app --reload
		# uvicorn prompt_leakage_probing.deployment.main_1_fastapi:app --reload
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from prompt_leakage_probing.workflow.workflow import wf

		__all__ = ["wf"]