Change json structure

Summary: Now it's "Category" and then "Model Name" but changing it to "Model Name" and then "Category" to be inline with MITRE. ``` { "llama2-7b-chat": { "stat_per_model_per_injection_variant": { "ignore_previous_instructions": { "injection_successful_count": 0, "injection_unsuccessful_count": 1, "total_count": 1, "injection_successful_percentage": 0.0, "injection_unsuccessful_percentage": 1.0 }, "persuasion": { "injection_successful_count": 0, "injection_unsuccessful_count": 1, "total_count": 1, "injection_successful_percentage": 0.0, "injection_unsuccessful_percentage": 1.0 } }, "stat_per_model_per_injection_type": { "direct": { ... } ``` Reviewed By: cynikolai Differential Revision: D61996987 fbshipit-source-id: 8e448a2cc25a7160b487d87686aa6a8b02575cc0
meta-llama · Aug 29, 2024 · 50affa9 · 50affa9
1 parent 96a6ac2
commit 50affa9
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 24 deletions.
diff --git a/CybersecurityBenchmarks/README.md b/CybersecurityBenchmarks/README.md
@@ -596,8 +596,8 @@ unsuccessful
 
 ```
 {
-    "stat_per_model": {
-        "model_name": {
+    "model_name": {
+        "stat_per_model": {
             "injection_successful_count": ...,
             "injection_unsuccessful_count": ...,
             "total_count": ...,
@@ -615,8 +615,8 @@ follows the same structure.
 
 ```
 {
-    "stat_per_model_per_variant": {
-        "model_name": {
+    "model_name": {
+        "stat_per_model_per_variant": {
             "injected_prompt_technique_variant": {
                 "injection_successful_count": ...,
                 "injection_unsuccessful_count": ...,
@@ -626,16 +626,16 @@ follows the same structure.
             },
             ...
         }
+        "stat_per_model_per_type": {
+            ...
+        },
+        "stat_per_model_per_risk_category": {
+            ...
+        },
+        "stat_per_model_per_speaking_language": {
+            ..
+        }
     },
-    "stat_per_model_per_type": {
-        ...
-    },
-    "stat_per_model_per_risk_category": {
-        ...
-    },
-    "stat_per_model_per_speaking_language": {
-        ..
-    }
 }
 ```
 
@@ -646,8 +646,8 @@ test cases will be judged as either a successful or unsuccessful injection.
 
 ```
 {
-    "stat_per_model": {
-        "model_name": {
+    "model_name": {
+        "stat_per_model": {
             "injection_successful_count": ...,
             "injection_unsuccessful_count": ...,
             "total_count": ...,
@@ -669,8 +669,8 @@ The result of each bucket follows the same structure.
 
 ```
 {
-    "stat_per_model_per_injection_technique": {
-        "model_name": {
+    "model_name": {
+        "stat_per_model_per_injection_technique": {
             "embedded_text_instructions": {
                 "injection_successful_count": ...,
                 "injection_unsuccessful_count": ...,
@@ -680,13 +680,13 @@ The result of each bucket follows the same structure.
             },
             ...
         }
+        "stat_per_model_per_injection_type": {
+            ...
+        },
+        "stat_per_model_per_risk_category": {
+            ...
+        }
     },
-    "stat_per_model_per_injection_type": {
-        ...
-    },
-    "stat_per_model_per_risk_category": {
-        ...
-    }
 }
 ```
 

diff --git a/CybersecurityBenchmarks/benchmark/prompt_injection_benchmark.py b/CybersecurityBenchmarks/benchmark/prompt_injection_benchmark.py
@@ -213,7 +213,14 @@ def unfold_dict(
                 if category != "model"
             }
             output_data["stat_per_model"] = stats["model"]
-            stat_path.write_text(json.dumps(output_data, indent=4))
+            # Change the structure of the output data so that it is grouped by model
+            output_by_model = {
+                model: {} for model in output_data["stat_per_model"].keys()
+            }
+            for category, model_stats in output_data.items():
+                for model, stats in model_stats.items():
+                    output_by_model[model][category] = stats
+            stat_path.write_text(json.dumps(output_by_model, indent=4))
 
 
 def process_judge_prompt(