Merge pull request #6 from for-ai/feat/rewardbench

Add RewardBench script
for-ai · Jul 8, 2024 · 219fad6 · 219fad6
2 parents 6863757 + 089518a
commit 219fad6
Show file tree

Hide file tree

Showing 5 changed files with 225 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,8 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+data/*
+output/*
+configs/*.yml
+.DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,14 @@
+FROM pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+
+WORKDIR /stage
+
+# Install dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends git
+COPY requirements.txt /stage
+RUN pip install -r requirements.txt
+
+# Copy all files
+COPY . /stage
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+rewardbench
+datasets
+protobuf
diff --git a/scripts/convert_multilingual_uf.py b/scripts/convert_multilingual_uf.py
@@ -0,0 +1,77 @@
+"""Convert multilingual ultrafeedback into a format acceptable for RewardBench
+
+We need to follow the load_preference_dataset setup in RewardBench as
+shown here: https://github.com/allenai/reward-bench/blob/main/rewardbench/utils.py#L136
+So we need three columns:
+    - prompt (str)
+    - chosen (list[dict[str, str]]), and
+    - rejected (list[dict[str, str]])
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+from datasets import load_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="Convert a HuggingFace dataset into the RewardBench format."
+    )
+
+    # fmt: off
+    parser.add_argument("--dataset", type=str, default="nthakur/multilingual-ultrafeedback-dpo-v0.1", help="Dataset to convert.")
+    parser.add_argument("--output_path", type=Path, default="data/multilingual-ultrafeedback-dpo-v0.1.json", help="Path to save converted dataset as JSON file.")
+    parser.add_argument("--en", action="store_true", help="Use the english columns.")
+    # fmt: on
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    if args.output_path:
+        args.output_path.parents[0].mkdir(parents=True, exist_ok=True)
+
+    dataset = load_dataset(args.dataset, split="test")
+
+    def _convert_to_turn_based(example):
+        example["chosen"] = [
+            {"content": example["prompt"], "role": "user"},
+            {"content": example["chosen_raw"], "role": "assistant"},
+        ]
+        example["rejected"] = [
+            {"content": example["prompt"], "role": "user"},
+            {"content": example["rejected_raw"], "role": "assistant"},
+        ]
+        return example
+
+    prefix = "en_" if args.en else ""
+    cols = [
+        "id",
+        "source",
+        "language",
+        f"{prefix}input",
+        f"{prefix}chosen",
+        f"{prefix}rejected",
+    ]
+    rename_map = {
+        f"{prefix}input": "prompt",
+        f"{prefix}chosen": "chosen_raw",
+        f"{prefix}rejected": "rejected_raw",
+    }
+    dataset = (
+        dataset.select_columns(cols)
+        .rename_columns(rename_map)
+        .map(_convert_to_turn_based)
+        .remove_columns(["chosen_raw", "rejected_raw"])
+    )
+    dataset.to_json(args.output_path)
+    logging.info(f"Saved file to {args.output_path}.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_rewardbench.sh b/scripts/run_rewardbench.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+export TRANSFORMERS_CACHE="./cache/"
+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+export NCCL_P2P_DISABLE=1
+
+# Function to display usage information
+usage() {
+  echo "Usage: $0 [DATASET] [SPLIT] [OUTDIR]"
+  echo "  DATASET - The dataset to use (optional, default is 'ljvmiranda921/multilingual-ultrafeedback-dpi-v0.1-test')"
+  echo "  SPLIT   - The data split to use (optional, default is 'test')"
+  echo "  OUTDIR  - The output directory (optional, default is 'output/')"
+  exit 1
+}
+
+# Default values for arguments
+DATASET="ljvmiranda921/ultrafeedback-multilingual-dpo-test"
+SPLIT="test"
+OUTDIR="output/"
+
+# Check and assign arguments if provided
+if [ $# -gt 3 ]; then
+  echo "Error: Too many arguments."
+  usage
+elif [ $# -ge 1 ]; then
+  DATASET=$1
+fi
+
+if [ $# -ge 2 ]; then
+  SPLIT=$2
+fi
+
+if [ $# -ge 3 ]; then
+  OUTDIR=$3
+fi
+
+rewardbench \
+    --model openbmb/UltraRM-13b \
+    --chat_template openbmb \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 8 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+
+rewardbench \
+    --model OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 \
+    --chat_template oasst_pythia \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 8 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+
+rewardbench \
+    --model OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 \
+    --chat_template oasst_pythia \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 16 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+
+rewardbench \
+    --model OpenAssistant/reward-model-deberta-v3-large-v2 \
+    --chat_template raw \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 64 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+
+rewardbench \
+    --model berkeley-nest/Starling-RM-7B-alpha \
+    --tokenizer meta-llama/Llama-2-7b-chat-hf \
+    --chat_template llama-2 \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 16 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+
+rewardbench \
+    --model sfairXC/FsfairX-LLaMA3-RM-v0.1 \
+    --tokenizer sfairXC/FsfairX-LLaMA3-RM-v0.1 \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 4 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+
+rewardbench \
+    --model openbmb/Eurus-RM-7b \
+    --tokenizer openbmb/Eurus-RM-7b \
+    --chat_template mistral \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 16 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+
+rewardbench \
+    --model allenai/tulu-v2.5-13b-preference-mix-rm \
+    --tokenizer allenai/tulu-v2.5-13b-preference-mix-rm \
+    --chat_template mistral \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 4 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all