CerebrasResearch · pawelf-cerebras · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,6 @@ cython_debug/
 # Ignore Mac DS_Store files
 .DS_Store
 **/.DS_Store
+
+# VS Code
+.vscode/
diff --git a/README.md b/README.md
@@ -196,54 +196,70 @@ response = client.chat.completions.create(
 
 ## Implemented techniques
 
-| Approach                | Slug               | Description                                                                                    |
-| ----------------------- | ------------------ | ---------------------------------------------------------------------------------------------- |
-| CoT with Reflection     | `cot_reflection`   | Implements chain-of-thought reasoning with \<thinking\>, \<reflection> and \<output\> sections |
-| PlanSearch              | `plansearch`       | Implements a search algorithm over candidate plans for solving a problem in natural language   |
-| ReRead                  | `re2`              | Implements rereading to improve reasoning by processing queries twice                          |
-| Self-Consistency        | `self_consistency` | Implements an advanced self-consistency method                                                 |
-| Z3 Solver               | `z3`               | Utilizes the Z3 theorem prover for logical reasoning                                           |
-| R* Algorithm            | `rstar`            | Implements the R* algorithm for problem-solving                                                |
-| LEAP                    | `leap`             | Learns task-specific principles from few shot examples                                         |
-| Round Trip Optimization | `rto`              | Optimizes responses through a round-trip process                                               |
-| Best of N Sampling      | `bon`              | Generates multiple responses and selects the best one                                          |
-| Mixture of Agents       | `moa`              | Combines responses from multiple critiques                                                     |
-| Monte Carlo Tree Search | `mcts`             | Uses MCTS for decision-making in chat responses                                                |
-| PV Game                 | `pvg`              | Applies a prover-verifier game approach at inference time                                      |
-| CoT Decoding            |  N/A for proxy     | Implements chain-of-thought decoding to elicit reasoning without explicit prompting            |
-| Entropy Decoding        |  N/A for proxy     | Implements adaptive sampling based on the uncertainty of tokens during generation              |
+| Approach                             | Slug               | Description                                                                                    |
+| ------------------------------------ | ------------------ | ---------------------------------------------------------------------------------------------- |
+| Cerebras Planning and Optimimization | `cepo`             | Combines Best of N, Chain-of-Thought, Self-Reflection, Self-Improvement, and various prompting techniques |
+| CoT with Reflection                  | `cot_reflection`   | Implements chain-of-thought reasoning with \<thinking\>, \<reflection> and \<output\> sections |
+| PlanSearch                           | `plansearch`       | Implements a search algorithm over candidate plans for solving a problem in natural language   |
+| ReRead                               | `re2`              | Implements rereading to improve reasoning by processing queries twice                          |
+| Self-Consistency                     | `self_consistency` | Implements an advanced self-consistency method                                                 |
+| Z3 Solver                            | `z3`               | Utilizes the Z3 theorem prover for logical reasoning                                           |
+| R* Algorithm                         | `rstar`            | Implements the R* algorithm for problem-solving                                                |
+| LEAP                                 | `leap`             | Learns task-specific principles from few shot examples                                         |
+| Round Trip Optimization              | `rto`              | Optimizes responses through a round-trip process                                               |
+| Best of N Sampling                   | `bon`              | Generates multiple responses and selects the best one                                          |
+| Mixture of Agents                    | `moa`              | Combines responses from multiple critiques                                                     |
+| Monte Carlo Tree Search              | `mcts`             | Uses MCTS for decision-making in chat responses                                                |
+| PV Game                              | `pvg`              | Applies a prover-verifier game approach at inference time                                      |
+| CoT Decoding                         |  N/A for proxy     | Implements chain-of-thought decoding to elicit reasoning without explicit prompting            |
+| Entropy Decoding                     |  N/A for proxy     | Implements adaptive sampling based on the uncertainty of tokens during generation              |
 
 ## Implemented plugins
 
-| Plugin                  | Slug               | Description                                                                                    |
-| ----------------------- | ------------------ | ---------------------------------------------------------------------------------------------- |
-| Router                  | `router`           | Uses the [optillm-bert-uncased](https://huggingface.co/codelion/optillm-bert-uncased) model to route requests to different approaches based on the user prompt |
-| Chain-of-Code           | `coc`              | Implements a chain of code approach that combines CoT with code execution and LLM based code simulation |
-| Memory                  | `memory`           | Implements a short term memory layer, enables you to use unbounded context length with any LLM |
-| Privacy                 | `privacy`          | Anonymize PII data in request and deanonymize it back to original value in response            |
-| Read URLs               | `readurls`         | Reads all URLs found in the request, fetches the content at the URL and adds it to the context |
-| Execute Code            | `executecode`      | Enables use of code interpreter to execute python code in requests and LLM generated responses |
+| Plugin                               | Slug               | Description                                                                                    |
+| ------------------------------------ | ------------------ | ---------------------------------------------------------------------------------------------- |
+| Router                               | `router`           | Uses the [optillm-bert-uncased](https://huggingface.co/codelion/optillm-bert-uncased) model to route requests to different approaches based on the user prompt |
+| Chain-of-Code                        | `coc`              | Implements a chain of code approach that combines CoT with code execution and LLM based code simulation |
+| Memory                               | `memory`           | Implements a short term memory layer, enables you to use unbounded context length with any LLM |
+| Privacy                              | `privacy`          | Anonymize PII data in request and deanonymize it back to original value in response            |
+| Read URLs                            | `readurls`         | Reads all URLs found in the request, fetches the content at the URL and adds it to the context |
+| Execute Code                         | `executecode`      | Enables use of code interpreter to execute python code in requests and LLM generated responses |
 
 ## Available parameters
 
 optillm supports various command-line arguments and environment variables for configuration.
 
-| Parameter                | Description                                                     | Default Value   |
-|--------------------------|-----------------------------------------------------------------|-----------------|
-| `--approach`             | Inference approach to use                                       | `"auto"`        |
-| `--simulations`          | Number of MCTS simulations                                      | 2               |
-| `--exploration`          | Exploration weight for MCTS                                     | 0.2             |
-| `--depth`                | Simulation depth for MCTS                                       | 1               |
-| `--best-of-n`            | Number of samples for best_of_n approach                        | 3               |
-| `--model`                | OpenAI model to use                                             | `"gpt-4o-mini"` |
-| `--base-url`             | Base URL for OpenAI compatible endpoint                         | `""`            |
-| `--rstar-max-depth`      | Maximum depth for rStar algorithm                               | 3               |
-| `--rstar-num-rollouts`   | Number of rollouts for rStar algorithm                          | 5               |
-| `--rstar-c`              | Exploration constant for rStar algorithm                        | 1.4             |
-| `--n`                    | Number of final responses to be returned                        | 1               |
-| `--return-full-response` | Return the full response including the CoT with <thinking> tags | `False`         |
-| `--port`                 | Specify the port to run the proxy                               | 8000            |
-| `--optillm-api-key`      | Optional API key for client authentication to optillm           | `""`            |
+| Parameter                           | Description                                                     | Default Value   |
+|-------------------------------------|-----------------------------------------------------------------|-----------------|
+| `--approach`                        | Inference approach to use                                       | `"auto"`        |
+| `--simulations`                     | Number of MCTS simulations                                      | 2               |
+| `--exploration`                     | Exploration weight for MCTS                                     | 0.2             |
+| `--depth`                           | Simulation depth for MCTS                                       | 1               |
+| `--best-of-n`                       | Number of samples for best_of_n approach                        | 3               |
+| `--model`                           | OpenAI model to use                                             | `"gpt-4o-mini"` |
+| `--base-url`                        | Base URL for OpenAI compatible endpoint                         | `""`            |
+| `--rstar-max-depth`                 | Maximum depth for rStar algorithm                               | 3               |
+| `--rstar-num-rollouts`              | Number of rollouts for rStar algorithm                          | 5               |
+| `--rstar-c`                         | Exploration constant for rStar algorithm                        | 1.4             |
+| `--n`                               | Number of final responses to be returned                        | 1               |
+| `--return-full-response`            | Return the full response including the CoT with <thinking> tags | `False`         |
+| `--port`                            | Specify the port to run the proxy                               | 8000            |
+| `--optillm-api-key`                 | Optional API key for client authentication to optillm           | `""`            |
+| `--cepo_bestofn_n`                  | Number of responses to be generated in best of n stage          | 3               |
+| `--cepo_bestofn_temperature`        | Temperature for verifier in best of n stage                     | 0.1             |
+| `--cepo_bestofn_max_tokens`         | Maximum number of tokens for verifier in best of n stage        | 4096            |
+| `--cepo_bestofn_rating_type`        | Type of rating in best of n stage ("absolute" or "pairwise")    | `"absolute"`    |
+| `--cepo_planning_n`                 | Number of plans generated in planning stage                     | 3               |
+| `--cepo_planning_m`                 | Number of attempts to generate n plans in planning stage        | 6               |
+| `--cepo_planning_temperature_step1` | Temperature for generator in step 1 of planning stage           | 0.55            |
+| `--cepo_planning_temperature_step2` | Temperature for generator in step 2 of planning stage           | 0.25            |
+| `--cepo_planning_temperature_step3` | Temperature for generator in step 3 of planning stage           | 0.1             |
+| `--cepo_planning_temperature_step4` | Temperature for generator in step 4 of planning stage           | 0               |
+| `--cepo_planning_max_tokens_step1`  | Maximum number of tokens in step 1 of planning stage            | 4096            |
+| `--cepo_planning_max_tokens_step2`  | Maximum number of tokens in step 2 of planning stage            | 4096            |
+| `--cepo_planning_max_tokens_step3`  | Maximum number of tokens in step 3 of planning stage            | 4096            |
+| `--cepo_planning_max_tokens_step4`  | Maximum number of tokens in step 4 of planning stage            | 4096            |
+| `--cepo_config_file`                | Path to CePO configuration file                                 | None            |
 
 When using Docker, these can be set as environment variables prefixed with `OPTILLM_`.
 

diff --git a/configs/cepo_config.yaml b/configs/cepo_config.yaml
@@ -0,0 +1,14 @@
+bestofn_n: 3
+bestofn_temperature: 0.1
+bestofn_max_tokens: 4096
+bestofn_rating_type: "absolute"  # or "pairwise"
+planning_n: 3
+planning_m: 6
+planning_temperature_step1: 0.55
+planning_temperature_step2: 0.25
+planning_temperature_step3: 0.1
+planning_temperature_step4: 0
+planning_max_tokens_step1: 4096
+planning_max_tokens_step2: 4096
+planning_max_tokens_step3: 4096
+planning_max_tokens_step4: 4096
diff --git a/optillm.py b/optillm.py
@@ -3,6 +3,7 @@
 import os
 import secrets
 from flask import Flask, request, jsonify
+from cerebras.cloud.sdk import Cerebras
 from openai import AzureOpenAI, OpenAI
 from flask import Response
 import json
@@ -27,6 +28,7 @@
 from optillm.plansearch import plansearch
 from optillm.leap import leap
 from optillm.reread import re2_approach
+from optillm.cepo import cepo, CepoConfig, init_cepo_config
 
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -50,7 +52,14 @@ def get_config():
         from optillm.inference import create_inference_client
         API_KEY = os.environ.get("OPTILLM_API_KEY")
         default_client = create_inference_client()
-    # OpenAI, Azure, or LiteLLM API configuration
+    # Cerebras, OpenAI, Azure, or LiteLLM API configuration
+    elif os.environ.get("CEREBRAS_API_KEY"):
+        API_KEY = os.environ.get("CEREBRAS_API_KEY")
+        base_url = server_config['base_url']
+        if base_url != "":
+            default_client = Cerebras(api_key=API_KEY, base_url=base_url)
+        else:
+            default_client = Cerebras(api_key=API_KEY)
     elif os.environ.get("OPENAI_API_KEY"):
         API_KEY = os.environ.get("OPENAI_API_KEY")
         base_url = server_config['base_url']
@@ -104,7 +113,7 @@ def get_config():
 
 # List of known approaches
 known_approaches = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", 
-                   "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
+                   "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2", "cepo"]
 
 plugin_approaches = {}
 
@@ -283,6 +292,10 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
             return leap(system_prompt, initial_query, client, model)
         elif approach == 're2':
             return re2_approach(system_prompt, initial_query, client, model, n=server_config['n'])
+        elif approach == 'cepo':
+            # build the cepo config based on the cmd line arguments and the 
+            logger.debug(f"Calling with {cepo_config}")
+            return cepo(system_prompt, initial_query, client, model, cepo_config)
     elif approach in plugin_approaches:
         return plugin_approaches[approach](system_prompt, initial_query, client, model)
     else:
@@ -609,6 +622,13 @@ def parse_args():
     parser.add_argument("--base-url", "--base_url", dest="base_url", type=str, default=base_url_default,
                         help="Base url for OpenAI compatible endpoint")
 
+    # Special handling of all the Cepo Configurations
+    for key, value in CepoConfig.__dict__.items():
+        if not key.startswith('__'):
+            parser.add_argument(f"--cepo_{key}", dest=f"cepo_{key}", type=type(value), default=None, help=f"CePO configuration for {key}")
+
+    parser.add_argument(f"--cepo_config_file", dest=f"cepo_config_file", type=str, default=None, help="Path to CePO configuration file")
+
     args = parser.parse_args()
 
     # Convert argument names to match server_config keys
@@ -622,6 +642,7 @@ def parse_args():
 
 def main():
     global server_config
+    global cepo_config
     # Call this function at the start of main()
     load_plugins()
     args = parse_args()
@@ -636,6 +657,11 @@ def main():
     if logging_level in logging_levels.keys():
         logger.setLevel(logging_levels[logging_level])
 
+    # set and log the cepo configs
+    cepo_config = init_cepo_config(server_config)
+    if args.approach == 'cepo':
+        logger.info(f"CePO Config: {cepo_config}")
+
     logger.info(f"Starting server with approach: {server_config['approach']}")
     server_config_clean = server_config.copy()
     if server_config_clean['optillm_api_key']: