Merge pull request #4 from YanxinLu/main

support litellm and w/ background setup
scicode-bench · Jul 24, 2024 · a722196 · a722196
2 parents 8ab4166 + e5ae4fd
commit a722196
Show file tree

Hide file tree

Showing 7 changed files with 155 additions and 86 deletions.
diff --git a/eval/data/background_comment_template.txt b/eval/data/background_comment_template.txt
@@ -21,6 +21,8 @@ RESPONSE GUIDELINES:
 5. Ensure your response is in the format of ```python``` and includes the necessary background as a comment at the top.
 
 Example:
+```python
 # Background: [Here, insert the necessary scientific knowledge required for the next step.]
 
 [Insert the Python code here based on the provided function header and dependencies.]
+```
diff --git a/eval/data/problems_all.jsonl b/eval/data/problems_all.jsonl
diff --git a/eval/scripts/gencode_json.py b/eval/scripts/gencode_json.py
@@ -14,23 +14,27 @@
 
 class Gencode:
     def __init__(self, model: str, output_dir: Path,
-                 prompt_dir: Path, temperature: float):
+                 prompt_dir: Path, with_background: bool, temperature: float):
         self.model = model
         self.output_dir = output_dir
         self.prompt_dir = prompt_dir
+        self.with_background = with_background
         self.temperature = temperature
         self.previous_llm_code = []
 
-    def save_prompt_with_steps(self, prob_data: dict, prompt: str, num_steps: int, tot_steps: int) -> None:
-        output_dir = Path(self.prompt_dir, self.model)
+    def _get_background_dir(self):
+        return "with_background" if self.with_background else "without_background"
+
+    def save_prompt_with_steps(self, prob_data: dict, prompt: str, num_steps: int) -> None:
+        output_dir = Path(self.prompt_dir, Path(self.model).parts[-1], self._get_background_dir())
         output_dir.mkdir(parents=True, exist_ok=True)
         output_file_path = output_dir / f"{prob_data['problem_id']}.{num_steps}.txt"
         output_file_path.write_text(prompt, encoding="utf-8")
 
-    def save_response_with_steps(self, prob_data: dict, response: str, previous_code: str,
-                                 num_steps: int, model="gpt-4o",) -> None:
+    def save_response_with_steps(self, prob_data: dict, response: str,
+                                 previous_code: str, num_steps: int) -> None:
         output_dir = (
-                self.output_dir / model
+                self.output_dir / Path(self.model).parts[-1] / self._get_background_dir()
         )
         output_dir.mkdir(parents=True, exist_ok=True)
         prob_id = prob_data["problem_id"]
@@ -78,7 +82,7 @@ def generate_response_with_steps(
                         raise Exception(f'Generating {prob_id} step {num_steps} ahead of step {prev_step + 1}.')
         prompt, previous_code = self.generate_prompt_with_steps(prob_data, num_steps, prompt_template)
         if save:
-            self.save_prompt_with_steps(prob_data, prompt, num_steps, tot_steps)
+            self.save_prompt_with_steps(prob_data, prompt, num_steps)
 
         model_kwargs = {}
         if "claude" in model:
@@ -94,7 +98,7 @@ def generate_response_with_steps(
             model_fct = get_model_function(model, **model_kwargs)
             response_from_llm = model_fct(prompt)
             self.previous_llm_code[num_steps - 1] = extract_python_script(response_from_llm)
-            self.save_response_with_steps(prob_data, response_from_llm, previous_code, num_steps, model)
+            self.save_response_with_steps(prob_data, response_from_llm, previous_code, num_steps)
 
     @staticmethod
     def process_problem_code(prob_data: dict, num_steps: int) -> str:
@@ -109,11 +113,16 @@ def process_problem_steps(self, problem_data: dict, num_steps: int):
         next_step = []
         previous_code = []
         for i in range(num_steps - 1):
+            output_lines.append(problem_data["sub_steps"][i]["step_description_prompt"] + '\n' +
+                                problem_data["sub_steps"][i]["step_background"] if self.with_background
+                                else problem_data["sub_steps"][i]["step_description_prompt"])
             output_lines.append(self.previous_llm_code[i])
             previous_code.append(self.previous_llm_code[i])
             output_lines.append("------")
 
-        next_step.append(problem_data["sub_steps"][num_steps - 1]["step_description_prompt"])
+        next_step.append(problem_data["sub_steps"][num_steps - 1]["step_description_prompt"] + '\n' +
+                         problem_data["sub_steps"][num_steps - 1]["step_background"] if self.with_background
+                         else problem_data["sub_steps"][num_steps - 1]["step_description_prompt"])
         next_step.append(self.process_problem_code(problem_data, num_steps))
         output_str = "\n\n".join(output_lines[:-1])  # Remove the last "------"
         next_step_str = "\n\n".join(next_step)
@@ -160,6 +169,11 @@ def get_cli() -> argparse.ArgumentParser:
         default=Path("eval_results", "prompt"),
         help="Prompt directory",
     )
+    parser.add_argument(
+        "--with-background",
+        action="store_true",
+        help="Include problem background if enabled",
+    )
     parser.add_argument(
         "--temperature",
         type=float,
@@ -173,11 +187,12 @@ def main(model: str,
          output_dir: Path,
          input_path: Path,
          prompt_dir: Path,
+         with_background: bool,
          temperature: float
 ) -> None:
     gcode = Gencode(
         model=model, output_dir=output_dir,
-        prompt_dir=prompt_dir,  temperature=temperature
+        prompt_dir=prompt_dir,  with_background=with_background, temperature=temperature
     )
     data = read_from_jsonl(input_path)
     for problem in data:

diff --git a/eval/scripts/test_generated_code.py b/eval/scripts/test_generated_code.py
@@ -6,6 +6,7 @@
 import numpy as np
 import argparse
 
+from scicode.parse.parse import H5PY_FILE
 from scicode.parse.parse import read_from_jsonl
 
 
@@ -15,7 +16,12 @@
 DEV_STEP_NUM = 50
 
 
-def test_code(model_name, code_dir, log_dir, output_dir, jsonl_path, dev_set=False):
+def _get_background_dir(with_background):
+    return "with_background" if with_background else "without_background"
+
+
+def test_code(model_name, code_dir, log_dir, output_dir,
+              jsonl_path, dev_set=False, with_background=False):
 
     jsonl_data = read_from_jsonl(jsonl_path)
     json_dct = {}
@@ -26,7 +32,7 @@ def test_code(model_name, code_dir, log_dir, output_dir, jsonl_path, dev_set=Fal
         json_idx[prob_data['problem_id']] = jsonl_data.index(prob_data)
     start_time = time.time()
 
-    code_dir_ = Path(code_dir, model_name)
+    code_dir_ = Path(code_dir, model_name, _get_background_dir(with_background))
     tmp_dir = Path(f'tmp_{start_time}')
 
     tmp_dir.mkdir(parents=True, exist_ok=True)
@@ -82,7 +88,7 @@ def run_script(script_path):
             prob_id = func_id.split('.')[0]
             print(f'Testing function {func_id} ...')
             tot_prob[int(prob_id) - 1] += 1
-            logs_dir_ = Path(log_dir, model_name)
+            logs_dir_ = Path(log_dir, model_name, _get_background_dir(with_background))
             logs_dir_.mkdir(parents=True, exist_ok=True)
             logs_file = Path(logs_dir_, f'{file_path.stem}.txt')
             if logs_file.exists():
@@ -116,16 +122,16 @@ def run_script(script_path):
     print(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if dev_set else PROB_NUM - DEV_PROB_NUM}')
     print(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if dev_set else STEP_NUM}')
 
-    output_dir.mkdir(parents=True, exist_ok=True)
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
 
-    with open(f'{output_dir}/{model_name}.txt', 'w') as f:
+    with open(f'{output_dir}/{model_name}_{_get_background_dir(with_background)}.txt', 'w') as f:
         f.write(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if dev_set else PROB_NUM - DEV_PROB_NUM}\n')
         f.write(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if dev_set else STEP_NUM}\n\n')
         f.write(f'duration: {test_time} seconds\n')
         f.write('\ncorrect problems: ')
         f.write(f'\n\n{[i + 1 for i in range(PROB_NUM) if correct_prob[i] == tot_prob[i] and tot_prob[i] != 0]}\n')
 
-    with open(f'{output_dir}/{model_name}.json', 'w', encoding='utf-8') as f:
+    with open(f'{output_dir}/{model_name}_{_get_background_dir(with_background)}.json', 'w', encoding='utf-8') as f:
         json.dump(correct_dict, f, indent=4)
 
     shutil.rmtree(tmp_dir)
@@ -166,6 +172,11 @@ def get_cli() -> argparse.ArgumentParser:
         "--dev-set",
         action='store_true',
         help="Test dev set if enabled",
+    ),
+    parser.add_argument(
+        "--with-background",
+        action="store_true",
+        help="Include problem background if enabled",
     )
     return parser
 
@@ -175,9 +186,13 @@ def main(model: str,
          log_dir: Path,
          output_dir: Path,
          jsonl_path: Path,
-         dev_set: bool
+         dev_set: bool,
+         with_background: bool
 ) -> None:
-    test_code(model, code_dir, log_dir, output_dir, jsonl_path, dev_set)
+    if not Path(H5PY_FILE).exists():
+        raise FileNotFoundError("Please download the numeric test results before testing generated code.")
+    model = Path(model).parts[-1]
+    test_code(model, code_dir, log_dir, output_dir, jsonl_path, dev_set, with_background)
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "rich",
     "pytest",
     "pytest-cov",
+    "litellm",
     # requirements for execution
     "numpy",
     "scipy",

diff --git a/src/scicode/gen/models.py b/src/scicode/gen/models.py
@@ -4,14 +4,43 @@
 import google.generativeai as genai
 import config
 import re
+import os
+import litellm
+from litellm.utils import validate_environment as litellm_validate_environment
+
 from scicode import keys_cfg_path
+from scicode.utils.log import get_logger
+
+logger = get_logger("models")
 
 
 def get_config():
     if not keys_cfg_path.exists():
         raise FileNotFoundError(f"Config file not found: {keys_cfg_path}")
     return config.Config(str(keys_cfg_path))
 
+def generate_litellm_response(prompt: str, *, model: str, **kwargs) -> str:
+    """Call the litellm api to generate a response"""
+    # litellm expects all keys as env variables
+    config = get_config()
+    for key, value in config.as_dict().items():
+        if key in os.environ and os.environ[key] != value:
+            logger.warning(f"Overwriting {key} from config with environment variable")
+        else:
+            os.environ[key] = value
+    # Let's validate that we have everythong for this model
+    env_validation = litellm_validate_environment(model)
+    if not env_validation.get("keys_in_environment") or env_validation.get("missing_keys", []):
+        msg = f"Environment validation for litellm failed for model {model}: {env_validation}"
+        raise ValueError(msg)
+    response = litellm.completion(
+        model=model,
+        messages = [
+            {"role": "user", "content": prompt},
+        ],
+        **kwargs,
+    )
+    return response.choices[0].message.content
 
 def generate_openai_response(prompt: str, *, model="gpt-4-turbo-2024-04-09",
                              temperature: float = 0) -> str:
@@ -87,7 +116,10 @@ def generate_google_response(prompt: str, *, model: str = "gemini-pro",
 
 def get_model_function(model: str, **kwargs):
     """Return the appropriate function to generate a response based on the model"""
-    if "gpt" in model:
+    if model.startswith("litellm/"):
+        model = model.removeprefix("litellm/")
+        fct = generate_litellm_response
+    elif "gpt" in model:
         fct = generate_openai_response
     elif "claude" in model:
         fct = generate_anthropic_response
@@ -107,7 +139,11 @@ def generate_dummy_response(prompt: str, **kwargs) -> str:
 
 def extract_python_script(response: str):
     # We will extract the python script from the response
-    python_script = response.split("```python")[1].split("```")[0]
+    if '```' in response:
+        python_script = response.split("```python")[1].split("```")[0] if '```python' in response else response.split('```')[1].split('```')[0]
+    else:
+        print("Fail to extract python code from specific format.")
+        python_script = response
     python_script = re.sub(r'^\s*(import .*|from .*\s+import\s+.*)', '', python_script, flags=re.MULTILINE)
     return python_script
 
diff --git a/tests/test_data/first_problem.jsonl b/tests/test_data/first_problem.jsonl