mosaicml · bmosaicml · Sep 15, 2023 · Sep 19, 2023 · Sep 19, 2023 · Sep 19, 2023
@@ -11,9 +11,9 @@
 
 import torch
 from composer import algorithms
-from composer.callbacks import (EarlyStopper, Generate, LRMonitor,
-                                MemoryMonitor, OptimizerMonitor,
-                                RuntimeEstimator, SpeedMonitor)
+from composer.callbacks import (EarlyStopper, Generate, LRMonitor, MemoryMonitor,
+                                OptimizerMonitor, RuntimeEstimator, EvalOutputLogging,
+                                SpeedMonitor)
 from composer.core import Algorithm, Callback, Evaluator
 from composer.datasets.in_context_learning_evaluation import \
     get_icl_task_dataloader
@@ -205,6 +205,8 @@ def build_callback(name: str, kwargs: Union[DictConfig, Dict[str,
         if isinstance(kwargs, DictConfig):
             kwargs = om.to_object(kwargs)  # pyright: ignore
         return HuggingFaceCheckpointer(**kwargs)
+    elif name == 'eval_output_logging':
+        return EvalOutputLogging(**kwargs)
     else:
         raise ValueError(f'Not sure how to build callback: {name}')
 

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
@@ -1,20 +1,30 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
+<<<<<<< HEAD
+  git_branch: output_eval_logging
+=======
+<<<<<<< HEAD
+  git_branch: output_eval_logging
+=======
   git_branch: v0.4.0
+>>>>>>> main
+>>>>>>> e3861070fb7cf3a555db46bc00dc2aaa955e559b
   # git_commit: # OR use your commit hash
   pip_install: -e ".[gpu]"
   ssh_clone: false # Should be true if using a private repo
 
 command: |
+  pip uninstall mosaicml -y
+  pip install git+https://github.com/bmosaicml/composer.git@error_logging_callback
   cd llm-foundry/scripts
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
-run_name: mpt-eval
-gpu_num: 8
-# gpu_type:
-# cluster:  # replace with your cluster here!
+run_name: gsm8k-debug
+gpu_num: 32
+gpu_type: a100_40gb
+cluster: r7z2 # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
@@ -28,16 +38,16 @@ parameters:
 
   models:
   -
-    model_name: mosaicml/mpt-7b-instruct
+    model_name: mosaicml/mpt-30b
     # Tokenizer
     tokenizer:
-      name: EleutherAI/gpt-neox-20b
+      name:  mosaicml/mpt-30b
       kwargs:
         model_max_length: ${max_seq_len}
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+      pretrained_model_name_or_path:  mosaicml/mpt-30b
       init_device: mixed
       pretrained: true
       use_auth_token: false
@@ -50,5 +60,12 @@ parameters:
     limit_all_gathers: True
 
 
+
+  callbacks:
+    eval_output_logging:
+      subset_sample: -1
+      output_directory: s3://mosaicml-internal-checkpoints-test/test_icl_output_logger_30b_base_gsm8k
+
+
   icl_tasks: 'eval/yamls/tasks_v0.2.yaml'
   eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
diff --git a/mcli/mcli-rlhf-eval.yaml b/mcli/mcli-rlhf-eval.yaml
@@ -0,0 +1,68 @@
+integrations:
+- integration_type: git_repo
+  git_repo: mosaicml/llm-foundry
+  git_branch: output_eval_logging
+  # git_commit: # OR use your commit hash
+  pip_install: -e ".[gpu]"
+  ssh_clone: false # Should be true if using a private repo
+
+command: |
+  pip uninstall mosaicml -y
+  pip install git+https://github.com/bmosaicml/composer.git@error_logging_callback
+  cd llm-foundry/scripts
+  composer eval/eval.py /mnt/config/parameters.yaml
+
+# Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
+run_name: output-logger-rlhf-prompts
+gpu_num: 8
+gpu_type: a100_80gb
+cluster: r1z1 # replace with your cluster here!
+
+image: mosaicml/llm-foundry:2.0.1_cu118-latest
+
+# The below is injected as a YAML file: /mnt/config/parameters.yaml
+parameters:
+  dist_timeout: 6000
+  seed: 1
+  max_seq_len: 1024
+  device_eval_batch_size: 1
+  precision: amp_fp16
+
+  models:
+  -
+    model_name: mosaicml/mpt-30b-instruct
+    # Tokenizer
+    tokenizer:
+      name:  mosaicml/mpt-30b-instruct
+      kwargs:
+        model_max_length: ${max_seq_len}
+
+    model:
+      name: hf_causal_lm
+      pretrained_model_name_or_path:  mosaicml/mpt-30b-instruct
+      init_device: mixed
+      pretrained: true
+      use_auth_token: false
+
+  # FSDP config for model sharding
+  fsdp_config:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: FULL
+    forward_prefetch: True
+    limit_all_gathers: True
+
+
+  icl_tasks:
+  -
+    label: rlhf_prompts
+    dataset_uri: eval/local_data/rlhf_prompts/rlhf_prompts.jsonl # ADD YOUR OWN DATASET URI
+    num_fewshot: [0]
+    icl_task_type: question_answering
+    has_categories: true
+
+  callbacks:
+    eval_output_logging:
+      print_only_incorrect: false
+      subset_sample: -1
+      output_directory: s3://mosaicml-internal-checkpoints-test/30b_instruct_rlhf_prompts
+
@@ -6,6 +6,7 @@
 import sys
 import time
 import warnings
+from composer.core.callback import Callback
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pandas as pd
@@ -21,7 +22,7 @@
 
 from llmfoundry.models import MPTForCausalLM
 from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
-from llmfoundry.utils.builders import (add_metrics_to_eval_loaders,
+from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, build_callback,
                                        build_evaluators, build_logger,
                                        build_tokenizer)
 from llmfoundry.utils.config_utils import pop_config, process_init_device
@@ -114,6 +115,7 @@ def evaluate_model(
     precision: str,
     eval_gauntlet_df: Optional[pd.DataFrame],
     icl_subset_num_batches: Optional[int],
+    callback_configs: Optional[Dict]
 ):
 
     print(f'Evaluating model: {model_cfg.model_name}', flush=True)
@@ -135,7 +137,12 @@ def evaluate_model(
         icl_subset_num_batches=icl_subset_num_batches,
     )
 
-    callbacks = []
+    # Callbacks
+    callbacks: List[Callback] = [
+        build_callback(str(name), callback_cfg)
+        for name, callback_cfg in callback_configs.items()
+    ] if callback_configs else []
+
     if eval_gauntlet_callback is not None:
         callbacks.append(eval_gauntlet_callback)
 
@@ -192,6 +199,7 @@ def evaluate_model(
         dist_timeout=dist_timeout,
         python_log_level=python_log_level,
     )
+
 
     if torch.cuda.is_available():
         torch.cuda.synchronize()
@@ -272,7 +280,11 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
                                              default_value=None)
     # Pop out interpolation variables.
     pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None)
-
+    callback_configs: Optional[DictConfig] = pop_config(cfg,
+                                                        'callbacks',
+                                                        must_exist=False,
+                                                        default_value=None)
+
     # Warn for unused parameters
     for key in cfg:
         warnings.warn(
@@ -313,6 +325,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
              python_log_level=python_log_level,
              precision=precision,
              eval_gauntlet_df=eval_gauntlet_df,
+             callback_configs=callback_configs,
              icl_subset_num_batches=icl_subset_num_batches)
         trainers.append(trainer)
 
@@ -356,6 +369,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
     return trainers, eval_gauntlet_df
 
 
+
+
+
 def calculate_markdown_results(logger_keys: List[str], trainer: Trainer,
                                benchmark_to_taxonomy: Dict[str, str],
                                model_name: str):