Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Output eval logging #740

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

import torch
from composer import algorithms
from composer.callbacks import (EarlyStopper, Generate, LRMonitor,
MemoryMonitor, OptimizerMonitor,
RuntimeEstimator, SpeedMonitor)
from composer.callbacks import (EarlyStopper, Generate, LRMonitor, MemoryMonitor,
OptimizerMonitor, RuntimeEstimator, EvalOutputLogging,
SpeedMonitor)
from composer.core import Algorithm, Callback, Evaluator
from composer.datasets.in_context_learning_evaluation import \
get_icl_task_dataloader
Expand Down Expand Up @@ -205,6 +205,8 @@ def build_callback(name: str, kwargs: Union[DictConfig, Dict[str,
if isinstance(kwargs, DictConfig):
kwargs = om.to_object(kwargs) # pyright: ignore
return HuggingFaceCheckpointer(**kwargs)
elif name == 'eval_output_logging':
return EvalOutputLogging(**kwargs)
else:
raise ValueError(f'Not sure how to build callback: {name}')

Expand Down
31 changes: 24 additions & 7 deletions mcli/mcli-hf-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,30 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
<<<<<<< HEAD
git_branch: output_eval_logging
=======
<<<<<<< HEAD
git_branch: output_eval_logging
=======
git_branch: v0.4.0
>>>>>>> main
>>>>>>> e3861070fb7cf3a555db46bc00dc2aaa955e559b
# git_commit: # OR use your commit hash
pip_install: -e ".[gpu]"
ssh_clone: false # Should be true if using a private repo

command: |
pip uninstall mosaicml -y
pip install git+https://github.com/bmosaicml/composer.git@error_logging_callback
cd llm-foundry/scripts
composer eval/eval.py /mnt/config/parameters.yaml

# Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
run_name: mpt-eval
gpu_num: 8
# gpu_type:
# cluster: # replace with your cluster here!
run_name: gsm8k-debug
gpu_num: 32
gpu_type: a100_40gb
cluster: r7z2 # replace with your cluster here!

image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest

Expand All @@ -28,16 +38,16 @@ parameters:

models:
-
model_name: mosaicml/mpt-7b-instruct
model_name: mosaicml/mpt-30b
# Tokenizer
tokenizer:
name: EleutherAI/gpt-neox-20b
name: mosaicml/mpt-30b
kwargs:
model_max_length: ${max_seq_len}

model:
name: hf_causal_lm
pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
pretrained_model_name_or_path: mosaicml/mpt-30b
init_device: mixed
pretrained: true
use_auth_token: false
Expand All @@ -50,5 +60,12 @@ parameters:
limit_all_gathers: True



callbacks:
eval_output_logging:
subset_sample: -1
output_directory: s3://mosaicml-internal-checkpoints-test/test_icl_output_logger_30b_base_gsm8k


icl_tasks: 'eval/yamls/tasks_v0.2.yaml'
eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
68 changes: 68 additions & 0 deletions mcli/mcli-rlhf-eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: output_eval_logging
# git_commit: # OR use your commit hash
pip_install: -e ".[gpu]"
ssh_clone: false # Should be true if using a private repo

command: |
pip uninstall mosaicml -y
pip install git+https://github.com/bmosaicml/composer.git@error_logging_callback
cd llm-foundry/scripts
composer eval/eval.py /mnt/config/parameters.yaml

# Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
run_name: output-logger-rlhf-prompts
gpu_num: 8
gpu_type: a100_80gb
cluster: r1z1 # replace with your cluster here!

image: mosaicml/llm-foundry:2.0.1_cu118-latest

# The below is injected as a YAML file: /mnt/config/parameters.yaml
parameters:
dist_timeout: 6000
seed: 1
max_seq_len: 1024
device_eval_batch_size: 1
precision: amp_fp16

models:
-
model_name: mosaicml/mpt-30b-instruct
# Tokenizer
tokenizer:
name: mosaicml/mpt-30b-instruct
kwargs:
model_max_length: ${max_seq_len}

model:
name: hf_causal_lm
pretrained_model_name_or_path: mosaicml/mpt-30b-instruct
init_device: mixed
pretrained: true
use_auth_token: false

# FSDP config for model sharding
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: FULL
forward_prefetch: True
limit_all_gathers: True


icl_tasks:
-
label: rlhf_prompts
dataset_uri: eval/local_data/rlhf_prompts/rlhf_prompts.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
icl_task_type: question_answering
has_categories: true

callbacks:
eval_output_logging:
print_only_incorrect: false
subset_sample: -1
output_directory: s3://mosaicml-internal-checkpoints-test/30b_instruct_rlhf_prompts

22 changes: 19 additions & 3 deletions scripts/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys
import time
import warnings
from composer.core.callback import Callback
from typing import Any, Dict, List, Optional, Tuple, Union

import pandas as pd
Expand All @@ -21,7 +22,7 @@

from llmfoundry.models import MPTForCausalLM
from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
from llmfoundry.utils.builders import (add_metrics_to_eval_loaders,
from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, build_callback,
build_evaluators, build_logger,
build_tokenizer)
from llmfoundry.utils.config_utils import pop_config, process_init_device
Expand Down Expand Up @@ -114,6 +115,7 @@ def evaluate_model(
precision: str,
eval_gauntlet_df: Optional[pd.DataFrame],
icl_subset_num_batches: Optional[int],
callback_configs: Optional[Dict]
):

print(f'Evaluating model: {model_cfg.model_name}', flush=True)
Expand All @@ -135,7 +137,12 @@ def evaluate_model(
icl_subset_num_batches=icl_subset_num_batches,
)

callbacks = []
# Callbacks
callbacks: List[Callback] = [
build_callback(str(name), callback_cfg)
for name, callback_cfg in callback_configs.items()
] if callback_configs else []

if eval_gauntlet_callback is not None:
callbacks.append(eval_gauntlet_callback)

Expand Down Expand Up @@ -192,6 +199,7 @@ def evaluate_model(
dist_timeout=dist_timeout,
python_log_level=python_log_level,
)


if torch.cuda.is_available():
torch.cuda.synchronize()
Expand Down Expand Up @@ -272,7 +280,11 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
default_value=None)
# Pop out interpolation variables.
pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None)

callback_configs: Optional[DictConfig] = pop_config(cfg,
'callbacks',
must_exist=False,
default_value=None)

# Warn for unused parameters
for key in cfg:
warnings.warn(
Expand Down Expand Up @@ -313,6 +325,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
python_log_level=python_log_level,
precision=precision,
eval_gauntlet_df=eval_gauntlet_df,
callback_configs=callback_configs,
icl_subset_num_batches=icl_subset_num_batches)
trainers.append(trainer)

Expand Down Expand Up @@ -356,6 +369,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
return trainers, eval_gauntlet_df





def calculate_markdown_results(logger_keys: List[str], trainer: Trainer,
benchmark_to_taxonomy: Dict[str, str],
model_name: str):
Expand Down
Loading
Loading