Skip to content

Commit

Permalink
Merge remote-tracking branch 'foundry-official/output_eval_logging' i…
Browse files Browse the repository at this point in the history
…nto long_context_from_hugging_face
  • Loading branch information
maxisawesome committed Nov 30, 2023
2 parents 47972cb + 1e6e923 commit 488e9a5
Show file tree
Hide file tree
Showing 67 changed files with 3,615 additions and 1,565 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
mcloud-timeout: 1200
mcloud-timeout: 1800
name: ${{ matrix.name }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,14 @@ source llmfoundry-venv-amd/bin/activate

# installs
pip install cmake packaging torch
pip install -e . # this installs some things which are not needed but they dont hurt
pip install -e . # This installs some things that are not needed but they don't hurt
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2
```
**Lastly**, install the ROCm enabled flash attention (instructions [here](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm2#amd-gpurocm-support)).

Notes:
1. `attn_impl: triton` does not work.
1. We don't yet have a docker img where everything works perfectly. You might need to up/down grade some packages (in our case, we needed to downgrade to `numpy==1.23.5`) before everything works without issue.
1. We don't yet have a docker img where everything works perfectly. You might need to up/downgrade some packages (in our case, we needed to downgrade to `numpy==1.23.5`) before everything works without issue.

# Quickstart

Expand Down Expand Up @@ -228,7 +228,7 @@ python inference/convert_composer_to_hf.py \
# --hf_repo_for_upload user-org/repo-name

# Evaluate the model on a subset of tasks
python eval/eval.py \
composer eval/eval.py \
eval/yamls/hf_eval.yaml \
icl_tasks=eval/yamls/copa.yaml \
model_name_or_path=mpt-125m-hf
Expand Down
73 changes: 50 additions & 23 deletions TUTORIAL.md

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
import torch

try:
# Before importing any transformers models, we need to disable transformers flash attention if
# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
# gated import otherwise.
import transformers

from llmfoundry import optim, utils
from llmfoundry.data import (ConcatTokensDataset,
MixtureOfDenoisersCollator, NoConcatDataset,
Expand All @@ -14,8 +19,8 @@
ComposerHFT5)
from llmfoundry.models.layers.attention import (
MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
flash_attn_fn, scaled_multihead_dot_product_attention,
triton_flash_attn_fn)
flash_attn_fn, is_flash_v1_installed,
scaled_multihead_dot_product_attention, triton_flash_attn_fn)
from llmfoundry.models.layers.blocks import MPTBlock
from llmfoundry.models.layers.ffn import (FFN_CLASS_REGISTRY, MPTMLP,
build_ffn)
Expand All @@ -24,6 +29,8 @@
MPTForCausalLM, MPTModel,
MPTPreTrainedModel)
from llmfoundry.tokenizers import TiktokenTokenizerWrapper
if is_flash_v1_installed():
transformers.utils.is_flash_attn_available = lambda: False

except ImportError as e:
try:
Expand Down
88 changes: 63 additions & 25 deletions llmfoundry/callbacks/eval_gauntlet_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,32 @@ class Weighting(Enum):
LOG_SAMPLE_SZ = 3


def calculate_named_averages(average_names: Dict[str, list],
category_scores: Dict[str, float]):
"""Calculates the named averages based off the raw category scores.
For each named average, take a simple average of all the category scores associated with that named average.
Args:
average_names (dict[str, list]): Contains a mapping of named averages to which category scores that average should consist of.
category_scores (dict[str, float]): Contains the raw scores corresponding to each category.
"""
average_scores = {}
for avg_name, category_list in average_names.items():
composite_subset = {
category: score
for category, score in category_scores.items()
if category in category_list
}
if len(composite_subset.values()) > 0:
average_scores[avg_name] = sum(composite_subset.values()) / len(
composite_subset.values())
else:
average_scores[avg_name] = 0

return average_scores


class EvalGauntlet(Callback):
"""The EvalGauntlet aggregates ICL eval results.
Expand All @@ -31,7 +57,7 @@ class EvalGauntlet(Callback):
Args:
logger_keys (list): These are the exact keys that the individual benchmark metrics will be
logged under in the logger after eval
tasks (dict): This contains the list of categories, as well as the subtasks within them, the
categories (dict): This contains the list of categories, as well as the subtasks within them, the
random baseline accuracy of each subtask, and the number of fewshot examples
used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
weighting (Weighting): The weighting scheme used to balance different tasks within each category.
Expand All @@ -43,6 +69,7 @@ class EvalGauntlet(Callback):
rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark
by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0.
benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting.
averages (Optional[dict]): Optional dictionary specifying a mapping from a average names to lists of categories used produce each named average.
"""

def __init__(self,
Expand All @@ -51,7 +78,8 @@ def __init__(self,
weighting: str = 'EQUAL',
subtract_random_baseline: bool = True,
rescale_accuracy: bool = True,
benchmark_sizes: Optional[dict] = None):
benchmark_sizes: Optional[dict] = None,
averages: Optional[dict] = None):
if isinstance(logger_keys, dict):
raise ValueError(
'logger_keys now requires a list type as input, not a dict')
Expand All @@ -66,13 +94,12 @@ def __init__(self,
)

self.categories = categories
self.category_names = [conf.get('name') for conf in self.categories]
self.weighting = Weighting[weighting]
self.subtract_random_baseline = subtract_random_baseline
self.rescale_accuracy = rescale_accuracy
self.logger_keys = logger_keys

for category in self.categories:

for benchmark in category['benchmarks']:
bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"

Expand All @@ -95,7 +122,20 @@ def __init__(self,
assert weight is not None
benchmark['weighting'] = weight

def compute_averages(self, state: State) -> Dict[str, float]:
self.averages = {}
if averages is not None:
self.averages = averages
else:
# if no averages spec provided, simply average everything
self.averages['default_average'] = self.category_names

for avg_name in self.averages:
if avg_name in self.category_names:
raise ValueError(
f'Found average name `{avg_name}` used as category name. Average names and category names must be non-overlapping.'
)

def extract_metrics_from_state(self, state: State) -> Dict[str, float]:
results = {}

for key in self.logger_keys:
Expand All @@ -121,31 +161,30 @@ def compute_averages(self, state: State) -> Dict[str, float]:
return {k: sum(v) / len(v) for k, v in results.items()}

def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
new_metrics = self.compute_averages(state)
if len(new_metrics) == 0:
computed_metrics = self.extract_metrics_from_state(state)
if len(computed_metrics) == 0:
return {}
composite_scores = {}

category_scores = {}
for category in self.categories:
missing_metrics = []
composite_scores[category['name']] = []
category_scores[category['name']] = []
for benchmark in category['benchmarks']:
key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"

if key not in new_metrics:
if key not in computed_metrics:
log.warning(
f'Could not find results for benchmark: {benchmark}.')
missing_metrics.append(key)
else:
score = new_metrics[key]
score = computed_metrics[key]

if self.subtract_random_baseline:
score -= benchmark['random_baseline']

if self.rescale_accuracy and self.subtract_random_baseline:
score /= 1.0 - benchmark['random_baseline']

composite_scores[category['name']].append({
category_scores[category['name']].append({
'name': benchmark['name'],
'score': score,
'weighting': benchmark['weighting']
Expand All @@ -155,23 +194,22 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
log.warning(
f"Removing category `{category['name']}` from scores because benchmarks were missing: {missing_metrics}"
)
del composite_scores[category['name']]
del category_scores[category['name']]
continue
total_weight = sum(
k['weighting'] for k in composite_scores[category['name']])
composite_scores[category['name']] = sum(
k['weighting'] for k in category_scores[category['name']])
category_scores[category['name']] = sum(
k['score'] * (k['weighting'] / total_weight)
for k in composite_scores[category['name']])
for k in category_scores[category['name']])

composite_scores = {
named_averages = calculate_named_averages(self.averages,
category_scores)
category_scores.update(named_averages)
category_scores = {
f'icl/metrics/eval_gauntlet/{k}': v
for k, v in composite_scores.items()
for k, v in category_scores.items()
}

composite_scores['icl/metrics/eval_gauntlet/average'] = sum(
composite_scores.values()) / len(composite_scores.values()) if len(
composite_scores.values()) > 0 else 0
if logger is not None:
logger.log_metrics(composite_scores)
logger.log_metrics(category_scores)

return composite_scores
return category_scores
62 changes: 40 additions & 22 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@
import contextlib
import copy
import logging
import math
import os
import tempfile
from pathlib import Path
from typing import Optional, Union

import torch
from composer.core import Callback, Event, State, Time
from composer.core import Callback, Event, State, Time, TimeUnit
from composer.core.state import fsdp_state_dict_type_context
from composer.loggers import Logger, MLFlowLogger
from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
from composer.models import HuggingFaceModel
from composer.utils import dist, format_name_with_dist_and_time, parse_uri
from composer.utils import (dist, format_name_with_dist_and_time,
maybe_create_remote_uploader_downloader_from_uri,
parse_uri)
from composer.utils.misc import create_interval_scheduler
from transformers import PreTrainedModel, PreTrainedTokenizerBase

Expand Down Expand Up @@ -52,12 +54,11 @@ def __init__(
save_interval: Union[str, int, Time],
huggingface_folder_name: str = 'ba{batch}',
precision: str = 'float32',
overwrite: bool = False,
overwrite: bool = True,
mlflow_registered_model_name: Optional[str] = None,
mlflow_logging_config: Optional[dict] = None,
):
self.backend, self.bucket_name, self.save_dir_format_str = parse_uri(
save_folder)
_, _, self.save_dir_format_str = parse_uri(save_folder)
self.overwrite = overwrite
self.precision = precision
self.dtype = {
Expand All @@ -83,15 +84,20 @@ def __init__(

self.huggingface_folder_name_fstr = os.path.join(
'huggingface', huggingface_folder_name)

if isinstance(save_interval, str):
save_interval = Time.from_timestring(save_interval)
if isinstance(save_interval, int):
save_interval = Time(save_interval, TimeUnit.EPOCH)

self.save_interval = save_interval
self.check_interval = create_interval_scheduler(
save_interval, include_end_of_training=True)
self.upload_to_object_store = (self.backend != '')
if self.upload_to_object_store:
self.remote_ud = RemoteUploaderDownloader(
bucket_uri=f'{self.backend}://{self.bucket_name}',
num_concurrent_uploads=4)
else:
self.remote_ud = None

self.remote_ud = maybe_create_remote_uploader_downloader_from_uri(
save_folder, loggers=[])
if self.remote_ud is not None:
self.remote_ud._num_concurrent_uploads = 4

self.last_checkpoint_batch: Optional[Time] = None
self.mlflow_loggers = []
Expand All @@ -107,7 +113,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
raise ValueError(
f'`HuggingFaceCheckpointer` is only compatible with `HuggingFaceModel`s. '
+ f'Got {type(state.model)} instead.')
if self.upload_to_object_store and self.remote_ud is not None:
if self.remote_ud is not None:
self.remote_ud.init(state, logger)
state.callbacks.append(self.remote_ud)

Expand All @@ -128,6 +134,21 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set(
'5GB')

def _is_last_batch(self, state: State):
elapsed_duration = state.get_elapsed_duration()
if elapsed_duration is not None and elapsed_duration >= 1.0:
return True

assert state.max_duration is not None # for pyright
# If the save interval is specified as 1dur, and the max duration is in epoch units
# we need a special case to identify we are on the last batch and should write the mlflow checkpoint
if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
assert state.dataloader_len is not None # for pyright
return int(state.timestamp.batch) % math.ceil(
state.max_duration.value * state.dataloader_len) == 0

return False

def _save_checkpoint(self, state: State, logger: Logger):
del logger # unused

Expand All @@ -146,7 +167,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
self.huggingface_folder_name_fstr), state.run_name,
state.timestamp)
dir_context_mgr = tempfile.TemporaryDirectory(
) if self.upload_to_object_store else contextlib.nullcontext(
) if self.remote_ud is not None else contextlib.nullcontext(
enter_result=save_dir)

with dir_context_mgr as temp_save_dir:
Expand Down Expand Up @@ -210,11 +231,8 @@ def _save_checkpoint(self, state: State, logger: Logger):
log.debug('Editing MPT files for HuggingFace compatibility')
edit_files_for_hf_compatibility(temp_save_dir)

if self.upload_to_object_store:
assert self.remote_ud is not None
log.info(
f'Uploading HuggingFace formatted checkpoint to {self.backend}://{self.bucket_name}/{save_dir}'
)
if self.remote_ud is not None:
log.info(f'Uploading HuggingFace formatted checkpoint')
for filename in os.listdir(temp_save_dir):
self.remote_ud.upload_file(
state=state,
Expand All @@ -224,8 +242,8 @@ def _save_checkpoint(self, state: State, logger: Logger):
overwrite=self.overwrite,
)

elapsed_duration = state.get_elapsed_duration()
if self.mlflow_registered_model_name is not None and elapsed_duration is not None and elapsed_duration >= 1.0:
if self.mlflow_registered_model_name and self._is_last_batch(
state):
components = {'model': new_model_instance}
if original_tokenizer is not None:
components['tokenizer'] = original_tokenizer
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
from llmfoundry.data.dataloader import build_dataloader
from llmfoundry.data.denoising import (MixtureOfDenoisersCollator,
build_text_denoising_dataloader)
from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
Expand All @@ -18,4 +19,5 @@
'build_text_dataloader',
'NoConcatDataset',
'ConcatTokensDataset',
'build_dataloader',
]
Loading

0 comments on commit 488e9a5

Please sign in to comment.