Skip to content

Commit

Permalink
fix(dataset): normalize tokenizer config and change hash from tokeniz…
Browse files Browse the repository at this point in the history
…er class to tokenizer path
  • Loading branch information
NanoCode012 committed Mar 19, 2024
1 parent b1e3e1b commit 28d7777
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 3 deletions.
4 changes: 4 additions & 0 deletions src/axolotl/utils/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ def normalize_config(cfg):
model_config = load_model_config(cfg)
cfg.model_config_type = model_config.model_type

cfg.tokenizer_config = (
cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
)

# figure out if the model is llama
cfg.is_llama_derived_model = (
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
Expand Down
2 changes: 1 addition & 1 deletion src/axolotl/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def load_tokenized_prepared_datasets(
split="train",
) -> Tuple[DatasetDict, List[Prompter]]:
cfg_datasets = cfg.test_datasets if split == "test" else cfg.datasets
tokenizer_name = tokenizer.__class__.__name__
tokenizer_name = cfg.tokenizer_config
ds_hash = str(
md5(
(
Expand Down
3 changes: 1 addition & 2 deletions src/axolotl/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,8 @@ def load_tokenizer(cfg):
if cfg.tokenizer_type:
tokenizer_cls = getattr(transformers, cfg.tokenizer_type)

tokenizer_config = cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
tokenizer = tokenizer_cls.from_pretrained(
tokenizer_config,
cfg.tokenizer_config,
trust_remote_code=cfg.trust_remote_code or False,
use_fast=use_fast,
**tokenizer_kwargs,
Expand Down

0 comments on commit 28d7777

Please sign in to comment.