Skip to content

Commit

Permalink
update llama predict
Browse files Browse the repository at this point in the history
  • Loading branch information
DesmonDay committed Jul 18, 2024
1 parent 62a18c4 commit a19f4b6
Show file tree
Hide file tree
Showing 3 changed files with 213 additions and 20 deletions.
13 changes: 0 additions & 13 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,3 @@ repos:
entry: python .copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
# For Markdown files
- repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.41.0
hooks:
- id: markdownlint
- repo: local
hooks:
- id: add-spaces-between-chinese-and-english
name: Add spaces between Chinese and English characters
entry: python scripts/codestyle/check_spaces.py
language: python
files: \.(md|markdown)$
pass_filenames: true
56 changes: 49 additions & 7 deletions paddlenlp/experimental/transformers/llama/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
GenerationBlockInferenceModel,
GenerationInferenceModel,
)
from paddlenlp.experimental.transformers.utils import load_tp_checkpoint
from paddlenlp.transformers import LlamaConfig, LlamaPretrainedModel
from paddlenlp.transformers.conversion_utils import split_param_func
from paddlenlp.transformers.llama.modeling import LlamaLMHead
Expand All @@ -51,9 +52,16 @@
CausalLMOutputWithCrossAttentions,
)
from paddlenlp.transformers.model_utils import (
dtype_guard,
dy2st_nocheck_guard_context,
no_init_weights,
register_base_model,
)
from paddlenlp.transformers.utils import (
ContextManagers,
is_paddle_support_lazy_init,
is_safetensors_available,
)
from paddlenlp.utils.log import logger

__all__ = [
Expand Down Expand Up @@ -1238,9 +1246,47 @@ def __init__(self, config):

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
config = kwargs.pop("config", None)
cache_dir = kwargs.pop("cache_dir", None)
dtype = kwargs.pop("dtype", None)
if dtype is None:
dtype = config.dtype
subfolder = kwargs.pop("subfolder", None)
if subfolder is None:
subfolder = ""
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)

init_contexts = []
if low_cpu_mem_usage or config.quantization_config.is_weight_quantize():
# Instantiate model.
init_contexts.append(no_init_weights(_enable=True))
if is_paddle_support_lazy_init():
init_contexts.append(paddle.LazyGuard())
if dtype:
init_contexts.append(dtype_guard(dtype))

# init the model
with ContextManagers(init_contexts):
model = cls(config)

resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded = cls._resolve_model_file_path(
pretrained_model_name_or_path,
cache_dir=cache_dir,
subfolder=subfolder,
from_hf_hub=False,
from_aistudio=False,
config=config,
convert_from_torch=False,
use_safetensors=use_safetensors,
variant=variant,
)

model_path = os.path.dirname(resolved_archive_file)
state_dict = load_tp_checkpoint(model_path, cls, config)
model.set_state_dict(state_dict)
return model

@classmethod
def get_cache_kvs_shape(
Expand Down Expand Up @@ -1479,10 +1525,6 @@ def get_tensor_parallel_split_mappings(num_layers):
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
from paddlenlp.transformers.utils import (
ContextManagers,
is_safetensors_available,
)

from_hf_hub = kwargs.pop("from_hf_hub", False)
config = kwargs.pop("config", None)
Expand Down
164 changes: 164 additions & 0 deletions paddlenlp/experimental/transformers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

import json
import os
from functools import partial

import numpy as np
import paddle
from tqdm import tqdm

from paddlenlp.transformers import AutoConfig
from paddlenlp.transformers.model_utils import _add_variant, load_state_dict
from paddlenlp.transformers.utils import paddlenlp_load
from paddlenlp.utils.env import (
PADDLE_WEIGHTS_INDEX_NAME,
SAFE_MASTER_WEIGHTS_INDEX_NAME,
SAFE_PEFT_WEIGHTS_INDEX_NAME,
SAFE_WEIGHTS_INDEX_NAME,
)

try:
from paddlenlp.utils.safetensors import fast_load_file as safe_load_file
from paddlenlp.utils.safetensors import fast_safe_open as safe_open
except:
from safetensors import safe_open
from safetensors.numpy import load_file as safe_load_file


def load_sharded_checkpoint(folder, variant=None, return_numpy=False):
"""
This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
loaded in the model.
Args:
folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
variant (`str`): The model variant.
return_numpy (`bool`): Whether to return numpy array instead of paddle tensor.
"""
# Load the index
pdparams_file = os.path.join(folder, _add_variant("model_state.pdparams", variant))
lora_pdparams_file = os.path.join(folder, _add_variant("lora_model_state.pdparams", variant))
safetensors_file = os.path.join(folder, _add_variant("model.safetensors", variant))
if os.path.isfile(pdparams_file):
return paddle.load(pdparams_file, return_numpy=return_numpy)
if os.path.isfile(lora_pdparams_file):
return paddle.load(lora_pdparams_file, return_numpy=return_numpy)
if os.path.isfile(safetensors_file):
state_dict = safe_load_file(safetensors_file)
if not return_numpy:
for key in list(state_dict.keys()):
if isinstance(state_dict[key], np.ndarray):
state_dict[key] = paddle.Tensor(state_dict.pop(key), zero_copy=True)
return state_dict

index_file = os.path.join(folder, _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant))
safe_index_file = os.path.join(folder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant))
safe_master_file = os.path.join(folder, _add_variant(SAFE_MASTER_WEIGHTS_INDEX_NAME, variant))
safe_peft_file = os.path.join(folder, _add_variant(SAFE_PEFT_WEIGHTS_INDEX_NAME, variant))

index_present = os.path.isfile(index_file)
safe_index_present = os.path.isfile(safe_index_file)
safe_master_present = os.path.isfile(safe_master_file)
safe_peft_present = os.path.isfile(safe_peft_file)

load_safe = False
load_index = None
if safe_index_present:
load_safe = True # load safe due to preference
load_index = safe_index_file
elif safe_master_present:
load_safe = True
load_index = safe_master_file
elif index_present:
load_index = index_file
elif safe_peft_present:
load_safe = True
load_index = safe_peft_file
else:
raise ValueError(f"Could not find {index_file} or {safe_index_file} or {safe_peft_file}")

with open(load_index, "r", encoding="utf-8") as f:
index = json.load(f)

shard_files = list(set(index["weight_map"].values()))
loader = safe_load_file if load_safe else partial(paddlenlp_load, map_location="np" if return_numpy else "cpu")

ret = {}
for shard_file in tqdm(shard_files):
state_dict = loader(os.path.join(folder, shard_file))
ret.update(state_dict)

if not return_numpy:
for key in list(ret.keys()):
if isinstance(ret[key], np.ndarray):
ret[key] = paddle.Tensor(ret.pop(key), zero_copy=True)

return ret


def load_tp_checkpoint(folder, cls, config, return_numpy=False):
"""
This load is performed efficiently: Load tp checkpoint only from cpu, no need to init the model.
Args:
folder (`str` or `os.PathLike`): A path to a folder containing the model checkpoint.
cls (`str`): The model class.
config (`AutoConfig`): The model config.
return_numpy (bool): Whether load the tp checkpoint as numpy.
"""

config = AutoConfig.from_pretrained(folder)
if config.tensor_parallel_degree == 1:
return load_sharded_checkpoint(folder, return_numpy=return_numpy)
else:
rank_model_path = os.path.join(folder, f"model_state.tp0{config.tensor_parallel_rank}.pdparams")
model_path = os.path.join(folder, "model_state.pdparams")
safe_model_path = os.path.join(folder, "model.safetensors")
if os.path.exists(rank_model_path):
return paddle.load(rank_model_path, return_numpy=return_numpy)
elif os.path.exists(model_path):
state_dict = cls.convert_tensor_parallel(model_path, config)
elif os.path.exists(safe_model_path):
with safe_open(safe_model_path, framework="np", device="cpu") as f:
loaded_keys = f.keys()
tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys)
state_dict = load_state_dict(safe_model_path, tp_actions)
else: # shard files safetensors
resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded = cls._resolve_model_file_path(
pretrained_model_name_or_path=folder,
use_safetensors=True,
)
if len(resolved_sharded_files) > 1:
resolved_sharded_files = tqdm(resolved_sharded_files, desc="Loading checkpoint shards")
loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_state_dict_keys, ignore_error=True)
state_dict = {}
for shard_file in resolved_sharded_files:
shard_state_dict = load_state_dict(
shard_file,
tp_actions,
loaded_state_dict_keys,
)
state_dict.update(shard_state_dict)
if return_numpy:
for k in list(state_dict.keys()):
if not isinstance(state_dict[k], np.ndarray):
state_dict[k] = state_dict.pop(k).cpu().numpy()
return state_dict

0 comments on commit a19f4b6

Please sign in to comment.