From 5b02459b3ae07d1cd53a0e481b3b8af559340b77 Mon Sep 17 00:00:00 2001 From: Holden Date: Tue, 9 Jul 2024 23:53:46 +0800 Subject: [PATCH] support converting TurboSparse mistral model which embeds MLP in Pytorch tensors --- convert.py | 9 +++++---- gguf-py/gguf/tensor_mapping.py | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index 9103f661..fba6d040 100755 --- a/convert.py +++ b/convert.py @@ -1205,7 +1205,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)") - parser.add_argument("sparse_predictor", type=Path, help="predictors for sparse FFN inference") + parser.add_argument("sparse_predictor", type=Path, help="predictors for sparse FFN inference", nargs='?') args = parser.parse_args(args_in) @@ -1230,9 +1230,10 @@ def main(args_in: list[str] | None = None) -> None: if not args.vocab_only: model_plus = load_some_model(args.model) params = Params.load(model_plus) - mlp_predictor_plus = load_predictor_model(args.sparse_predictor) - params.predictor_params = PredictorParams.load(mlp_predictor_plus) - model_plus = merge_multifile_models([model_plus, mlp_predictor_plus]) + if args.sparse_predictor: + mlp_predictor_plus = load_predictor_model(args.sparse_predictor) + params.predictor_params = PredictorParams.load(mlp_predictor_plus) + model_plus = merge_multifile_models([model_plus, mlp_predictor_plus]) else: model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) params = Params.load(model_plus) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 2c813050..9bf7e1bc 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -197,10 +197,12 @@ class TensorNameMap: MODEL_TENSOR.FC_1: ( "model.layers.{bid}.fc1", + "model.layers.{bid}.mlp.predictor.fc1", ), MODEL_TENSOR.FC_2: ( "model.layers.{bid}.fc2", + "model.layers.{bid}.mlp.predictor.fc2", ), }