Skip to content

Commit

Permalink
Merge branch 'main' into tokenizer-again
Browse files Browse the repository at this point in the history
  • Loading branch information
cg123 authored Jun 29, 2024
2 parents eb3fa19 + 21937cd commit cd4d4bf
Show file tree
Hide file tree
Showing 10 changed files with 101 additions and 12 deletions.
62 changes: 62 additions & 0 deletions mergekit/_data/architectures/gemma2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"model_type": "gemma2",
"architectures": [
"Gemma2ForCausalLM"
],
"pre_weights": [
{
"name": "model.embed_tokens.weight",
"is_embed": true
}
],
"num_layers_config_key": "num_hidden_layers",
"layer_templates": {
"weights": [
{
"name": "model.layers.${layer_index}.input_layernorm.weight"
},
{
"name": "model.layers.${layer_index}.self_attn.q_proj.weight"
},
{
"name": "model.layers.${layer_index}.self_attn.k_proj.weight"
},
{
"name": "model.layers.${layer_index}.self_attn.v_proj.weight"
},
{
"name": "model.layers.${layer_index}.self_attn.o_proj.weight"
},
{
"name": "model.layers.${layer_index}.post_attention_layernorm.weight"
},
{
"name": "model.layers.${layer_index}.pre_feedforward_layernorm.weight"
},
{
"name": "model.layers.${layer_index}.mlp.up_proj.weight"
},
{
"name": "model.layers.${layer_index}.mlp.gate_proj.weight"
},
{
"name": "model.layers.${layer_index}.mlp.down_proj.weight"
},
{
"name": "model.layers.${layer_index}.post_feedforward_layernorm.weight"
}
]
},
"post_weights": [
{
"name": "model.norm.weight"
},
{
"name": "lm_head.weight",
"is_embed": true,
"aliases": [
"model.embed_tokens.weight"
]
}
]
}
5 changes: 4 additions & 1 deletion mergekit/_data/architectures/qwen2.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
},
{
"name": "lm_head.weight",
"is_embed": true
"is_embed": true,
"aliases": [
"model.embed_tokens.weight"
]
}
],
"num_layers_config_key": "num_hidden_layers",
Expand Down
1 change: 1 addition & 0 deletions mergekit/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class MergeConfiguration(BaseModel):
Literal["union"], Literal["base"], ModelReference, None
] = None
tokenizer: Optional[TokenizerConfig] = None
out_dtype: Optional[str] = None

def referenced_models(self) -> List[ModelReference]:
models = set()
Expand Down
3 changes: 3 additions & 0 deletions mergekit/io/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def execute(self, **kwargs) -> Dict[str, torch.Tensor]:
class ReturnTensor(Task[torch.Tensor]):
weight_info: WeightInfo
tensor_task: Task[torch.Tensor]
dtype: Optional[str] = None

def arguments(self) -> Dict[str, Task]:
return {"tensor": self.tensor_task}
Expand All @@ -220,4 +221,6 @@ def group_label(self) -> Optional[str]:
return self.tensor_task.group_label()

def execute(self, tensor: torch.Tensor) -> torch.Tensor:
if self.dtype and (dtype := dtype_from_name(self.dtype)) != tensor.dtype:
tensor = tensor.to(dtype=dtype)
return tensor
2 changes: 1 addition & 1 deletion mergekit/io/tensor_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def finalize(self):
json.dump(
{
"metadata": {
"mergekit_version": "0.0.4.2",
"mergekit_version": "0.0.4.4",
"total_size": self.total_size,
},
"weight_map": self.weight_map,
Expand Down
4 changes: 3 additions & 1 deletion mergekit/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,9 @@ def _model_out_config(
res = config.base_model.config(trust_remote_code=trust_remote_code)
else:
res = config.referenced_models()[0].config(trust_remote_code=trust_remote_code)
if config.dtype:
if config.out_dtype:
res.torch_dtype = config.out_dtype
elif config.dtype:
res.torch_dtype = config.dtype

if config.slices:
Expand Down
11 changes: 9 additions & 2 deletions mergekit/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def plan_to_disk(self, out_path: str) -> List[Task]:
writer_task=writer_task,
clone=self.options.clone_tensors,
optional=weight.optional,
dtype=weight.force_dtype,
dtype=weight.force_dtype or self.config.out_dtype,
)
)
finalize = FinalizeModel(
Expand All @@ -287,7 +287,14 @@ def plan_to_disk(self, out_path: str) -> List[Task]:
def plan_in_memory(self) -> List[ReturnTensor]:
"""Plan the merge to be performed in memory."""
self._plan()
return [ReturnTensor(weight_info=w, tensor_task=t) for w, t in self._tensors]
return [
ReturnTensor(
weight_info=w,
tensor_task=t,
dtype=w.force_dtype or self.config.out_dtype,
)
for w, t in self._tensors
]

def _plan(self):
self.normalize_config()
Expand Down
16 changes: 15 additions & 1 deletion mergekit/scripts/evolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import logging
import os
import time
from typing import List, Optional

import click
Expand Down Expand Up @@ -113,6 +114,12 @@
default=None,
help="Maximum time to run the optimization in seconds",
)
@click.option(
"--force-population-size",
type=int,
default=None,
help="Force a specific initial population size for CMA-ES",
)
def main(
genome_config_path: str,
max_fevals: int,
Expand All @@ -135,6 +142,7 @@ def main(
save_final_model: bool,
reshard: bool,
timeout: Optional[float],
force_population_size: Optional[int],
):
config = EvolMergeConfiguration.model_validate(
yaml.safe_load(open(genome_config_path, "r", encoding="utf-8"))
Expand Down Expand Up @@ -309,12 +317,15 @@ def parallel_evaluate(x: List[np.ndarray]) -> List[float]:
return [-x["score"] for x in res] # maximize

try:
cma_opts = {"maxfevals": max_fevals, "timeout": timeout}
if force_population_size is not None:
cma_opts["popsize"] = force_population_size
xbest, es = cma.fmin2(
None,
parallel_objective=parallel_evaluate,
x0=x0,
sigma0=sigma0,
options={"maxfevals": max_fevals, "timeout": timeout},
options=cma_opts,
callback=progress_callback,
)
xbest_cost = es.result.fbest
Expand All @@ -325,6 +336,9 @@ def parallel_evaluate(x: List[np.ndarray]) -> List[float]:
print(f"Best cost: {xbest_cost:.4f}")
print()

# pause for a bit to let any CUDA-using processes clean up
time.sleep(1.0)

# save the best merge configuration using original model references
genome_pretty = ModelGenome(config.genome, trust_remote_code=trust_remote_code)
best_config = genome_pretty.genotype_merge_config(xbest)
Expand Down
5 changes: 1 addition & 4 deletions mergekit/sparsify.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,8 @@ def rescale_sum(tensor: torch.Tensor, mask: torch.Tensor):
org_sum = tensor.abs().sum()
new_sum = (tensor * mask).abs().sum()

if org_sum >= 1e-8:
if org_sum >= 1e-8 and new_sum >= 1e-8:
tensor *= org_sum / new_sum
else:
pass

return tensor * mask


Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name = "mergekit"
description = "Tools for merging pre-trained large language models"
readme = "README.md"
license = { text = "LGPL-3.0-or-later" }
version = "0.0.4.3"
version = "0.0.4.4"
authors = [{ name = "Charles Goddard", email = "[email protected]" }]
dependencies = [
"torch>=2.0.0",
Expand All @@ -17,7 +17,7 @@ dependencies = [
"accelerate~=0.30.1",
"pydantic==2.7.1",
"immutables==0.20",
"transformers>=4.39.3",
"transformers>=4.42.3",
"huggingface_hub",
"peft",
"typing-extensions",
Expand Down

0 comments on commit cd4d4bf

Please sign in to comment.