openelm-coreml.py

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

# When using float16, all predicted logits are 0. To be debugge


def selector(op):
    return op.op_type != "l2_norm"
    

compute_precision = ct.transform.FP16ComputePrecision(op_selector=selector)


#compute_precision = ct.precision.FLOAT16
#compute_precision = ct.precision.FLOAT32
#compute_precision = ct.transform.FP16ComputePrecision(op_selector)

#compute_units = ct.ComputeUnit.CPU_ONLY
compute_units = ct.ComputeUnit.ALL


# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

# OpenELM uses the Llama tokenizer, see https://huggingface.co/apple/OpenELM-270M-Instruct/blob/main/generate_openelm.py#L21.
# It also uses custom code.

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

model.eval()
#model.half() not suer if needs to be after eval

'''
# Configuration details based on config.json
context_length = 2048  # max_context_length from config.json
vocab_size = 32001  # Derived from _anchor_vocab_size with padding token
padding_index = 32000  # _anchor_padding_index
forward_dtype = torch.bfloat16  # torch_dtype from config.json
backward_dtype = torch.float32

# Apply mixed precision
model = model.to(forward_dtype)

# Define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=padding_index)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0024, betas=(0.9, 0.95), eps=1.e-8, weight_decay=0.1)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=350000, eta_min=0.00024)

# Function to create data
def create_data():
    # Placeholder data creation function
    # Replace with actual data loading and preprocessing
    inputs = torch.randint(0, vocab_size, (16, context_length)).to(torch.long)  # Example input tensor
    labels = torch.randint(0, vocab_size, (16, context_length)).to(torch.long)  # Example label tensor
    return [(inputs, labels)]

data = create_data()

# Prepare model for palettization
config = DKMPalettizerConfig(global_config=ModuleDKMPalettizerConfig(
    n_bits=6,
    weight_threshold=1024,
    quantize_activations=True,
    quant_min=0,
    quant_max=100
))


palettizer = DKMPalettizer(model, config)
prepared_model = palettizer.prepare()

# Fine-tune the model for a few epochs
for epoch in range(1):  # Replace 1 with the number of epochs you want
    for inputs, labels in data:
        optimizer.zero_grad()  # Clear the gradients
        outputs = model(inputs)
        loss = loss_fn(outputs.view(-1, vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()
        palettizer.step()
        scheduler.step()  # Update learning rate

# Finalize the model
finalized_model = palettizer.finalize(inplace=True)
'''


## palettization may need to be after the code below. idk if it will work but lets try.

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=False)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
    def forward(self, *args, **kwargs):
        input_ids = args[0]
        return self.model(
            input_ids=input_ids,
            return_dict=False,
            use_cache=False,
            **kwargs
        )


to_jit = Wrapper(model)
jit_inputs = list(t_inputs.values())
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval();

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs["logits"])

## Core ML conversion

coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
#coreml_output_types = [ct.TensorType(name=name) for name in outputs.keys()]
coreml_output_types = [ct.TensorType(name=name, dtype=np.float32) for name in outputs.keys()]
#coreml_output_types = [ct.TensorType(name=name, dtype=np.float16) for name in outputs.keys()]
# Conversion fails with `Conversion for torch.repeat_interleave with non-zero dim has not been implemented`.
# We hack a special case shortcut when the first dim is `1`.

from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil import register_torch_op
from coremltools.converters.mil.frontend.torch.torch_op_registry import _TORCH_OPS_REGISTRY
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

del _TORCH_OPS_REGISTRY["repeat_interleave"]

@register_torch_op
def repeat_interleave(context, node):
    """
    Copy from https://github.com/apple/coremltools/blob/0bef2d6aabd122527cf86cc983c08fb16a4041b5/coremltools/converters/mil/frontend/torch/ops.py#L5174
    plus special case for dim=1 and bs=1
    """
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)

    special_case = dim.val == 1 and x.shape[0] == 1
    if special_case:
        x = mb.reshape(x=x, shape=(x.shape[1:]))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError(
                "Conversion for torch.repeat_interleave with Tensor repeats has not been implemented"
            )
        repeats_val = repeats_val0

    # This would operate on the flattened input tensor
    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and not special_case:
            raise NotImplementedError(
                "Conversion for torch.repeat_interleave with non-zero dim has not been implemented"
            )

    """
    on a high level:
         x
         | tile in dim 0
         v
        [x, x, ...]
         | reshape to split the repeats
         v
        [[x],
         [x],
         ...]
         | transpose(1, 0)
         V
        [x^T, x^T, ...]
         | flatten
         V
        result
    """

    reps = [1] * x.rank
    reps[0] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0] = 1
    perm[1] = 0
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[0] = -1
    if special_case:
        result_shape = [1] + result_shape
    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)

    context.add(result)


eps = 1e-6
###
def stable_l2_norm(x, eps):
    max_val = x.abs().max(axis=-1, keepdim=True).values
    max_val = torch.clamp(max_val, min=eps)
    xscaled = x / max_val
    scaled_norm = torch.acos(xscaled)
    return x / torch.clamp(scaled_norm, min=eps), max_val
###
class CustomRMSNorm(nn.Module):
    def __init__(self, weight, eps):
        super().__init__()
        self.weight = weight
        self.eps = eps
    
    def forward(self, x):
        x, max_val = stable_l2_norm(x, self.eps)
        return x * (x.size(-1) ** 0.5 / max_val) * self.weight

###
model.transformer.norm = CustomRMSNorm(model.transformer.norm.weight, model.transformer.norm.eps)

for layer in model.transformer.layers:
    layer.attn.q_norm = CustomRMSNorm(layer.attn.q_norm.weight, layer.attn.q_norm.eps)
    layer.attn.k_norm = CustomRMSNorm(layer.attn.k_norm.weight, layer.attn.k_norm.eps)
    layer.ffn_norm = CustomRMSNorm(layer.ffn_norm.weight, layer.ffn_norm.eps)
    layer.attn_norm = CustomRMSNorm(layer.attn_norm.weight, layer.attn_norm.eps)


coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
    #pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION, # palettization
)


import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

# Override tokenizer
model_name = "pcuenq/test-llama-tokenizer"

architecture = model.config.model_type
'''
user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": compute_precision,
}
'''

# Assuming `compute_precision` is used to apply float16 precision selectively
precision_description = "FP16 for all ops except l2_norm"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}


spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:

    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.

Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)