Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#6344: Update RoBERTa QA demo #8896

Merged
merged 1 commit into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions models/demos/bert/tt/ttnn_optimized_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import ttnn

from models.utility_functions import is_grayskull
from models.experimental.functional_common.attention_mask_functions import get_extended_attention_mask


Expand All @@ -13,7 +13,7 @@ def bert_attention(
attention_mask,
*,
parameters,
num_cores_x=12,
num_cores_x=12 if is_grayskull() else 8,
):
num_heads = config.num_attention_heads
batch_size, _, hidden_size = hidden_states.shape
Expand Down Expand Up @@ -43,7 +43,7 @@ def bert_attention(
query,
key,
memory_config=ttnn.L1_MEMORY_CONFIG,
dtype=ttnn.bfloat16,
dtype=ttnn.bfloat8_b,
core_grid=ttnn.CoreGrid(y=batch_size, x=num_cores_x),
uaydonat marked this conversation as resolved.
Show resolved Hide resolved
)
ttnn.deallocate(query)
Expand Down Expand Up @@ -95,7 +95,7 @@ def bert_intermediate(
hidden_states,
*,
parameters,
num_cores_x=12,
num_cores_x=12 if is_grayskull() else 8,
):
batch_size, *_ = hidden_states.shape

Expand All @@ -107,6 +107,11 @@ def bert_intermediate(
dtype=ttnn.bfloat8_b,
core_grid=ttnn.CoreGrid(y=batch_size, x=num_cores_x),
activation="gelu",
compute_kernel_config=ttnn.WormholeComputeKernelConfig(
math_fidelity=ttnn.MathFidelity.HiFi2,
math_approx_mode=False,
packer_l1_acc=False,
),
)
return output

Expand All @@ -117,7 +122,7 @@ def bert_output(
residual,
*,
parameters,
num_cores_x=12,
num_cores_x=12 if is_grayskull() else 8,
):
batch_size, *_ = hidden_states.shape

Expand Down
19 changes: 19 additions & 0 deletions models/demos/roberta/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## functional_roberta Demo
## How to Run

If you wish to run the demo for ttnn_optimized_functional_roberta, use `pytest --disable-warnings models/demos/roberta/demo/demo.py::test_demo[models.demos.bert.tt.ttnn_optimized_bert-8-384-deepset/roberta-large-squad2-models/demos/roberta/demo/input_data.json]` to run the demo.

If you wish to run the demo with a different input use `pytest --disable-warnings models/demos/roberta/demo/demo.py::test_demo[models.demos.bert.tt.ttnn_optimized_bert-8-384-deepset/roberta-large-squad2-<address_to_your_json_file>]`. This file is expected to have exactly 8 inputs.

Our second demo is designed to run SQuADV2 dataset, run this with `pytest --disable-warnings models/demos/roberta/demo/demo.py::test_demo_squadv2[models.demos.bert.tt.ttnn_optimized_bert-8-384-3-deepset/roberta-large-squad2]`.

If you wish to run for `n_iterations` samples, use `pytest --disable-warnings models/demos/roberta/demo/demo.py::test_demo_squadv2[models.demos.bert.tt.ttnn_optimized_bert-8-384-<n_iterations>-deepset/roberta-large-squad2]`


# Inputs
Inputs by default are provided from `input_data.json`. If you wish you to change the inputs, provide a different path to test_demo.

We do not recommend modifying `input_data.json` file.

# Details
The entry point to functional_roberta model is bert_for_question_answering in `models/demos/bert/tt/ttnn_bert.py` (`models/demos/bert/tt/ttnn_optimized_bert.py` for optimized version). The model picks up certain configs and weights from huggingface pretrained model. We have used `deepset/roberta-large-squad2` version from huggingface as our reference.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
disable_persistent_kernel_cache,
profiler,
)
from models.demos.bert.tt import ttnn_bert
from models.demos.bert.tt import ttnn_optimized_bert

from models.datasets.dataset_squadv2 import squadv2_1K_samples_input, squadv2_answer_decode_batch
Expand Down Expand Up @@ -42,6 +41,12 @@ def load_inputs(input_path, batch):
return context, question


def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx


def run_roberta_question_and_answering_inference(
device,
use_program_cache,
Expand All @@ -60,13 +65,9 @@ def run_roberta_question_and_answering_inference(
tokenizer = RobertaTokenizer.from_pretrained(model_name)
config = hugging_face_reference_model.config
nlp = pipeline("question-answering", model=hugging_face_reference_model, tokenizer=tokenizer)
config.use_dram = True

if bert == ttnn_bert:
tt_model_name = f"ttnn_{model_name}"
elif bert == ttnn_optimized_bert:
tt_model_name = f"ttnn_{model_name}_optimized"
else:
raise ValueError(f"Unknown bert: {bert}")
tt_model_name = f"ttnn_{model_name}_optimized"

profiler.start(f"preprocessing_parameter")
parameters = preprocess_model_parameters(
Expand Down Expand Up @@ -105,10 +106,14 @@ def run_roberta_question_and_answering_inference(

profiler.start(f"preprocessing_input")

position_ids = create_position_ids_from_input_ids(
input_ids=roberta_input.input_ids, padding_idx=config.pad_token_id
)
ttnn_roberta_inputs = bert.preprocess_inputs(
roberta_input["input_ids"],
roberta_input["token_type_ids"],
torch.zeros(1, sequence_size) if bert == ttnn_optimized_bert else None,
position_ids,
roberta_input["attention_mask"],
device=device,
)
profiler.end(f"preprocessing_input")
Expand Down Expand Up @@ -139,7 +144,8 @@ def run_roberta_question_and_answering_inference(

tt_answer = nlp.postprocess([tt_res], **postprocess_params)

logger.info(f"answer: {tt_answer['answer']}\n")
logger.info(f"Question: {question[i]}")
logger.info(f"Answer: {tt_answer['answer']}\n")
model_answers[i] = tt_answer["answer"]

profiler.end("post_processing_output_to_string")
Expand Down Expand Up @@ -175,13 +181,9 @@ def run_roberta_question_and_answering_inference_squad_v2(
# set up tokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)
config = hugging_face_reference_model.config
config.use_dram = True

if bert == ttnn_bert:
tt_model_name = f"ttnn_{model_name}"
elif bert == ttnn_optimized_bert:
tt_model_name = f"ttnn_{model_name}_optimized"
else:
raise ValueError(f"Unknown bert: {bert}")
tt_model_name = f"ttnn_{model_name}_optimized"

parameters = preprocess_model_parameters(
model_name=tt_model_name,
Expand All @@ -208,10 +210,14 @@ def run_roberta_question_and_answering_inference_squad_v2(
if i < n_iterations:
batch_data = batch[0]
curr_batch_size = batch_data["input_ids"].shape[0]
position_ids = create_position_ids_from_input_ids(
input_ids=batch_data.input_ids, padding_idx=config.pad_token_id
)
ttnn_roberta_inputs = bert.preprocess_inputs(
batch_data["input_ids"],
batch_data["token_type_ids"],
torch.zeros(1, sequence_size) if bert == ttnn_optimized_bert else None,
position_ids,
batch_data["attention_mask"],
device=device,
)

Expand Down Expand Up @@ -250,44 +256,51 @@ def run_roberta_question_and_answering_inference_squad_v2(
i += 1
eval_score = squad_metric.compute(predictions=pred_labels, references=true_labels)
cpu_eval_score = squad_metric.compute(predictions=cpu_pred_labels, references=true_labels)
logger.info(f"\tTT_Eval: exact: {eval_score['exact']} -- F1: {eval_score['f1']}")
logger.info(f"TT_Eval: exact: {eval_score['exact']} -- F1: {eval_score['f1']}")
uaydonat marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f"CPU_Eval: exact: {cpu_eval_score['exact']} -- F1: {cpu_eval_score['f1']}")

assert eval_score["exact"] >= cpu_eval_score["exact"] and eval_score["f1"] >= cpu_eval_score["f1"], (
f"Expected Exact Match: {cpu_eval_score['exact']}, Actual Exact Match: {eval_score['exact']}; "
f"Expected F1 Score: {cpu_eval_score['f1']}, Actual F1 Score: {eval_score['f1']}"
)

@pytest.mark.parametrize("model_name", ["deepset/roberta-large-squad2"])
@pytest.mark.parametrize("bert", [ttnn_bert, ttnn_optimized_bert])
def test_demo(
input_path,
model_name,
bert,
device,
use_program_cache,
):

@pytest.mark.parametrize(
"model_name, input_loc",
((["deepset/roberta-large-squad2", "models/demos/roberta/demo/input_data.json"]),),
)
@pytest.mark.parametrize(
("bert", "batch_size", "sequence_size"),
((ttnn_optimized_bert, 8, 384),),
)
def test_demo(device, use_program_cache, model_name, input_loc, bert, batch_size, sequence_size):
esmalTT marked this conversation as resolved.
Show resolved Hide resolved
disable_persistent_kernel_cache()
disable_compilation_reports()

return run_roberta_question_and_answering_inference(
device=device,
use_program_cache=use_program_cache,
model_name=model_name,
batch_size=8,
sequence_size=384,
batch_size=batch_size,
sequence_size=sequence_size,
bert=bert,
input_path=input_path,
input_path=input_loc,
)


@pytest.mark.parametrize("model_name", ["deepset/roberta-large-squad2"])
@pytest.mark.parametrize("bert", [ttnn_bert, ttnn_optimized_bert])
@pytest.mark.parametrize(
"n_iterations",
((3),),
("bert", "batch_size", "sequence_size", "n_iterations"),
((ttnn_optimized_bert, 8, 384, 3),),
)
def test_demo_squadv2(
device,
use_program_cache,
model_name,
bert,
batch_size,
sequence_size,
n_iterations,
device,
use_program_cache,
):
disable_persistent_kernel_cache()
disable_compilation_reports()
Expand All @@ -296,8 +309,8 @@ def test_demo_squadv2(
device=device,
use_program_cache=use_program_cache,
model_name=model_name,
batch_size=8,
sequence_size=384,
batch_size=batch_size,
sequence_size=sequence_size,
bert=bert,
n_iterations=n_iterations,
)
37 changes: 37 additions & 0 deletions models/demos/roberta/tests/test_perf_device_roberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

import pytest
from models.utility_functions import is_grayskull
from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report


@pytest.mark.models_device_performance_bare_metal
@pytest.mark.parametrize(
"batch_size, test",
[
[8, "sequence_size=384-batch_size=8-model_name=deepset/roberta-large-squad2"],
],
)
def test_perf_device_bare_metal(batch_size, test):
subdir = "ttnn_roberta"
num_iterations = 1
margin = 0.03
expected_perf = 154.94 if is_grayskull() else 153.70

command = f"pytest tests/ttnn/integration_tests/roberta/test_ttnn_optimized_roberta.py::test_roberta_for_question_answering[{test}]"
cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

inference_time_key = "AVG DEVICE KERNEL SAMPLES/S"
expected_perf_cols = {inference_time_key: expected_perf}

post_processed_results = run_device_perf(command, subdir, num_iterations, cols, batch_size)
expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols)
prep_device_perf_report(
model_name=f"ttnn_roberta_{batch_size}",
batch_size=batch_size,
post_processed_results=post_processed_results,
expected_results=expected_results,
comments=test.replace("/", "_"),
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,19 @@

# SPDX-License-Identifier: Apache-2.0

import time

import pytest

from loguru import logger
import ttnn
import time
import torch
import pytest
import transformers


import ttnn

from models.demos.bert.tt import ttnn_bert
from loguru import logger
from models.demos.bert.tt import ttnn_optimized_bert

from ttnn.model_preprocessing import preprocess_model_parameters

from models.utility_functions import (
is_wormhole_b0,
is_blackhole,
is_grayskull,
enable_persistent_kernel_cache,
disable_persistent_kernel_cache,
)
Expand All @@ -29,34 +23,34 @@

def get_expected_times(bert):
return {
ttnn_bert: (13, 32),
ttnn_optimized_bert: (12, 0.092),
ttnn_optimized_bert: (8.7, 0.15) if is_grayskull() else (12.5, 0.14),
}[bert]


@pytest.mark.skipif(is_wormhole_b0() or is_blackhole(), reason="Unsupported on WH and BH")
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx


@pytest.mark.models_performance_bare_metal
@pytest.mark.models_performance_virtual_machine
@pytest.mark.parametrize("model_name", ["deepset/roberta-large-squad2"])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("sequence_size", [384])
@pytest.mark.parametrize("bert", [ttnn_bert, ttnn_optimized_bert])
@pytest.mark.parametrize("bert", [ttnn_optimized_bert])
def test_performance(device, use_program_cache, model_name, batch_size, sequence_size, bert):
disable_persistent_kernel_cache()

config = transformers.RobertaConfig.from_pretrained(model_name)
config.use_dram = True

input_ids = torch.randint(0, config.vocab_size, (batch_size, sequence_size)).to(torch.int32)
torch_token_type_ids = torch.zeros((batch_size, sequence_size), dtype=torch.int32)
torch_position_ids = torch.zeros((batch_size, sequence_size), dtype=torch.int32)
torch_attention_mask = torch.zeros(1, sequence_size) if bert == ttnn_optimized_bert else None
torch_position_ids = create_position_ids_from_input_ids(input_ids=input_ids, padding_idx=config.pad_token_id)

if bert == ttnn_bert:
tt_model_name = f"ttnn_{model_name}"
elif bert == ttnn_optimized_bert:
tt_model_name = f"ttnn_{model_name}_optimized"
else:
raise ValueError(f"Unknown functional_roberta: {bert}")
tt_model_name = f"ttnn_{model_name}_optimized"

parameters = preprocess_model_parameters(
model_name=tt_model_name,
Expand Down Expand Up @@ -106,3 +100,7 @@ def test_performance(device, use_program_cache, model_name, batch_size, sequence
logger.info(f"Compile time: {inference_and_compile_time - inference_time}")
logger.info(f"Inference time: {inference_time}")
logger.info(f"Samples per second: {1 / inference_time * batch_size}")

assert (
inference_time < expected_inference_time
), f"Expected inference time: {expected_inference_time} Actual inference time: {inference_time}"
21 changes: 0 additions & 21 deletions models/experimental/functional_roberta/README.md

This file was deleted.

Loading
Loading