From bf9002f0158c0d346559b951268fd1dd308459e4 Mon Sep 17 00:00:00 2001 From: Sudharsan-V Date: Thu, 12 Sep 2024 12:46:42 +0000 Subject: [PATCH] #13396: Add data parallel support for distilbert model --- models/demos/wormhole/distilbert/README.md | 35 ++ models/demos/wormhole/distilbert/demo/demo.py | 341 +++++++++++++++ .../wormhole/distilbert/demo/input_data.json | 50 +++ .../wormhole/distilbert/distilbert_utils.py | 178 ++++++++ .../distilbert/tests/test_perf_distilbert.py | 197 +++++++++ .../tt/ttnn_optimized_distilbert.py | 406 ++++++++++++++++++ tests/scripts/run_performance.sh | 4 + .../single_card/run_single_card_demo_tests.sh | 2 + .../distilbert/test_ttnn_distilbert_wh.py | 113 +++++ 9 files changed, 1326 insertions(+) create mode 100644 models/demos/wormhole/distilbert/README.md create mode 100644 models/demos/wormhole/distilbert/demo/demo.py create mode 100644 models/demos/wormhole/distilbert/demo/input_data.json create mode 100644 models/demos/wormhole/distilbert/distilbert_utils.py create mode 100644 models/demos/wormhole/distilbert/tests/test_perf_distilbert.py create mode 100644 models/demos/wormhole/distilbert/tt/ttnn_optimized_distilbert.py create mode 100644 tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert_wh.py diff --git a/models/demos/wormhole/distilbert/README.md b/models/demos/wormhole/distilbert/README.md new file mode 100644 index 000000000000..b57b77779f2a --- /dev/null +++ b/models/demos/wormhole/distilbert/README.md @@ -0,0 +1,35 @@ +## Distilbert Model + +# Platforms: + WH N300, N150 + +## Introduction +DistilBERT is a transformers model, smaller and faster than BERT, which was pretrained on the same corpus in a self-supervised fashion, using the BERT base model as a teacher. The DistilBERT Question Answering model is fine-tuned specifically for the task of extracting answers from a given context, making it highly efficient for question-answering applications. + +# Details +The entry point to distilebert model is distilbert_for_question_answering in `models/demos/wormhole/distilbert/tt/ttnn_optimized_distilbert.py`. The model picks up certain configs and weights from huggingface pretrained model. We have used `distilbert-base-uncased-distilled-squad` version from huggingface as our reference. + +This model, located in `models/demos/wormhole`, supports functionality on both N150 and N300 devices, depending on availability. If the device is N300, the weights and inputs are distributed across the device, allowing the model to run in parallel. + +## Sequence Size: 384 + +Sequence size determines the maximum length of input sequences processed by the model, optimizing performance and compatibility. It's recommended to set the `sequence_size` to 384 + +## Batch size: 8 + +Batch Size determines the number of input sequences processed simultaneously during training or inference, impacting computational efficiency and memory usage. It's recommended to set the `batch_size` to 8 + +Use `pytest --disable-warnings models/demos/wormhole/distilbert/demo/demo.py::test_demo[wormhole_b0-True-models.demos.wormhole.distilbert.tt.ttnn_optimized_distilbert-distilbert-base-uncased-distilled-squad-models/demos/distilbert/demo/input_data.json]` to run the ttnn_optimized_distilbert demo. + + +If you wish to run the demo with a different input, change the pytest fixture input_loc to the desired location and use `pytest --disable-warnings models/demos/wormhole/distilbert/demo/demo.py::test_demo[wormhole_b0-True-models.demos.wormhole.distilbert.tt.ttnn_optimized_distilbert-distilbert-base-uncased-distilled-squad-models/demos/distilbert/demo/input_data.json]`. This file is expected to have exactly 8 inputs. + +Our second demo is designed to run SQuADV2 dataset, run this with `pytest --disable-warnings models/demos/wormhole/distilbert/demo/demo.py::test_demo_squadv2[wormhole_b0-True-3-models.demos.wormhole.distilbert.tt.ttnn_optimized_distilbert-distilbert-base-uncased-distilled-squad]`. + +If you wish to run for `n_iterations` samples, use `pytest --disable-warnings models/demos/wormhole/distilbert/demo/demo.py::test_demo_squadv2[wormhole_b0-True--models.demos.wormhole.distilbert.tt.ttnn_optimized_distilbert-distilbert-base-uncased-distilled-squad]` + +## Inputs + +The demo receives inputs from respective input_data.json by default. To modify the inputs or specify a different path, adjust the input_path parameter in the command accordingly. It's recommended to avoid direct modifications to the input_data.json file. + +# Owner Sudharsan Vijayaraghavan diff --git a/models/demos/wormhole/distilbert/demo/demo.py b/models/demos/wormhole/distilbert/demo/demo.py new file mode 100644 index 000000000000..f0d39b6b294b --- /dev/null +++ b/models/demos/wormhole/distilbert/demo/demo.py @@ -0,0 +1,341 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 +import json +import pytest +import torch +from loguru import logger +import ttnn +from models.utility_functions import ( + disable_compilation_reports, + disable_persistent_kernel_cache, + profiler, +) +from models.demos.wormhole.distilbert.tt import ttnn_optimized_distilbert +from models.demos.wormhole.distilbert.distilbert_utils import ( + squadv2_1K_samples_input, + squadv2_answer_decode_batch, +) +from ttnn.model_preprocessing import ( + preprocess_model_parameters, +) +from models.utility_functions import is_wormhole_b0, skip_for_grayskull +from transformers import DistilBertForQuestionAnswering, AutoTokenizer, pipeline +import evaluate + + +def load_inputs(input_path, batch): + with open(input_path) as f: + input_data = json.load(f) + assert len(input_data) >= batch, f"Input data needs to have at least {batch} (batch size) entries." + context = [] + question = [] + for i in range(batch): + context.append(input_data[i]["context"]) + question.append(input_data[i]["question"]) + return context, question + + +def run_distilbert_question_and_answering_inference( + model_name, + batch_size, + sequence_size, + distilbert, + model_location_generator, + input_path, + mesh_device, +): + disable_persistent_kernel_cache() + + HF_model = DistilBertForQuestionAnswering.from_pretrained(model_name) + HF_model.eval() + tt_model_name = f"ttnn_{model_name}_optimized" + + inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0) + weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device) + output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0) + + profiler.start(f"preprocessing_parameter") + + with ttnn.distribute(ttnn.ReplicateTensorToMesh(mesh_device)): + parameters = preprocess_model_parameters( + model_name=tt_model_name, + initialize_model=lambda: HF_model, + custom_preprocessor=ttnn_optimized_distilbert.custom_preprocessor, + device=mesh_device, + ) + profiler.end(f"preprocessing_parameter") + + # set up tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_name) + config = HF_model.config + nlp = pipeline("question-answering", model=HF_model, tokenizer=tokenizer) + + context, question = load_inputs(input_path, batch_size) + preprocess_params, _, postprocess_params = nlp._sanitize_parameters(max_seq_len=sequence_size, padding="max_length") + inputs = nlp._args_parser({"question": question, "context": context}) + preprocessed_inputs = [] + for i in range(batch_size): + model_input = next(nlp.preprocess(inputs[0][i], **preprocess_params)) + single_input = { + "example": model_input["example"], + "inputs": model_input, + } + preprocessed_inputs.append(single_input) + + distilbert_input = tokenizer( + question, + context, + max_length=sequence_size, + padding="max_length", + truncation=True, + return_attention_mask=True, + return_tensors="pt", + ) + + profiler.start(f"preprocessing_input") + position_ids = torch.arange(config.max_position_embeddings).expand((1, -1)) + position_ids = torch.cat([position_ids] * batch_size, dim=0) + input_ids, position_ids, attention_mask = distilbert.preprocess_inputs( + distilbert_input["input_ids"], + position_ids, + distilbert_input["attention_mask"], + device=mesh_device, + mesh_mapper=inputs_mesh_mapper, + ) + profiler.end(f"preprocessing_input") + + mask_reshp = (batch_size, 1, 1, attention_mask.shape[1]) + score_shape = (batch_size, 12, 384, 384) + + mask = (distilbert_input["attention_mask"] == 0).view(mask_reshp).expand(score_shape) + min_val = torch.zeros(score_shape) + min_val_tensor = min_val.masked_fill(mask, torch.tensor(torch.finfo(torch.bfloat16).min)) + negative_val = torch.zeros(score_shape) + negative_val_tensor = negative_val.masked_fill(mask, -1) + + min_val_tensor = ttnn.from_torch( + min_val_tensor, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, mesh_mapper=inputs_mesh_mapper, device=mesh_device + ) + + negative_val_tensor = ttnn.from_torch( + negative_val_tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + mesh_mapper=inputs_mesh_mapper, + device=mesh_device, + ) + + profiler.start(f"inference_time") + tt_output = ttnn_optimized_distilbert.distilbert_for_question_answering( + config, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + parameters=parameters, + device=mesh_device, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + mesh_mapper=weights_mesh_mapper, + ip_mesh_mapper=inputs_mesh_mapper, + ) + profiler.end(f"inference_time") + + tt_output = ( + ttnn.to_torch(ttnn.from_device(tt_output), mesh_composer=output_mesh_composer) + .reshape(batch_size, 1, sequence_size, -1) + .to(torch.float32) + ) + tt_start_logits = tt_output[..., :, 0].squeeze(1) + tt_end_logits = tt_output[..., :, 1].squeeze(1) + model_answers = {} + + profiler.start("post_processing_output_to_string") + for i in range(batch_size): + tt_res = { + "start": tt_start_logits[i], + "end": tt_end_logits[i], + "example": preprocessed_inputs[i]["example"], + **preprocessed_inputs[i]["inputs"], + } + tt_answer = nlp.postprocess([tt_res], **postprocess_params) + logger.info(f"answer: {tt_answer['answer']}\n") + model_answers[i] = tt_answer["answer"] + profiler.end("post_processing_output_to_string") + + measurements = { + "preprocessing_parameter": profiler.get("preprocessing_parameter"), + "preprocessing_input": profiler.get("preprocessing_input"), + "inference_time": profiler.get("inference_time"), + "post_processing": profiler.get("post_processing_output_to_string"), + } + logger.info(f"preprocessing_parameter: {measurements['preprocessing_parameter']} s") + logger.info(f"preprocessing_input: {measurements['preprocessing_input']} s") + logger.info(f"inference_time: {measurements['inference_time']} s") + logger.info(f"post_processing : {measurements['post_processing']} s") + return measurements + + +def run_distilbert_question_and_answering_inference_squad_v2( + use_program_cache, + model_name, + batch_size, + sequence_size, + distilbert, + model_location_generator, + n_iterations, + mesh_device, +): + disable_persistent_kernel_cache() + HF_model = DistilBertForQuestionAnswering.from_pretrained(model_name) + HF_model.eval() + + tt_model_name = f"ttnn_{model_name}_optimized" + + inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0) + weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device) + output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0) + mesh_device_flag = True + + with ttnn.distribute(ttnn.ReplicateTensorToMesh(mesh_device)): + parameters = preprocess_model_parameters( + model_name=tt_model_name, + initialize_model=lambda: HF_model, + custom_preprocessor=ttnn_optimized_distilbert.custom_preprocessor, + device=mesh_device, + ) + + # set up tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_name) + config = HF_model.config + + nlp = pipeline("question-answering", model=HF_model, tokenizer=tokenizer) + attention_mask = True + token_type_ids = False + inputs_squadv2 = squadv2_1K_samples_input(tokenizer, sequence_size, attention_mask, token_type_ids, batch_size) + squad_metric = evaluate.load("squad_v2") + position_ids = torch.arange(config.max_position_embeddings).expand((1, -1)) + position_ids = torch.cat([position_ids] * batch_size, dim=0) + + with torch.no_grad(): + pred_labels = [] + cpu_pred_labels = [] + true_labels = [] + i = 0 + for batch in inputs_squadv2: + if i < n_iterations: + batch_data = batch[0] + curr_batch_size = batch_data["input_ids"].shape[0] + ttnn_distilbert_inputs = distilbert.preprocess_inputs( + batch_data["input_ids"], + position_ids, + batch_data["attention_mask"], + device=mesh_device, + mesh_mapper=inputs_mesh_mapper, + ) + mask_reshp = (batch_size, 1, 1, batch_data["attention_mask"].shape[1]) + score_shape = (batch_size, 12, 384, 384) + + mask = (batch_data["attention_mask"] == 0).view(mask_reshp).expand(score_shape) + min_val = torch.zeros(score_shape) + min_val_tensor = min_val.masked_fill(mask, torch.tensor(torch.finfo(torch.bfloat16).min)) + negative_val = torch.zeros(score_shape) + negative_val_tensor = negative_val.masked_fill(mask, -1) + min_val_tensor = ttnn.from_torch( + min_val_tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + mesh_mapper=inputs_mesh_mapper, + device=mesh_device, + ) + + negative_val_tensor = ttnn.from_torch( + negative_val_tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + mesh_mapper=inputs_mesh_mapper, + device=mesh_device, + ) + + tt_output = ttnn_optimized_distilbert.distilbert_for_question_answering( + config, + input_ids=ttnn_distilbert_inputs[0], + attention_mask=ttnn_distilbert_inputs[2], + position_ids=ttnn_distilbert_inputs[1], + parameters=parameters, + device=mesh_device, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + mesh_mapper=weights_mesh_mapper, + ip_mesh_mapper=inputs_mesh_mapper, + ) + tt_output = ( + ttnn.to_torch(tt_output, mesh_composer=output_mesh_composer) + .reshape(batch_size, 1, sequence_size, -1) + .to(torch.float32) + ) + cpu_output = HF_model(**batch_data) + references = batch[1] + question = batch[2] + context = batch[3] + cpu_predictions, tt_predictions = squadv2_answer_decode_batch( + HF_model, + tokenizer, + nlp, + references, + cpu_output, + tt_output, + curr_batch_size, + question, + context, + ) + pred_labels.extend(tt_predictions) + cpu_pred_labels.extend(cpu_predictions) + true_labels.extend(references) + del tt_output + i += 1 + eval_score = squad_metric.compute(predictions=pred_labels, references=true_labels) + cpu_eval_score = squad_metric.compute(predictions=cpu_pred_labels, references=true_labels) + logger.info(f"\tTT_Eval: exact: {eval_score['exact']} -- F1: {eval_score['f1']}") + logger.info(f"\tCPU_Eval: exact: {cpu_eval_score['exact']} -- F1: {cpu_eval_score['f1']}") + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "model_name, input_loc", + ((["distilbert-base-uncased-distilled-squad", "models/demos/distilbert/demo/input_data.json"]),), +) +@pytest.mark.parametrize("distilbert", [ttnn_optimized_distilbert]) +def test_demo(input_loc, model_name, distilbert, model_location_generator, mesh_device): + disable_persistent_kernel_cache() + disable_compilation_reports() + return run_distilbert_question_and_answering_inference( + model_name=model_name, + batch_size=8, + sequence_size=384, + distilbert=distilbert, + model_location_generator=model_location_generator, + input_path=input_loc, + mesh_device=mesh_device, + ) + + +@skip_for_grayskull() +@pytest.mark.parametrize("model_name", ["distilbert-base-uncased-distilled-squad"]) +@pytest.mark.parametrize("distilbert", [ttnn_optimized_distilbert]) +@pytest.mark.parametrize( + "n_iterations", + ((3),), +) +def test_demo_squadv2(model_name, distilbert, n_iterations, model_location_generator, use_program_cache, mesh_device): + disable_persistent_kernel_cache() + disable_compilation_reports() + return run_distilbert_question_and_answering_inference_squad_v2( + use_program_cache=use_program_cache, + model_name=model_name, + batch_size=8, + sequence_size=384, + distilbert=distilbert, + model_location_generator=model_location_generator, + n_iterations=n_iterations, + mesh_device=mesh_device, + ) diff --git a/models/demos/wormhole/distilbert/demo/input_data.json b/models/demos/wormhole/distilbert/demo/input_data.json new file mode 100644 index 000000000000..950b8d36323b --- /dev/null +++ b/models/demos/wormhole/distilbert/demo/input_data.json @@ -0,0 +1,50 @@ +[ + { + "context" : "Johann Joachim Winckelmann was a German art historian and archaeologist. He was a pioneering Hellenist who first articulated the difference between Greek, Greco-Roman and Roman art. The prophet and founding hero of modern archaeology, Winckelmann was one of the founders of scientific archaeology and first applied the categories of style on a large, systematic basis to the history of art.", + "question" : "What discipline did Winkelmann create?" + }, + { + "context" : "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.", + "question" : "Who ruled the duchy of Normandy" + }, + { + "context" : "In many countries, there is a Gender pay gap in favor of males in the labor market. Several factors other than discrimination may contribute to this gap. On average, women are more likely than men to consider factors other than pay when looking for work, and may be less willing to travel or relocate. Thomas Sowell, in his book Knowledge and Decisions, claims that this difference is due to women not taking jobs due to marriage or pregnancy, but income studies show that that does not explain the entire difference. A U.S. Census's report stated that in US once other factors are accounted for there is still a difference in earnings between women and men. The income gap in other countries ranges from 53% in Botswana to -40% in Bahrain.", + "question" : "Who does a gender pay gap tend to favor?" + }, + { + "context" : "Most of the Huguenot congregations (or individuals) in North America eventually affiliated with other Protestant denominations with more numerous members. The Huguenots adapted quickly and often married outside their immediate French communities, which led to their assimilation. Their descendants in many families continued to use French first names and surnames for their children well into the nineteenth century. Assimilated, the French made numerous contributions to United States economic life, especially as merchants and artisans in the late Colonial and early Federal periods. For example, E.I. du Pont, a former student of Lavoisier, established the Eleutherian gunpowder mills.", + "question" : "How were Huguenot settlers assimilated into North American society at large?" + }, + { + "context" : "In the laboratory, biostratigraphers analyze rock samples from outcrop and drill cores for the fossils found in them. These fossils help scientists to date the core and to understand the depositional environment in which the rock units formed. Geochronologists precisely date rocks within the stratigraphic section in order to provide better absolute bounds on the timing and rates of deposition. Magnetic stratigraphers look for signs of magnetic reversals in igneous rock units within the drill cores. Other scientists perform stable isotope studies on the rocks to gain information about past climate.", + "question" : "Who analyzes rock samples from drill cores in the lab?" + }, + { + "context" : "Neutrophils and macrophages are phagocytes that travel throughout the body in pursuit of invading pathogens. Neutrophils are normally found in the bloodstream and are the most abundant type of phagocyte, normally representing 50% to 60% of the total circulating leukocytes. During the acute phase of inflammation, particularly as a result of bacterial infection, neutrophils migrate toward the site of inflammation in a process called chemotaxis, and are usually the first cells to arrive at the scene of infection. Macrophages are versatile cells that reside within tissues and produce a wide array of chemicals including enzymes, complement proteins, and regulatory factors such as interleukin 1. Macrophages also act as scavengers, ridding the body of worn-out cells and other debris, and as antigen-presenting cells that activate the adaptive immune system.", + "question" : "What is the process in which neutrophils move towards the site of inflammation called?" + }, + { + "context" : "In Afghanistan, the mujahideen's victory against the Soviet Union in the 1980s did not lead to justice and prosperity, due to a vicious and destructive civil war between political and tribal warlords, making Afghanistan one of the poorest countries on earth. In 1992, the Democratic Republic of Afghanistan ruled by communist forces collapsed, and democratic Islamist elements of mujahdeen founded the Islamic State of Afghanistan. In 1996, a more conservative and anti-democratic Islamist movement known as the Taliban rose to power, defeated most of the warlords and took over roughly 80% of Afghanistan.", + "question" : "When did the Democratic Republic of Afghanistan collapse?" + }, + { + "context" : "The largest single sensory feature is the aboral organ (at the opposite end from the mouth). Its main component is a statocyst, a balance sensor consisting of a statolith, a solid particle supported on four bundles of cilia, called \"balancers\", that sense its orientation. The statocyst is protected by a transparent dome made of long, immobile cilia. A ctenophore does not automatically try to keep the statolith resting equally on all the balancers. Instead its response is determined by the animal's \"mood\", in other words the overall state of the nervous system. For example, if a ctenophore with trailing tentacles captures prey, it will often put some comb rows into reverse, spinning the mouth towards the prey.", + "question" : "What is the main component of the aboral organ?" + }, + { + "context": "Mark Rothko was a Latvian-born American abstract painter. He is best known for his color field paintings that depicted irregular and painterly rectangular regions of color, which he produced from 1949 to 1970. Although Rothko did not personally subscribe to any one school, he is associated with the American Abstract Expressionist movement of modern art. Originally emigrating to Portland, Oregon, from Russian Empire (Latvia) with his family, Rothko later moved to New York City where his youthful period of artistic production dealt primarily with urban scenery.", + "question": "what is Rothko best known for?" + }, + { + "context": "Malignant narcissism is a psychological syndrome that could include aspects of narcissistic personality disorder (NPD) alongside a mix of antisocial, paranoid and sadistic personality disorder traits. The importance of malignant narcissism and of projection as a defense mechanism has been confirmed in paranoia, as well as the patient's vulnerability to malignant narcissistic regression. A person with malignant narcissism exhibits paranoia in addition to the symptoms of a Narcissistic Personality Disorder. Because a malignant narcissist's personality cannot tolerate any criticism, being mocked typically causes paranoia.", + "question": "What symptoms a malignant narcissist might exhibit in addition to the symptoms of a NPD patient?" + }, + { + "context": "The 14 July Revolution, also known as the 1958 Iraqi military coup, was a coup d'état that took place on 14 July 1958 in Iraq which resulted in the toppling of King Faisal II and the overthrow of the Hashemite-led Kingdom of Iraq. The Iraqi Republic established in its wake ended the Hashemite Arab Federation between Iraq and Jordan that had been established just six months earlier. In July 1958, units of the Royal Iraqi Army were dispatched to Jordan in support of King Hussein. A group of Iraqi Free Officers, led by Brigadier Abd al-Karim Qasim and Colonel Abdul Salam Arif, took advantage of the opportunity and instead marched on Baghdad. On 14 July, revolutionary forces seized control of the capital and proclaimed a new republic, headed by a Revolutionary Council.", + "question": "When was the Hashemite Arab Federation formed?" + }, + { + "context": "The Tasmanian devil is a carnivorous marsupial of the family Dasyuridae. It was formerly present across mainland Australia, but became extinct there around 3,500 years ago. The size of a small dog, the Tasmanian devil became the largest carnivorous marsupial in the world following the extinction of the thylacine in 1936. It is related to quolls, and distantly related to the thylacine. It is characterised by its stocky and muscular build, black fur, pungent odour, extremely loud and disturbing screech, keen sense of smell, and ferocity when feeding. The Tasmanian devil's large head and neck allow it to generate among the strongest bites per unit body mass of any extant predatory land mammal. It hunts prey and scavenges on carrion.", + "question": "What allows Tasmanian devil to generate strong bites?" + } +] diff --git a/models/demos/wormhole/distilbert/distilbert_utils.py b/models/demos/wormhole/distilbert/distilbert_utils.py new file mode 100644 index 000000000000..7075eed9b360 --- /dev/null +++ b/models/demos/wormhole/distilbert/distilbert_utils.py @@ -0,0 +1,178 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 +from torch.utils.data import Dataset +from typing import Any +from datasets import load_dataset +from loguru import logger + + +class SQUADV2Dataset(Dataset): + """Configurable SQuad-V2 Dataset.""" + + def __init__( + self, + dataset_question: Any, + dataset_context: Any, + dataset_reference: Any, + tokenizer: Any, + seq_len: int, + attention_mask: bool, + token_type_ids: bool, + ): + """Init and preprocess SST-2 dataset. + Parameters + ---------- + dataset : Any + SQUAD-v2 dataset + tokenizer : Any + tokenizer object from HuggingFace + split : str + Which split to use i.e. ["train", "validation", "test"] + seq_len : int + Sequence length + attention_mask : bool + token_type_ids : bool + """ + self.data = [] + for i in range(len(dataset_question)): + self.data.append( + ( + tokenizer( + dataset_question[i], + dataset_context[i], + max_length=seq_len, + padding="max_length", + truncation=True, + return_attention_mask=attention_mask, + # return_token_type_ids=token_type_ids, + return_tensors="pt", + ), + dataset_reference[i], + dataset_question[i], + dataset_context[i], + ) + ) + + def __len__(self): + """Return length of dataset. + Returns + ------- + int + Length of dataset + """ + return len(self.data) + + def __getitem__(self, index: int): + """Return sample from dataset. + Parameters + ---------- + index : int + Index of sample + Returns + ------- + Tuple + Data sample in format of X, y + """ + X = self.data[index] + return X + + +def squad_divide_chunks(dataset_question, dataset_context, dataset_reference, batch): + dataset_question_b = [] + dataset_context_b = [] + dataset_reference_b = [] + for i in range(0, len(dataset_question), batch): + dataset_question_b.append(dataset_question[i : i + batch]) + dataset_context_b.append(dataset_context[i : i + batch]) + dataset_reference_b.append(dataset_reference[i : i + batch]) + return dataset_question_b, dataset_context_b, dataset_reference_b + + +def squadv2_1K_samples_input(tokenizer, seq_len, attention_mask, token_type_ids, microbatch=8): + squadv2_dataset = load_dataset("squad_v2", use_auth_token=False, streaming=True)["validation"] + dataset_iter = iter(squadv2_dataset) + dataset_question = [] + dataset_context = [] + dataset_reference = [] + for _ in range(2048): + dataset_sgl = next(dataset_iter) + if len(dataset_sgl["answers"]["text"]) > 0: + dataset_question.append(dataset_sgl["question"]) + dataset_context.append(dataset_sgl["context"]) + dataset_reference.append({"answers": dataset_sgl["answers"], "id": dataset_sgl["id"]}) + if len(dataset_question) == 1024: + logger.info("SQuADv2 1024 samples load ..done") + break + dataset_question, dataset_context, dataset_reference = squad_divide_chunks( + dataset_question, dataset_context, dataset_reference, microbatch + ) + dataset_processed = SQUADV2Dataset( + dataset_question, + dataset_context, + dataset_reference, + tokenizer=tokenizer, + seq_len=seq_len, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + ) + return dataset_processed + + +def squadv2_answer_decode_batch( + HF_model, + tokenizer, + nlp, + references, + cpu_out, + tt_untilized_output, + BATCH_SIZE, + question, + context, + seq_len=384, + padding=None, +): + tt_predictions = [] + cpu_predictions = [] + preprocess_params, _, postprocess_params = nlp._sanitize_parameters(max_seq_len=seq_len, padding="max_length") + input_q = {"context": context, "question": question} + examples = nlp._args_parser(input_q) + for i in range(BATCH_SIZE): + logger.info(f"--REF-- {references[i]['answers']['text']}") + answer_start_scores = cpu_out["start_logits"][i] + answer_end_scores = cpu_out["end_logits"][i] + tt_start_logits = tt_untilized_output[..., :, 0].squeeze(1)[i] + tt_end_logits = tt_untilized_output[..., :, 1].squeeze(1)[i] + model_input = next(nlp.preprocess(examples[0][i], **preprocess_params)) + single_input = { + "data": ( + model_input["input_ids"], + model_input["attention_mask"], + model_input["token_type_ids"], + ), + "example": model_input["example"], + "inputs": model_input, + } + pt_res = { + "start": answer_start_scores, + "end": answer_end_scores, + "example": single_input["example"], + **single_input["inputs"], + } + cpu_answer_nlp = nlp.postprocess([pt_res], **postprocess_params)["answer"] + tt_res = { + "start": tt_start_logits, + "end": tt_end_logits, + "example": single_input["example"], + **single_input["inputs"], + } + tt_answer_nlp = nlp.postprocess([tt_res], **postprocess_params)["answer"] + logger.info(f"--CPU-- {cpu_answer_nlp}") + logger.info(f"--TT--- {tt_answer_nlp}") + logger.info(f"=======") + cpu_predictions.append( + {"prediction_text": cpu_answer_nlp, "id": references[i]["id"], "no_answer_probability": 0.0} + ) + tt_predictions.append( + {"prediction_text": tt_answer_nlp, "id": references[i]["id"], "no_answer_probability": 0.0} + ) + return cpu_predictions, tt_predictions diff --git a/models/demos/wormhole/distilbert/tests/test_perf_distilbert.py b/models/demos/wormhole/distilbert/tests/test_perf_distilbert.py new file mode 100644 index 000000000000..8985b02fc2e7 --- /dev/null +++ b/models/demos/wormhole/distilbert/tests/test_perf_distilbert.py @@ -0,0 +1,197 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import pytest +import ttnn +from loguru import logger +import time + +from models.demos.wormhole.distilbert.tt import ttnn_optimized_distilbert +from models.utility_functions import ( + enable_persistent_kernel_cache, + disable_persistent_kernel_cache, + profiler, +) +from ttnn.model_preprocessing import ( + preprocess_model_parameters, +) +from models.perf.perf_utils import prep_perf_report +from transformers import DistilBertForQuestionAnswering, AutoTokenizer +from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report +from models.utility_functions import is_grayskull, is_wormhole_b0, skip_for_grayskull + + +@skip_for_grayskull() +@pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize("model_name", ["distilbert-base-uncased-distilled-squad"]) +@pytest.mark.parametrize( + "batch_size, seq_len, expected_inference_time, expected_compile_time", + ([8, 384, 15.00, 16.00],), +) +def test_performance_distilbert_for_qa( + mesh_device, + batch_size, + model_name, + seq_len, + expected_inference_time, + expected_compile_time, +): + HF_model = DistilBertForQuestionAnswering.from_pretrained(model_name) + HF_model.eval() + + # set up tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_name) + config = HF_model.config + + disable_persistent_kernel_cache() + + cpu_key = "ref_key" + + context = batch_size * [ + "Johann Joachim Winckelmann was a German art historian and archaeologist. He was a pioneering Hellenist who first articulated the difference between Greek, Greco-Roman and Roman art. The prophet and founding hero of modern archaeology, Winckelmann was one of the founders of scientific archaeology and first applied the categories of style on a large, systematic basis to the history of art." + ] + question = batch_size * ["What discipline did Winkelmann create?"] + inputs = tokenizer( + question, + context, + max_length=seq_len, + padding="max_length", + truncation=True, + return_attention_mask=True, + return_tensors="pt", + ) + tt_model_name = f"ttnn_{model_name}_optimized" + + inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0) + weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device) + + profiler.start(f"preprocessing_parameter") + with ttnn.distribute(ttnn.ReplicateTensorToMesh(mesh_device)): + parameters = preprocess_model_parameters( + model_name=tt_model_name, + initialize_model=lambda: HF_model, + custom_preprocessor=ttnn_optimized_distilbert.custom_preprocessor, + device=mesh_device, + ) + profiler.end(f"preprocessing_parameter") + + mask_reshp = (batch_size, 1, 1, inputs["attention_mask"].shape[1]) + score_shape = (batch_size, 12, 384, 384) + + mask = (inputs["attention_mask"] == 0).view(mask_reshp).expand(score_shape) + min_val = torch.zeros(score_shape) + min_val_tensor = min_val.masked_fill(mask, torch.tensor(torch.finfo(torch.bfloat16).min)) + + negative_val = torch.zeros(score_shape) + negative_val_tensor = negative_val.masked_fill(mask, -1) + min_val_tensor = ttnn.from_torch( + min_val_tensor, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, mesh_mapper=inputs_mesh_mapper, device=mesh_device + ) + + negative_val_tensor = ttnn.from_torch( + negative_val_tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + mesh_mapper=inputs_mesh_mapper, + device=mesh_device, + ) + + with torch.no_grad(): + profiler.start(cpu_key) + torch_out = HF_model(**inputs) + profiler.end(cpu_key) + + durations = [] + for _ in range(2): + position_ids = torch.arange(config.max_position_embeddings).expand((1, -1)) + position_ids = torch.cat([position_ids] * batch_size, dim=0) + profiler.start(f"preprocessing_input") + input_ids, position_ids, attention_mask = ttnn_optimized_distilbert.preprocess_inputs( + inputs["input_ids"], + position_ids, + inputs["attention_mask"], + device=mesh_device, + mesh_mapper=inputs_mesh_mapper, + ) + profiler.end(f"preprocessing_input") + + start = time.time() + tt_output = ttnn_optimized_distilbert.distilbert_for_question_answering( + config, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + parameters=parameters, + device=mesh_device, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + mesh_mapper=weights_mesh_mapper, + ip_mesh_mapper=inputs_mesh_mapper, + ) + tt_output = ttnn.from_device(tt_output) + end = time.time() + + durations.append(end - start) + enable_persistent_kernel_cache() + + inference_and_compile_time, inference_time, *_ = durations + + prep_perf_report( + model_name=f"ttnn_{model_name}_optimized", + batch_size=batch_size, + inference_and_compile_time=inference_and_compile_time, + inference_time=inference_time, + expected_compile_time=expected_compile_time, + expected_inference_time=expected_inference_time, + comments="", + inference_time_cpu=0.0, + ) + + logger.info(f"Compile time: {inference_and_compile_time - inference_time}") + logger.info(f"Inference time: {inference_time}") + logger.info(f"Samples per second: {1 / inference_time * batch_size}") + + assert ( + inference_time < expected_inference_time + ), f"Expected inference time: {expected_inference_time} Actual inference time: {inference_time}" + logger.info("Exit Distilbert perf test") + + +@skip_for_grayskull() +@pytest.mark.models_device_performance_bare_metal +@pytest.mark.parametrize( + "batch_size, test", + [ + [8, "distilbert-base-uncased-distilled-squad"], + ], +) +def test_distilbert_perf_device(batch_size, test, reset_seeds): + subdir = "ttnn_distilbert" + margin = 0.03 + num_iterations = 1 + + if is_grayskull(): + expected_perf = 10.01 + elif is_wormhole_b0(): + if ttnn.GetNumAvailableDevices() == 2: + expected_perf = 192 + else: + expected_perf = 212 + + command = f"pytest tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert_wh.py::test_distilbert_for_question_answering[silicon_arch_name=wormhole_b0-silicon_arch_wormhole_b0=True-sequence_size=768-batch_size=8-model_name=distilbert-base-uncased-distilled-squad]" + + cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] + inference_time_key = "AVG DEVICE KERNEL SAMPLES/S" + expected_perf_cols = {inference_time_key: expected_perf} + + post_processed_results = run_device_perf(command, subdir, num_iterations, cols, batch_size) + expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols) + prep_device_perf_report( + model_name=f"ttnn_distilbert{batch_size}", + batch_size=batch_size, + post_processed_results=post_processed_results, + expected_results=expected_results, + comments=test.replace("/", "_"), + ) diff --git a/models/demos/wormhole/distilbert/tt/ttnn_optimized_distilbert.py b/models/demos/wormhole/distilbert/tt/ttnn_optimized_distilbert.py new file mode 100644 index 000000000000..6b45b7e3e62a --- /dev/null +++ b/models/demos/wormhole/distilbert/tt/ttnn_optimized_distilbert.py @@ -0,0 +1,406 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from typing import Optional +import torch +from ttnn.model_preprocessing import ( + preprocess_linear_bias, + preprocess_linear_weight, +) +import ttnn.torch_tracer + + +def get_head_mask( + head_mask: Optional[ttnn.Tensor], + num_hidden_layers: int, + is_attention_chunked: bool = False, +): + head_mask = [ + None, + ] * num_hidden_layers + return head_mask + + +def attention( + config, + hidden_states, + mask, + head_mask=None, + output_attentions=None, + device=None, + base_address=None, + parameters=None, + num_cores_x=12, + min_val_tensor=None, + negative_val_tensor=None, + mesh_mapper=None, +): + batch_size, q_length, dim = hidden_states.shape + k_length = hidden_states.shape[1] + dim_per_head = config.dim // config.n_heads + + query_key_value_output = ttnn.linear( + hidden_states, + parameters.query_key_value.weight, + bias=parameters.query_key_value.bias, + memory_config=ttnn.L1_MEMORY_CONFIG, + dtype=ttnn.bfloat8_b, + core_grid=ttnn.CoreGrid(y=device.core_grid.y, x=device.core_grid.x), + ) + + ( + query, + key, + value, + ) = ttnn.transformer.split_query_key_value_and_split_heads( + query_key_value_output, + memory_config=ttnn.L1_MEMORY_CONFIG, + num_heads=config.n_heads, + ) + ttnn.deallocate(query_key_value_output) + + query = query * (1 / (dim_per_head) ** 0.5) + + attention_scores = ttnn.matmul( + query, + key, + memory_config=ttnn.L1_MEMORY_CONFIG, + dtype=ttnn.bfloat16, + core_grid=ttnn.CoreGrid(y=device.core_grid.y, x=device.core_grid.x), + ) + ttnn.deallocate(query) + ttnn.deallocate(key) + score_list = [] + + if batch_size <= 2: + inter_scores = attention_scores * negative_val_tensor + inter_scores = inter_scores + attention_scores + scores = inter_scores + min_val_tensor + else: + for i in range(2, batch_size + 1, 2): + inter_scores = attention_scores[i - 2 : i, :, :, :] * negative_val_tensor[i - 2 : i, :, :, :] + inter_scores = inter_scores + attention_scores[i - 2 : i, :, :, :] + score = inter_scores + min_val_tensor[i - 2 : i, :, :, :] + score = ttnn.permute(score, (1, 0, 2, 3)) + + score_list.append(score) + ttnn.deallocate(inter_scores) + + scores = ttnn.concat(score_list, dim=1) + scores = ttnn.permute(scores, (1, 0, 2, 3)) + + weights = ttnn.transformer.attention_softmax(scores, head_size=1) + ttnn.deallocate(scores) + context_layer = ttnn.matmul( + weights, + value, + memory_config=ttnn.L1_MEMORY_CONFIG, + dtype=ttnn.bfloat16, + core_grid=ttnn.CoreGrid(y=device.core_grid.y, x=device.core_grid.x), + ) + + ttnn.deallocate(weights) + ttnn.deallocate(value) + + context_layer = ttnn.permute(context_layer, [0, 1, 3, 2]) + context_layer = ttnn.reshape(context_layer, (batch_size, config.n_heads * dim_per_head, -1)) + + context_layer = ttnn.permute(context_layer, (0, 2, 1)) + + self_output = ttnn.linear( + context_layer, + parameters.out_lin.weight, + bias=parameters.out_lin.bias, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + dtype=ttnn.bfloat16, + ) + ttnn.deallocate(context_layer) + return self_output + + +def ffn(configs, hidden_state, device, base_address, parameters, num_cores_x=12, mesh_mapper=None): + batch_size, *_ = hidden_state.shape + + output = ttnn.linear( + hidden_state, + parameters.lin1.weight, + bias=parameters.lin1.bias, + memory_config=ttnn.L1_MEMORY_CONFIG, + dtype=ttnn.bfloat16, + activation="gelu", + core_grid=ttnn.CoreGrid(y=device.core_grid.y, x=device.core_grid.x), + ) + + output = ttnn.linear( + output, + parameters.lin2.weight, + bias=parameters.lin2.bias, + memory_config=ttnn.L1_MEMORY_CONFIG, + dtype=ttnn.bfloat16, + core_grid=ttnn.CoreGrid(y=device.core_grid.y, x=device.core_grid.x), + ) + return output + + +def transformer_block( + config, + x, + attention_mask=None, + head_mask=None, + output_attentions: bool = False, + base_address=None, + parameters=None, + device=None, + min_val_tensor=None, + negative_val_tensor=None, + mesh_mapper=None, +): + sa_output = attention( + config, + x, + attention_mask, + head_mask, + output_attentions, + device=device, + base_address=base_address, + parameters=parameters.attention, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + ) + + sa_output = ttnn.layer_norm( + x + sa_output, + weight=parameters.sa_layer_norm.weight, + bias=parameters.sa_layer_norm.bias, + epsilon=1e-12, + memory_config=ttnn.L1_MEMORY_CONFIG, + ) + ttnn.deallocate(x) + + ffn_output = ffn(config, sa_output, device=device, base_address=base_address, parameters=parameters.ffn) + + ffn_output = ttnn.layer_norm( + ffn_output + sa_output, + weight=parameters.output_layer_norm.weight, + bias=parameters.output_layer_norm.bias, + epsilon=1e-12, + memory_config=ttnn.L1_MEMORY_CONFIG, + ) + + ttnn.deallocate(sa_output) + return ffn_output + + +def transformer( + config, + x, + attention_mask=None, + head_mask=None, + output_attentions: bool = False, + output_hidden_states: bool = False, + base_address=None, + parameters=None, + device=None, + min_val_tensor=None, + negative_val_tensor=None, + mesh_mapper=None, +): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + hidden_state = x + + for params in parameters.layer: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + layer_outputs = transformer_block( + config=config, + x=hidden_state, + attention_mask=attention_mask, + head_mask=None, + output_attentions=output_attentions, + base_address=f"{base_address}.layer", + parameters=params, + device=device, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + ) + hidden_state = layer_outputs + + return hidden_state + + +def distilbert( + config, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + position_ids=None, + min_val_tensor=None, + negative_val_tensor=None, + *, + base_address, + parameters, + device, + mesh_mapper=None, + ip_mesh_mapper=None, +): + output_attentions = output_attentions if output_attentions is not None else config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else config.output_hidden_states + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + head_mask = get_head_mask(head_mask, config.num_hidden_layers) + + if input_ids is not None: + word_embeddings = ttnn.embedding( + input_ids, + parameters.distilbert.embeddings.word_embeddings.weight, + layout=ttnn.TILE_LAYOUT, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + ) + seq_length = word_embeddings.shape[1] + + if position_ids is not None: + position_ids = position_ids[:, :seq_length] + + position_embeddings = ttnn.embedding( + position_ids, + parameters.distilbert.embeddings.position_embeddings.weight, + layout=ttnn.TILE_LAYOUT, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + ) + + transpose = False + if word_embeddings.shape[0] > 1: + word_embeddings = ttnn.permute(word_embeddings, (1, 2, 0)) + position_embeddings = ttnn.permute(position_embeddings, (1, 2, 0)) + transpose = True + + embeddings = word_embeddings + position_embeddings + + ttnn.deallocate(word_embeddings) + + if transpose: + embeddings = ttnn.permute(embeddings, (2, 0, 1)) + + embeddings = ttnn.layer_norm( + embeddings, + epsilon=1e-12, + weight=parameters.distilbert.embeddings.LayerNorm.weight, + bias=parameters.distilbert.embeddings.LayerNorm.bias, + memory_config=ttnn.L1_MEMORY_CONFIG, + ) + + return transformer( + config, + embeddings, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + base_address=f"distilbert.transformer", + parameters=parameters.distilbert.transformer, + device=device, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + ) + + +def distilbert_for_question_answering( + config, + input_ids, + attention_mask, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + min_val_tensor=None, + negative_val_tensor=None, + *, + parameters, + device, + base_address="", + mesh_mapper=None, + ip_mesh_mapper=None, +): + distilbert_output = distilbert( + config, + input_ids, + attention_mask, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + position_ids=position_ids, + device=device, + base_address=f"", + parameters=parameters, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + ip_mesh_mapper=ip_mesh_mapper, + ) + + qa_outputs = ttnn.linear( + distilbert_output, + parameters.qa_outputs.weight, + bias=parameters.qa_outputs.bias, + memory_config=ttnn.L1_MEMORY_CONFIG, + core_grid=ttnn.CoreGrid(y=device.core_grid.y, x=device.core_grid.x), + ) + + return qa_outputs + + +def preprocess_inputs( + input_ids, + position_ids, + attention_mask, + device, + mesh_mapper, +): + input_ids = ttnn.from_torch(input_ids, mesh_mapper=mesh_mapper, device=device) + if position_ids is not None: + position_ids = ttnn.from_torch(position_ids, mesh_mapper=mesh_mapper, device=device) + attention_mask = ttnn.from_torch(attention_mask, mesh_mapper=mesh_mapper, device=device) + return (input_ids, position_ids, attention_mask) + + +def custom_preprocessor(torch_model, name): + parameters = {} + + if hasattr(torch_model, "q_lin") and hasattr(torch_model, "k_lin") and hasattr(torch_model, "v_lin"): + qkv_weight = torch.cat( + [ + torch_model.q_lin.weight, + torch_model.k_lin.weight, + torch_model.v_lin.weight, + ], + dim=0, + ) + qkv_bias = torch.cat( + [torch_model.q_lin.bias, torch_model.k_lin.bias, torch_model.v_lin.bias], + dim=0, + ) + output_weight = torch_model.out_lin.weight + output_bias = torch_model.out_lin.bias + parameters = {"query_key_value": {}, "out_lin": {}} + parameters["query_key_value"]["weight"] = preprocess_linear_weight(qkv_weight, dtype=ttnn.bfloat16) + parameters["query_key_value"]["bias"] = preprocess_linear_bias(qkv_bias, dtype=ttnn.bfloat16) + parameters["out_lin"]["weight"] = preprocess_linear_weight(output_weight, dtype=ttnn.bfloat16) + parameters["out_lin"]["bias"] = preprocess_linear_bias(output_bias, dtype=ttnn.bfloat16) + return parameters diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 6c41b5ae3367..d6e4adb30a42 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -17,6 +17,8 @@ run_perf_models_other() { if [ "$tt_arch" == "wormhole_b0" ]; then env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/resnet50/tests/test_perf_e2e_resnet50.py -m $test_marker + + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/distilbert/tests/test_perf_distilbert.py -m $test_marker fi env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker @@ -120,6 +122,8 @@ run_device_perf_models() { env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/metal_BERT_large_11/tests -m $test_marker env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b_common/tests -m $test_marker + + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/distilbert/tests -m $test_marker fi ## Merge all the generated reports diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh index ee1df0e55675..d9a89fc32a32 100755 --- a/tests/scripts/single_card/run_single_card_demo_tests.sh +++ b/tests/scripts/single_card/run_single_card_demo_tests.sh @@ -31,6 +31,8 @@ run_common_func_tests() { # ConvNet Mnist pytest --disable-warnings models/demos/convnet_mnist/demo/demo.py --timeout 600; fail+=$? + pytest --disable-warnings models/demos/wormhole/distilbert/demo/demo.py --timeout 600; fail+=$? + return $fail } diff --git a/tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert_wh.py b/tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert_wh.py new file mode 100644 index 000000000000..a2958c66aa7d --- /dev/null +++ b/tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert_wh.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import torch + +import ttnn +from ttnn.model_preprocessing import preprocess_model_parameters +from transformers import ( + DistilBertForQuestionAnswering as HF_DistilBertForQuestionAnswering, +) +from transformers import AutoTokenizer +from models.demos.wormhole.distilbert.tt import ttnn_optimized_distilbert +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import is_wormhole_b0, skip_for_grayskull + + +@skip_for_grayskull() +@pytest.mark.parametrize("model_name", ["distilbert-base-uncased-distilled-squad"]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("sequence_size", [768]) +def test_distilbert_for_question_answering(mesh_device, model_name, batch_size, sequence_size, reset_seeds): + tokenizer = AutoTokenizer.from_pretrained(model_name) + HF_model = HF_DistilBertForQuestionAnswering.from_pretrained(model_name) + HF_model.eval() + + tt_model_name = f"ttnn_{model_name}_optimized" + + inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0) + weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device) + output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0) + + with ttnn.distribute(ttnn.ReplicateTensorToMesh(mesh_device)): + parameters = preprocess_model_parameters( + model_name=tt_model_name, + initialize_model=lambda: HF_model, + custom_preprocessor=ttnn_optimized_distilbert.custom_preprocessor, + device=mesh_device, + ) + + model = HF_model.eval() + config = HF_model.config + + question = batch_size * ["Where do I live?"] + context = batch_size * ["My name is Merve and I live in İstanbul."] + inputs = tokenizer( + question, + context, + return_tensors="pt", + padding="max_length", + max_length=384, + truncation=True, + return_attention_mask=True, + ) + input_ids = inputs.input_ids + attention_mask = inputs.attention_mask + position_ids = torch.arange(config.max_position_embeddings).expand((1, -1)) + position_ids = torch.cat([position_ids] * batch_size, dim=0) + mask_reshp = (batch_size, 1, 1, attention_mask.shape[1]) + score_shape = (batch_size, 12, 384, 384) + + mask = (attention_mask == 0).view(mask_reshp).expand(score_shape) + min_val = torch.zeros(score_shape) + min_val_tensor = min_val.masked_fill(mask, torch.tensor(torch.finfo(torch.bfloat16).min)) + + negative_val = torch.zeros(score_shape) + negative_val_tensor = negative_val.masked_fill(mask, -1) + torch_output = model(input_ids, attention_mask) + + tt_model_name = f"ttnn_{model_name}_optimized" + + input_ids, position_ids, attention_mask = ttnn_optimized_distilbert.preprocess_inputs( + input_ids, position_ids, attention_mask, device=mesh_device, mesh_mapper=inputs_mesh_mapper + ) + + min_val_tensor = ttnn.from_torch( + min_val_tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + mesh_mapper=inputs_mesh_mapper, + device=mesh_device, + ) + + negative_val_tensor = ttnn.from_torch( + negative_val_tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + mesh_mapper=inputs_mesh_mapper, + device=mesh_device, + ) + + tt_output = ttnn_optimized_distilbert.distilbert_for_question_answering( + config, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + parameters=parameters, + device=mesh_device, + min_val_tensor=min_val_tensor, + negative_val_tensor=negative_val_tensor, + mesh_mapper=weights_mesh_mapper, + ip_mesh_mapper=inputs_mesh_mapper, + ) + + tt_output = ttnn.to_torch(tt_output, mesh_composer=output_mesh_composer) + start_logits, end_logits = tt_output.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + assert_with_pcc(torch_output.start_logits, start_logits, 0.99) + assert_with_pcc(torch_output.end_logits, end_logits, 0.99)