diff --git a/models/demos/wormhole/llama31_8b/demo/demo_trace.py b/models/demos/wormhole/llama31_8b/demo/demo_trace.py index cc42ebab512..c789cd354c6 100644 --- a/models/demos/wormhole/llama31_8b/demo/demo_trace.py +++ b/models/demos/wormhole/llama31_8b/demo/demo_trace.py @@ -124,10 +124,10 @@ def run_llama_demo(user_input, batch_size, device, instruct_mode, is_ci_env, num output_filename = f"{output_directory}/demo_user_output_{timestamp}.txt" # Set Llama flags for CI - # if is_ci_env and instruct_mode: # Update paths for instruct mode, otherwise use default paths for general weights - os.environ["LLAMA_CKPT_DIR"] = "/proj_sw/user_dev/hf_data/llama/Meta-Llama-3.1-8B-Instruct/" - os.environ["LLAMA_TOKENIZER_PATH"] = "/proj_sw/user_dev/hf_data/llama/Meta-Llama-3.1-8B-Instruct/" - os.environ["LLAMA_CACHE_PATH"] = "/proj_sw/user_dev/hf_data/llama/Meta-Llama-3.1-8B-Instruct/" + if is_ci_env and instruct_mode: # Update paths for instruct mode, otherwise use default paths for general weights + os.environ["LLAMA_CKPT_DIR"] = "/proj_sw/user_dev/hf_data/llama/Meta-Llama-3.1-8B-Instruct/" + os.environ["LLAMA_TOKENIZER_PATH"] = "/proj_sw/user_dev/hf_data/llama/Meta-Llama-3.1-8B-Instruct/" + os.environ["LLAMA_CACHE_PATH"] = "/proj_sw/user_dev/hf_data/llama/Meta-Llama-3.1-8B-Instruct/" # This module requires the env paths above for CI runs from models.demos.wormhole.llama31_8b.tt.model_config import TtModelArgs diff --git a/models/demos/wormhole/llama31_8b/tests/test_llama_perf.py b/models/demos/wormhole/llama31_8b/tests/test_llama_perf.py index dce87212d11..3afd3b088ae 100644 --- a/models/demos/wormhole/llama31_8b/tests/test_llama_perf.py +++ b/models/demos/wormhole/llama31_8b/tests/test_llama_perf.py @@ -9,9 +9,8 @@ import ttnn from models.demos.wormhole.llama31_8b.tt.llama_common import ( prepare_inputs_ttnn, - sample, - HostEmbedding, get_single_rot_mat, + HostEmbedding, ) from models.demos.wormhole.llama31_8b.tt.llama_model import TtTransformer from models.demos.wormhole.llama31_8b.tt.llama_embedding import TtLlamaEmbedding @@ -137,35 +136,38 @@ def run_inference(device, tt_model, tt_embd, embd, encoded_prompts, generation_s # Select the first token from the prompts for initial decoding encoded_prompts_tensor = torch.tensor(encoded_prompts) # [:,0] - for i in range(generation_length): - current_pos = generation_start_pos + i - pt_decode_input = embd(encoded_prompts_tensor[:, 0]).view(batch, seqlen, -1) - tt_decode_input = pt_decode_input - decode_input = prepare_inputs_ttnn( - tt_decode_input, - tt_model.args.dim, - tt_model.device, - ) + # Initialize tt_out_tok with the first token + tt_out_tok = ttnn.from_torch( + torch.nn.functional.pad( + encoded_prompts_tensor[:, 0].unsqueeze(0).unsqueeze(0).unsqueeze(0), (0, 31), "constant", 0 + ), + device=device, + dtype=ttnn.uint32, + ) - current_pos_tensor = ttnn.from_torch(torch.tensor([current_pos] * batch), device=device, dtype=ttnn.int32) - current_pos_attn_tensor = ttnn.from_torch( - torch.tensor([current_pos] * batch * 8), device=device, dtype=ttnn.int32 - ) + current_pos = ttnn.from_torch(torch.tensor([generation_start_pos] * batch), device=device, dtype=ttnn.int32) + current_pos_attn = ttnn.from_torch( + torch.tensor([generation_start_pos] * batch * 8), device=device, dtype=ttnn.int32 + ) + for i in range(generation_length): # Run TT model profiler.start(f"model_run_for_inference_{i}") - tt_out = tt_model(decode_input, current_pos_tensor, current_pos_attn_tensor, rot_mat=current_rot_mat) - # Convert ttnn tensor to torch tensor - profiler.start(f"result_wait_for_inference_{i}") - tt_out = ttnn.untilize(tt_out, use_multicore=True) - tt_output_torch = ttnn.to_torch(tt_out).permute(2, 1, 0, 3).squeeze(1) # [seq, batch, hidden_dim] + decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok)) + tt_out = tt_model(decode_input, current_pos, current_pos_attn, rot_mat=current_rot_mat) + tt_out_rm = ttnn.untilize(tt_out, use_multicore=True) + ttnn.deallocate(tt_out) + tt_out_tok = ttnn.argmax(tt_out_rm, dim=3, use_multicore=True, output_tensor=tt_out_tok) + ttnn.deallocate(tt_out_rm) - profiler.end(f"model_run_for_inference_{i}") - profiler.end(f"result_wait_for_inference_{i}") + # Update the rotation matrix for the next iteration + new_rot_mat = ttnn.linear(rot_matrix, current_rot_mat) + current_rot_mat = ttnn.copy(new_rot_mat, current_rot_mat) + ttnn.plus_one(current_pos) + ttnn.plus_one(current_pos_attn) - # Greedy decode the generated token and pass it back in, this is just a perf test - tt_out_tok = sample(tt_output_torch, temperature=0, top_p=1) + profiler.end(f"model_run_for_inference_{i}") - # Update the rotation matrix for the next iteration - current_rot_mat = ttnn.linear(rot_matrix, current_rot_mat) + # Synchronize device to ensure all operations are complete + ttnn.synchronize_device(device)