Skip to content

Commit

Permalink
13398: Update batch_size to 256 for data parallel MNIST
Browse files Browse the repository at this point in the history
  • Loading branch information
sabira-mcw committed Nov 21, 2024
1 parent 0712b23 commit cd8ea6c
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 20 deletions.
4 changes: 2 additions & 2 deletions models/demos/wormhole/mnist/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ WH N150, WH N300

The MNIST model uses only fully connected linear layers to classify handwritten digits from the MNIST dataset. Despite the absence of convolutional layers, the model efficiently processes the 28x28 pixel images by flattening them into a 1D vector and passing them through multiple linear layers to predict the corresponding digit (0-9). This approach demonstrates how even simpler architectures can be applied for image classification tasks.

### Batch size: 512
### Batch size: 256

Batch Size determines the number of input sequences processed simultaneously during training or inference, impacting computational efficiency and memory usage. It's recommended to set the batch_size to 512
Batch Size determines the number of input sequences processed simultaneously during training or inference, impacting computational efficiency and memory usage. It's recommended to set the batch_size to 256

## How to Run

Expand Down
8 changes: 5 additions & 3 deletions models/demos/wormhole/mnist/demo/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from torch.utils.data import DataLoader
from models.demos.wormhole.mnist.reference.mnist import MnistModel
from models.demos.wormhole.mnist.tt import tt_mnist

from models.utility_functions import disable_persistent_kernel_cache
from ttnn.model_preprocessing import preprocess_model_parameters
from models.utility_functions import is_wormhole_b0, skip_for_grayskull

Expand All @@ -25,7 +25,8 @@ def run_demo_dataset(batch_size, iterations, model_location_generator, mesh_devi
state_dict = torch.load(model_location_generator("mnist_model.pt", model_subdir="mnist"))
model = MnistModel(state_dict)
model = model.eval()

mesh_device_flag = is_wormhole_b0() and ttnn.GetNumAvailableDevices() == 2
batch_size = batch_size if mesh_device_flag else batch_size // 2
inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0)
output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0)
with ttnn.distribute(ttnn.ReplicateTensorToMesh(mesh_device)):
Expand Down Expand Up @@ -68,14 +69,15 @@ def run_demo_dataset(batch_size, iterations, model_location_generator, mesh_devi


@skip_for_grayskull()
@pytest.mark.parametrize("batch_size", [512])
@pytest.mark.parametrize("batch_size", [256])
@pytest.mark.parametrize("iterations", [1])
def test_demo_dataset(
batch_size,
iterations,
model_location_generator,
mesh_device,
):
disable_persistent_kernel_cache()
return run_demo_dataset(
batch_size=batch_size,
iterations=iterations,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@
def get_expected_times(tt_mnist):
if is_wormhole_b0():
return {
tt_mnist: (10.460, 0.0139),
tt_mnist: (10.89, 0.017),
}[tt_mnist]


@pytest.mark.models_performance_bare_metal
@pytest.mark.models_performance_virtual_machine
@pytest.mark.parametrize(
"batch_size",
[512],
[256],
)
@pytest.mark.parametrize(
"tt_mnist",
Expand All @@ -52,7 +52,8 @@ def test_performance_mnist(mesh_device, batch_size, tt_mnist, model_location_gen
test_dataset = datasets.MNIST(root="./data", train=False, transform=transform, download=True)
dataloader = DataLoader(test_dataset, batch_size=batch_size)
x, labels = next(iter(dataloader))

mesh_device_flag = is_wormhole_b0() and ttnn.GetNumAvailableDevices() == 2
batch_size = batch_size if mesh_device_flag else batch_size // 2
inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0)
weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device)
output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0)
Expand All @@ -71,34 +72,38 @@ def test_performance_mnist(mesh_device, batch_size, tt_mnist, model_location_gen
ttnn_output = tt_mnist.mnist(mesh_device, batch_size, x, parameters)
end = time.time()
durations.append(end - start)
# enable_persistent_kernel_cache()
enable_persistent_kernel_cache()

inference_and_compile_time, *inference_times = durations
average_inference_time = sum(inference_times) / len(inference_times)
inference_time = sum(inference_times) / len(inference_times)
expected_compile_time, expected_inference_time = get_expected_times(tt_mnist)

prep_perf_report(
model_name="MNIST",
batch_size=batch_size,
inference_and_compile_time=inference_and_compile_time,
inference_time=average_inference_time,
inference_time=inference_time,
expected_compile_time=expected_compile_time,
expected_inference_time=expected_inference_time,
comments="",
inference_time_cpu=0.0,
)

logger.info(f"Compile time: {inference_and_compile_time - average_inference_time}")
logger.info(f"Inference time: {average_inference_time}")
logger.info(f"Compile time: {inference_and_compile_time - inference_time}")
logger.info(f"Inference time: {inference_time}")
logger.info(f"Inference times: {inference_times}")
logger.info(f"Sample(s) per second: {1 / average_inference_time * batch_size}")
logger.info(f"Sample(s) per second: {1 / inference_time * batch_size}")
assert (
inference_time < expected_inference_time
), f"Expected inference time: {expected_inference_time} Actual inference time: {inference_time}"
logger.info("Exit MNIST perf test")


@skip_for_grayskull()
@pytest.mark.parametrize(
"batch_size, expected_perf",
[
[512, 2899420.682],
[256, 1520045.60],
],
)
@pytest.mark.models_device_performance_bare_metal
Expand All @@ -107,7 +112,7 @@ def test_perf_device_bare_metal(batch_size, expected_perf):
num_iterations = 1
margin = 0.03

command = f"pytest tests/ttnn/integration_tests/mnist/test_mnist.py"
command = f"pytest tests/ttnn/integration_tests/mnist/test_mnist_wh.py"
cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

inference_time_key = "AVG DEVICE KERNEL SAMPLES/S"
Expand Down
5 changes: 3 additions & 2 deletions tests/scripts/run_performance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ run_perf_models_other() {
if [ "$tt_arch" == "wormhole_b0" ]; then
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/resnet50/tests/test_perf_e2e_resnet50.py -m $test_marker

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mnist/tests/test_perf_mnist.py -m $test_marker
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mnist/tests/test_perf_mnist_wh.py -m $test_marker
fi

env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker
Expand Down Expand Up @@ -113,6 +113,8 @@ run_device_perf_models() {
fi

if [ "$tt_arch" == "wormhole_b0" ]; then
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yam pytets models/demos/wormhole/mnist/tests -m $test_marker

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/resnet50/tests -m $test_marker

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests/test_unet_perf.py -m $test_marker
Expand All @@ -123,7 +125,6 @@ run_device_perf_models() {

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b_common/tests -m $test_marker

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yam pytets models/demos/wormhole/mnist/tests/test_perf_mnist.py::test_performance_mnist -m $test_marker
fi

## Merge all the generated reports
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
@skip_for_grayskull()
@pytest.mark.parametrize(
"batch_size",
[512],
[256],
)
def test_mnist(mesh_device, reset_seeds, batch_size, model_location_generator):
state_dict = torch.load(model_location_generator("mnist_model.pt", model_subdir="mnist"))
Expand All @@ -26,10 +26,12 @@ def test_mnist(mesh_device, reset_seeds, batch_size, model_location_generator):
dataloader = DataLoader(test_dataset, batch_size=batch_size)
x, labels = next(iter(dataloader))
torch_output = model(x)
mesh_device_flag = is_wormhole_b0() and ttnn.GetNumAvailableDevices() == 2
batch_size = batch_size if mesh_device_flag else batch_size // 2
inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0)
weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device)
output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0)
mesh_device_flag = True

with ttnn.distribute(ttnn.ReplicateTensorToMesh(mesh_device)):
parameters = preprocess_model_parameters(initialize_model=lambda: model, device=mesh_device)

Expand Down

0 comments on commit cd8ea6c

Please sign in to comment.