Skip to content

Commit

Permalink
#8865: For host dispatch time measureing infra: Change column names; …
Browse files Browse the repository at this point in the history
…reduce num_repeats for long running ops; Move prints out of measuring loop; run tracy with --no-device
  • Loading branch information
nemanjagrujic committed Jun 13, 2024
1 parent bf70636 commit e12d00e
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 37 deletions.
16 changes: 8 additions & 8 deletions tests/tt_eager/profiling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cl
After the script is finished profiling results are in the designated output folder in file `host_overhead_profiler_output.csv`. Content of output csv might look like:

```
op,count,overhead min(ms),overhead mean(ms),total mean(ms)
op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms)
tt_lib.tensor.add,8,0.87,0.89,1.08
tt_lib.tensor.sub,8,0.94,0.94,1.1
tt_lib.tensor.mul,8,0.82,0.85,1.0
Expand All @@ -36,11 +36,11 @@ tt_lib.tensor.atan2,8,40.93,43.69,43.35
```

Columns:
* op: Op being profiled
* count: Number of profile runs.
* overhead min(ms): Minimum measured dispatch time (overhead).
* overhead mean(ms): Mean of measured dispatch times.
* total mean(ms): Total time needed to run the op (both dispatch and kernel time).
* `op`: Op being profiled
* `count`: Number of profile runs.
* `python min dispatch time (ms)`: Minimum measured dispatch time (overhead).
* `python mean dispatch time(ms)`: Mean of measured dispatch times.
* `python mean dispatch + sync time (ms)`: Total time needed to run the op (both dispatch and kernel time). Measured after syncronize.


## profile_host_overhead_with_tracy.py
Expand All @@ -56,7 +56,7 @@ python tests/tt_eager/profiling/profile_host_overhead_with_tracy.py -o host_over
It profiles all tt_lib ops and saves measurement results to `host_overhead_profile/final.csv`. Output might look like:

```
op,count,overhead min(ms),overhead mean(ms),total mean(ms),HOST DURATION [ms]
op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms),C++ mean dispatch time (ms)
tt_lib.tensor.add,40,0.85,0.93,1.21,0.96
tt_lib.tensor.atan2,40,37.16,40.51,42.17,40.36
tt_lib.tensor.div,40,0.87,0.92,1.18,0.96
Expand All @@ -69,7 +69,7 @@ tt_lib.tensor.squared_difference,40,0.91,1.1,1.39,1.08
tt_lib.tensor.sub,40,0.88,0.92,1.21,0.98
```

Added column is `HOST DURATION [ms]` which is host time parsed from `Tracy` output.
Added column is `C++ mean dispatch time (ms)` which is host time parsed from `Tracy` output and mean taken from those measurements.


## test_host_overhead_ci.py
Expand Down
19 changes: 19 additions & 0 deletions tests/tt_eager/profiling/ops_for_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,7 @@ def angle_bw(x, y):
{
"op": polygamma_bw,
"name": "tt_lib.tensor.polygamma_bw",
"num_repeats": 3,
},
{
"op": tt_lib.tensor.atan_bw,
Expand All @@ -794,6 +795,7 @@ def angle_bw(x, y):
{
"op": tt_lib.tensor.cosh_bw,
"name": "tt_lib.tensor.cosh_bw",
"num_repeats": 3,
},
{
"op": tt_lib.tensor.cos_bw,
Expand Down Expand Up @@ -862,6 +864,7 @@ def angle_bw(x, y):
{
"op": tt_lib.tensor.digamma_bw,
"name": "tt_lib.tensor.digamma_bw",
"num_repeats": 2,
},
{
"op": tt_lib.tensor.deg2rad_bw,
Expand Down Expand Up @@ -1874,6 +1877,7 @@ def primary_moreh_norm_3(x):
{
"op": tt_lib.tensor.lgamma,
"name": "tt_lib.tensor.lgamma",
"num_repeats": 3,
},
{
"op": logical_andi,
Expand Down Expand Up @@ -2031,10 +2035,12 @@ def primary_moreh_norm_3(x):
{
"op": tril,
"name": "tt_lib.tensor.tril",
"num_repeats": 3,
},
{
"op": triu,
"name": "tt_lib.tensor.triu",
"num_repeats": 3,
},
{
"op": reduce_sum_h,
Expand Down Expand Up @@ -2123,6 +2129,7 @@ def primary_moreh_norm_3(x):
{
"op": sum_0,
"name": "tt_lib.tensor.sum_dim_0",
"num_repeats": 2,
},
{
"op": sum_1,
Expand Down Expand Up @@ -2259,10 +2266,12 @@ def primary_moreh_norm_3(x):
{
"op": repeat_interleave_1,
"name": "tt_lib.tensor.repeat_interleave_dim_1",
"num_repeats": 2,
},
{
"op": repeat_interleave_2,
"name": "tt_lib.tensor.repeat_interleave_dim_2",
"num_repeats": 2,
},
{
"op": pow_int,
Expand All @@ -2279,34 +2288,42 @@ def primary_moreh_norm_3(x):
{
"op": argmax_1,
"name": "tt_lib.tensor.argmax_dim_3",
"num_repeats": 2,
},
{
"op": argmax_2,
"name": "tt_lib.tensor.argmax_dim_2",
"num_repeats": 2,
},
{
"op": argmax_3,
"name": "tt_lib.tensor.argmax_dim_1",
"num_repeats": 2,
},
{
"op": argmax_all,
"name": "tt_lib.tensor.argmax_all",
"num_repeats": 2,
},
{
"op": argmin_1,
"name": "tt_lib.tensor.argmin_dim_3",
"num_repeats": 2,
},
{
"op": argmin_2,
"name": "tt_lib.tensor.argmin_dim_2",
"num_repeats": 2,
},
{
"op": argmin_3,
"name": "tt_lib.tensor.argmin_dim_1",
"num_repeats": 2,
},
{
"op": argmin_all,
"name": "tt_lib.tensor.argmin_all",
"num_repeats": 2,
},
{
"op": tt_lib.tensor.fill_zero_bw,
Expand Down Expand Up @@ -2646,6 +2663,7 @@ def fused_linear_shape_func(input_shape):
{
"op": lamb_optimizer,
"name": "tt_lib.tensor.lamb_optimizer",
"num_repeats": 2,
},
{
"op": addalpha_bw,
Expand All @@ -2666,6 +2684,7 @@ def fused_linear_shape_func(input_shape):
{
"op": tt_lib.tensor.div_bw,
"name": "tt_lib.tensor.div_bw",
"num_repeats": 3,
},
{
"op": tt_lib.tensor.mul_bw,
Expand Down
44 changes: 23 additions & 21 deletions tests/tt_eager/profiling/profile_host_overhead.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
]

all_num_call_to_stack = [1, 3] # For 10 and more test execution spills to dispatch
num_repeats = 5
NUM_REPEATS = 5


def measure_host_overhead(op_func, op_name, device, num_call_to_stack, is_warmup):
Expand All @@ -46,34 +46,29 @@ def measure_host_overhead(op_func, op_name, device, num_call_to_stack, is_warmup
tt_lib.device.Synchronize(device)

duration = 1000 * (time.time() - start_time)
total_op_time = duration / num_call_to_stack
logger.info(f"{num_call_to_stack} calls and Synchronize after {duration:.2f}ms ({total_op_time:.2f}ms per call)")
avg_op_time = duration / num_call_to_stack
logger.info(f"{num_call_to_stack} calls and Synchronize after {duration:.2f}ms ({avg_op_time:.2f}ms per call)")

start_time = time.time()
for _ in range(num_call_to_stack):
op_func()

duration = 1000 * (time.time() - start_time)
overhead_ms = duration / num_call_to_stack
logger.info(f"{num_call_to_stack} calls without Synchronize {duration:.2f}ms ({overhead_ms:.2f}ms per call)")

start_time = time.time()
dispatch_end_time = time.time()
tt_lib.device.Synchronize(device)
duration = 1000 * (time.time() - start_time)
duration_per_call = duration / num_call_to_stack
logger.info(f"Synchronize {duration:.2f}ms ({duration_per_call:.2f}ms per call)")

sync_time = 1000 * (time.time() - dispatch_end_time)
dispatch_time = 1000 * (dispatch_end_time - start_time)
avg_dispatch_time = dispatch_time / num_call_to_stack

logger.info(
f"{num_call_to_stack} calls without Synchronize {dispatch_time:.2f}ms ({avg_dispatch_time:.2f}ms per call)"
)
logger.info(f"Synchronize {sync_time:.2f}ms ({(sync_time/num_call_to_stack):.2f}ms per call)")

if not is_warmup:
signpost(header=f"end {op_name}")

try:
# Dumping profile info
logger.info(f"Dumping device profiler data")
tt_lib.device.DumpDeviceProfiler(device)
except Exception as e:
logger.warning(f"DumpDeviceProfiler {e}")

return overhead_ms, total_op_time
return avg_dispatch_time, avg_op_time


def measure_host_overhead_binary(
Expand Down Expand Up @@ -199,6 +194,7 @@ def run_measure_host_overhead(op, device, text_file, measuring_func):
if "layout" in op and op["layout"] == "ROW_MAJOR":
dlayout = tt_lib.tensor.Layout.ROW_MAJOR

num_repeats = op["num_repeats"] if "num_repeats" in op else NUM_REPEATS
shape_func = None if "shape_func" not in op else op["shape_func"]

# Warmup
Expand Down Expand Up @@ -253,7 +249,7 @@ def test_host_overhead(device, user_input):
pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input host_overhead_profile::tt_lib.tensor.isclose
Run with tracy:
python -m tracy -v -r -p -o host_overhead_profile -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input host_overhead_profile"
python -m tracy -v -r -p -o host_overhead_profile --no-device -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input host_overhead_profile"
"""

if "::" in user_input[0]:
Expand All @@ -270,7 +266,10 @@ def test_host_overhead(device, user_input):
os.makedirs(out_directory)

with open(out_file_path, "w") as text_file:
text_file.write(f"op,count,overhead min(ms),overhead mean(ms),total mean(ms)\n")
start_time = time.time()
text_file.write(
f"op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms)\n"
)

for op in ops_for_profiling.all_binary_ops:
if op_name != "":
Expand All @@ -292,3 +291,6 @@ def test_host_overhead(device, user_input):
continue

run_measure_host_overhead(op, device, text_file, measure_host_overhead_ternary)

duration = (time.time() - start_time) / 60
logger.info(f"Profiling finished in {duration:.2f}min")
6 changes: 3 additions & 3 deletions tests/tt_eager/profiling/profile_host_overhead_with_tracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def profile_host_overhead(output_directory, output_csv):

for op_name in all_ops:
op_id = f"{i:03d}"
command = f'python -m tracy -v -r -p -o {output_directory} -n {op_id}_{op_name} -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input {output_directory}::{op_name}"'
command = f'python -m tracy -v -r -p -o {output_directory} -n {op_id}_{op_name} --no-device -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input {output_directory}::{op_name}"'
subprocess.run([command], shell=True, check=False, env=currentEnvs, timeout=3000)
i += 1

Expand Down Expand Up @@ -99,10 +99,10 @@ def profile_host_overhead(output_directory, output_csv):
# Add the average value to the final dataframe
op_count = row["count"]
host_duration = round(host_duration / (op_count * 1000 * 1000), 2)
final_df.loc[index, "HOST DURATION [ms]"] = host_duration
final_df.loc[index, "C++ mean dispatch time (ms)"] = host_duration

duration = (time.time() - start_time) / 60
logger.info(f"Total ops profiled {len(all_ops)} in {duration:.2f}min")
logger.info(f"{len(all_ops)} ops profiled in {duration:.2f}min")

# Sort output
final_df = final_df.sort_values("op")
Expand Down
2 changes: 1 addition & 1 deletion tests/tt_eager/profiling/reference.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
op,count,overhead min(ms),overhead mean(ms),total mean(ms),HOST DURATION [ms]
op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms),C++ mean dispatch time (ms)
tt_lib.fused_ops.linear.Linear,40,2.56,3.9,4.2,3.49
tt_lib.fused_ops.softmax.softmax,40,3.22,3.36,3.74,3.37
tt_lib.operations.primary.add_layernorm,40,0.99,1.19,1.65,1.26
Expand Down
4 changes: 0 additions & 4 deletions tests/tt_eager/profiling/test_host_overhead_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,7 @@ def test_host_overhead_ci():
reference_filename = "tests/tt_eager/profiling/reference.txt"
measuring_tolerance = 1.1

start_time = time.time()
profile_host_overhead(profile_output_folder, profile_output_filename)
duration = (time.time() - start_time) / 60

logger.info(f"Profiled host overhead for all ops in {duration:.2f}min")

with open(measured_filename, mode="r") as infile:
reader = csv.reader(infile)
Expand Down

0 comments on commit e12d00e

Please sign in to comment.