#8865: For host dispatch time measureing infra: Change column names; …

…reduce num_repeats for long running ops; Move prints out of measuring loop; run tracy with --no-device
tenstorrent · Jun 13, 2024 · e12d00e · e12d00e
1 parent bf70636
commit e12d00e
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 37 deletions.
diff --git a/tests/tt_eager/profiling/README.md b/tests/tt_eager/profiling/README.md
@@ -23,7 +23,7 @@ pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cl
 After the script is finished profiling results are in the designated output folder in file `host_overhead_profiler_output.csv`. Content of output csv might look like:
 
 ```
-op,count,overhead min(ms),overhead mean(ms),total mean(ms)
+op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms)
 tt_lib.tensor.add,8,0.87,0.89,1.08
 tt_lib.tensor.sub,8,0.94,0.94,1.1
 tt_lib.tensor.mul,8,0.82,0.85,1.0
@@ -36,11 +36,11 @@ tt_lib.tensor.atan2,8,40.93,43.69,43.35
 ```
 
 Columns:
-* op: Op being profiled
-* count: Number of profile runs.
-* overhead min(ms): Minimum measured dispatch time (overhead).
-* overhead mean(ms): Mean of measured dispatch times.
-* total mean(ms): Total time needed to run the op (both dispatch and kernel time).
+* `op`: Op being profiled
+* `count`: Number of profile runs.
+* `python min dispatch time (ms)`: Minimum measured dispatch time (overhead).
+* `python mean dispatch time(ms)`: Mean of measured dispatch times.
+* `python mean dispatch + sync time (ms)`: Total time needed to run the op (both dispatch and kernel time). Measured after syncronize.
 
 
 ## profile_host_overhead_with_tracy.py
@@ -56,7 +56,7 @@ python tests/tt_eager/profiling/profile_host_overhead_with_tracy.py -o host_over
 It profiles all tt_lib ops and saves measurement results to `host_overhead_profile/final.csv`. Output might look like:
 
 ```
-op,count,overhead min(ms),overhead mean(ms),total mean(ms),HOST DURATION [ms]
+op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms),C++ mean dispatch time (ms)
 tt_lib.tensor.add,40,0.85,0.93,1.21,0.96
 tt_lib.tensor.atan2,40,37.16,40.51,42.17,40.36
 tt_lib.tensor.div,40,0.87,0.92,1.18,0.96
@@ -69,7 +69,7 @@ tt_lib.tensor.squared_difference,40,0.91,1.1,1.39,1.08
 tt_lib.tensor.sub,40,0.88,0.92,1.21,0.98
 ```
 
-Added column is `HOST DURATION [ms]` which is host time parsed from `Tracy` output.
+Added column is `C++ mean dispatch time (ms)` which is host time parsed from `Tracy` output and mean taken from those measurements.
 
 
 ## test_host_overhead_ci.py

diff --git a/tests/tt_eager/profiling/ops_for_profiling.py b/tests/tt_eager/profiling/ops_for_profiling.py
@@ -774,6 +774,7 @@ def angle_bw(x, y):
     {
         "op": polygamma_bw,
         "name": "tt_lib.tensor.polygamma_bw",
+        "num_repeats": 3,
     },
     {
         "op": tt_lib.tensor.atan_bw,
@@ -794,6 +795,7 @@ def angle_bw(x, y):
     {
         "op": tt_lib.tensor.cosh_bw,
         "name": "tt_lib.tensor.cosh_bw",
+        "num_repeats": 3,
     },
     {
         "op": tt_lib.tensor.cos_bw,
@@ -862,6 +864,7 @@ def angle_bw(x, y):
     {
         "op": tt_lib.tensor.digamma_bw,
         "name": "tt_lib.tensor.digamma_bw",
+        "num_repeats": 2,
     },
     {
         "op": tt_lib.tensor.deg2rad_bw,
@@ -1874,6 +1877,7 @@ def primary_moreh_norm_3(x):
     {
         "op": tt_lib.tensor.lgamma,
         "name": "tt_lib.tensor.lgamma",
+        "num_repeats": 3,
     },
     {
         "op": logical_andi,
@@ -2031,10 +2035,12 @@ def primary_moreh_norm_3(x):
     {
         "op": tril,
         "name": "tt_lib.tensor.tril",
+        "num_repeats": 3,
     },
     {
         "op": triu,
         "name": "tt_lib.tensor.triu",
+        "num_repeats": 3,
     },
     {
         "op": reduce_sum_h,
@@ -2123,6 +2129,7 @@ def primary_moreh_norm_3(x):
     {
         "op": sum_0,
         "name": "tt_lib.tensor.sum_dim_0",
+        "num_repeats": 2,
     },
     {
         "op": sum_1,
@@ -2259,10 +2266,12 @@ def primary_moreh_norm_3(x):
     {
         "op": repeat_interleave_1,
         "name": "tt_lib.tensor.repeat_interleave_dim_1",
+        "num_repeats": 2,
     },
     {
         "op": repeat_interleave_2,
         "name": "tt_lib.tensor.repeat_interleave_dim_2",
+        "num_repeats": 2,
     },
     {
         "op": pow_int,
@@ -2279,34 +2288,42 @@ def primary_moreh_norm_3(x):
     {
         "op": argmax_1,
         "name": "tt_lib.tensor.argmax_dim_3",
+        "num_repeats": 2,
     },
     {
         "op": argmax_2,
         "name": "tt_lib.tensor.argmax_dim_2",
+        "num_repeats": 2,
     },
     {
         "op": argmax_3,
         "name": "tt_lib.tensor.argmax_dim_1",
+        "num_repeats": 2,
     },
     {
         "op": argmax_all,
         "name": "tt_lib.tensor.argmax_all",
+        "num_repeats": 2,
     },
     {
         "op": argmin_1,
         "name": "tt_lib.tensor.argmin_dim_3",
+        "num_repeats": 2,
     },
     {
         "op": argmin_2,
         "name": "tt_lib.tensor.argmin_dim_2",
+        "num_repeats": 2,
     },
     {
         "op": argmin_3,
         "name": "tt_lib.tensor.argmin_dim_1",
+        "num_repeats": 2,
     },
     {
         "op": argmin_all,
         "name": "tt_lib.tensor.argmin_all",
+        "num_repeats": 2,
     },
     {
         "op": tt_lib.tensor.fill_zero_bw,
@@ -2646,6 +2663,7 @@ def fused_linear_shape_func(input_shape):
     {
         "op": lamb_optimizer,
         "name": "tt_lib.tensor.lamb_optimizer",
+        "num_repeats": 2,
     },
     {
         "op": addalpha_bw,
@@ -2666,6 +2684,7 @@ def fused_linear_shape_func(input_shape):
     {
         "op": tt_lib.tensor.div_bw,
         "name": "tt_lib.tensor.div_bw",
+        "num_repeats": 3,
     },
     {
         "op": tt_lib.tensor.mul_bw,

diff --git a/tests/tt_eager/profiling/profile_host_overhead.py b/tests/tt_eager/profiling/profile_host_overhead.py
@@ -32,7 +32,7 @@
 ]
 
 all_num_call_to_stack = [1, 3]  # For 10 and more test  execution spills to dispatch
-num_repeats = 5
+NUM_REPEATS = 5
 
 
 def measure_host_overhead(op_func, op_name, device, num_call_to_stack, is_warmup):
@@ -46,34 +46,29 @@ def measure_host_overhead(op_func, op_name, device, num_call_to_stack, is_warmup
     tt_lib.device.Synchronize(device)
 
     duration = 1000 * (time.time() - start_time)
-    total_op_time = duration / num_call_to_stack
-    logger.info(f"{num_call_to_stack} calls and Synchronize after {duration:.2f}ms ({total_op_time:.2f}ms per call)")
+    avg_op_time = duration / num_call_to_stack
+    logger.info(f"{num_call_to_stack} calls and Synchronize after {duration:.2f}ms ({avg_op_time:.2f}ms per call)")
 
     start_time = time.time()
     for _ in range(num_call_to_stack):
         op_func()
 
-    duration = 1000 * (time.time() - start_time)
-    overhead_ms = duration / num_call_to_stack
-    logger.info(f"{num_call_to_stack} calls without Synchronize {duration:.2f}ms ({overhead_ms:.2f}ms per call)")
-
-    start_time = time.time()
+    dispatch_end_time = time.time()
     tt_lib.device.Synchronize(device)
-    duration = 1000 * (time.time() - start_time)
-    duration_per_call = duration / num_call_to_stack
-    logger.info(f"Synchronize {duration:.2f}ms ({duration_per_call:.2f}ms per call)")
+
+    sync_time = 1000 * (time.time() - dispatch_end_time)
+    dispatch_time = 1000 * (dispatch_end_time - start_time)
+    avg_dispatch_time = dispatch_time / num_call_to_stack
+
+    logger.info(
+        f"{num_call_to_stack} calls without Synchronize {dispatch_time:.2f}ms ({avg_dispatch_time:.2f}ms per call)"
+    )
+    logger.info(f"Synchronize {sync_time:.2f}ms ({(sync_time/num_call_to_stack):.2f}ms per call)")
 
     if not is_warmup:
         signpost(header=f"end {op_name}")
 
-    try:
-        # Dumping profile info
-        logger.info(f"Dumping device profiler data")
-        tt_lib.device.DumpDeviceProfiler(device)
-    except Exception as e:
-        logger.warning(f"DumpDeviceProfiler {e}")
-
-    return overhead_ms, total_op_time
+    return avg_dispatch_time, avg_op_time
 
 
 def measure_host_overhead_binary(
@@ -199,6 +194,7 @@ def run_measure_host_overhead(op, device, text_file, measuring_func):
         if "layout" in op and op["layout"] == "ROW_MAJOR":
             dlayout = tt_lib.tensor.Layout.ROW_MAJOR
 
+        num_repeats = op["num_repeats"] if "num_repeats" in op else NUM_REPEATS
         shape_func = None if "shape_func" not in op else op["shape_func"]
 
         # Warmup
@@ -253,7 +249,7 @@ def test_host_overhead(device, user_input):
     pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input host_overhead_profile::tt_lib.tensor.isclose
 
     Run with tracy:
-    python -m tracy -v -r -p -o host_overhead_profile -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input host_overhead_profile"
+    python -m tracy -v -r -p -o host_overhead_profile --no-device -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input host_overhead_profile"
     """
 
     if "::" in user_input[0]:
@@ -270,7 +266,10 @@ def test_host_overhead(device, user_input):
         os.makedirs(out_directory)
 
     with open(out_file_path, "w") as text_file:
-        text_file.write(f"op,count,overhead min(ms),overhead mean(ms),total mean(ms)\n")
+        start_time = time.time()
+        text_file.write(
+            f"op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms)\n"
+        )
 
         for op in ops_for_profiling.all_binary_ops:
             if op_name != "":
@@ -292,3 +291,6 @@ def test_host_overhead(device, user_input):
                     continue
 
             run_measure_host_overhead(op, device, text_file, measure_host_overhead_ternary)
+
+        duration = (time.time() - start_time) / 60
+        logger.info(f"Profiling finished in {duration:.2f}min")
diff --git a/tests/tt_eager/profiling/profile_host_overhead_with_tracy.py b/tests/tt_eager/profiling/profile_host_overhead_with_tracy.py
@@ -34,7 +34,7 @@ def profile_host_overhead(output_directory, output_csv):
 
     for op_name in all_ops:
         op_id = f"{i:03d}"
-        command = f'python -m tracy -v -r -p -o {output_directory} -n {op_id}_{op_name} -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input {output_directory}::{op_name}"'
+        command = f'python -m tracy -v -r -p -o {output_directory} -n {op_id}_{op_name} --no-device -m "pytest tests/tt_eager/profiling/profile_host_overhead.py --input-method cli --cli-input {output_directory}::{op_name}"'
         subprocess.run([command], shell=True, check=False, env=currentEnvs, timeout=3000)
         i += 1
 
@@ -99,10 +99,10 @@ def profile_host_overhead(output_directory, output_csv):
                 # Add the average value to the final dataframe
                 op_count = row["count"]
                 host_duration = round(host_duration / (op_count * 1000 * 1000), 2)
-                final_df.loc[index, "HOST DURATION [ms]"] = host_duration
+                final_df.loc[index, "C++ mean dispatch time (ms)"] = host_duration
 
     duration = (time.time() - start_time) / 60
-    logger.info(f"Total ops profiled {len(all_ops)} in {duration:.2f}min")
+    logger.info(f"{len(all_ops)} ops profiled in {duration:.2f}min")
 
     # Sort output
     final_df = final_df.sort_values("op")

diff --git a/tests/tt_eager/profiling/reference.txt b/tests/tt_eager/profiling/reference.txt
@@ -1,4 +1,4 @@
-op,count,overhead min(ms),overhead mean(ms),total mean(ms),HOST DURATION [ms]
+op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms),C++ mean dispatch time (ms)
 tt_lib.fused_ops.linear.Linear,40,2.56,3.9,4.2,3.49
 tt_lib.fused_ops.softmax.softmax,40,3.22,3.36,3.74,3.37
 tt_lib.operations.primary.add_layernorm,40,0.99,1.19,1.65,1.26

diff --git a/tests/tt_eager/profiling/test_host_overhead_ci.py b/tests/tt_eager/profiling/test_host_overhead_ci.py
@@ -16,11 +16,7 @@ def test_host_overhead_ci():
     reference_filename = "tests/tt_eager/profiling/reference.txt"
     measuring_tolerance = 1.1
 
-    start_time = time.time()
     profile_host_overhead(profile_output_folder, profile_output_filename)
-    duration = (time.time() - start_time) / 60
-
-    logger.info(f"Profiled host overhead for all ops in {duration:.2f}min")
 
     with open(measured_filename, mode="r") as infile:
         reader = csv.reader(infile)