Merge pull request #309 from opencompl/sasha/big-pipeline

bigger matrix size for pipeline
opencompl · Oct 3, 2024 · 975633a · 975633a
2 parents 4a3b885 + 59d4cbe
commit 975633a
Show file tree

Hide file tree

Showing 9 changed files with 78 additions and 37 deletions.
diff --git a/Makefile b/Makefile
@@ -15,7 +15,10 @@ all: maybe_update_xdsl_commit
 low_level_representation: maybe_update_xdsl_commit
 	snakemake --cores $(JOBS) --rerun-incomplete low_level_representation
 
+pipeline: maybe_update_xdsl_commit
+	snakemake --cores $(JOBS) --rerun-incomplete pipeline
+
 clean:
-	snakemake --delete-all-output --rerun-incomplete fast all
+	snakemake --delete-all-output --rerun-incomplete fast all pipeline low_level_representation
 
 include ./Makefile.xdsl
diff --git a/Snakefile b/Snakefile
@@ -85,10 +85,22 @@ MANUAL_KERNELS = [
 # Test sets
 ###########################################################
 
+PIPELINE_PARAMS_FAST = "1x20x5xf64"
+PIPELINE_PARAMS_FULL = "1x200x5xf64"
+
+TESTSET_PIPELINE_FAST = [
+    *expand("matmul/" + PIPELINE_PARAMS_FAST + "/{phase}", phase=XDSL_LINALG_OPT_VARIANTS),
+]
+
+TESTSET_PIPELINE = [
+    *expand("matmul/" + PIPELINE_PARAMS_FULL + "/{phase}", phase=XDSL_LINALG_OPT_VARIANTS),
+]
+
 # Minimum set of tests to be used as a meaningful smoke test,
 # runs as fast as possible to save CI time
 TESTSET_FAST = [
     *MANUAL_KERNELS,
+    *TESTSET_PIPELINE_FAST,
     # 3d templated kernels
     *expand(
         "matmul_transb/4x16x16xf32/{variant}",
@@ -118,7 +130,6 @@ TESTSET_FAST = [
         "sum/4x8xf32/{variant}", variant=["baseline", "snrt", "linalg", "linalg_xdsl"]
     ),
     *expand("sum/8x8xf16/{variant}", variant=["baseline", "linalg_xdsl"]),
-    *expand("matmul/1x20x5xf64/{phase}", phase=XDSL_LINALG_OPT_VARIANTS),
 ]
 
 TESTSET_LOW_LEVEL_REPRESENTATION = [
@@ -176,6 +187,7 @@ TESTSET_LOW_LEVEL_REPRESENTATION = [
 TESTSET_ALL = [
     *MANUAL_KERNELS,
     *TESTSET_LOW_LEVEL_REPRESENTATION,
+    *TESTSET_PIPELINE,
     # 3d templated kernels: baseline + linalg_xdsl
     *expand(
         "matmul/{M}x{K}x{N}xf64/{variant}",
@@ -185,9 +197,7 @@ TESTSET_ALL = [
         variant=["baseline", "linalg_xdsl"],
     ),
     # Passes contributions
-    "matmul/1x400x25xf64/linalg_xdsl",
-    "matmul/1x400x25xf64/linalg_full_xdsl",
-    *expand("matmul/1x400x25xf64/{phase}", phase=XDSL_LINALG_OPT_VARIANTS),
+    "matmul/" + PIPELINE_PARAMS_FULL + "/linalg_xdsl",
     # 2d templated kernels: baseline + linalg_xdsl
     *expand(
         "{kernel}/{M}x{N}xf64/{variant}",
@@ -224,7 +234,8 @@ def select_test_set_profiles(wildcards) -> list[str]:
     sets = {
         "fast": sorted(set(TESTSET_FAST)),
         "all": sorted(set(TESTSET_ALL)),
-        "low_level_representation": sorted(set(TESTSET_LOW_LEVEL_REPRESENTATION))
+        "low_level_representation": sorted(set(TESTSET_LOW_LEVEL_REPRESENTATION)),
+        "pipeline": sorted(set(TESTSET_PIPELINE)),
     }
     name = wildcards.testset
     if name not in sets:
@@ -233,6 +244,19 @@ def select_test_set_profiles(wildcards) -> list[str]:
         )
     return expand("kernels/{test}.profile.json", test=sets[name])
 
+def select_test_set_regalloc_jsons(wildcards) -> list[str]:
+    sets = {
+        "fast": sorted(set(TESTSET_FAST)),
+        "all": sorted(set(TESTSET_ALL)),
+        "low_level_representation": sorted(set(TESTSET_LOW_LEVEL_REPRESENTATION)),
+        "pipeline": sorted(set(TESTSET_PIPELINE)),
+    }
+    name = wildcards.testset
+    if name not in sets:
+        raise ValueError(
+            f"unknown test set name '{name}', valid values are: {sets.keys()}"
+        )
+    return expand("kernels/{test}.regalloc.json", test=sets[name])
 
 ###########################################################
 # Target rules
@@ -246,7 +270,6 @@ rule fast:
         "results/pivoted_fpu.fast.csv",
         "results/pivoted_ipc.fast.csv",
         "results/regalloc.fast.csv",
-        "results/pipeline.fast.csv",
     # This is the default rule taking over former result
     # file names:
     output:
@@ -266,6 +289,17 @@ rule low_level_representation:
     input:
         "results/kernels.low_level_representation.csv"
 
+rule pipeline:
+    input:
+        kernels="results/kernels.pipeline.csv",
+        regalloc="kernels/regalloc.pipeline.jsonl",
+        frep_count="results/frep_count.csv",
+        pipeline_py="scripts/pipeline.py",
+    output:
+        "results/pipeline.csv",
+    shell:
+        "python {input.pipeline_py} {input.kernels} {input.regalloc} {input.frep_count} -o {output}"
+
 rule all:
     input:
         "results/kernels.all.csv",
@@ -384,18 +418,18 @@ rule assembly_to_regalloc_stats:
 
 rule combine_regalloc_stats:
     input:
-        *expand("kernels/{test}.regalloc.json", test=TESTSET_FAST),
+        select_test_set_regalloc_jsons
     output:
-        "kernels/regalloc.fast.jsonl",
+        "kernels/regalloc.{testset}.jsonl",
     shell:
         "cat {input} > {output}"
 
 
 rule count_frep_instructions:
     input:
-        expand("kernels/matmul/1x20x5xf64/{test}.S", test=XDSL_LINALG_OPT_VARIANTS)
+        expand("kernels/matmul/" + PIPELINE_PARAMS_FULL + "/{phase}.S", phase=XDSL_LINALG_OPT_VARIANTS),
     output:
-        "results/frep_count.fast.csv"
+        "results/frep_count.csv"
     shell:
         """
         echo "variant,frep_count" > {output}
@@ -411,7 +445,7 @@ rule regalloc_stats_to_csv:
     input:
         "kernels/regalloc.fast.jsonl",
     output:
-        "results/regalloc.fast.csv",
+        "results/regalloc.{testset}.csv",
     run:
         import pandas as pd
 
@@ -424,18 +458,6 @@ rule regalloc_stats_to_csv:
         df.to_csv(output[0], index=True)
 
 
-rule pipeline:
-    input:
-        kernels="results/kernels.fast.csv",
-        regalloc="kernels/regalloc.fast.jsonl",
-        frep_count="results/frep_count.fast.csv",
-        pipeline_py="scripts/pipeline.py",
-    output:
-        "results/pipeline.fast.csv",
-    shell:
-        "python {input.pipeline_py} {input.kernels} {input.regalloc} {input.frep_count} -o {output}"
-
-
 rule optimization_pipelines:
     input:
         passes = "kernels/optimization_passes.txt",

diff --git a/results/frep_count.fast.csv → results/frep_count.csv b/results/frep_count.fast.csv → results/frep_count.csv
diff --git a/results/kernels.pipeline.csv b/results/kernels.pipeline.csv
@@ -0,0 +1,7 @@
+test,params,impl,cycles,end,end_fpss,fpss_avg_fpu_latency,fpss_avg_load_latency,fpss_fpu_fmadd_issues,fpss_fpu_issues,fpss_fpu_latency,fpss_fpu_occupancy,fpss_fpu_rel_occupancy,fpss_issues,fpss_load_latency,fpss_loads,fpss_occupancy,fpss_section_latency,fpss_stores,fseq_fpu_yield,fseq_yield,section,snitch_avg_load_latency,snitch_fseq_offloads,snitch_fseq_rel_offloads,snitch_issues,snitch_load_latency,snitch_loads,snitch_occupancy,snitch_stores,start,tend,total_ipc,tstart
+matmul,1x200x5xf64,linalg_0_xdsl,40161,41098,41095,2.9950099800399204,1.0,1000,1002,3001,0.024949577948756255,0.20011983223487118,5007,3000,3000,0.12467319040860536,0,1005,1.0,1.0,1,0,5007,0.19972874865371573,20062,0,0,0.49953935409974853,0,938,0.0,0.624212544508354,0.0
+matmul,1x200x5xf64,linalg_1_xdsl,19165,20098,20095,2.985104270109235,1.0,1000,1007,3006,0.05254369945212627,0.33444038525406844,3011,1000,1000,0.15710931385337856,0,1000,1.0,1.0,1,0,3011,0.23019877675840977,10069,0,0,0.5253848160709627,0,934,0.0,0.6824941299243412,0.0
+matmul,1x200x5xf64,linalg_2_xdsl,4147,5080,5077,2.985104270109235,1.0,1000,1007,3006,0.24282613937786351,0.9862879529872673,1021,5,5,0.24620207378828068,0,5,1.0,1.0,1,0,1021,0.32619808306709264,2109,0,0,0.5085604051121293,0,934,0.0,0.7547624789004099,0.0
+matmul,1x200x5xf64,linalg_3_xdsl,4124,5039,5036,2.985104270109235,1.0,1000,1007,3006,0.244180407371484,0.9862879529872673,1021,5,5,0.24757516973811833,0,5,36.464285714285715,36.464285714285715,1,0,28,0.224,97,0,0,0.02352085354025218,0,916,0.0,0.2710960232783705,0.0
+matmul,1x200x5xf64,linalg_4_xdsl,4130,5069,5066,2.975296442687747,0.0,1000,1012,3011,0.24503631961259079,0.9980276134122288,1014,0,0,0.24552058111380146,0,0,42.25,42.25,1,0,24,0.3,56,0,0,0.013559322033898305,0,940,0.0,0.25907990314769974,0.0
+matmul,1x200x5xf64,linalg_5_xdsl,1115,2030,2027,2.980217606330366,0.0,1000,1011,3013,0.9067264573991032,0.998025666337611,1013,0,0,0.9085201793721973,0,0,53.31578947368421,53.31578947368421,1,0,19,0.37254901960784315,32,0,0,0.028699551569506727,0,916,0.0,0.9372197309417041,0.0
diff --git a/results/pipeline.csv b/results/pipeline.csv
@@ -0,0 +1,7 @@
+variant,F Registers,X Registers,Cycles,FPU Occupancy [%],F Loads,F Stores,FMAdd Issues,FRep Count
+Baseline,3,14,40161,2.49,3000,1005,1000,0
++ Streams,3,12,19165,5.25,1000,1000,1000,0
++ Scalar Replacement,3,11,4147,24.28,5,5,1000,0
++ FRep,3,10,4124,24.42,5,5,1000,2
++ Fuse Fill,5,9,4130,24.50,0,0,1000,1
++ Unroll and Jam,8,8,1115,90.67,0,0,1000,1
diff --git a/results/pipeline.fast.csv b/results/pipeline.fast.csv
diff --git a/results/regalloc.fast.csv b/results/regalloc.fast.csv
@@ -1,10 +1,10 @@
 impl,params,allocated_float,allocated_int
-matmul,4x16x8xf64,8,9
-sum,4x4xf64,3,8
+conv2d_d1_s1_3x3,4x4xf64,8,9
 fill,4x4xf64,3,4
-relu,4x4xf64,3,6
-pooling_nchw_sum_d1_s2_3x3,4x4xf64,7,7
+matmul,4x16x8xf64,8,9
 pooling_nchw_max_d1_s2_3x3,4x4xf64,7,7
-conv2d_d1_s1_3x3,4x4xf64,8,9
+pooling_nchw_sum_d1_s2_3x3,4x4xf64,7,7
+relu,4x4xf64,3,6
+sum,4x4xf64,3,8
 sum,4x8xf32,3,8
 sum,8x8xf16,3,8
diff --git a/results/regalloc.pipeline.csv b/results/regalloc.pipeline.csv
@@ -0,0 +1,10 @@
+impl,params,allocated_float,allocated_int
+conv2d_d1_s1_3x3,4x4xf64,8,9
+fill,4x4xf64,3,4
+matmul,4x16x8xf64,8,9
+pooling_nchw_max_d1_s2_3x3,4x4xf64,7,7
+pooling_nchw_sum_d1_s2_3x3,4x4xf64,7,7
+relu,4x4xf64,3,6
+sum,4x4xf64,3,8
+sum,4x8xf32,3,8
+sum,8x8xf16,3,8
diff --git a/scripts/pipeline.py b/scripts/pipeline.py
@@ -58,7 +58,6 @@ def main():
     regalloc_df = pd.read_json(regalloc_stats, lines=True)
     regalloc_df = regalloc_df[regalloc_df.impl == "matmul"]
     del regalloc_df["impl"]
-    regalloc_df = regalloc_df[regalloc_df.params == "1x20x5xf64"]
     del regalloc_df["params"]
 
     regalloc_df = regalloc_df.set_index("variant")