Xilinx · jamesroxbypb · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/reference_designs/ipu-xrt/vector_softmax/Makefile b/reference_designs/ipu-xrt/vector_softmax/Makefile
@@ -53,6 +53,9 @@ endif
 run: ${targetname}.exe build/final.xclbin build/insts.txt 
 	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
+profile: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -p results.csv
+
 trace:
 	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
 

diff --git a/reference_designs/ipu-xrt/vector_softmax/aie2.py b/reference_designs/ipu-xrt/vector_softmax/aie2.py
@@ -16,7 +16,7 @@
 def my_eltwise_add():
 
     word_size_in = 2
-    N = 65536  # *1024
+    N = 262144 #*1024
-    N = 262144 #*1024
+    N = 262144  # *1024
-    N = 262144 #*1024
+    N = 262144  # *1024
     N_in_bytes = N * word_size_in
 
     A_sz_in_i32s = N_in_bytes // 4

diff --git a/reference_designs/ipu-xrt/vector_softmax/aie2.py.orig b/reference_designs/ipu-xrt/vector_softmax/aie2.py.orig
@@ -0,0 +1,121 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+
+def my_eltwise_add():
+
+    word_size_in = 2
+    N = 65536 #*1024
+    N_in_bytes = N * word_size_in
+
+    A_sz_in_i32s = N_in_bytes // 4
+    C_sz_in_i32s = N_in_bytes // 4
+
+    # Tile sizes
+    n = 1024
+    N_div_n = N // n
+
+    n_cores = 4
+    tiles = N_div_n // n_cores
+    buffer_depth = 2
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            memRef_ty = T.memref(n, T.bf16())
+
+            # Type used in the tile memory
+            memRef_A_ty = T.memref(n, T.bf16())
+            memRef_C_ty = T.memref(n, T.bf16())
+
+            # Type used in the memory tile which aggregates across the 4 cores
+            memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
+            memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())
+
+            # AIE Core Function declarations
+
+            exp_bf16_vector = external_func("exp_bf16_vector", inputs=[memRef_ty, memRef_ty])
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+
+            MemTile = tile(0, 1)
+            cores = [tile(0, 2 + i) for i in range(n_cores)]
+
+            inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
+            outC_fifo_names = [f"memC{i}" for i in range(n_cores)]
+
+            inA_fifos = {}
+            outC_fifos = {}
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
+            for i in range(n_cores):
+                inA_fifos[inA_fifo_names[i]] = object_fifo(
+                    inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
+                )
+            object_fifo_link(inA, inA_fifo_names)
+
+            # Output C
+            for i in range(n_cores):
+                outC_fifos[outC_fifo_names[i]] = object_fifo(
+                    outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
+                )
+            outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
+            object_fifo_link(outC_fifo_names[0:n_cores], outC)
+
+            # Set up compute tiles
+            for i in range(n_cores):
+                # Compute tile i
+                @core(cores[i], "kernels.a")
+                def core_body():
+                    for _ in for_(0xFFFFFFFF):
+                        for _ in for_(tiles):
+                            elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+
+                            call(exp_bf16_vector,[elem_in_a, elem_out])
+
+                            inA_fifos[inA_fifo_names[i]].release(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            outC_fifos[outC_fifo_names[i]].release(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            yield_([])
+                        yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                ipu_dma_memcpy_nd(
+                    metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_eltwise_add()
diff --git a/reference_designs/ipu-xrt/vector_softmax/bf16_softmax.mlir.orig b/reference_designs/ipu-xrt/vector_softmax/bf16_softmax.mlir.orig
@@ -0,0 +1,34 @@
+module {
+  func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 1.000000e+00 : f32
+    %cst_1 = arith.constant 0.000000e+00 : bf16
+    %cst_2 = arith.constant dense<0xFF80> : vector<32xbf16>
+    %0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) {
+      %5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16>
+      %6 = arith.maximumf %arg3, %5 : vector<32xbf16>
+      affine.yield %6 : vector<32xbf16>
+    }
+    %1 = vector.reduction <maximumf>, %0 : vector<32xbf16> into bf16
+    affine.for %arg2 = 0 to 1024 {
+      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %6 = arith.subf %5, %1 : bf16
+      %7 = math.exp %6 : bf16
+      affine.store %7, %arg0[%arg2] : memref<1024xbf16>
+    }
+    %2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) {
+      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %6 = arith.extf %5 : bf16 to f32
+      %7 = arith.addf %arg3, %6 : f32
+      affine.yield %7 : f32
+    }
+    %3 = arith.divf %cst_0, %2 : f32
+    %4 = arith.truncf %3 : f32 to bf16
+    affine.for %arg2 = 0 to 1024 {
+      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %6 = arith.mulf %5, %4 : bf16
+      affine.store %6, %arg1[%arg2] : memref<1024xbf16>
+    }
+    return
+  }
+}
diff --git a/reference_designs/ipu-xrt/vector_softmax/exp.cc b/reference_designs/ipu-xrt/vector_softmax/exp.cc
@@ -26,3 +26,33 @@ extern "C" {
 void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }
 
 } // extern "C"
+//===- scale.cc -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+extern void dut(bfloat16 *a_in, bfloat16 *cout);
+
+extern "C" {
+
+void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) {
+  dut(a_in, c_out);
+}
-void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) {
-  dut(a_in, c_out);
-}
+void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }
-void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) {
-  dut(a_in, c_out);
-}
+void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }
+
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vector_softmax/sweep.py b/reference_designs/ipu-xrt/vector_softmax/sweep.py
@@ -0,0 +1,20 @@
+import os;
-import os;
+import os
-import os;
+import os
+
+for action in ["rm -f","touch"]:
-for action in ["rm -f","touch"]:
+for action in ["rm -f", "touch"]:
-for action in ["rm -f","touch"]:
+for action in ["rm -f", "touch"]:
+    cmd = f"{action} results.csv"
+    os.system(cmd)
+
+
+for s in [16384,32768,65536,131072,262144]:
+    for i in [64,128,256,512,1024]:
-for s in [16384,32768,65536,131072,262144]:
-    for i in [64,128,256,512,1024]:
+for s in [16384, 32768, 65536, 131072, 262144]:
+    for i in [64, 128, 256, 512, 1024]:
-for s in [16384,32768,65536,131072,262144]:
-    for i in [64,128,256,512,1024]:
+for s in [16384, 32768, 65536, 131072, 262144]:
+    for i in [64, 128, 256, 512, 1024]:
+        for f in ["bf16_softmax.mlir", "test.cpp", "aie2.py"]:
+            sed = f"sed 's\\1024\\{i}\g' {f}.orig > {f}.first"
+            os.system(sed)
+            sed = f"sed 's\\65536\\{s}\g' {f}.first > {f}"
+            os.system(sed)
+        make_clean = f"make clean > /dev/null"
+        os.system(make_clean)
+        make_all = f"make all"
+        os.system(make_all)
+        make_profile = f"make profile"
+        os.system(make_profile)    
-        os.system(make_profile)    
+        os.system(make_profile)
-        os.system(make_profile)    
+        os.system(make_profile)
diff --git a/reference_designs/ipu-xrt/vector_softmax/test.cpp b/reference_designs/ipu-xrt/vector_softmax/test.cpp
@@ -16,19 +16,30 @@
 #include <ctime>
 #include <fstream>
 #include <iostream>
+<<<<<<< HEAD
+=======
 #include <math.h>
+>>>>>>> asplos
->>>>>>> asplos
+    >>>>>>> asplos
->>>>>>> asplos
+    >>>>>>> asplos
 #include <sstream>
 #include <stdfloat>
 #include <string>
 #include <vector>
+<<<<<<< HEAD
-<<<<<<< HEAD
+    <<<<<<< HEAD
-<<<<<<< HEAD
+    <<<<<<< HEAD
+#include <math.h>
+=======
-=======
+    =======
-=======
+    =======
+>>>>>>> asplos
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
 constexpr bool VERIFY = true;
-constexpr bool VERIFY = true;
+    constexpr bool VERIFY = true;
-constexpr bool VERIFY = true;
+    constexpr bool VERIFY = true;
 
+<<<<<<< HEAD
+constexpr int IN_SIZE = 262144; //*1024;
+=======
 constexpr int IN_SIZE = 65536; //*1024;
+>>>>>>> asplos
 constexpr int TILE_SIZE = 1024;
 constexpr int OUT_SIZE = IN_SIZE;
 
@@ -88,6 +99,10 @@ int main(int argc, const char *argv[]) {
       "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
       "verbosity,v", po::value<int>()->default_value(0),
       "the verbosity of the output")(
+<<<<<<< HEAD
+      "profile,p", po::value<std::string>()->default_value(""),"CSV profile")(
-      "profile,p", po::value<std::string>()->default_value(""),"CSV profile")(
+      "profile,p", po::value<std::string>()->default_value(""), "CSV profile")(
-      "profile,p", po::value<std::string>()->default_value(""),"CSV profile")(
+      "profile,p", po::value<std::string>()->default_value(""), "CSV profile")(
+=======
+>>>>>>> asplos
       "instr,i", po::value<std::string>()->required(),
       "path of file containing userspace instructions to be sent to the LX6");
   po::variables_map vm;
@@ -180,7 +195,7 @@ int main(int argc, const char *argv[]) {
 
   int sticky_errors = 0;
 
-  unsigned num_iter = 256;
+  unsigned num_iter = 64;
   float npu_time_total = 0;
   float npu_time_min = 9999999;
   float npu_time_max = 0;
@@ -215,21 +230,22 @@ int main(int argc, const char *argv[]) {
       std::vector<std::bfloat16_t> RefVec(IN_SIZE);
       auto cpu_start = std::chrono::high_resolution_clock::now();
 
-      for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) {
+      for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) {
-      for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) {
+      for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) {
-      for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) {
+      for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) {
         float running = 0.0;
         for (uint32_t i = 0; i < TILE_SIZE; i++) {
-          float ez = (float)(exp(AVec[t + i]));
+          float ez = (float)(exp(AVec[t+i]));
-          float ez = (float)(exp(AVec[t+i]));
+          float ez = (float)(exp(AVec[t + i]));
-          float ez = (float)(exp(AVec[t+i]));
+          float ez = (float)(exp(AVec[t + i]));
           running += ez;
-          RefVec[t + i] = exp(AVec[t + i]);
+          RefVec[t+i] = exp(AVec[t+i]);
-          RefVec[t+i] = exp(AVec[t+i]);
+          RefVec[t + i] = exp(AVec[t + i]);
-          RefVec[t+i] = exp(AVec[t+i]);
+          RefVec[t + i] = exp(AVec[t + i]);
         }
+
-        
-        
         for (uint32_t i = 0; i < TILE_SIZE; i++) {
-          RefVec[t + i] /= running;
+          RefVec[t+i] /= running;
-          RefVec[t+i] /= running;
+          RefVec[t + i] /= running;
-          RefVec[t+i] /= running;
+          RefVec[t + i] /= running;
         }
-      }
+      }      
-      }      
+      }
-      }      
+      }
       auto cpu_stop = std::chrono::high_resolution_clock::now();
-      float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(
-                           cpu_stop - cpu_start)
-                           .count();
+      float cpu_time =
+          std::chrono::duration_cast<std::chrono::microseconds>(cpu_stop - cpu_start)
+              .count();
-      float cpu_time =
-          std::chrono::duration_cast<std::chrono::microseconds>(cpu_stop - cpu_start)
-              .count();
+      float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(
+                           cpu_stop - cpu_start)
+                           .count();
-      float cpu_time =
-          std::chrono::duration_cast<std::chrono::microseconds>(cpu_stop - cpu_start)
-              .count();
+      float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(
+                           cpu_stop - cpu_start)
+                           .count();
 
       cpu_time_total += cpu_time;
       cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
@@ -264,6 +280,13 @@ int main(int argc, const char *argv[]) {
     npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
     npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
 
+    std::string profile = vm["profile"].as<std::string>();
+    if (profile.length()) {
+      std::ofstream of;
+      of.open(profile, std::ios::app); // Append
+      of << IN_SIZE << "," << TILE_SIZE << "," << npu_time << std::endl;
+    }
+
     if (VERIFY) {
       if (!errors) {
         std::cout << iter << ": pass! in " << npu_time << "us" << std::endl;