Skip to content

Commit

Permalink
Vector softmax (Xilinx#1172)
Browse files Browse the repository at this point in the history
Co-authored-by: pjr <[email protected]>
  • Loading branch information
2 people authored and fifield committed Apr 10, 2024
1 parent 8f44d4e commit d032536
Show file tree
Hide file tree
Showing 8 changed files with 566 additions and 10 deletions.
3 changes: 3 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ endif
run: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE

profile: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -p results.csv

trace:
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json

Expand Down
2 changes: 1 addition & 1 deletion reference_designs/ipu-xrt/vector_softmax/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def my_eltwise_add():

word_size_in = 2
N = 65536 # *1024
N = 262144 #*1024
N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
Expand Down
121 changes: 121 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/aie2.py.orig
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx


def my_eltwise_add():

word_size_in = 2
N = 65536 #*1024
N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
C_sz_in_i32s = N_in_bytes // 4

# Tile sizes
n = 1024
N_div_n = N // n

n_cores = 4
tiles = N_div_n // n_cores
buffer_depth = 2

with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
memRef_ty = T.memref(n, T.bf16())

# Type used in the tile memory
memRef_A_ty = T.memref(n, T.bf16())
memRef_C_ty = T.memref(n, T.bf16())

# Type used in the memory tile which aggregates across the 4 cores
memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())

# AIE Core Function declarations

exp_bf16_vector = external_func("exp_bf16_vector", inputs=[memRef_ty, memRef_ty])

# Tile declarations
ShimTile = tile(0, 0)

MemTile = tile(0, 1)
cores = [tile(0, 2 + i) for i in range(n_cores)]

inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
outC_fifo_names = [f"memC{i}" for i in range(n_cores)]

inA_fifos = {}
outC_fifos = {}

# AIE-array data movement with object fifos
# Input A
inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
for i in range(n_cores):
inA_fifos[inA_fifo_names[i]] = object_fifo(
inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
)
object_fifo_link(inA, inA_fifo_names)

# Output C
for i in range(n_cores):
outC_fifos[outC_fifo_names[i]] = object_fifo(
outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
)
outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
object_fifo_link(outC_fifo_names[0:n_cores], outC)

# Set up compute tiles
for i in range(n_cores):
# Compute tile i
@core(cores[i], "kernels.a")
def core_body():
for _ in for_(0xFFFFFFFF):
for _ in for_(tiles):
elem_out = outC_fifos[outC_fifo_names[i]].acquire(
ObjectFifoPort.Produce, 1
)
elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
ObjectFifoPort.Consume, 1
)

call(exp_bf16_vector,[elem_in_a, elem_out])

inA_fifos[inA_fifo_names[i]].release(
ObjectFifoPort.Consume, 1
)
outC_fifos[outC_fifo_names[i]].release(
ObjectFifoPort.Produce, 1
)
yield_([])
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty)
def sequence(A, C):
ipu_dma_memcpy_nd(
metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
)
ipu_dma_memcpy_nd(
metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


my_eltwise_add()
34 changes: 34 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/bf16_softmax.mlir.orig
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
module {
func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant 0.000000e+00 : bf16
%cst_2 = arith.constant dense<0xFF80> : vector<32xbf16>
%0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) {
%5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16>
%6 = arith.maximumf %arg3, %5 : vector<32xbf16>
affine.yield %6 : vector<32xbf16>
}
%1 = vector.reduction <maximumf>, %0 : vector<32xbf16> into bf16
affine.for %arg2 = 0 to 1024 {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.subf %5, %1 : bf16
%7 = math.exp %6 : bf16
affine.store %7, %arg0[%arg2] : memref<1024xbf16>
}
%2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.extf %5 : bf16 to f32
%7 = arith.addf %arg3, %6 : f32
affine.yield %7 : f32
}
%3 = arith.divf %cst_0, %2 : f32
%4 = arith.truncf %3 : f32 to bf16
affine.for %arg2 = 0 to 1024 {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.mulf %5, %4 : bf16
affine.store %6, %arg1[%arg2] : memref<1024xbf16>
}
return
}
}
30 changes: 30 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/exp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,33 @@ extern "C" {
void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }

} // extern "C"
//===- scale.cc -------------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#define __AIENGINE__ 2
#define NOCPP
#define __AIEARCH__ 20

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <aie_api/aie.hpp>

extern void dut(bfloat16 *a_in, bfloat16 *cout);

extern "C" {

void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) {
dut(a_in, c_out);
}

} // extern "C"
20 changes: 20 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/sweep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os;

for action in ["rm -f","touch"]:
cmd = f"{action} results.csv"
os.system(cmd)


for s in [16384,32768,65536,131072,262144]:
for i in [64,128,256,512,1024]:
for f in ["bf16_softmax.mlir", "test.cpp", "aie2.py"]:
sed = f"sed 's\\1024\\{i}\g' {f}.orig > {f}.first"
os.system(sed)
sed = f"sed 's\\65536\\{s}\g' {f}.first > {f}"
os.system(sed)
make_clean = f"make clean > /dev/null"
os.system(make_clean)
make_all = f"make all"
os.system(make_all)
make_profile = f"make profile"
os.system(make_profile)
41 changes: 32 additions & 9 deletions reference_designs/ipu-xrt/vector_softmax/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,30 @@
#include <ctime>
#include <fstream>
#include <iostream>
<<<<<<< HEAD
=======
#include <math.h>
>>>>>>> asplos
#include <sstream>
#include <stdfloat>
#include <string>
#include <vector>
<<<<<<< HEAD
#include <math.h>
=======
>>>>>>> asplos

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

constexpr bool VERIFY = true;

<<<<<<< HEAD
constexpr int IN_SIZE = 262144; //*1024;
=======
constexpr int IN_SIZE = 65536; //*1024;
>>>>>>> asplos
constexpr int TILE_SIZE = 1024;
constexpr int OUT_SIZE = IN_SIZE;

Expand Down Expand Up @@ -88,6 +99,10 @@ int main(int argc, const char *argv[]) {
"the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
"verbosity,v", po::value<int>()->default_value(0),
"the verbosity of the output")(
<<<<<<< HEAD
"profile,p", po::value<std::string>()->default_value(""),"CSV profile")(
=======
>>>>>>> asplos
"instr,i", po::value<std::string>()->required(),
"path of file containing userspace instructions to be sent to the LX6");
po::variables_map vm;
Expand Down Expand Up @@ -180,7 +195,7 @@ int main(int argc, const char *argv[]) {

int sticky_errors = 0;

unsigned num_iter = 256;
unsigned num_iter = 64;
float npu_time_total = 0;
float npu_time_min = 9999999;
float npu_time_max = 0;
Expand Down Expand Up @@ -215,21 +230,22 @@ int main(int argc, const char *argv[]) {
std::vector<std::bfloat16_t> RefVec(IN_SIZE);
auto cpu_start = std::chrono::high_resolution_clock::now();

for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) {
for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) {
float running = 0.0;
for (uint32_t i = 0; i < TILE_SIZE; i++) {
float ez = (float)(exp(AVec[t + i]));
float ez = (float)(exp(AVec[t+i]));
running += ez;
RefVec[t + i] = exp(AVec[t + i]);
RefVec[t+i] = exp(AVec[t+i]);
}

for (uint32_t i = 0; i < TILE_SIZE; i++) {
RefVec[t + i] /= running;
RefVec[t+i] /= running;
}
}
}
auto cpu_stop = std::chrono::high_resolution_clock::now();
float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(
cpu_stop - cpu_start)
.count();
float cpu_time =
std::chrono::duration_cast<std::chrono::microseconds>(cpu_stop - cpu_start)
.count();

cpu_time_total += cpu_time;
cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
Expand Down Expand Up @@ -264,6 +280,13 @@ int main(int argc, const char *argv[]) {
npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;

std::string profile = vm["profile"].as<std::string>();
if (profile.length()) {
std::ofstream of;
of.open(profile, std::ios::app); // Append
of << IN_SIZE << "," << TILE_SIZE << "," << npu_time << std::endl;
}

if (VERIFY) {
if (!errors) {
std::cout << iter << ": pass! in " << npu_time << "us" << std::endl;
Expand Down
Loading

0 comments on commit d032536

Please sign in to comment.