Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector softmax #1172

Merged
merged 3 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ endif
run: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE

profile: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -p results.csv

trace:
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json

Expand Down
2 changes: 1 addition & 1 deletion reference_designs/ipu-xrt/vector_softmax/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def my_eltwise_add():

word_size_in = 2
N = 65536 # *1024
N = 262144 #*1024
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
N = 262144 #*1024
N = 262144 # *1024

N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
Expand Down
121 changes: 121 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/aie2.py.orig
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx


def my_eltwise_add():

word_size_in = 2
N = 65536 #*1024
N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
C_sz_in_i32s = N_in_bytes // 4

# Tile sizes
n = 1024
N_div_n = N // n

n_cores = 4
tiles = N_div_n // n_cores
buffer_depth = 2

with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
memRef_ty = T.memref(n, T.bf16())

# Type used in the tile memory
memRef_A_ty = T.memref(n, T.bf16())
memRef_C_ty = T.memref(n, T.bf16())

# Type used in the memory tile which aggregates across the 4 cores
memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())

# AIE Core Function declarations

exp_bf16_vector = external_func("exp_bf16_vector", inputs=[memRef_ty, memRef_ty])

# Tile declarations
ShimTile = tile(0, 0)

MemTile = tile(0, 1)
cores = [tile(0, 2 + i) for i in range(n_cores)]

inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
outC_fifo_names = [f"memC{i}" for i in range(n_cores)]

inA_fifos = {}
outC_fifos = {}

# AIE-array data movement with object fifos
# Input A
inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
for i in range(n_cores):
inA_fifos[inA_fifo_names[i]] = object_fifo(
inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
)
object_fifo_link(inA, inA_fifo_names)

# Output C
for i in range(n_cores):
outC_fifos[outC_fifo_names[i]] = object_fifo(
outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
)
outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
object_fifo_link(outC_fifo_names[0:n_cores], outC)

# Set up compute tiles
for i in range(n_cores):
# Compute tile i
@core(cores[i], "kernels.a")
def core_body():
for _ in for_(0xFFFFFFFF):
for _ in for_(tiles):
elem_out = outC_fifos[outC_fifo_names[i]].acquire(
ObjectFifoPort.Produce, 1
)
elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
ObjectFifoPort.Consume, 1
)

call(exp_bf16_vector,[elem_in_a, elem_out])

inA_fifos[inA_fifo_names[i]].release(
ObjectFifoPort.Consume, 1
)
outC_fifos[outC_fifo_names[i]].release(
ObjectFifoPort.Produce, 1
)
yield_([])
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty)
def sequence(A, C):
ipu_dma_memcpy_nd(
metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
)
ipu_dma_memcpy_nd(
metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


my_eltwise_add()
34 changes: 34 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/bf16_softmax.mlir.orig
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
module {
func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant 0.000000e+00 : bf16
%cst_2 = arith.constant dense<0xFF80> : vector<32xbf16>
%0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) {
%5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16>
%6 = arith.maximumf %arg3, %5 : vector<32xbf16>
affine.yield %6 : vector<32xbf16>
}
%1 = vector.reduction <maximumf>, %0 : vector<32xbf16> into bf16
affine.for %arg2 = 0 to 1024 {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.subf %5, %1 : bf16
%7 = math.exp %6 : bf16
affine.store %7, %arg0[%arg2] : memref<1024xbf16>
}
%2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.extf %5 : bf16 to f32
%7 = arith.addf %arg3, %6 : f32
affine.yield %7 : f32
}
%3 = arith.divf %cst_0, %2 : f32
%4 = arith.truncf %3 : f32 to bf16
affine.for %arg2 = 0 to 1024 {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.mulf %5, %4 : bf16
affine.store %6, %arg1[%arg2] : memref<1024xbf16>
}
return
}
}
30 changes: 30 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/exp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,33 @@ extern "C" {
void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }

} // extern "C"
//===- scale.cc -------------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#define __AIENGINE__ 2
#define NOCPP
#define __AIEARCH__ 20

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <aie_api/aie.hpp>

extern void dut(bfloat16 *a_in, bfloat16 *cout);

extern "C" {

void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) {
dut(a_in, c_out);
}
Comment on lines +54 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) {
dut(a_in, c_out);
}
void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }


} // extern "C"
20 changes: 20 additions & 0 deletions reference_designs/ipu-xrt/vector_softmax/sweep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
import os;
import os


for action in ["rm -f","touch"]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
for action in ["rm -f","touch"]:
for action in ["rm -f", "touch"]:

cmd = f"{action} results.csv"
os.system(cmd)


for s in [16384,32768,65536,131072,262144]:
for i in [64,128,256,512,1024]:
Comment on lines +8 to +9
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
for s in [16384,32768,65536,131072,262144]:
for i in [64,128,256,512,1024]:
for s in [16384, 32768, 65536, 131072, 262144]:
for i in [64, 128, 256, 512, 1024]:

for f in ["bf16_softmax.mlir", "test.cpp", "aie2.py"]:
sed = f"sed 's\\1024\\{i}\g' {f}.orig > {f}.first"
os.system(sed)
sed = f"sed 's\\65536\\{s}\g' {f}.first > {f}"
os.system(sed)
make_clean = f"make clean > /dev/null"
os.system(make_clean)
make_all = f"make all"
os.system(make_all)
make_profile = f"make profile"
os.system(make_profile)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
os.system(make_profile)
os.system(make_profile)

41 changes: 32 additions & 9 deletions reference_designs/ipu-xrt/vector_softmax/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,30 @@
#include <ctime>
#include <fstream>
#include <iostream>
<<<<<<< HEAD
=======
#include <math.h>
>>>>>>> asplos
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
>>>>>>> asplos
>>>>>>> asplos

#include <sstream>
#include <stdfloat>
#include <string>
#include <vector>
<<<<<<< HEAD
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
<<<<<<< HEAD
<<<<<<< HEAD

#include <math.h>
=======
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
=======
=======

>>>>>>> asplos

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

constexpr bool VERIFY = true;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
constexpr bool VERIFY = true;
constexpr bool VERIFY = true;


<<<<<<< HEAD
constexpr int IN_SIZE = 262144; //*1024;
=======
constexpr int IN_SIZE = 65536; //*1024;
>>>>>>> asplos
constexpr int TILE_SIZE = 1024;
constexpr int OUT_SIZE = IN_SIZE;

Expand Down Expand Up @@ -88,6 +99,10 @@ int main(int argc, const char *argv[]) {
"the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
"verbosity,v", po::value<int>()->default_value(0),
"the verbosity of the output")(
<<<<<<< HEAD
"profile,p", po::value<std::string>()->default_value(""),"CSV profile")(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
"profile,p", po::value<std::string>()->default_value(""),"CSV profile")(
"profile,p", po::value<std::string>()->default_value(""), "CSV profile")(

=======
>>>>>>> asplos
"instr,i", po::value<std::string>()->required(),
"path of file containing userspace instructions to be sent to the LX6");
po::variables_map vm;
Expand Down Expand Up @@ -180,7 +195,7 @@ int main(int argc, const char *argv[]) {

int sticky_errors = 0;

unsigned num_iter = 256;
unsigned num_iter = 64;
float npu_time_total = 0;
float npu_time_min = 9999999;
float npu_time_max = 0;
Expand Down Expand Up @@ -215,21 +230,22 @@ int main(int argc, const char *argv[]) {
std::vector<std::bfloat16_t> RefVec(IN_SIZE);
auto cpu_start = std::chrono::high_resolution_clock::now();

for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) {
for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) {
for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) {

float running = 0.0;
for (uint32_t i = 0; i < TILE_SIZE; i++) {
float ez = (float)(exp(AVec[t + i]));
float ez = (float)(exp(AVec[t+i]));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
float ez = (float)(exp(AVec[t+i]));
float ez = (float)(exp(AVec[t + i]));

running += ez;
RefVec[t + i] = exp(AVec[t + i]);
RefVec[t+i] = exp(AVec[t+i]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
RefVec[t+i] = exp(AVec[t+i]);
RefVec[t + i] = exp(AVec[t + i]);

}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change

for (uint32_t i = 0; i < TILE_SIZE; i++) {
RefVec[t + i] /= running;
RefVec[t+i] /= running;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
RefVec[t+i] /= running;
RefVec[t + i] /= running;

}
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
}
}

auto cpu_stop = std::chrono::high_resolution_clock::now();
float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(
cpu_stop - cpu_start)
.count();
float cpu_time =
std::chrono::duration_cast<std::chrono::microseconds>(cpu_stop - cpu_start)
.count();
Comment on lines +246 to +248
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[clang-format] reported by reviewdog 🐶

Suggested change
float cpu_time =
std::chrono::duration_cast<std::chrono::microseconds>(cpu_stop - cpu_start)
.count();
float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(
cpu_stop - cpu_start)
.count();


cpu_time_total += cpu_time;
cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
Expand Down Expand Up @@ -264,6 +280,13 @@ int main(int argc, const char *argv[]) {
npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;

std::string profile = vm["profile"].as<std::string>();
if (profile.length()) {
std::ofstream of;
of.open(profile, std::ios::app); // Append
of << IN_SIZE << "," << TILE_SIZE << "," << npu_time << std::endl;
}

if (VERIFY) {
if (!errors) {
std::cout << iter << ": pass! in " << npu_time << "us" << std::endl;
Expand Down
Loading
Loading