diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp index 6e77665372..3716344a49 100644 --- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp +++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp @@ -87,34 +87,78 @@ extractMACOperandsFromAddOperands(Value addLhs, Value addRhs) { return {}; } -// Create MulElemOp for i8 and bf16 types in aie-ml. The corresponding intrinsic -// is mul_elem_16_2, which indicates that we need to concatenate zero vectors -// for both mul operands before creating MulElemOp. -static aievec::MulElemOp createMulElemAieML(ConversionPatternRewriter &rewriter, - Value lval, Value rval, - VectorType srcType, - unsigned bitWidth, Location loc) { - Type accType = getVectorOpDestType(srcType, /*AIEML =*/true); - VectorType vecType = - createVectorType(512 / bitWidth, srcType.getElementType()); - - arith::ConstantOp zeroConstOp = nullptr; - zeroConstOp = rewriter.create( - loc, srcType.getElementType(), - rewriter.getZeroAttr(srcType.getElementType())); - auto broadcastZeroOp = rewriter.create( - loc, vecType, zeroConstOp->getResult(0)); - auto extOp = rewriter.create(loc, srcType, - broadcastZeroOp.getResult(), 0); - - SmallVector lSources = {lval, extOp->getResult(0)}; - SmallVector rSources = {rval, extOp->getResult(0)}; - auto lConcatOp = rewriter.create(loc, vecType, lSources); - auto rConcatOp = rewriter.create(loc, vecType, rSources); - - auto mulElemOp = rewriter.create( - loc, accType, lConcatOp->getResult(0), rConcatOp->getResult(0)); - return mulElemOp; +// Convert a input value to a target vector type. This function can insert +// multiple aievec ops depending on the combination of input and output vector +// types. +static std::optional +convertValueToTargetTypeAieML(ConversionPatternRewriter &rewriter, Location loc, + Value inputVal, VectorType tgtType) { + VectorType srcType = cast(inputVal.getType()); + auto srcElemType = srcType.getElementType(); + unsigned srcBitWidth = srcElemType.getIntOrFloatBitWidth(); + unsigned srcLaneSize = getVectorLaneSize(srcType); + + auto tgtElemType = tgtType.getElementType(); + unsigned tgtBitWidth = tgtElemType.getIntOrFloatBitWidth(); + unsigned tgtLaneSize = getVectorLaneSize(tgtType); + + if (srcType == tgtType) + return inputVal; + + if ((srcElemType == tgtElemType) && (srcLaneSize != tgtLaneSize)) { + // TODO: relax the condition below? + if ((srcLaneSize == 16 && tgtLaneSize == 32 && + isa(srcElemType)) || + (srcLaneSize == 32 && tgtLaneSize == 64 && + isa(srcElemType))) { + auto zeroConstOp = rewriter.create( + loc, srcType.getElementType(), + rewriter.getZeroAttr(srcType.getElementType())); + auto broadcastZeroOp = rewriter.create( + loc, tgtType, zeroConstOp->getResult(0)); + auto extOp = rewriter.create( + loc, srcType, broadcastZeroOp->getResult(0), 0); + + SmallVector inputSources = {inputVal, extOp->getResult(0)}; + aievec::ConcatOp concatOp = + rewriter.create(loc, tgtType, inputSources); + + return concatOp.getResult(); + } + } else if ((srcElemType != tgtElemType) && (srcLaneSize == tgtLaneSize) && + isa(srcElemType) && isa(tgtElemType)) { + if (srcBitWidth == 16 && tgtBitWidth == 32 && srcLaneSize == 16) { + // Case 1: vector<16xi16> to vector<16xi32> conversion by aievec.ups + + // aievec.cast + auto accType = getVectorOpDestType(srcType, /*AIEML =*/true); + auto upsOp = rewriter.create(loc, accType, inputVal); + auto castOp = rewriter.create( + loc, tgtType, upsOp.getResult(), /*isResAcc*/ false); + return castOp.getResult(); + } else if (srcBitWidth == 8 && tgtBitWidth == 32 && srcLaneSize == 16) { + // Case 2: vector<16xi8> to vector<16xi32> conversion by aievec.concat + + // aievec.ups + aievec.cast + aievec.ext + // FIXME: Should use undef_xxx() for the second input of concat + auto concatOutType = createVectorType(32, srcElemType); + auto concatOp = rewriter.create( + loc, concatOutType, SmallVector({inputVal, inputVal})); + auto accType = getVectorOpDestType(concatOutType, /*AIEML =*/true); + auto upsOp = + rewriter.create(loc, accType, concatOp.getResult()); + auto castType = createVectorType(32, tgtElemType); + auto castOp = rewriter.create( + loc, castType, upsOp.getResult(), /*isResAcc*/ false); + auto extOp = + rewriter.create(loc, tgtType, castOp.getResult(), 0); + return extOp.getResult(); + } else if (srcBitWidth == 8 && tgtBitWidth == 16 && srcLaneSize == 32) { + // Case 3: vector<32xi8> to vector<32xi16> conversion by aievec.unpack + auto unpackOp = rewriter.create(loc, tgtType, inputVal); + return unpackOp.getResult(); + } + } + + return std::nullopt; } // Return the list of attributes that configure an `aievec.select` op to @@ -546,8 +590,8 @@ struct ConvertMulFToAIEVecMulElemOpPattern if (!resultType) return failure(); + // FIXME: Verify it is not a part of FMA auto isAddOp = [&](Operation *op) { return isa(op); }; - // Verify it is not a part of FMA if (mulOp->hasOneUse() && llvm::any_of(mulOp->getUsers(), isAddOp)) return failure(); @@ -560,42 +604,68 @@ struct ConvertMulFToAIEVecMulElemOpPattern if (laneSize != 16 || (resultElWidth != 16 && resultElWidth != 32)) return failure(); - aievec::MulElemOp mulElemOp = nullptr; - - if (resultElWidth == 16) { - mulElemOp = - createMulElemAieML(rewriter, adaptor.getLhs(), adaptor.getRhs(), - resultType, resultElWidth, mulOp.getLoc()); - rewriter.replaceOpWithNewOp( - mulOp, resultType, mulElemOp.getResult(), shiftParam); + // Decide the accType for aievec.mul_elem based on mulOp's lhs & rhs + auto lval = adaptor.getLhs(); + auto rval = adaptor.getRhs(); + if (auto lvalExtOp = lval.getDefiningOp()) { + lval = lvalExtOp->getOperand(0); + } + if (auto rvalExtOp = rval.getDefiningOp()) { + rval = rvalExtOp->getOperand(0); + } + VectorType lSrcType = cast(lval.getType()); + VectorType rSrcType = cast(rval.getType()); + unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth(); + unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth(); + Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true); + if (rBitWidth > lBitWidth) { + accType = getVectorOpDestType(rSrcType, /*AIEML =*/true); + } + // Only support the same lhs/rhs type at the moment + if (lSrcType != rSrcType) { + return failure(); + } + // Only support two bfloat16 inputs at the moment + if (lBitWidth != 16 || rBitWidth != 16) { + return failure(); } - // float type - else { - auto lhs = dyn_cast(adaptor.getLhs().getDefiningOp()); - auto rhs = dyn_cast(adaptor.getRhs().getDefiningOp()); - - if (!lhs || !rhs) - return failure(); - - auto lval = lhs->getOperand(0); - auto rval = rhs->getOperand(0); - VectorType lSrcType = cast(lval.getType()); - VectorType rSrcType = cast(rval.getType()); + // Prepare lhr/rhs for the aievec.mul_elem op + VectorType targetInputType = + createVectorType(512 / lBitWidth, lSrcType.getElementType()); + if (rBitWidth > lBitWidth) { + targetInputType = + createVectorType(512 / rBitWidth, rSrcType.getElementType()); + } + auto lValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(), + lval, targetInputType); + auto rValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(), + rval, targetInputType); + if (!lValConverted || !rValConverted) + return failure(); - unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth(); - unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth(); + // Create an aievec.mul_elem op + aievec::MulElemOp mulElemOp = rewriter.create( + mulOp.getLoc(), accType, *lValConverted, *rValConverted); - if (lBitWidth != 16 || rBitWidth != 16) - return failure(); + // Create an aievec.cast or an aievec.srs op + auto mulElemResultType = mulElemOp.getType(); + auto mulElemResultElWidth = + mulElemResultType.getElementType().getIntOrFloatBitWidth(); - mulElemOp = createMulElemAieML(rewriter, lval, rval, lSrcType, lBitWidth, - mulOp.getLoc()); + if (mulElemResultElWidth == resultElWidth) { rewriter.replaceOpWithNewOp( mulOp, resultType, mulElemOp.getResult(), /*isResAcc*/ false); + } else if (mulElemResultElWidth > resultElWidth) { + rewriter.replaceOpWithNewOp( + mulOp, resultType, mulElemOp.getResult(), shiftParam); + } else { + return failure(); } + return success(); } + unsigned shiftParam; }; @@ -617,8 +687,8 @@ struct ConvertMulIToAIEVecMulElemOpPattern if (!resultType) return failure(); + // FIXME: Verify it is not a part of MAC auto isAddOp = [&](Operation *op) { return isa(op); }; - // Verify it is not a part of MAC if (mulOp->hasOneUse() && llvm::any_of(mulOp->getUsers(), isAddOp)) return failure(); @@ -631,79 +701,57 @@ struct ConvertMulIToAIEVecMulElemOpPattern ((laneSize != 16 && laneSize != 32) || resultElWidth != 32)) return failure(); - // Deal with the case with sext op for i8 and i16: - // Case 1: - // Transfer - - // %1 = arith.extsi %a : vector<32xi8> to vector<32xi32> - // %2 = arith.extsi %b : vector<32xi8> to vector<32xi32> - // %3 = arith.muli %1, %2 : vector<32xi32> - // to - - // aievec.mul_elem(%a, %b) : vector<64xi8>, vector<64xi8>, vector<32xi32> - // - // Case 2: - // Transfer - - // %1 = arith.extsi %a : vector<32xi16> to vector<32xi32> - // %2 = arith.extsi %b : vector<32xi16> to vector<32xi32> - // %3 = arith.muli %1, %2 : vector<32xi32> - // to - - // aievec.mul_elem(%a, %b) : vector<32xi16>, vector<32xi16>, vector<32xi32> - if (laneSize == 32 && (resultElWidth == 32 || resultElWidth == 8)) { - if (resultElWidth == 32) { - auto lhs = dyn_cast(adaptor.getLhs().getDefiningOp()); - auto rhs = dyn_cast(adaptor.getRhs().getDefiningOp()); - - if (!lhs || !rhs) - return failure(); - - auto lval = lhs->getOperand(0); - auto rval = rhs->getOperand(0); - - VectorType lSrcType = cast(lval.getType()); - VectorType rSrcType = cast(rval.getType()); + // Decide the accType for aievec.mul_elem based on mulOp's lhs & rhs + auto lval = adaptor.getLhs(); + auto rval = adaptor.getRhs(); + if (auto lvalExtOp = lval.getDefiningOp()) { + lval = lvalExtOp->getOperand(0); + } + if (auto rvalExtOp = rval.getDefiningOp()) { + rval = rvalExtOp->getOperand(0); + } + VectorType lSrcType = cast(lval.getType()); + VectorType rSrcType = cast(rval.getType()); + unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth(); + unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth(); + Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true); + if (rBitWidth > lBitWidth) { + accType = getVectorOpDestType(rSrcType, /*AIEML =*/true); + } - unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth(); - unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth(); + // Prepare lhr/rhs for the aievec.mul_elem op + VectorType targetInputType = + createVectorType(512 / lBitWidth, lSrcType.getElementType()); + if (rBitWidth > lBitWidth) { + targetInputType = + createVectorType(512 / rBitWidth, rSrcType.getElementType()); + } + auto lValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(), + lval, targetInputType); + auto rValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(), + rval, targetInputType); + if (!lValConverted || !rValConverted) + return failure(); - if ((lBitWidth != 8 || rBitWidth != 8) && - (lBitWidth != 16 || rBitWidth != 16)) - return failure(); + // Create an aievec.mul_elem op + aievec::MulElemOp mulElemOp = rewriter.create( + mulOp.getLoc(), accType, *lValConverted, *rValConverted); - aievec::MulElemOp mulElemOp = nullptr; - if (lBitWidth == 8) { - mulElemOp = createMulElemAieML(rewriter, lval, rval, lSrcType, - lBitWidth, mulOp.getLoc()); - } else { - Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true); - mulElemOp = rewriter.create(mulOp.getLoc(), - accType, lval, rval); - } - rewriter.replaceOpWithNewOp( - mulOp, resultType, mulElemOp.getResult(), /*isResAcc*/ false); - // Case 3: - // Transfer - - // %1 = arith muli %a, %b : vector<32xi8> - // to - - // aievec.mul_elem(%a, %b) : vector<64xi8>, vector<64xi8>, - // vector<32xi32> - } else { - auto lval = adaptor.getLhs(); - auto rval = adaptor.getRhs(); - VectorType srcType = cast(lval.getType()); - unsigned bitWidth = srcType.getElementType().getIntOrFloatBitWidth(); - auto mulElemOp = createMulElemAieML(rewriter, lval, rval, srcType, - bitWidth, mulOp.getLoc()); - rewriter.replaceOpWithNewOp( - mulOp, srcType, mulElemOp.getResult(), shiftParam); - } - } else { - Type accType = getVectorOpDestType(cast(mulOp.getType()), - /*AIEML =*/true); + // Create an aievec.cast or an aievec.srs op + auto mulElemResultType = mulElemOp.getType(); + auto mulElemResultElWidth = + mulElemResultType.getElementType().getIntOrFloatBitWidth(); - auto mulElemOp = rewriter.create( - mulOp.getLoc(), accType, adaptor.getLhs(), adaptor.getRhs()); + if (mulElemResultElWidth == resultElWidth) { + rewriter.replaceOpWithNewOp( + mulOp, resultType, mulElemOp.getResult(), /*isResAcc*/ false); + } else if (mulElemResultElWidth > resultElWidth) { rewriter.replaceOpWithNewOp( mulOp, resultType, mulElemOp.getResult(), shiftParam); + } else { + return failure(); } + return success(); } diff --git a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp index 2da75ca84c..a4524b9fa4 100644 --- a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp +++ b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp @@ -679,6 +679,27 @@ static LogicalResult printOperation(CppEmitter &emitter, return success(); } +// Generate the unpack intrinsic for AIE-ML +static LogicalResult printOperation(CppEmitter &emitter, + aievec::UnpackOp unpackOp) { + + // The source should have already been emitted + Value source = unpackOp.getSource(); + if (!emitter.hasValueInScope(source)) + return failure(); + + // Generate the initialization for the vector + if (failed(emitter.emitAssignPrefix(*unpackOp, /*isAcc=*/false))) + return failure(); + + raw_indented_ostream &os = emitter.ostream(); + + os << "unpack("; + os << emitter.getOrCreateName(source); + os << ")"; + return success(); +} + // Generate the srs intrinsic static LogicalResult printOperation(CppEmitter &emitter, aievec::SRSOp srsOp) { Value source = srsOp.getSource(); @@ -1499,7 +1520,16 @@ static LogicalResult printOperation(CppEmitter &emitter, raw_indented_ostream &os = emitter.ostream(); // Generate the initialization for the result - if (failed(emitter.emitAssignPrefix(*add_elemOp, true))) + // FIXME: move the logic to the op creation and add isAcc to the op attribute + bool isAcc = false; + VectorType resType = cast(add_elemOp.getResult().getType()); + auto resElemType = resType.getElementType(); + unsigned resBitWidth = resElemType.getIntOrFloatBitWidth(); + unsigned resLaneSize = getVectorLaneSize(resType); + if (isa(resElemType) || (resBitWidth * resLaneSize == 1024)) + isAcc = true; + + if (failed(emitter.emitAssignPrefix(*add_elemOp, /*isAcc=*/isAcc))) return failure(); os << "add("; @@ -1527,7 +1557,16 @@ static LogicalResult printOperation(CppEmitter &emitter, raw_indented_ostream &os = emitter.ostream(); // Generate the initialization for the result - if (failed(emitter.emitAssignPrefix(*sub_elemOp, true))) + // FIXME: move the logic to the op creation and add isAcc to the op attribute + bool isAcc = false; + VectorType resType = cast(sub_elemOp.getResult().getType()); + auto resElemType = resType.getElementType(); + unsigned resBitWidth = resElemType.getIntOrFloatBitWidth(); + unsigned resLaneSize = getVectorLaneSize(resType); + if (isa(resElemType) || (resBitWidth * resLaneSize == 1024)) + isAcc = true; + + if (failed(emitter.emitAssignPrefix(*sub_elemOp, /*isAcc=*/isAcc))) return failure(); os << "sub("; @@ -2914,7 +2953,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) { aievec::BroadcastScalarOp, aievec::MulConvOp, aievec::FMAConvOp, aievec::ShiftOp, aievec::ShuffleOp, aievec::CastOp, aievec::MinOp, aievec::MaxOp, aievec::CmpOp, aievec::SelOp, - aievec::ExtElemOp>( + aievec::ExtElemOp, aievec::UnpackOp>( [&](auto op) { return printOperation(*this, op); }) .Default([&](Operation *) { return op.emitOpError("unable to find printer for op"); @@ -3001,34 +3040,22 @@ LogicalResult CppEmitter::emitType(Location loc, Type type, bool stdintType, return failure(); unsigned dimSize = tType.getDimSize(tType.getRank() - 1); - - if (eltType.isa()) { - os << "v" << std::to_string(dimSize); - auto iType = eltType.cast(); - unsigned width = iType.getWidth(); - if ((dimSize == 16 && width == 64) || (dimSize == 32 && width == 32)) { - if (isAcc) { + os << "v" << std::to_string(dimSize); + + if (AIEML && isAcc) { + if (eltType.isa()) { + // AIE-ML has `ups_to_v16acc32`, `ups_to_v16acc64`, `ups_to_v32acc32` + // intrinsics + unsigned width = eltType.cast().getWidth(); + if ((dimSize == 16 && width == 64) || (dimSize == 32 && width == 32) || + (dimSize == 16 && width == 32)) { return (os << "acc" << width), success(); } else { - return (os << "int" << width), success(); - } - } - } else if (eltType.isa()) { - if (AIEML) { - if (isAcc) { - return (os << "v16accfloat"), success(); - } else { - auto fType = eltType.cast(); - unsigned width = fType.getWidth(); - if (width == 16) { - return (os << "v" << std::to_string(dimSize) << "bfloat16"), - success(); - } else { - return (os << "v" << std::to_string(dimSize) << "float"), success(); - } + return failure(); } - } else { - os << "v" << std::to_string(dimSize); + } else if (eltType.isa()) { + // AIE-ML only has a `ups_to_v16accfloat` intrinsic + return (os << "accfloat"), success(); } } diff --git a/test/Conversion/VectorToAIEVec/test_mul_elem.mlir b/test/Conversion/VectorToAIEVec/test_mul_elem.mlir index 115e8a3b24..5cefc46251 100644 --- a/test/Conversion/VectorToAIEVec/test_mul_elem.mlir +++ b/test/Conversion/VectorToAIEVec/test_mul_elem.mlir @@ -97,3 +97,44 @@ func.func @test_mul_elem_bf16_float(%a : vector<16xbf16>, %3 = arith.mulf %1, %2 : vector<16xf32> return %3 : vector<16xf32> } + +// CHECK-LABEL: func @test_i8_i16_mul_elem +// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<32xi8> +// CHECK-SAME: %[[B:[A-Za-z0-9]+]]: vector<32xi16> +func.func @test_i8_i16_mul_elem(%a : vector<32xi8>, %b : vector<32xi16>) -> vector<32xi32> { + // CHECK: %[[UNPACK:.*]] = aievec.unpack %arg0 : vector<32xi8>, vector<32xi16> + // CHECK: %[[ME:.*]] = aievec.mul_elem %arg1, %[[UNPACK:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32> + // CHECK: %[[CAST:.*]] = aievec.cast %[[ME]] {isResAcc = false} : vector<32xi32>, vector<32xi32> + %1 = arith.extsi %b : vector<32xi16> to vector<32xi32> + %2 = arith.extsi %a : vector<32xi8> to vector<32xi32> + %3 = arith.muli %1, %2 : vector<32xi32> + return %3 : vector<32xi32> +} + +// CHECK-LABEL: func @test_i8_i32_mul_elem +// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<16xi8> +// CHECK-SAME: %[[B:[A-Za-z0-9]+]]: vector<16xi32> +func.func @test_i8_i32_mul_elem(%a : vector<16xi8>, %b : vector<16xi32>) -> vector<16xi32> { + // CHECK: %[[CC:.*]] = aievec.concat %arg0, %arg0 : vector<16xi8>, vector<32xi8> + // CHECK: %[[UPS:.*]] = aievec.ups %[[CC]] {shift = 0 : i8} : vector<32xi8>, vector<32xi32> + // CHECK: %[[CAST:.*]] = aievec.cast %[[UPS]] {isResAcc = false} : vector<32xi32>, vector<32xi32> + // CHECK: %[[EXT:.*]] = aievec.ext %[[CAST]] {index = 0 : i8} : vector<32xi32>, vector<16xi32> + // CHECK: %[[ME:.*]] = aievec.mul_elem %[[EXT]], %arg1 : vector<16xi32>, vector<16xi32>, vector<16xi64> + // CHECK: %[[SRS:.*]] = aievec.srs %[[ME]] {shift = 0 : i8} : vector<16xi64>, vector<16xi32> + %1 = arith.extsi %a : vector<16xi8> to vector<16xi32> + %2 = arith.muli %1, %b : vector<16xi32> + return %2 : vector<16xi32> +} + +// CHECK-LABEL: func @test_i16_i32_mul_elem +// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<16xi16> +// CHECK-SAME: %[[B:[A-Za-z0-9]+]]: vector<16xi32> +func.func @test_i16_i32_mul_elem(%a : vector<16xi16>, %b : vector<16xi32>) -> vector<16xi32> { + // CHECK: %[[UPS:.*]] = aievec.ups %arg0 {shift = 0 : i8} : vector<16xi16>, vector<16xi32> + // CHECK: %[[CAST:.*]] = aievec.cast %[[UPS]] {isResAcc = false} : vector<16xi32>, vector<16xi32> + // CHECK: %[[ME:.*]] = aievec.mul_elem %[[CAST]], %arg1 : vector<16xi32>, vector<16xi32>, vector<16xi64> + // CHECK: %[[SRS:.*]] = aievec.srs %[[ME]] {shift = 0 : i8} : vector<16xi64>, vector<16xi32> + %1 = arith.extsi %a : vector<16xi16> to vector<16xi32> + %2 = arith.muli %1, %b : vector<16xi32> + return %2 : vector<16xi32> +} diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/defines.h b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/defines.h new file mode 100644 index 0000000000..b0366ff425 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/defines.h @@ -0,0 +1,4 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 1024; +constexpr unsigned const IN1_SIZE = 1024; +constexpr unsigned const OUT0_SIZE = 1024; diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/dut.cc b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/dut.cc new file mode 100644 index 0000000000..6fb5f26d73 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/dut.cc @@ -0,0 +1,20 @@ +// clang-format off +void dut(int16_t * restrict v1, int32_t * restrict v2, int32_t * restrict v3) { + size_t v4 = 0; + size_t v5 = 1024; + size_t v6 = 16; + for (size_t v7 = v4; v7 < v5; v7 += v6) + chess_prepare_for_pipelining + chess_loop_range(64, 64) + { + v16int16 v8 = *(v16int16 *)(v1 + v7); + v16int32 v9 = *(v16int32 *)(v2 + v7); + v16acc32 v10 = ups_to_v16acc32(v8, 0); + v16int32 v11 = v16int32(v10); + v16acc64 v12 = mul_elem_16_2(v9, broadcast_zero_s32(), v11, undef_v16int32()); + v16int32 v13 = srs_to_v16int32(v12, 0); + *(v16int32 *)(v3 + v7) = v13; + } + return; +} +// clang-format on diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/i16xi32_mul_elem.mlir b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/i16xi32_mul_elem.mlir new file mode 100644 index 0000000000..fee6d160d8 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/i16xi32_mul_elem.mlir @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir +// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir +// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine --debug-only=lower-vector-to-aievec -o aievec.mlir +// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp --debug-only=aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: tensor<1024xi16>, %arg1: tensor<1024xi32>) -> (tensor<1024xi32>) { + %0 = "tosa.cast"(%arg0) : (tensor<1024xi16>) -> tensor<1024xi32> + %2 = "tosa.mul"(%0,%arg1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>) -> (tensor<1024xi32>) + return %2 : tensor<1024xi32> + } +} + diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/testbench.cc b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/testbench.cc new file mode 100644 index 0000000000..28b260b46d --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/testbench.cc @@ -0,0 +1,57 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include +void dut(int16_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0); +void dut_ref(int16_t *in0, int32_t *in1, int32_t *out0); + +alignas(32) int16_t g_in0[IN0_SIZE]; +alignas(32) int32_t g_in1[IN1_SIZE]; +alignas(32) int32_t g_out0[OUT0_SIZE]; +alignas(32) int32_t g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + // XXX Figure out how to use argv with xme_ca_udm_dbg -A + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_integer(); }); + std::generate(g_in1, g_in1 + IN1_SIZE, + [&]() { return random_integer(); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_in1, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_in1, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +// in0, in1, out0 are in C4 layout. +void dut_ref(int16_t *in0, int32_t *in1, int32_t *out0) { + for (unsigned k = 0; k < OUT0_SIZE; k += 1) { + out0[k] = in0[k] * in1[k]; + } +} diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/defines.h b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/defines.h new file mode 100644 index 0000000000..b0366ff425 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/defines.h @@ -0,0 +1,4 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 1024; +constexpr unsigned const IN1_SIZE = 1024; +constexpr unsigned const OUT0_SIZE = 1024; diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/i8xi16_mul_elem_v16.mlir b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/i8xi16_mul_elem_v16.mlir new file mode 100644 index 0000000000..b96fa459f3 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/i8xi16_mul_elem_v16.mlir @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023, Advanced Micro Devices, Inc. + +// XFAIL: * +// REQUIRES: valid_xchess_license +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir +// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir +// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir --mlir-print-ir-after-all >& aie-opt.stdout +// RUN: aie-translate aievec_new.mlir -aieml=true --aievec-to-cpp -o dut.cc >& aie-translate.stdout +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut_new.cc >& xchesscc.stdout +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: tensor<1024xi8>, %arg1: tensor<1024xi16>) -> (tensor<1024xi32>) { + %0 = "tosa.cast"(%arg0) : (tensor<1024xi8>) -> tensor<1024xi32> + %1 = "tosa.cast"(%arg1) : (tensor<1024xi16>) -> tensor<1024xi32> + %2 = "tosa.mul"(%0,%1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>) -> (tensor<1024xi32>) + return %2 : tensor<1024xi32> + } +} + diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/testbench.cc b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/testbench.cc new file mode 100644 index 0000000000..9006f2395a --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/testbench.cc @@ -0,0 +1,57 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include +void dut(int8_t *restrict in0, int16_t *restrict in1, int32_t *restrict out0); +void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0); + +alignas(32) int8_t g_in0[IN0_SIZE]; +alignas(32) int16_t g_in1[IN1_SIZE]; +alignas(32) int32_t g_out0[OUT0_SIZE]; +alignas(32) int32_t g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + // XXX Figure out how to use argv with xme_ca_udm_dbg -A + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_integer(); }); + std::generate(g_in1, g_in1 + IN1_SIZE, + [&]() { return random_integer(); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_in1, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_in1, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +// in0, in1, out0 are in C4 layout. +void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0) { + for (unsigned k = 0; k < OUT0_SIZE; k += 1) { + out0[k] = in0[k] * in1[k]; + } +} diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/defines.h b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/defines.h new file mode 100644 index 0000000000..b0366ff425 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/defines.h @@ -0,0 +1,4 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 1024; +constexpr unsigned const IN1_SIZE = 1024; +constexpr unsigned const OUT0_SIZE = 1024; diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/dut.cc b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/dut.cc new file mode 100644 index 0000000000..2fd8ad1fc7 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/dut.cc @@ -0,0 +1,19 @@ +// clang-format off +void dut(int8_t * restrict v1, int16_t * restrict v2, int32_t * restrict v3) { + size_t v4 = 0; + size_t v5 = 1024; + size_t v6 = 32; + for (size_t v7 = v4; v7 < v5; v7 += v6) + chess_prepare_for_pipelining + chess_loop_range(32, 32) + { + v32int8 v8 = *(v32int8 *)(v1 + v7); + v32int16 v9 = *(v32int16 *)(v2 + v7); + v32int16 v10 = unpack(v8); + v32acc32 v11 = mul_elem_32(v9, v10); + v32int32 v12 = v32int32(v11); + *(v32int32 *)(v3 + v7) = v12; + } + return; +} +// clang-format on diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/i8xi16_mul_elem_v32.mlir b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/i8xi16_mul_elem_v32.mlir new file mode 100644 index 0000000000..f519c04ae5 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/i8xi16_mul_elem_v32.mlir @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir +// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32" +// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir --mlir-print-ir-after-all +// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: tensor<1024xi8>, %arg1: tensor<1024xi16>) -> (tensor<1024xi32>) { + %0 = "tosa.cast"(%arg0) : (tensor<1024xi8>) -> tensor<1024xi32> + %1 = "tosa.cast"(%arg1) : (tensor<1024xi16>) -> tensor<1024xi32> + %2 = "tosa.mul"(%0,%1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>) -> (tensor<1024xi32>) + return %2 : tensor<1024xi32> + } +} + diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/testbench.cc b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/testbench.cc new file mode 100644 index 0000000000..9006f2395a --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/testbench.cc @@ -0,0 +1,57 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include +void dut(int8_t *restrict in0, int16_t *restrict in1, int32_t *restrict out0); +void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0); + +alignas(32) int8_t g_in0[IN0_SIZE]; +alignas(32) int16_t g_in1[IN1_SIZE]; +alignas(32) int32_t g_out0[OUT0_SIZE]; +alignas(32) int32_t g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + // XXX Figure out how to use argv with xme_ca_udm_dbg -A + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_integer(); }); + std::generate(g_in1, g_in1 + IN1_SIZE, + [&]() { return random_integer(); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_in1, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_in1, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +// in0, in1, out0 are in C4 layout. +void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0) { + for (unsigned k = 0; k < OUT0_SIZE; k += 1) { + out0[k] = in0[k] * in1[k]; + } +} diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/defines.h b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/defines.h new file mode 100644 index 0000000000..b0366ff425 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/defines.h @@ -0,0 +1,4 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 1024; +constexpr unsigned const IN1_SIZE = 1024; +constexpr unsigned const OUT0_SIZE = 1024; diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/dut.cc b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/dut.cc new file mode 100644 index 0000000000..0d75998d74 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/dut.cc @@ -0,0 +1,22 @@ +// clang-format off +void dut(int8_t * restrict v1, int32_t * restrict v2, int32_t * restrict v3) { + size_t v4 = 0; + size_t v5 = 1024; + size_t v6 = 16; + for (size_t v7 = v4; v7 < v5; v7 += v6) + chess_prepare_for_pipelining + chess_loop_range(64, 64) + { + v16int8 v8 = *(v16int8 *)(v1 + v7); + v16int32 v9 = *(v16int32 *)(v2 + v7); + v32int8 v10 = concat(v8, v8); + v32acc32 v11 = ups_to_v32acc32(v10, 0); + v32int32 v12 = v32int32(v11); + v16int32 v13 = extract_v16int32(v12, 0); + v16acc64 v14 = mul_elem_16_2(v9, broadcast_zero_s32(), v13, undef_v16int32()); + v16int32 v15 = srs_to_v16int32(v14, 0); + *(v16int32 *)(v3 + v7) = v15; + } + return; +} +// clang-format on diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/i8xi32_mul_elem.mlir b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/i8xi32_mul_elem.mlir new file mode 100644 index 0000000000..3369a60633 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/i8xi32_mul_elem.mlir @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir +// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir +// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir --mlir-print-ir-after-all +// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: tensor<1024xi8>, %arg1: tensor<1024xi32>) -> (tensor<1024xi32>) { + %0 = "tosa.cast"(%arg0) : (tensor<1024xi8>) -> tensor<1024xi32> + %2 = "tosa.mul"(%0,%arg1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>) -> (tensor<1024xi32>) + return %2 : tensor<1024xi32> + } +} + diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/testbench.cc b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/testbench.cc new file mode 100644 index 0000000000..8b5d031fd2 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/testbench.cc @@ -0,0 +1,57 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include +void dut(int8_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0); +void dut_ref(int8_t *in0, int32_t *in1, int32_t *out0); + +alignas(32) int8_t g_in0[IN0_SIZE]; +alignas(32) int32_t g_in1[IN1_SIZE]; +alignas(32) int32_t g_out0[OUT0_SIZE]; +alignas(32) int32_t g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + // XXX Figure out how to use argv with xme_ca_udm_dbg -A + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_integer(); }); + std::generate(g_in1, g_in1 + IN1_SIZE, + [&]() { return random_integer(); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_in1, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_in1, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +// in0, in1, out0 are in C4 layout. +void dut_ref(int8_t *in0, int32_t *in1, int32_t *out0) { + for (unsigned k = 0; k < OUT0_SIZE; k += 1) { + out0[k] = in0[k] * in1[k]; + } +} diff --git a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc index 47b18d0054..dc11faa000 100644 --- a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc +++ b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc @@ -1,19 +1,23 @@ -void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) { - size_t v4 = 16; - size_t v5 = 1024; - size_t v6 = 0; - bfloat16 v7 = 0.0e+00; - v32bfloat16 v8 = broadcast_to_v32bfloat16(v7); - v16bfloat16 v9 = extract_v16bfloat16(v8, 0); - for (size_t v10 = v6; v10 < v5; v10 += v4) - chess_prepare_for_pipelining chess_loop_range(64, 64) { - v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10); - v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10); - v32bfloat16 v13 = concat(v11, v9); - v32bfloat16 v14 = concat(v12, v9); - v16accfloat v15 = mul_elem_16_2(v14, v13); - v16bfloat16 v16 = to_v16bfloat16(v15); - *(v16bfloat16 *)(v3 + v10) = v16; - } +// clang-format off +void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) { + bfloat16 v4 = 0.0e+00; + v32bfloat16 v5 = broadcast_to_v32bfloat16(v4); + v16bfloat16 v6 = extract_v16bfloat16(v5, 0); + size_t v7 = 0; + size_t v8 = 1024; + size_t v9 = 16; + for (size_t v10 = v7; v10 < v8; v10 += v9) + chess_prepare_for_pipelining + chess_loop_range(64, 64) + { + v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10); + v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10); + v32bfloat16 v13 = concat(v11, v6); + v32bfloat16 v14 = concat(v12, v6); + v16accfloat v15 = mul_elem_16_2(v14, v13); + v16bfloat16 v16 = to_v16bfloat16(v15); + *(v16bfloat16 *)(v3 + v10) = v16; + } return; } +// clang-format on diff --git a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc index 7bfd773ebf..b42f925387 100644 --- a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc +++ b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc @@ -1,4 +1,5 @@ -void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, float *restrict v3) { +// clang-format off +void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, float * restrict v3) { bfloat16 v4 = 0.0e+00; v32bfloat16 v5 = broadcast_to_v32bfloat16(v4); v16bfloat16 v6 = extract_v16bfloat16(v5, 0); @@ -6,14 +7,17 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, float *restrict v3) { size_t v8 = 1024; size_t v9 = 16; for (size_t v10 = v7; v10 < v8; v10 += v9) - chess_prepare_for_pipelining chess_loop_range(64, 64) { - v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10); - v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10); - v32bfloat16 v13 = concat(v11, v6); - v32bfloat16 v14 = concat(v12, v6); - v16accfloat v15 = mul_elem_16_2(v14, v13); - v16float v16 = v16float(v15); - *(v16float *)(v3 + v10) = v16; - } + chess_prepare_for_pipelining + chess_loop_range(64, 64) + { + v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10); + v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10); + v32bfloat16 v13 = concat(v11, v6); + v32bfloat16 v14 = concat(v12, v6); + v16accfloat v15 = mul_elem_16_2(v14, v13); + v16float v16 = v16float(v15); + *(v16float *)(v3 + v10) = v16; + } return; } +// clang-format on