diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
index 6e77665372..3716344a49 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
@@ -87,34 +87,78 @@ extractMACOperandsFromAddOperands(Value addLhs, Value addRhs) {
   return {};
 }
 
-// Create MulElemOp for i8 and bf16 types in aie-ml. The corresponding intrinsic
-// is mul_elem_16_2, which indicates that we need to concatenate zero vectors
-// for both mul operands before creating MulElemOp.
-static aievec::MulElemOp createMulElemAieML(ConversionPatternRewriter &rewriter,
-                                            Value lval, Value rval,
-                                            VectorType srcType,
-                                            unsigned bitWidth, Location loc) {
-  Type accType = getVectorOpDestType(srcType, /*AIEML =*/true);
-  VectorType vecType =
-      createVectorType(512 / bitWidth, srcType.getElementType());
-
-  arith::ConstantOp zeroConstOp = nullptr;
-  zeroConstOp = rewriter.create<arith::ConstantOp>(
-      loc, srcType.getElementType(),
-      rewriter.getZeroAttr(srcType.getElementType()));
-  auto broadcastZeroOp = rewriter.create<aievec::BroadcastScalarOp>(
-      loc, vecType, zeroConstOp->getResult(0));
-  auto extOp = rewriter.create<aievec::ExtOp>(loc, srcType,
-                                              broadcastZeroOp.getResult(), 0);
-
-  SmallVector<Value> lSources = {lval, extOp->getResult(0)};
-  SmallVector<Value> rSources = {rval, extOp->getResult(0)};
-  auto lConcatOp = rewriter.create<aievec::ConcatOp>(loc, vecType, lSources);
-  auto rConcatOp = rewriter.create<aievec::ConcatOp>(loc, vecType, rSources);
-
-  auto mulElemOp = rewriter.create<aievec::MulElemOp>(
-      loc, accType, lConcatOp->getResult(0), rConcatOp->getResult(0));
-  return mulElemOp;
+// Convert a input value to a target vector type. This function can insert
+// multiple aievec ops depending on the combination of input and output vector
+// types.
+static std::optional<Value>
+convertValueToTargetTypeAieML(ConversionPatternRewriter &rewriter, Location loc,
+                              Value inputVal, VectorType tgtType) {
+  VectorType srcType = cast<VectorType>(inputVal.getType());
+  auto srcElemType = srcType.getElementType();
+  unsigned srcBitWidth = srcElemType.getIntOrFloatBitWidth();
+  unsigned srcLaneSize = getVectorLaneSize(srcType);
+
+  auto tgtElemType = tgtType.getElementType();
+  unsigned tgtBitWidth = tgtElemType.getIntOrFloatBitWidth();
+  unsigned tgtLaneSize = getVectorLaneSize(tgtType);
+
+  if (srcType == tgtType)
+    return inputVal;
+
+  if ((srcElemType == tgtElemType) && (srcLaneSize != tgtLaneSize)) {
+    // TODO: relax the condition below?
+    if ((srcLaneSize == 16 && tgtLaneSize == 32 &&
+         isa<FloatType>(srcElemType)) ||
+        (srcLaneSize == 32 && tgtLaneSize == 64 &&
+         isa<IntegerType>(srcElemType))) {
+      auto zeroConstOp = rewriter.create<arith::ConstantOp>(
+          loc, srcType.getElementType(),
+          rewriter.getZeroAttr(srcType.getElementType()));
+      auto broadcastZeroOp = rewriter.create<aievec::BroadcastScalarOp>(
+          loc, tgtType, zeroConstOp->getResult(0));
+      auto extOp = rewriter.create<aievec::ExtOp>(
+          loc, srcType, broadcastZeroOp->getResult(0), 0);
+
+      SmallVector<Value> inputSources = {inputVal, extOp->getResult(0)};
+      aievec::ConcatOp concatOp =
+          rewriter.create<aievec::ConcatOp>(loc, tgtType, inputSources);
+
+      return concatOp.getResult();
+    }
+  } else if ((srcElemType != tgtElemType) && (srcLaneSize == tgtLaneSize) &&
+             isa<IntegerType>(srcElemType) && isa<IntegerType>(tgtElemType)) {
+    if (srcBitWidth == 16 && tgtBitWidth == 32 && srcLaneSize == 16) {
+      // Case 1: vector<16xi16> to vector<16xi32> conversion by aievec.ups +
+      // aievec.cast
+      auto accType = getVectorOpDestType(srcType, /*AIEML =*/true);
+      auto upsOp = rewriter.create<aievec::UPSOp>(loc, accType, inputVal);
+      auto castOp = rewriter.create<aievec::CastOp>(
+          loc, tgtType, upsOp.getResult(), /*isResAcc*/ false);
+      return castOp.getResult();
+    } else if (srcBitWidth == 8 && tgtBitWidth == 32 && srcLaneSize == 16) {
+      // Case 2: vector<16xi8> to vector<16xi32> conversion by aievec.concat +
+      // aievec.ups + aievec.cast + aievec.ext
+      // FIXME: Should use undef_xxx() for the second input of concat
+      auto concatOutType = createVectorType(32, srcElemType);
+      auto concatOp = rewriter.create<aievec::ConcatOp>(
+          loc, concatOutType, SmallVector<Value>({inputVal, inputVal}));
+      auto accType = getVectorOpDestType(concatOutType, /*AIEML =*/true);
+      auto upsOp =
+          rewriter.create<aievec::UPSOp>(loc, accType, concatOp.getResult());
+      auto castType = createVectorType(32, tgtElemType);
+      auto castOp = rewriter.create<aievec::CastOp>(
+          loc, castType, upsOp.getResult(), /*isResAcc*/ false);
+      auto extOp =
+          rewriter.create<aievec::ExtOp>(loc, tgtType, castOp.getResult(), 0);
+      return extOp.getResult();
+    } else if (srcBitWidth == 8 && tgtBitWidth == 16 && srcLaneSize == 32) {
+      // Case 3: vector<32xi8> to vector<32xi16> conversion by aievec.unpack
+      auto unpackOp = rewriter.create<aievec::UnpackOp>(loc, tgtType, inputVal);
+      return unpackOp.getResult();
+    }
+  }
+
+  return std::nullopt;
 }
 
 // Return the list of attributes that configure an `aievec.select` op to
@@ -546,8 +590,8 @@ struct ConvertMulFToAIEVecMulElemOpPattern
     if (!resultType)
       return failure();
 
+    // FIXME: Verify it is not a part of FMA
     auto isAddOp = [&](Operation *op) { return isa<arith::AddFOp>(op); };
-    // Verify it is not a part of FMA
     if (mulOp->hasOneUse() && llvm::any_of(mulOp->getUsers(), isAddOp))
       return failure();
 
@@ -560,42 +604,68 @@ struct ConvertMulFToAIEVecMulElemOpPattern
     if (laneSize != 16 || (resultElWidth != 16 && resultElWidth != 32))
       return failure();
 
-    aievec::MulElemOp mulElemOp = nullptr;
-
-    if (resultElWidth == 16) {
-      mulElemOp =
-          createMulElemAieML(rewriter, adaptor.getLhs(), adaptor.getRhs(),
-                             resultType, resultElWidth, mulOp.getLoc());
-      rewriter.replaceOpWithNewOp<aievec::SRSOp>(
-          mulOp, resultType, mulElemOp.getResult(), shiftParam);
+    // Decide the accType for aievec.mul_elem based on mulOp's lhs & rhs
+    auto lval = adaptor.getLhs();
+    auto rval = adaptor.getRhs();
+    if (auto lvalExtOp = lval.getDefiningOp<arith::ExtFOp>()) {
+      lval = lvalExtOp->getOperand(0);
+    }
+    if (auto rvalExtOp = rval.getDefiningOp<arith::ExtFOp>()) {
+      rval = rvalExtOp->getOperand(0);
+    }
+    VectorType lSrcType = cast<VectorType>(lval.getType());
+    VectorType rSrcType = cast<VectorType>(rval.getType());
+    unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth();
+    unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth();
+    Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true);
+    if (rBitWidth > lBitWidth) {
+      accType = getVectorOpDestType(rSrcType, /*AIEML =*/true);
+    }
+    // Only support the same lhs/rhs type at the moment
+    if (lSrcType != rSrcType) {
+      return failure();
+    }
+    // Only support two bfloat16 inputs at the moment
+    if (lBitWidth != 16 || rBitWidth != 16) {
+      return failure();
     }
-    // float type
-    else {
-      auto lhs = dyn_cast<arith::ExtFOp>(adaptor.getLhs().getDefiningOp());
-      auto rhs = dyn_cast<arith::ExtFOp>(adaptor.getRhs().getDefiningOp());
-
-      if (!lhs || !rhs)
-        return failure();
-
-      auto lval = lhs->getOperand(0);
-      auto rval = rhs->getOperand(0);
 
-      VectorType lSrcType = cast<VectorType>(lval.getType());
-      VectorType rSrcType = cast<VectorType>(rval.getType());
+    // Prepare lhr/rhs for the aievec.mul_elem op
+    VectorType targetInputType =
+        createVectorType(512 / lBitWidth, lSrcType.getElementType());
+    if (rBitWidth > lBitWidth) {
+      targetInputType =
+          createVectorType(512 / rBitWidth, rSrcType.getElementType());
+    }
+    auto lValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(),
+                                                       lval, targetInputType);
+    auto rValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(),
+                                                       rval, targetInputType);
+    if (!lValConverted || !rValConverted)
+      return failure();
 
-      unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth();
-      unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth();
+    // Create an aievec.mul_elem op
+    aievec::MulElemOp mulElemOp = rewriter.create<aievec::MulElemOp>(
+        mulOp.getLoc(), accType, *lValConverted, *rValConverted);
 
-      if (lBitWidth != 16 || rBitWidth != 16)
-        return failure();
+    // Create an aievec.cast or an aievec.srs op
+    auto mulElemResultType = mulElemOp.getType();
+    auto mulElemResultElWidth =
+        mulElemResultType.getElementType().getIntOrFloatBitWidth();
 
-      mulElemOp = createMulElemAieML(rewriter, lval, rval, lSrcType, lBitWidth,
-                                     mulOp.getLoc());
+    if (mulElemResultElWidth == resultElWidth) {
       rewriter.replaceOpWithNewOp<aievec::CastOp>(
           mulOp, resultType, mulElemOp.getResult(), /*isResAcc*/ false);
+    } else if (mulElemResultElWidth > resultElWidth) {
+      rewriter.replaceOpWithNewOp<aievec::SRSOp>(
+          mulOp, resultType, mulElemOp.getResult(), shiftParam);
+    } else {
+      return failure();
     }
+
     return success();
   }
+
   unsigned shiftParam;
 };
 
@@ -617,8 +687,8 @@ struct ConvertMulIToAIEVecMulElemOpPattern
     if (!resultType)
       return failure();
 
+    // FIXME: Verify it is not a part of MAC
     auto isAddOp = [&](Operation *op) { return isa<arith::AddIOp>(op); };
-    // Verify it is not a part of MAC
     if (mulOp->hasOneUse() && llvm::any_of(mulOp->getUsers(), isAddOp))
       return failure();
 
@@ -631,79 +701,57 @@ struct ConvertMulIToAIEVecMulElemOpPattern
         ((laneSize != 16 && laneSize != 32) || resultElWidth != 32))
       return failure();
 
-    // Deal with the case with sext op for i8 and i16:
-    // Case 1:
-    // Transfer -
-    // %1 = arith.extsi %a : vector<32xi8> to vector<32xi32>
-    // %2 = arith.extsi %b : vector<32xi8> to vector<32xi32>
-    // %3 = arith.muli %1, %2 : vector<32xi32>
-    // to -
-    // aievec.mul_elem(%a, %b) : vector<64xi8>, vector<64xi8>, vector<32xi32>
-    //
-    // Case 2:
-    // Transfer -
-    // %1 = arith.extsi %a : vector<32xi16> to vector<32xi32>
-    // %2 = arith.extsi %b : vector<32xi16> to vector<32xi32>
-    // %3 = arith.muli %1, %2 : vector<32xi32>
-    // to -
-    // aievec.mul_elem(%a, %b) : vector<32xi16>, vector<32xi16>, vector<32xi32>
-    if (laneSize == 32 && (resultElWidth == 32 || resultElWidth == 8)) {
-      if (resultElWidth == 32) {
-        auto lhs = dyn_cast<arith::ExtSIOp>(adaptor.getLhs().getDefiningOp());
-        auto rhs = dyn_cast<arith::ExtSIOp>(adaptor.getRhs().getDefiningOp());
-
-        if (!lhs || !rhs)
-          return failure();
-
-        auto lval = lhs->getOperand(0);
-        auto rval = rhs->getOperand(0);
-
-        VectorType lSrcType = cast<VectorType>(lval.getType());
-        VectorType rSrcType = cast<VectorType>(rval.getType());
+    // Decide the accType for aievec.mul_elem based on mulOp's lhs & rhs
+    auto lval = adaptor.getLhs();
+    auto rval = adaptor.getRhs();
+    if (auto lvalExtOp = lval.getDefiningOp<arith::ExtSIOp>()) {
+      lval = lvalExtOp->getOperand(0);
+    }
+    if (auto rvalExtOp = rval.getDefiningOp<arith::ExtSIOp>()) {
+      rval = rvalExtOp->getOperand(0);
+    }
+    VectorType lSrcType = cast<VectorType>(lval.getType());
+    VectorType rSrcType = cast<VectorType>(rval.getType());
+    unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth();
+    unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth();
+    Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true);
+    if (rBitWidth > lBitWidth) {
+      accType = getVectorOpDestType(rSrcType, /*AIEML =*/true);
+    }
 
-        unsigned lBitWidth = lSrcType.getElementType().getIntOrFloatBitWidth();
-        unsigned rBitWidth = rSrcType.getElementType().getIntOrFloatBitWidth();
+    // Prepare lhr/rhs for the aievec.mul_elem op
+    VectorType targetInputType =
+        createVectorType(512 / lBitWidth, lSrcType.getElementType());
+    if (rBitWidth > lBitWidth) {
+      targetInputType =
+          createVectorType(512 / rBitWidth, rSrcType.getElementType());
+    }
+    auto lValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(),
+                                                       lval, targetInputType);
+    auto rValConverted = convertValueToTargetTypeAieML(rewriter, mulOp.getLoc(),
+                                                       rval, targetInputType);
+    if (!lValConverted || !rValConverted)
+      return failure();
 
-        if ((lBitWidth != 8 || rBitWidth != 8) &&
-            (lBitWidth != 16 || rBitWidth != 16))
-          return failure();
+    // Create an aievec.mul_elem op
+    aievec::MulElemOp mulElemOp = rewriter.create<aievec::MulElemOp>(
+        mulOp.getLoc(), accType, *lValConverted, *rValConverted);
 
-        aievec::MulElemOp mulElemOp = nullptr;
-        if (lBitWidth == 8) {
-          mulElemOp = createMulElemAieML(rewriter, lval, rval, lSrcType,
-                                         lBitWidth, mulOp.getLoc());
-        } else {
-          Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true);
-          mulElemOp = rewriter.create<aievec::MulElemOp>(mulOp.getLoc(),
-                                                         accType, lval, rval);
-        }
-        rewriter.replaceOpWithNewOp<aievec::CastOp>(
-            mulOp, resultType, mulElemOp.getResult(), /*isResAcc*/ false);
-        // Case 3:
-        // Transfer -
-        // %1 = arith muli %a, %b : vector<32xi8>
-        // to -
-        // aievec.mul_elem(%a, %b) : vector<64xi8>, vector<64xi8>,
-        // vector<32xi32>
-      } else {
-        auto lval = adaptor.getLhs();
-        auto rval = adaptor.getRhs();
-        VectorType srcType = cast<VectorType>(lval.getType());
-        unsigned bitWidth = srcType.getElementType().getIntOrFloatBitWidth();
-        auto mulElemOp = createMulElemAieML(rewriter, lval, rval, srcType,
-                                            bitWidth, mulOp.getLoc());
-        rewriter.replaceOpWithNewOp<aievec::SRSOp>(
-            mulOp, srcType, mulElemOp.getResult(), shiftParam);
-      }
-    } else {
-      Type accType = getVectorOpDestType(cast<VectorType>(mulOp.getType()),
-                                         /*AIEML =*/true);
+    // Create an aievec.cast or an aievec.srs op
+    auto mulElemResultType = mulElemOp.getType();
+    auto mulElemResultElWidth =
+        mulElemResultType.getElementType().getIntOrFloatBitWidth();
 
-      auto mulElemOp = rewriter.create<aievec::MulElemOp>(
-          mulOp.getLoc(), accType, adaptor.getLhs(), adaptor.getRhs());
+    if (mulElemResultElWidth == resultElWidth) {
+      rewriter.replaceOpWithNewOp<aievec::CastOp>(
+          mulOp, resultType, mulElemOp.getResult(), /*isResAcc*/ false);
+    } else if (mulElemResultElWidth > resultElWidth) {
       rewriter.replaceOpWithNewOp<aievec::SRSOp>(
           mulOp, resultType, mulElemOp.getResult(), shiftParam);
+    } else {
+      return failure();
     }
+
     return success();
   }
 
diff --git a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
index 2da75ca84c..a4524b9fa4 100644
--- a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
+++ b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
@@ -679,6 +679,27 @@ static LogicalResult printOperation(CppEmitter &emitter,
   return success();
 }
 
+// Generate the unpack intrinsic for AIE-ML
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    aievec::UnpackOp unpackOp) {
+
+  // The source should have already been emitted
+  Value source = unpackOp.getSource();
+  if (!emitter.hasValueInScope(source))
+    return failure();
+
+  // Generate the initialization for the vector
+  if (failed(emitter.emitAssignPrefix(*unpackOp, /*isAcc=*/false)))
+    return failure();
+
+  raw_indented_ostream &os = emitter.ostream();
+
+  os << "unpack(";
+  os << emitter.getOrCreateName(source);
+  os << ")";
+  return success();
+}
+
 // Generate the srs intrinsic
 static LogicalResult printOperation(CppEmitter &emitter, aievec::SRSOp srsOp) {
   Value source = srsOp.getSource();
@@ -1499,7 +1520,16 @@ static LogicalResult printOperation(CppEmitter &emitter,
   raw_indented_ostream &os = emitter.ostream();
 
   // Generate the initialization for the result
-  if (failed(emitter.emitAssignPrefix(*add_elemOp, true)))
+  // FIXME: move the logic to the op creation and add isAcc to the op attribute
+  bool isAcc = false;
+  VectorType resType = cast<VectorType>(add_elemOp.getResult().getType());
+  auto resElemType = resType.getElementType();
+  unsigned resBitWidth = resElemType.getIntOrFloatBitWidth();
+  unsigned resLaneSize = getVectorLaneSize(resType);
+  if (isa<FloatType>(resElemType) || (resBitWidth * resLaneSize == 1024))
+    isAcc = true;
+
+  if (failed(emitter.emitAssignPrefix(*add_elemOp, /*isAcc=*/isAcc)))
     return failure();
 
   os << "add(";
@@ -1527,7 +1557,16 @@ static LogicalResult printOperation(CppEmitter &emitter,
   raw_indented_ostream &os = emitter.ostream();
 
   // Generate the initialization for the result
-  if (failed(emitter.emitAssignPrefix(*sub_elemOp, true)))
+  // FIXME: move the logic to the op creation and add isAcc to the op attribute
+  bool isAcc = false;
+  VectorType resType = cast<VectorType>(sub_elemOp.getResult().getType());
+  auto resElemType = resType.getElementType();
+  unsigned resBitWidth = resElemType.getIntOrFloatBitWidth();
+  unsigned resLaneSize = getVectorLaneSize(resType);
+  if (isa<FloatType>(resElemType) || (resBitWidth * resLaneSize == 1024))
+    isAcc = true;
+
+  if (failed(emitter.emitAssignPrefix(*sub_elemOp, /*isAcc=*/isAcc)))
     return failure();
 
   os << "sub(";
@@ -2914,7 +2953,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
                 aievec::BroadcastScalarOp, aievec::MulConvOp, aievec::FMAConvOp,
                 aievec::ShiftOp, aievec::ShuffleOp, aievec::CastOp,
                 aievec::MinOp, aievec::MaxOp, aievec::CmpOp, aievec::SelOp,
-                aievec::ExtElemOp>(
+                aievec::ExtElemOp, aievec::UnpackOp>(
               [&](auto op) { return printOperation(*this, op); })
           .Default([&](Operation *) {
             return op.emitOpError("unable to find printer for op");
@@ -3001,34 +3040,22 @@ LogicalResult CppEmitter::emitType(Location loc, Type type, bool stdintType,
       return failure();
 
     unsigned dimSize = tType.getDimSize(tType.getRank() - 1);
-
-    if (eltType.isa<IntegerType>()) {
-      os << "v" << std::to_string(dimSize);
-      auto iType = eltType.cast<IntegerType>();
-      unsigned width = iType.getWidth();
-      if ((dimSize == 16 && width == 64) || (dimSize == 32 && width == 32)) {
-        if (isAcc) {
+    os << "v" << std::to_string(dimSize);
+
+    if (AIEML && isAcc) {
+      if (eltType.isa<IntegerType>()) {
+        // AIE-ML has `ups_to_v16acc32`, `ups_to_v16acc64`, `ups_to_v32acc32`
+        // intrinsics
+        unsigned width = eltType.cast<IntegerType>().getWidth();
+        if ((dimSize == 16 && width == 64) || (dimSize == 32 && width == 32) ||
+            (dimSize == 16 && width == 32)) {
           return (os << "acc" << width), success();
         } else {
-          return (os << "int" << width), success();
-        }
-      }
-    } else if (eltType.isa<FloatType>()) {
-      if (AIEML) {
-        if (isAcc) {
-          return (os << "v16accfloat"), success();
-        } else {
-          auto fType = eltType.cast<FloatType>();
-          unsigned width = fType.getWidth();
-          if (width == 16) {
-            return (os << "v" << std::to_string(dimSize) << "bfloat16"),
-                   success();
-          } else {
-            return (os << "v" << std::to_string(dimSize) << "float"), success();
-          }
+          return failure();
         }
-      } else {
-        os << "v" << std::to_string(dimSize);
+      } else if (eltType.isa<FloatType>()) {
+        // AIE-ML only has a `ups_to_v16accfloat` intrinsic
+        return (os << "accfloat"), success();
       }
     }
 
diff --git a/test/Conversion/VectorToAIEVec/test_mul_elem.mlir b/test/Conversion/VectorToAIEVec/test_mul_elem.mlir
index 115e8a3b24..5cefc46251 100644
--- a/test/Conversion/VectorToAIEVec/test_mul_elem.mlir
+++ b/test/Conversion/VectorToAIEVec/test_mul_elem.mlir
@@ -97,3 +97,44 @@ func.func @test_mul_elem_bf16_float(%a : vector<16xbf16>,
   %3 = arith.mulf %1, %2 : vector<16xf32>
   return %3 : vector<16xf32>
 }
+
+// CHECK-LABEL: func @test_i8_i16_mul_elem
+// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<32xi8>
+// CHECK-SAME: %[[B:[A-Za-z0-9]+]]: vector<32xi16>
+func.func @test_i8_i16_mul_elem(%a : vector<32xi8>, %b : vector<32xi16>) -> vector<32xi32> {
+  // CHECK: %[[UNPACK:.*]] = aievec.unpack %arg0 : vector<32xi8>, vector<32xi16>
+  // CHECK: %[[ME:.*]] = aievec.mul_elem %arg1, %[[UNPACK:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
+  // CHECK: %[[CAST:.*]] = aievec.cast %[[ME]] {isResAcc = false} : vector<32xi32>, vector<32xi32>
+  %1 = arith.extsi %b : vector<32xi16> to vector<32xi32>
+  %2 = arith.extsi %a : vector<32xi8> to vector<32xi32>
+  %3 = arith.muli %1, %2 : vector<32xi32>
+  return %3 : vector<32xi32>
+}
+
+// CHECK-LABEL: func @test_i8_i32_mul_elem
+// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<16xi8>
+// CHECK-SAME: %[[B:[A-Za-z0-9]+]]: vector<16xi32>
+func.func @test_i8_i32_mul_elem(%a : vector<16xi8>, %b : vector<16xi32>) -> vector<16xi32> {
+  // CHECK: %[[CC:.*]] = aievec.concat %arg0, %arg0 : vector<16xi8>, vector<32xi8>
+  // CHECK: %[[UPS:.*]] = aievec.ups %[[CC]] {shift = 0 : i8} : vector<32xi8>, vector<32xi32>
+  // CHECK: %[[CAST:.*]] = aievec.cast %[[UPS]] {isResAcc = false} : vector<32xi32>, vector<32xi32>
+  // CHECK: %[[EXT:.*]] = aievec.ext %[[CAST]] {index = 0 : i8} : vector<32xi32>, vector<16xi32>
+  // CHECK: %[[ME:.*]] = aievec.mul_elem %[[EXT]], %arg1 : vector<16xi32>, vector<16xi32>, vector<16xi64>
+  // CHECK: %[[SRS:.*]] = aievec.srs %[[ME]] {shift = 0 : i8} : vector<16xi64>, vector<16xi32>
+  %1 = arith.extsi %a : vector<16xi8> to vector<16xi32>
+  %2 = arith.muli %1, %b : vector<16xi32>
+  return %2 : vector<16xi32>
+}
+
+// CHECK-LABEL: func @test_i16_i32_mul_elem
+// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: vector<16xi16>
+// CHECK-SAME: %[[B:[A-Za-z0-9]+]]: vector<16xi32>
+func.func @test_i16_i32_mul_elem(%a : vector<16xi16>, %b : vector<16xi32>) -> vector<16xi32> {
+  // CHECK: %[[UPS:.*]] = aievec.ups %arg0 {shift = 0 : i8} : vector<16xi16>, vector<16xi32>
+  // CHECK: %[[CAST:.*]] = aievec.cast %[[UPS]] {isResAcc = false} : vector<16xi32>, vector<16xi32>
+  // CHECK: %[[ME:.*]] = aievec.mul_elem %[[CAST]], %arg1 : vector<16xi32>, vector<16xi32>, vector<16xi64>
+  // CHECK: %[[SRS:.*]] = aievec.srs %[[ME]] {shift = 0 : i8} : vector<16xi64>, vector<16xi32>
+  %1 = arith.extsi %a : vector<16xi16> to vector<16xi32>
+  %2 = arith.muli %1, %b : vector<16xi32>
+  return %2 : vector<16xi32>
+}
diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/defines.h b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/defines.h
new file mode 100644
index 0000000000..b0366ff425
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/dut.cc b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/dut.cc
new file mode 100644
index 0000000000..6fb5f26d73
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/dut.cc
@@ -0,0 +1,20 @@
+// clang-format off
+void dut(int16_t * restrict v1, int32_t * restrict v2, int32_t * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 1024;
+  size_t v6 = 16;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(64, 64)
+  {
+    v16int16 v8 = *(v16int16 *)(v1 + v7);
+    v16int32 v9 = *(v16int32 *)(v2 + v7);
+    v16acc32 v10 = ups_to_v16acc32(v8, 0);
+    v16int32 v11 = v16int32(v10);
+    v16acc64 v12 = mul_elem_16_2(v9, broadcast_zero_s32(), v11, undef_v16int32());
+    v16int32 v13 = srs_to_v16int32(v12, 0);
+    *(v16int32 *)(v3 + v7) = v13;
+  }
+  return;
+}
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/i16xi32_mul_elem.mlir b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/i16xi32_mul_elem.mlir
new file mode 100644
index 0000000000..fee6d160d8
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/i16xi32_mul_elem.mlir
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine --debug-only=lower-vector-to-aievec -o aievec.mlir
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp --debug-only=aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<1024xi16>, %arg1: tensor<1024xi32>) -> (tensor<1024xi32>) {
+    %0 = "tosa.cast"(%arg0) : (tensor<1024xi16>) -> tensor<1024xi32>
+    %2 = "tosa.mul"(%0,%arg1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>)  -> (tensor<1024xi32>)
+    return %2 : tensor<1024xi32>
+  }
+}
+
diff --git a/test/Integration/Dialect/TOSA/i16xi32_mul_elem/testbench.cc b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/testbench.cc
new file mode 100644
index 0000000000..28b260b46d
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i16xi32_mul_elem/testbench.cc
@@ -0,0 +1,57 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(int16_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0);
+void dut_ref(int16_t *in0, int32_t *in1, int32_t *out0);
+
+alignas(32) int16_t g_in0[IN0_SIZE];
+alignas(32) int32_t g_in1[IN1_SIZE];
+alignas(32) int32_t g_out0[OUT0_SIZE];
+alignas(32) int32_t g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  // XXX Figure out how to use argv with xme_ca_udm_dbg -A
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_integer<int16_t>(); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_integer<int32_t>(); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+// in0, in1, out0 are in C4 layout.
+void dut_ref(int16_t *in0, int32_t *in1, int32_t *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] * in1[k];
+  }
+}
diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/defines.h b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/defines.h
new file mode 100644
index 0000000000..b0366ff425
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/i8xi16_mul_elem_v16.mlir b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/i8xi16_mul_elem_v16.mlir
new file mode 100644
index 0000000000..b96fa459f3
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/i8xi16_mul_elem_v16.mlir
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// XFAIL: *
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir --mlir-print-ir-after-all >& aie-opt.stdout
+// RUN: aie-translate aievec_new.mlir -aieml=true --aievec-to-cpp -o dut.cc >& aie-translate.stdout
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut_new.cc >& xchesscc.stdout
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<1024xi8>, %arg1: tensor<1024xi16>) -> (tensor<1024xi32>) {
+    %0 = "tosa.cast"(%arg0) : (tensor<1024xi8>) -> tensor<1024xi32>
+    %1 = "tosa.cast"(%arg1) : (tensor<1024xi16>) -> tensor<1024xi32>
+    %2 = "tosa.mul"(%0,%1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>)  -> (tensor<1024xi32>)
+    return %2 : tensor<1024xi32>
+  }
+}
+
diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/testbench.cc b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/testbench.cc
new file mode 100644
index 0000000000..9006f2395a
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v16/testbench.cc
@@ -0,0 +1,57 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(int8_t *restrict in0, int16_t *restrict in1, int32_t *restrict out0);
+void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0);
+
+alignas(32) int8_t g_in0[IN0_SIZE];
+alignas(32) int16_t g_in1[IN1_SIZE];
+alignas(32) int32_t g_out0[OUT0_SIZE];
+alignas(32) int32_t g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  // XXX Figure out how to use argv with xme_ca_udm_dbg -A
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_integer<int8_t>(); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_integer<int16_t>(); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+// in0, in1, out0 are in C4 layout.
+void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] * in1[k];
+  }
+}
diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/defines.h b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/defines.h
new file mode 100644
index 0000000000..b0366ff425
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/dut.cc b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/dut.cc
new file mode 100644
index 0000000000..2fd8ad1fc7
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/dut.cc
@@ -0,0 +1,19 @@
+// clang-format off
+void dut(int8_t * restrict v1, int16_t * restrict v2, int32_t * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 1024;
+  size_t v6 = 32;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(32, 32)
+  {
+    v32int8 v8 = *(v32int8 *)(v1 + v7);
+    v32int16 v9 = *(v32int16 *)(v2 + v7);
+    v32int16 v10 = unpack(v8);
+    v32acc32 v11 = mul_elem_32(v9, v10);
+    v32int32 v12 = v32int32(v11);
+    *(v32int32 *)(v3 + v7) = v12;
+  }
+  return;
+}
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/i8xi16_mul_elem_v32.mlir b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/i8xi16_mul_elem_v32.mlir
new file mode 100644
index 0000000000..f519c04ae5
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/i8xi16_mul_elem_v32.mlir
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32"
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir --mlir-print-ir-after-all
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<1024xi8>, %arg1: tensor<1024xi16>) -> (tensor<1024xi32>) {
+    %0 = "tosa.cast"(%arg0) : (tensor<1024xi8>) -> tensor<1024xi32>
+    %1 = "tosa.cast"(%arg1) : (tensor<1024xi16>) -> tensor<1024xi32>
+    %2 = "tosa.mul"(%0,%1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>)  -> (tensor<1024xi32>)
+    return %2 : tensor<1024xi32>
+  }
+}
+
diff --git a/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/testbench.cc b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/testbench.cc
new file mode 100644
index 0000000000..9006f2395a
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi16_mul_elem_v32/testbench.cc
@@ -0,0 +1,57 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(int8_t *restrict in0, int16_t *restrict in1, int32_t *restrict out0);
+void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0);
+
+alignas(32) int8_t g_in0[IN0_SIZE];
+alignas(32) int16_t g_in1[IN1_SIZE];
+alignas(32) int32_t g_out0[OUT0_SIZE];
+alignas(32) int32_t g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  // XXX Figure out how to use argv with xme_ca_udm_dbg -A
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_integer<int8_t>(); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_integer<int16_t>(); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+// in0, in1, out0 are in C4 layout.
+void dut_ref(int8_t *in0, int16_t *in1, int32_t *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] * in1[k];
+  }
+}
diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/defines.h b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/defines.h
new file mode 100644
index 0000000000..b0366ff425
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/dut.cc b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/dut.cc
new file mode 100644
index 0000000000..0d75998d74
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/dut.cc
@@ -0,0 +1,22 @@
+// clang-format off
+void dut(int8_t * restrict v1, int32_t * restrict v2, int32_t * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 1024;
+  size_t v6 = 16;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(64, 64)
+  {
+    v16int8 v8 = *(v16int8 *)(v1 + v7);
+    v16int32 v9 = *(v16int32 *)(v2 + v7);
+    v32int8 v10 = concat(v8, v8);
+    v32acc32 v11 = ups_to_v32acc32(v10, 0);
+    v32int32 v12 = v32int32(v11);
+    v16int32 v13 = extract_v16int32(v12, 0);
+    v16acc64 v14 = mul_elem_16_2(v9, broadcast_zero_s32(), v13, undef_v16int32());
+    v16int32 v15 = srs_to_v16int32(v14, 0);
+    *(v16int32 *)(v3 + v7) = v15;
+  }
+  return;
+}
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/i8xi32_mul_elem.mlir b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/i8xi32_mul_elem.mlir
new file mode 100644
index 0000000000..3369a60633
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/i8xi32_mul_elem.mlir
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir --mlir-print-ir-after-all
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<1024xi8>, %arg1: tensor<1024xi32>) -> (tensor<1024xi32>) {
+    %0 = "tosa.cast"(%arg0) : (tensor<1024xi8>) -> tensor<1024xi32>
+    %2 = "tosa.mul"(%0,%arg1) {shift = 0 : i32} : (tensor<1024xi32>, tensor<1024xi32>)  -> (tensor<1024xi32>)
+    return %2 : tensor<1024xi32>
+  }
+}
+
diff --git a/test/Integration/Dialect/TOSA/i8xi32_mul_elem/testbench.cc b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/testbench.cc
new file mode 100644
index 0000000000..8b5d031fd2
--- /dev/null
+++ b/test/Integration/Dialect/TOSA/i8xi32_mul_elem/testbench.cc
@@ -0,0 +1,57 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(int8_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0);
+void dut_ref(int8_t *in0, int32_t *in1, int32_t *out0);
+
+alignas(32) int8_t g_in0[IN0_SIZE];
+alignas(32) int32_t g_in1[IN1_SIZE];
+alignas(32) int32_t g_out0[OUT0_SIZE];
+alignas(32) int32_t g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  // XXX Figure out how to use argv with xme_ca_udm_dbg -A
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_integer<int8_t>(); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_integer<int32_t>(); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+// in0, in1, out0 are in C4 layout.
+void dut_ref(int8_t *in0, int32_t *in1, int32_t *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] * in1[k];
+  }
+}
diff --git a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc
index 47b18d0054..dc11faa000 100644
--- a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc
+++ b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/dut.cc
@@ -1,19 +1,23 @@
-void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, bfloat16 *restrict v3) {
-  size_t v4 = 16;
-  size_t v5 = 1024;
-  size_t v6 = 0;
-  bfloat16 v7 = 0.0e+00;
-  v32bfloat16 v8 = broadcast_to_v32bfloat16(v7);
-  v16bfloat16 v9 = extract_v16bfloat16(v8, 0);
-  for (size_t v10 = v6; v10 < v5; v10 += v4)
-    chess_prepare_for_pipelining chess_loop_range(64, 64) {
-      v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10);
-      v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10);
-      v32bfloat16 v13 = concat(v11, v9);
-      v32bfloat16 v14 = concat(v12, v9);
-      v16accfloat v15 = mul_elem_16_2(v14, v13);
-      v16bfloat16 v16 = to_v16bfloat16(v15);
-      *(v16bfloat16 *)(v3 + v10) = v16;
-    }
+// clang-format off
+void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) {
+  bfloat16 v4 = 0.0e+00;
+  v32bfloat16 v5 = broadcast_to_v32bfloat16(v4);
+  v16bfloat16 v6 = extract_v16bfloat16(v5, 0);
+  size_t v7 = 0;
+  size_t v8 = 1024;
+  size_t v9 = 16;
+  for (size_t v10 = v7; v10 < v8; v10 += v9)
+  chess_prepare_for_pipelining
+  chess_loop_range(64, 64)
+  {
+    v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10);
+    v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10);
+    v32bfloat16 v13 = concat(v11, v6);
+    v32bfloat16 v14 = concat(v12, v6);
+    v16accfloat v15 = mul_elem_16_2(v14, v13);
+    v16bfloat16 v16 = to_v16bfloat16(v15);
+    *(v16bfloat16 *)(v3 + v10) = v16;
+  }
   return;
 }
+// clang-format on
diff --git a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc
index 7bfd773ebf..b42f925387 100644
--- a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc
+++ b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/dut.cc
@@ -1,4 +1,5 @@
-void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, float *restrict v3) {
+// clang-format off
+void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, float * restrict v3) {
   bfloat16 v4 = 0.0e+00;
   v32bfloat16 v5 = broadcast_to_v32bfloat16(v4);
   v16bfloat16 v6 = extract_v16bfloat16(v5, 0);
@@ -6,14 +7,17 @@ void dut(bfloat16 *restrict v1, bfloat16 *restrict v2, float *restrict v3) {
   size_t v8 = 1024;
   size_t v9 = 16;
   for (size_t v10 = v7; v10 < v8; v10 += v9)
-    chess_prepare_for_pipelining chess_loop_range(64, 64) {
-      v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10);
-      v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10);
-      v32bfloat16 v13 = concat(v11, v6);
-      v32bfloat16 v14 = concat(v12, v6);
-      v16accfloat v15 = mul_elem_16_2(v14, v13);
-      v16float v16 = v16float(v15);
-      *(v16float *)(v3 + v10) = v16;
-    }
+  chess_prepare_for_pipelining
+  chess_loop_range(64, 64)
+  {
+    v16bfloat16 v11 = *(v16bfloat16 *)(v1 + v10);
+    v16bfloat16 v12 = *(v16bfloat16 *)(v2 + v10);
+    v32bfloat16 v13 = concat(v11, v6);
+    v32bfloat16 v14 = concat(v12, v6);
+    v16accfloat v15 = mul_elem_16_2(v14, v13);
+    v16float v16 = v16float(v15);
+    *(v16float *)(v3 + v10) = v16;
+  }
   return;
 }
+// clang-format on