Skip to content

Commit

Permalink
Add generic helper for min/max nodes + SIMD helper for floats
Browse files Browse the repository at this point in the history
- Refactored xmaxxminhelper to into a more generic helper function:
  evaluators for fmax/fmin/dmax/dmin nodes use this helper and add an
extra instruction do deal with NaNs
- added helper for fmin/fmax/... nodes that uses SIMD instructions when
  available - again the evaluators use this and add an extra instruction
to deal with NaNs
- idea is to enable other evaluators to use these helpers and deal with
  NaNs as they see fit

Signed-off-by: Matthew Hall <[email protected]>
  • Loading branch information
matthewhall2 committed Oct 4, 2024
1 parent a95eed5 commit 9ec2ebd
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 50 deletions.
244 changes: 194 additions & 50 deletions compiler/z/codegen/ControlFlowEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "codegen/RegisterDependency.hpp"
#include "codegen/RegisterPair.hpp"
#include "codegen/TreeEvaluator.hpp"
#include "z/codegen/OMRTreeEvaluator.hpp"
#include "codegen/S390Evaluator.hpp"
#include "compile/Compilation.hpp"
#include "compile/Method.hpp"
Expand Down Expand Up @@ -323,76 +324,178 @@ generateS390Compare(TR::Node * node, TR::CodeGenerator * cg, TR::InstOpCode::Mne
return branchOpCond;
}

static TR::Register*
xmaxxminHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic compareRROp, TR::InstOpCode::S390BranchCondition branchCond, TR::InstOpCode::Mnemonic moveRROp)
TR::Register *
OMR::Z::TreeEvaluator::xmaxxminhelper(TR::Node * node, TR::CodeGenerator * cg)
{
TR::Node* lhsNode = node->getChild(0);
TR::Node* rhsNode = node->getChild(1);

TR::Register* lhsReg = cg->gprClobberEvaluate(lhsNode);
TR::Register* rhsReg = cg->evaluate(rhsNode);
TR::Node * lhsNode = node->getChild(0);
TR::Node * rhsNode = node->getChild(1);

TR::LabelSymbol* cFlowRegionStart = generateLabelSymbol(cg);
TR::LabelSymbol* cFlowRegionEnd = generateLabelSymbol(cg);

TR::LabelSymbol* swap = generateLabelSymbol(cg);
TR::LabelSymbol* equalRegion = generateLabelSymbol(cg);
TR::Register * operand1 = cg->gprClobberEvaluate(lhsNode);
TR::Register * operand2 = cg->evaluate(rhsNode);

TR::LabelSymbol* cFlowRegionStart = generateLabelSymbol(cg);
generateS390LabelInstruction(cg, TR::InstOpCode::label, node, cFlowRegionStart);
cFlowRegionStart->setStartInternalControlFlow();

generateRREInstruction(cg, compareRROp, node, lhsReg, rhsReg);
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, branchCond, node, cFlowRegionEnd);
//Check for NaN operands for float and double
//Support float and double +0/-0 comparisons adhering to IEEE 754 standard
if (node->getOpCode().isFloatingPoint())
TR::InstOpCode::Mnemonic compareRROp = TR::InstOpCode::NOP;
TR::InstOpCode::S390BranchCondition returnFirstArgCond = TR::InstOpCode::COND_NOP;
bool isMaxOp = node->getOpCode().isMax();
bool isFloatingPointOp = node->getOpCode().isFloatingPoint();
if (isFloatingPointOp)
{
compareRROp = node->getOpCode().isDouble() ? TR::InstOpCode::CDBR : TR::InstOpCode::CEBR;
returnFirstArgCond = isMaxOp ? TR::InstOpCode::COND_MASK2 : TR::InstOpCode::COND_MASK4;
}
else
{
//Checking if operands are equal, then branching to equalRegion, otherwise fall through for NaN case handling
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC0, node, equalRegion);
TR_ASSERT_FATAL(node->getOpCode().isInt() || node->getOpCode().isLong(), "invalid operand type");
compareRROp = node->getOpCode().isLong() ? TR::InstOpCode::CGR : TR::InstOpCode::CR;
returnFirstArgCond = isMaxOp ? TR::InstOpCode::COND_BHR : TR::InstOpCode::COND_BLR;
}

TR::LabelSymbol* cFlowRegionEnd = generateLabelSymbol(cg);
TR::LabelSymbol* swap = generateLabelSymbol(cg);
generateRREInstruction(cg, compareRROp, node, operand1, operand2);
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, returnFirstArgCond, node, cFlowRegionEnd);
if (isFloatingPointOp)
{
/*
Check for NaN operands for float and double
Support float and double +0/-0 comparisons adhering to IEEE 754 standard
Checking if operands are equal, then branching to equalRegion, otherwise fall through for NaN case handling
*/
TR::LabelSymbol* equalRegion = generateLabelSymbol(cg);
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK8, node, equalRegion);

// If first operand is NaN, then we are done, otherwise fallthrough to move second operand as result
generateRREInstruction(cg, node->getOpCode().isDouble() ? TR::InstOpCode::LTDBR : TR::InstOpCode::LTEBR, node, lhsReg, lhsReg);
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC3, node, cFlowRegionEnd);
//branch to swap label, since either second operand is NaN, or entire satisfies the alternate condition code
TR::InstOpCode::Mnemonic tdcOpcode = node->getOpCode().isDouble() ? TR::InstOpCode::TCDB : TR::InstOpCode::TCEB;
generateRXEInstruction(cg, tdcOpcode, node, operand1, generateS390MemoryReference(0x00F, cg), 0); // can't use load and test here since it would set the quiet bit
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK4, node, cFlowRegionEnd);

// branch to swap label, since either second operand is NaN, or entire satisfies the alternate condition code
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_BRC, node, swap);

//code for handling +0/-0 comparisons when operands are equal
// code for handling +0/-0 comparisons when operands are equal
generateS390LabelInstruction(cg, TR::InstOpCode::label, node, equalRegion);
if (node->getOpCode().isMax())
{
//For Max calls, checking if first operand is +0, then we are done, otherwise fall through for swap
generateRXEInstruction(cg, node->getOpCode().isDouble() ? TR::InstOpCode::TCDB : TR::InstOpCode::TCEB, node, lhsReg, generateS390MemoryReference(0x800, cg), 0); // lhsReg is +0 ?
// For Max calls, checking if first operand is +0, then we are done, otherwise fall through for swap
generateRXEInstruction(cg, tdcOpcode, node, operand1, generateS390MemoryReference(0x800, cg), 0); // operand1 is +0 ?
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK4, node, cFlowRegionEnd); // it is +0
}
else if (node->getOpCode().isMin())
{
//For Min calls, checking if first operand is not +0, then we are done, otherwise fall through for swap
generateRXEInstruction(cg, node->getOpCode().isDouble() ? TR::InstOpCode::TCDB : TR::InstOpCode::TCEB, node, lhsReg, generateS390MemoryReference(0x800, cg), 0); // lhsReg is +0 ?
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_BE, node, cFlowRegionEnd); // it is not +0
// For Min calls, checking if first operand is not +0, then we are done, otherwise fall through for swap
generateRXEInstruction(cg, tdcOpcode, node, operand1, generateS390MemoryReference(0x400, cg), 0); // operand1 is -0 ?
generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK4, node, cFlowRegionEnd);
}
}

// Move resulting operand to operand1 as fallthrough for alternate Condition Code
generateS390LabelInstruction(cg, TR::InstOpCode::label, node, swap);
//Move resulting operand to lhsReg as fallthrough for alternate Condition Code
generateRREInstruction(cg, moveRROp, node, lhsReg, rhsReg);
TR::InstOpCode::Mnemonic moveRROp = TR::InstOpCode::NOP;
if (isFloatingPointOp)
{
moveRROp = node->getOpCode().isDouble() ? TR::InstOpCode::LDR : TR::InstOpCode::LER;
}
else
{
moveRROp = node->getOpCode().isLong() ? TR::InstOpCode::LGR : TR::InstOpCode::LR;
}
generateRREInstruction(cg, moveRROp, node, operand1, operand2);

TR::RegisterDependencyConditions* deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 2, cg);
deps->addPostConditionIfNotAlreadyInserted(lhsReg, TR::RealRegister::AssignAny);
deps->addPostConditionIfNotAlreadyInserted(rhsReg, TR::RealRegister::AssignAny);
deps->addPostConditionIfNotAlreadyInserted(operand1, TR::RealRegister::AssignAny);
deps->addPostConditionIfNotAlreadyInserted(operand2, TR::RealRegister::AssignAny);

generateS390LabelInstruction(cg, TR::InstOpCode::label, node, cFlowRegionEnd, deps);
cFlowRegionEnd->setEndInternalControlFlow();

node->setRegister(lhsReg);
node->setRegister(operand1);
cg->decReferenceCount(lhsNode);
cg->decReferenceCount(rhsNode);
return operand1;
}


TR::Register *
OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(TR::Node *node, TR::CodeGenerator *cg)
{
TR_ASSERT(node->getNumChildren() >= 1 || node->getNumChildren() <= 2, "node has incorrect number of children");
TR_ASSERT_FATAL(node->getOpCode().isFloatingPoint(), "invalid data type");

/* ===================== Allocating Registers ===================== */

TR::Register * argumentSelectorReg = cg->allocateRegister(TR_VRF);
TR::Register * compareResultReg = cg->allocateRegister(TR_VRF);
TR::Register * zeroCheckReg = cg->allocateRegister(TR_VRF);

/* ====== LD FPR0,16(GPR5) Load a ====== */
TR::Register * operand1 = cg->fprClobberEvaluate(node->getFirstChild());

/* ====== LD FPR2, 0(GPR5) Load b ====== */
TR::Register * operand2 = cg->evaluate(node->getSecondChild());

/* ===================== Generating instructions ===================== */

const uint8_t typeMask = node->getOpCode().isDouble() ? 3 : 2;
/* ====== WFTCIDB argumentSelectorReg,operand1,X'F' a == NaN ====== */
generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, argumentSelectorReg, operand1, 0xF, 8, typeMask);

/* ====== For Max: WFCHE compareResultReg,operand1,operand2 Compare a > b ====== */
bool isMaxOp = node->getOpCode().isMax();
if(isMaxOp)
{
generateVRRcInstruction(cg, TR::InstOpCode::VFCH, node, compareResultReg, operand1, operand2, 0, 8, typeMask);
}
/* ====== For Min: WFCHE compareResultReg,operand1,operand2 Compare a < b ====== */
else
{
generateVRRcInstruction(cg, TR::InstOpCode::VFCH, node, compareResultReg, operand2, operand1, 0, 8, typeMask);
}

/* ====== VO argumentSelectorReg,argumentSelectorReg,compareResultReg (a > b) || (a == NaN) ====== */
generateVRRcInstruction(cg, TR::InstOpCode::VO, node, argumentSelectorReg, argumentSelectorReg, compareResultReg, 0, 0, 0);

/* ====== For Max: WFTCIDB compareResultReg,operand1,X'800' a == +0 ====== */
if(isMaxOp)
{
generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, compareResultReg, operand1, 0x800, 8, typeMask);
}
/* ====== For Min: WFTCIDB compareResultReg,operand1,X'400' a == -0 ====== */
else
{
generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, compareResultReg, operand1, 0x400, 8, typeMask);
}
/* ====== WFTCIDB zeroCheckReg,operand2,X'C00' b == 0 ====== */
generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, zeroCheckReg, operand2, 0xC00, 8, typeMask);

return lhsReg;
/* ====== VN compareResultReg,compareResultReg,zeroCheckReg (a == -0) && (b == 0) ====== */
generateVRRcInstruction(cg, TR::InstOpCode::VN, node, compareResultReg, compareResultReg, zeroCheckReg, 0, 0, 0);

/* ====== VO argumentSelectorReg,argumentSelectorReg,compareResultReg (a >= b) || (a == NaN) || ((a == -0) && (b == 0)) ====== */
generateVRRcInstruction(cg, TR::InstOpCode::VO, node, argumentSelectorReg, argumentSelectorReg, compareResultReg, 0, 0, 0);

/* ====== VSEL operand1,operand1,operand2,argumentSelectorReg ====== */
generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, operand1, operand1, operand2, argumentSelectorReg);

/* ===================== Deallocating Registers ===================== */
cg->stopUsingRegister(operand2);
cg->stopUsingRegister(argumentSelectorReg);
cg->stopUsingRegister(compareResultReg);
cg->stopUsingRegister(zeroCheckReg);

node->setRegister(operand1);
cg->decReferenceCount(node->getFirstChild());
cg->decReferenceCount(node->getSecondChild());
return operand1;
}

static TR::Register*
imaximinHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic compareRROp, TR::InstOpCode::S390BranchCondition branchCond, TR::InstOpCode::Mnemonic moveRROp)
imaximinHelper(TR::Node* node, TR::CodeGenerator* cg)
{
TR::InstOpCode::S390BranchCondition branchCond = node->getOpCode().isMax() ? TR::InstOpCode::COND_BHR : TR::InstOpCode::COND_BLR;
if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_S390_Z15))
{
TR::Node* lhsNode = node->getChild(0);
Expand Down Expand Up @@ -432,13 +535,14 @@ imaximinHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic c
}
else
{
return xmaxxminHelper(node, cg, compareRROp, branchCond, moveRROp);
return OMR::Z::TreeEvaluator::xmaxxminhelper(node, cg);
}
}

static TR::Register*
lmaxlminHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic compareRROp, TR::InstOpCode::S390BranchCondition branchCond, TR::InstOpCode::Mnemonic moveRROp)
lmaxlminHelper(TR::Node* node, TR::CodeGenerator* cg)
{
TR::InstOpCode::S390BranchCondition branchCond = node->getOpCode().isMax() ? TR::InstOpCode::COND_BHR : TR::InstOpCode::COND_BLR;
if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_S390_Z15))
{
TR::Node* lhsNode = node->getChild(0);
Expand Down Expand Up @@ -478,61 +582,101 @@ lmaxlminHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic c
}
else
{
return xmaxxminHelper(node, cg, compareRROp, branchCond, moveRROp);
return OMR::Z::TreeEvaluator::xmaxxminhelper(node, cg);
}
}

TR::Register*
OMR::Z::TreeEvaluator::imaxEvaluator(TR::Node* node, TR::CodeGenerator* cg)
{
return imaximinHelper(node, cg, TR::InstOpCode::CR, TR::InstOpCode::COND_BHR, TR::InstOpCode::LR);
return imaximinHelper(node, cg);
}

TR::Register*
OMR::Z::TreeEvaluator::lmaxEvaluator(TR::Node* node, TR::CodeGenerator* cg)
{
return lmaxlminHelper(node, cg, TR::InstOpCode::CGR, TR::InstOpCode::COND_BHR, TR::InstOpCode::LGR);
return lmaxlminHelper(node, cg);
}

TR::Register*
OMR::Z::TreeEvaluator::fmaxEvaluator(TR::Node* node, TR::CodeGenerator* cg)
{

//Passing COND_MASK2 to check CC if operand 1 is already greater than operand 2
return xmaxxminHelper(node, cg, TR::InstOpCode::CEBR, TR::InstOpCode::COND_MASK2, TR::InstOpCode::LER);
TR::Register * result = NULL;
if (cg->getSupportsVectorRegisters())
{
cg->generateDebugCounter("z13/simd/floatMax", 1, TR::DebugCounter::Free);
result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg);
}
else
{
result = OMR::Z::TreeEvaluator::xmaxxminhelper(node, cg);
}
generateRREInstruction(cg, TR::InstOpCode::LTEBR, node, result, result);
return result;
}

TR::Register*
OMR::Z::TreeEvaluator::dmaxEvaluator(TR::Node* node, TR::CodeGenerator* cg)
{
//Passing COND_MASK2 to check CC if operand 1 is already greater than operand 2
return xmaxxminHelper(node, cg, TR::InstOpCode::CDBR, TR::InstOpCode::COND_MASK2, TR::InstOpCode::LDR);
TR::Register * result = NULL;
if (cg->getSupportsVectorRegisters())
{
cg->generateDebugCounter("z13/simd/doubleMax", 1, TR::DebugCounter::Free);
result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg);
}
else
{
result = OMR::Z::TreeEvaluator::xmaxxminhelper(node, cg);
}
generateRREInstruction(cg, TR::InstOpCode::LTDBR, node, result, result);
return result;
}

TR::Register *
OMR::Z::TreeEvaluator::iminEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
return imaximinHelper(node, cg, TR::InstOpCode::CR, TR::InstOpCode::COND_BLR, TR::InstOpCode::LR);
return imaximinHelper(node, cg);
}

TR::Register *
OMR::Z::TreeEvaluator::lminEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
return lmaxlminHelper(node, cg, TR::InstOpCode::CGR, TR::InstOpCode::COND_BLR, TR::InstOpCode::LGR);
return lmaxlminHelper(node, cg);
}

TR::Register*
OMR::Z::TreeEvaluator::fminEvaluator(TR::Node* node, TR::CodeGenerator* cg)
{
//Passing COND_MASK4 to check CC if operand 1 is already less than operand 2
return xmaxxminHelper(node, cg, TR::InstOpCode::CEBR, TR::InstOpCode::COND_MASK4, TR::InstOpCode::LER);
TR::Register * reg = NULL;
TR::Register * result = NULL;
if (cg->getSupportsVectorRegisters())
{
cg->generateDebugCounter("z13/simd/floatMin", 1, TR::DebugCounter::Free);
result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg);
}
else
{
result = OMR::Z::TreeEvaluator::xmaxxminhelper(node, cg);
}
generateRREInstruction(cg, TR::InstOpCode::LTEBR, node, result, result);
return result;
}

TR::Register*
OMR::Z::TreeEvaluator::dminEvaluator(TR::Node* node, TR::CodeGenerator* cg)
{
//Passing COND_MASK4 to check CC if operand 1 is already less than operand 2
return xmaxxminHelper(node, cg, TR::InstOpCode::CDBR, TR::InstOpCode::COND_MASK4, TR::InstOpCode::LDR);
TR::Register * result = NULL;
if (cg->getSupportsVectorRegisters())
{
cg->generateDebugCounter("z13/simd/doubleMin", 1, TR::DebugCounter::Free);
result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg);
}
else
{
result = OMR::Z::TreeEvaluator::xmaxxminhelper(node, cg);
}
generateRREInstruction(cg, TR::InstOpCode::LTDBR, node, result, result);
return result;
}

/**
Expand Down
2 changes: 2 additions & 0 deletions compiler/z/codegen/OMRTreeEvaluator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ class OMR_EXTENSIBLE TreeEvaluator: public OMR::TreeEvaluator
static TR::Register *MethodEnterHookEvaluator(TR::Node *node, TR::CodeGenerator *cg);
static TR::Register *MethodExitHookEvaluator(TR::Node *node, TR::CodeGenerator *cg);
static TR::Register *PassThroughEvaluator(TR::Node *node, TR::CodeGenerator *cg);
static TR::Register *fpMinMaxVectorHelper(TR::Node *node, TR::CodeGenerator *cg);
static TR::Register *xmaxxminhelper(TR::Node *node, TR::CodeGenerator *cg);

// mask evaluators
static TR::Register *mAnyTrueEvaluator(TR::Node *node, TR::CodeGenerator *cg);
Expand Down

0 comments on commit 9ec2ebd

Please sign in to comment.