From 581db9b57703b74eb51285e302e26203af49b0ae Mon Sep 17 00:00:00 2001 From: Matthew Hall Date: Tue, 8 Oct 2024 11:15:51 -0400 Subject: [PATCH] Support Java and IEEE-754 SPEC w.r.t floating point min/max ops on Z - changes are only for the float point min/max evaluators on z/Architecture - +0.0 compares as strictly greater than -0.0 - Refactored xmaxxminHelper to into a more generic helper function that does not change NaNs - added helper for fmin/fmax/... nodes that uses SIMD instructions when available that has same behaviour as xmaxxminHelper on floats - idea is to enable other evaluators to use these helpers and deal with NaNs as they see fit - omr evaluators support IEEE-754 w.r.t zeros and NaNs (returns the quiet NaN corresponding to the first NaN given if present) Signed-off-by: Matthew Hall --- compiler/z/codegen/ControlFlowEvaluator.cpp | 264 +++++++++++++++++--- compiler/z/codegen/OMRTreeEvaluator.hpp | 36 +++ 2 files changed, 260 insertions(+), 40 deletions(-) diff --git a/compiler/z/codegen/ControlFlowEvaluator.cpp b/compiler/z/codegen/ControlFlowEvaluator.cpp index f518b910cc9..6c7b44f2b67 100644 --- a/compiler/z/codegen/ControlFlowEvaluator.cpp +++ b/compiler/z/codegen/ControlFlowEvaluator.cpp @@ -73,6 +73,7 @@ #include "z/codegen/BinaryAnalyser.hpp" #include "z/codegen/BinaryCommutativeAnalyser.hpp" #include "z/codegen/CompareAnalyser.hpp" +#include "z/codegen/OMRTreeEvaluator.hpp" #include "z/codegen/S390GenerateInstructions.hpp" #include "z/codegen/S390HelperCallSnippet.hpp" #include "z/codegen/S390Instruction.hpp" @@ -323,52 +324,189 @@ generateS390Compare(TR::Node * node, TR::CodeGenerator * cg, TR::InstOpCode::Mne return branchOpCond; } -static TR::Register* -xmaxxminHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic compareRROp, TR::InstOpCode::S390BranchCondition branchCond, TR::InstOpCode::Mnemonic moveRROp) +TR::Register * +OMR::Z::TreeEvaluator::xmaxxminHelper(TR::Node * node, TR::CodeGenerator * cg) { - TR::Node* lhsNode = node->getChild(0); - TR::Node* rhsNode = node->getChild(1); - - TR::Register* lhsReg = cg->gprClobberEvaluate(lhsNode); - TR::Register* rhsReg = cg->evaluate(rhsNode); + TR::Register * operand1 = cg->gprClobberEvaluate(node->getChild(0)); + TR::Register * operand2 = cg->evaluate(node->getChild(1)); TR::LabelSymbol* cFlowRegionStart = generateLabelSymbol(cg); - TR::LabelSymbol* cFlowRegionEnd = generateLabelSymbol(cg); - generateS390LabelInstruction(cg, TR::InstOpCode::label, node, cFlowRegionStart); cFlowRegionStart->setStartInternalControlFlow(); - generateRREInstruction(cg, compareRROp, node, lhsReg, rhsReg); - generateS390BranchInstruction(cg, TR::InstOpCode::BRC, branchCond, node, cFlowRegionEnd); - - //Check for NaN operands for float and double - if (node->getOpCode().isFloatingPoint()) + TR::InstOpCode::Mnemonic compareRROp = TR::InstOpCode::NOP; + TR::InstOpCode::S390BranchCondition returnFirstArgCond = TR::InstOpCode::COND_NOP; + bool isMaxOp = node->getOpCode().isMax(); + bool isFloatingPointOp = node->getOpCode().isFloatingPoint(); + if (isFloatingPointOp) { + compareRROp = node->getOpCode().isDouble() ? TR::InstOpCode::CDBR : TR::InstOpCode::CEBR; + returnFirstArgCond = isMaxOp ? TR::InstOpCode::COND_MASK2 : TR::InstOpCode::COND_MASK4; + } + else + { + TR_ASSERT_FATAL(node->getOpCode().isInt() || node->getOpCode().isLong(), "invalid operand type"); + compareRROp = node->getOpCode().isLong() ? TR::InstOpCode::CGR : TR::InstOpCode::CR; + returnFirstArgCond = isMaxOp ? TR::InstOpCode::COND_BHR : TR::InstOpCode::COND_BLR; + } + + TR::LabelSymbol* cFlowRegionEnd = generateLabelSymbol(cg); + TR::LabelSymbol* swapValues = generateLabelSymbol(cg); + generateRREInstruction(cg, compareRROp, node, operand1, operand2); + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, returnFirstArgCond, node, cFlowRegionEnd); + if (isFloatingPointOp) + { + /** + * Check for NaN operands for float and double + * Support float and double +0/-0 comparisons adhering to IEEE 754 standard + * Checking if operands are equal, then branching to equalRegion, otherwise + * fall through for NaN case handling + */ + TR::LabelSymbol* equalRegion = generateLabelSymbol(cg); + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK8, node, equalRegion); + // If first operand is NaN, then we are done, otherwise fallthrough to move second operand as result - generateRREInstruction(cg, node->getOpCode().isDouble() ? TR::InstOpCode::LTDBR : TR::InstOpCode::LTEBR, node, lhsReg, lhsReg); - generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC3, node, cFlowRegionEnd); + TR::InstOpCode::Mnemonic tdcOpcode = node->getOpCode().isDouble() ? TR::InstOpCode::TCDB : TR::InstOpCode::TCEB; + generateRXEInstruction(cg, tdcOpcode, node, operand1, generateS390MemoryReference(0x00F, cg), 0); // can't use load and test here since it would set the quiet bit + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK4, node, cFlowRegionEnd); + + // branch to swap label, since either second operand is NaN, or entire satisfies the alternate condition code + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_BRC, node, swapValues); + + // code for handling +0/-0 comparisons when operands are equal + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, equalRegion); + if (node->getOpCode().isMax()) + { + // For Max calls, checking if first operand is +0, then we are done, otherwise fall through for swap + generateRXEInstruction(cg, tdcOpcode, node, operand1, generateS390MemoryReference(0x800, cg), 0); // operand1 is +0 ? + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK4, node, cFlowRegionEnd); // it is +0 + } + else if (node->getOpCode().isMin()) + { + // For Min calls, checking if first operand is not +0, then we are done, otherwise fall through for swap + generateRXEInstruction(cg, tdcOpcode, node, operand1, generateS390MemoryReference(0x400, cg), 0); // operand1 is -0 ? + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK4, node, cFlowRegionEnd); + } } - //Move resulting operand to lhsReg as fallthrough for alternate Condition Code - generateRREInstruction(cg, moveRROp, node, lhsReg, rhsReg); + // Move resulting operand to operand1 as fallthrough for alternate Condition Code + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, swapValues); + TR::InstOpCode::Mnemonic moveRROp = TR::InstOpCode::NOP; + if (isFloatingPointOp) + { + moveRROp = node->getOpCode().isDouble() ? TR::InstOpCode::LDR : TR::InstOpCode::LER; + } + else + { + moveRROp = node->getOpCode().isLong() ? TR::InstOpCode::LGR : TR::InstOpCode::LR; + } + generateRREInstruction(cg, moveRROp, node, operand1, operand2); TR::RegisterDependencyConditions* deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 2, cg); - deps->addPostConditionIfNotAlreadyInserted(lhsReg, TR::RealRegister::AssignAny); - deps->addPostConditionIfNotAlreadyInserted(rhsReg, TR::RealRegister::AssignAny); + deps->addPostConditionIfNotAlreadyInserted(operand1, TR::RealRegister::AssignAny); + deps->addPostConditionIfNotAlreadyInserted(operand2, TR::RealRegister::AssignAny); generateS390LabelInstruction(cg, TR::InstOpCode::label, node, cFlowRegionEnd, deps); cFlowRegionEnd->setEndInternalControlFlow(); - node->setRegister(lhsReg); - cg->decReferenceCount(lhsNode); - cg->decReferenceCount(rhsNode); + node->setRegister(operand1); + cg->decReferenceCount(node->getChild(0)); + cg->decReferenceCount(node->getChild(1)); + return operand1; + } + + +TR::Register * +OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(TR::Node *node, TR::CodeGenerator *cg) + { + TR_ASSERT_FATAL(node->getNumChildren() >= 1 && node->getNumChildren() <= 2 && node->getOpCode().isFloatingPoint(), + "node has incorrect number of children or is not floating point type"); + + /* ===================== Allocating Registers ===================== */ - return lhsReg; + TR::Register * argumentSelectorReg = cg->allocateRegister(TR_VRF); + TR::Register * compareResultReg = cg->allocateRegister(TR_VRF); + TR::Register * zeroCheckReg = cg->allocateRegister(TR_VRF); + + /* ====== LD FPR0,16(GPR5) Load a ====== */ + TR::Register * operand1 = cg->fprClobberEvaluate(node->getFirstChild()); + + /* ====== LD FPR2, 0(GPR5) Load b ====== */ + TR::Register * operand2 = cg->evaluate(node->getSecondChild()); + + /* ===================== Generating instructions ===================== */ + + const uint8_t typeMask = node->getOpCode().isDouble() ? 3 : 2; + /* ====== WFTCIDB argumentSelectorReg,operand1,X'F' a == NaN ====== */ + generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, argumentSelectorReg, operand1, 0xF, 8, typeMask); + + /* ====== For Max: WFCHE compareResultReg,operand1,operand2 Compare a > b ====== */ + bool isMaxOp = node->getOpCode().isMax(); + if(isMaxOp) + { + generateVRRcInstruction(cg, TR::InstOpCode::VFCH, node, compareResultReg, operand1, operand2, 0, 8, typeMask); + } + /* ====== For Min: WFCHE compareResultReg,operand1,operand2 Compare a < b ====== */ + else + { + generateVRRcInstruction(cg, TR::InstOpCode::VFCH, node, compareResultReg, operand2, operand1, 0, 8, typeMask); + } + + /* ====== VO argumentSelectorReg,argumentSelectorReg,compareResultReg + * VSEL will select first arg when: + * For Max: (a > b) || (a == NaN) + * For Min: (a < b) || (a == NaN) + */ + generateVRRcInstruction(cg, TR::InstOpCode::VO, node, argumentSelectorReg, argumentSelectorReg, compareResultReg, 0, 0, 0); + + /* ====== For Max: WFTCIDB compareResultReg,operand1,X'800' a == +0 ====== */ + if(isMaxOp) + { + generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, compareResultReg, operand1, 0x800, 8, typeMask); + } + /* ====== For Min: WFTCIDB compareResultReg,operand1,X'400' a == -0 ====== */ + else + { + generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, compareResultReg, operand1, 0x400, 8, typeMask); + } + + /* ====== WFTCIDB zeroCheckReg,operand2,X'C00' b == 0 ====== */ + generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, zeroCheckReg, operand2, 0xC00, 8, typeMask); + + /* ====== VN compareResultReg,compareResultReg,zeroCheckReg + * VSEL will select first arg when: + * For Max: (a == +0) && (b == 0) + * For Min: (a == -0) && (b == 0) + */ + generateVRRcInstruction(cg, TR::InstOpCode::VN, node, compareResultReg, compareResultReg, zeroCheckReg, 0, 0, 0); + + /* ====== VO argumentSelectorReg,argumentSelectorReg,compareResultReg + * VSEL will select first arg when: + * For Max: (a > b) || (a == NaN) || ((a == +0) && (b == 0)) + * For Min: (a < b) || (a == NaN) || ((a == -0) && (b == 0)) + * selects second arg otherwise (always selects second arg on (a == b) && (a != 0)) + */ + generateVRRcInstruction(cg, TR::InstOpCode::VO, node, argumentSelectorReg, argumentSelectorReg, compareResultReg, 0, 0, 0); + + /* ====== VSEL operand1,operand1,operand2,argumentSelectorReg ====== */ + generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, operand1, operand1, operand2, argumentSelectorReg); + + /* ===================== Deallocating Registers ===================== */ + cg->stopUsingRegister(operand2); + cg->stopUsingRegister(argumentSelectorReg); + cg->stopUsingRegister(compareResultReg); + cg->stopUsingRegister(zeroCheckReg); + + node->setRegister(operand1); + cg->decReferenceCount(node->getFirstChild()); + cg->decReferenceCount(node->getSecondChild()); + return operand1; } static TR::Register* -imaximinHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic compareRROp, TR::InstOpCode::S390BranchCondition branchCond, TR::InstOpCode::Mnemonic moveRROp) +imaximinHelper(TR::Node* node, TR::CodeGenerator* cg) { + TR::InstOpCode::S390BranchCondition branchCond = node->getOpCode().isMax() ? TR::InstOpCode::COND_BHR : TR::InstOpCode::COND_BLR; if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_S390_Z15)) { TR::Node* lhsNode = node->getChild(0); @@ -408,13 +546,14 @@ imaximinHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic c } else { - return xmaxxminHelper(node, cg, compareRROp, branchCond, moveRROp); + return OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); } } static TR::Register* -lmaxlminHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic compareRROp, TR::InstOpCode::S390BranchCondition branchCond, TR::InstOpCode::Mnemonic moveRROp) +lmaxlminHelper(TR::Node* node, TR::CodeGenerator* cg) { + TR::InstOpCode::S390BranchCondition branchCond = node->getOpCode().isMax() ? TR::InstOpCode::COND_BHR : TR::InstOpCode::COND_BLR; if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_S390_Z15)) { TR::Node* lhsNode = node->getChild(0); @@ -454,60 +593,105 @@ lmaxlminHelper(TR::Node* node, TR::CodeGenerator* cg, TR::InstOpCode::Mnemonic c } else { - return xmaxxminHelper(node, cg, compareRROp, branchCond, moveRROp); + return OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); } } TR::Register* OMR::Z::TreeEvaluator::imaxEvaluator(TR::Node* node, TR::CodeGenerator* cg) { - return imaximinHelper(node, cg, TR::InstOpCode::CR, TR::InstOpCode::COND_BHR, TR::InstOpCode::LR); + return imaximinHelper(node, cg); } TR::Register* OMR::Z::TreeEvaluator::lmaxEvaluator(TR::Node* node, TR::CodeGenerator* cg) { - return lmaxlminHelper(node, cg, TR::InstOpCode::CGR, TR::InstOpCode::COND_BHR, TR::InstOpCode::LGR); + return lmaxlminHelper(node, cg); } TR::Register* OMR::Z::TreeEvaluator::fmaxEvaluator(TR::Node* node, TR::CodeGenerator* cg) { - //Passing COND_MASK10 to check CC if operand 1 is already greater than, or equal to operand 2 - return xmaxxminHelper(node, cg, TR::InstOpCode::CEBR, TR::InstOpCode::COND_MASK10, TR::InstOpCode::LER); + TR::Register * result = NULL; + if (cg->getSupportsVectorRegisters()) + { + cg->generateDebugCounter("z13/simd/floatMax", 1, TR::DebugCounter::Free); + result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); + } + else + { + result = OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); + } + // load and test (result <- result) - sets quiet bit on NaN + generateRREInstruction(cg, TR::InstOpCode::LTEBR, node, result, result); + return result; } TR::Register* OMR::Z::TreeEvaluator::dmaxEvaluator(TR::Node* node, TR::CodeGenerator* cg) { - //Passing COND_MASK10 to check CC if operand 1 is already greater than, or equal to operand 2 - return xmaxxminHelper(node, cg, TR::InstOpCode::CDBR, TR::InstOpCode::COND_MASK10, TR::InstOpCode::LDR); + TR::Register * result = NULL; + if (cg->getSupportsVectorRegisters()) + { + cg->generateDebugCounter("z13/simd/doubleMax", 1, TR::DebugCounter::Free); + result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); + } + else + { + result = OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); + } + // load and test (result <- result) - sets quiet bit on NaN + generateRREInstruction(cg, TR::InstOpCode::LTDBR, node, result, result); + return result; } TR::Register * OMR::Z::TreeEvaluator::iminEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - return imaximinHelper(node, cg, TR::InstOpCode::CR, TR::InstOpCode::COND_BLR, TR::InstOpCode::LR); + return imaximinHelper(node, cg); } TR::Register * OMR::Z::TreeEvaluator::lminEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - return lmaxlminHelper(node, cg, TR::InstOpCode::CGR, TR::InstOpCode::COND_BLR, TR::InstOpCode::LGR); + return lmaxlminHelper(node, cg); } TR::Register* OMR::Z::TreeEvaluator::fminEvaluator(TR::Node* node, TR::CodeGenerator* cg) { - //Passing COND_MASK12 to check CC if operand 1 is already less than, or equal to operand 2 - return xmaxxminHelper(node, cg, TR::InstOpCode::CEBR, TR::InstOpCode::COND_MASK12, TR::InstOpCode::LER); + TR::Register * reg = NULL; + TR::Register * result = NULL; + if (cg->getSupportsVectorRegisters()) + { + cg->generateDebugCounter("z13/simd/floatMin", 1, TR::DebugCounter::Free); + result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); + } + else + { + result = OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); + } + // load and test (result <- result) - sets quiet bit on NaN + generateRREInstruction(cg, TR::InstOpCode::LTEBR, node, result, result); + return result; } TR::Register* OMR::Z::TreeEvaluator::dminEvaluator(TR::Node* node, TR::CodeGenerator* cg) { - //Passing COND_MASK12 to check CC if operand 1 is already less than, or equal to operand 2 - return xmaxxminHelper(node, cg, TR::InstOpCode::CDBR, TR::InstOpCode::COND_MASK12, TR::InstOpCode::LDR); + TR::Register * result = NULL; + if (cg->getSupportsVectorRegisters()) + { + cg->generateDebugCounter("z13/simd/doubleMin", 1, TR::DebugCounter::Free); + result = OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); + } + else + { + result = OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); + } + // load and test (result <- result) - sets quiet bit on NaN + generateRREInstruction(cg, TR::InstOpCode::LTDBR, node, result, result); + return result; } /** diff --git a/compiler/z/codegen/OMRTreeEvaluator.hpp b/compiler/z/codegen/OMRTreeEvaluator.hpp index 7b9ad48db93..52c7aaf58fc 100644 --- a/compiler/z/codegen/OMRTreeEvaluator.hpp +++ b/compiler/z/codegen/OMRTreeEvaluator.hpp @@ -254,6 +254,42 @@ class OMR_EXTENSIBLE TreeEvaluator: public OMR::TreeEvaluator static TR::Register *MethodExitHookEvaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *PassThroughEvaluator(TR::Node *node, TR::CodeGenerator *cg); + /** \brief + * This is a helper function used for floating point max/min operations + * when SIMD instructions are available. +0.0 compares as strictly + * greater than -0.0, and NaNs are returned unchanged if given + * (the quiet bit is not set for Signalling NaNs). + * Generates vector instructions. + * + * \param node + * The node to evaluate + * + * \param cg + * The code generator used to generate the instructions + * + * \return + * The register containing the result of the evaluation + * + */ + static TR::Register *fpMinMaxVectorHelper(TR::Node *node, TR::CodeGenerator *cg); + + /** \brief + * This is a helper function used for integral and floating point max/min operations. + * For floating points, +0.0 compares as strictly greater than -0.0, and NaNs are + * returned unchanged if given (the quiet bit is not set for Signalling NaNs). + * + * \param node + * The node to evaluate + * + * \param cg + * The code generator used to generate the instructions + * + * \return + * The register containing the result of the evaluation + * + */ + static TR::Register *xmaxxminHelper(TR::Node *node, TR::CodeGenerator *cg); + // mask evaluators static TR::Register *mAnyTrueEvaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *mAllTrueEvaluator(TR::Node *node, TR::CodeGenerator *cg);