Skip to content

Commit

Permalink
HPCC-32504 Recoverable failures on agents not handled properly with a…
Browse files Browse the repository at this point in the history
…cknowledge enabled

Signed-off-by: Richard Chapman <[email protected]>
  • Loading branch information
richardkchapman committed Sep 4, 2024
1 parent ae93d7a commit c348e4f
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
1 change: 1 addition & 0 deletions roxie/ccd/ccd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ interface IRoxieQueryPacket : extends IInterface

virtual void noteTimeSent() = 0;
virtual void setAcknowledged() = 0;
virtual void clearAcknowledged() = 0;
virtual bool isAcknowledged() const = 0;
virtual bool resendNeeded(unsigned now) = 0;
};
Expand Down
10 changes: 7 additions & 3 deletions roxie/ccd/ccdqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,11 @@ class CRoxieQueryPacket : public CRoxieQueryPacketBase, implements IRoxieQueryPa
acknowledged = true;
}

virtual void clearAcknowledged() override
{
acknowledged = false;
}

virtual bool isAcknowledged() const override
{
return acknowledged;
Expand Down Expand Up @@ -2723,6 +2728,8 @@ class RoxieSocketQueueManager : public RoxieReceiverBase
#endif
Owned<ISerializedRoxieQueryPacket> packet = createSerializedRoxiePacket(mb);
unsigned retries = header.thisChannelRetries(mySubchannel);
if (retries >= SUBCHANNEL_MASK)
return; // I already failed unrecoverably on this request - ignore it
if (acknowledgeAllRequests && (header.activityId & ~ROXIE_PRIORITY_MASK) < ROXIE_ACTIVITY_SPECIAL_FIRST)
{
#ifdef DEBUG
Expand All @@ -2742,9 +2749,6 @@ class RoxieSocketQueueManager : public RoxieReceiverBase
{
// MORE - is this fast enough? By the time I am seeing retries I may already be under load. Could move onto a separate thread
assertex(header.channel); // should never see a retry on channel 0
if (retries >= SUBCHANNEL_MASK)
return; // someone sent a failure or something - ignore it

// Send back an out-of-band immediately, to let Roxie server know that channel is still active
if (!(testAgentFailure & 0x800) && !acknowledgeAllRequests)
{
Expand Down
8 changes: 5 additions & 3 deletions roxie/ccd/ccdserver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4164,11 +4164,11 @@ class CRemoteResultAdaptor : implements IEngineRowStream, implements IFinalRoxie
unsigned now = 0;
if (acknowledgeAllRequests)
{
if (doTrace(traceRoxiePackets))
DBGLOG("Checking %d pending packets for ack status", pending.ordinality());
now = msTick();
if (now-lastRetryCheck < packetAcknowledgeTimeout/4)
return;
if (doTrace(traceRoxiePackets))
DBGLOG("Checking %d pending packets for ack status", pending.ordinality());
lastRetryCheck = now;
}
CriticalBlock b(pendingCrit);
Expand Down Expand Up @@ -5235,7 +5235,9 @@ class CRemoteResultAdaptor : implements IEngineRowStream, implements IFinalRoxie
Owned<IMessageUnpackCursor> exceptionData = mr->getCursor(rowManager);
throwRemoteException(exceptionData);
}
// Leave it on pending queue in original location
// One channel has failed, but should be recoverable
// Leave it on pending queue in original location, but clear acknowledged flag
op->clearAcknowledged();
break;

case ROXIE_ALIVE:
Expand Down

0 comments on commit c348e4f

Please sign in to comment.