Skip to content

Commit

Permalink
HPCC-30906 Ensure thormanager exception relayed to agent
Browse files Browse the repository at this point in the history
Ensure exceptions that cause the manager to exit are relayed
through to the agent.

Signed-off-by: Jake Smith <[email protected]>
  • Loading branch information
jakesmith committed Nov 22, 2023
1 parent 0078472 commit 2118784
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 26 deletions.
50 changes: 26 additions & 24 deletions thorlcr/master/thgraphmanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,28 @@

static const StatisticsMapping podStatistics({StNumPods});


void relayWuidException(IConstWorkUnit *workunit, const IException *exception)
{
WUState state = workunit->getState();
if (WUStateWait != state) // if already in wait state, then an exception has already been relayed
{
Owned<IWorkUnit> wu = &workunit->lock();
if (WUStateWait != state)
{
Owned<IWUException> we = wu->createException();
we->setSeverity(SeverityInformation);
StringBuffer errStr;
exception->errorMessage(errStr);
we->setExceptionMessage(errStr);
we->setExceptionSource("thormasterexception");
we->setExceptionCode(exception->errorCode());
WUState newState = (WUStateRunning == state) ? WUStateWait : WUStateFailed;
wu->setState(newState);
}
}
}

class CJobManager : public CSimpleInterface, implements IJobManager, implements IExceptionHandler
{
bool stopped, handlingConversation;
Expand Down Expand Up @@ -934,16 +956,7 @@ void CJobManager::reply(IConstWorkUnit *workunit, const char *wuid, IException *
if (!exitException)
{
exitException.setown(e);
Owned<IWorkUnit> w = &workunit->lock();
Owned<IWUException> we = w->createException();
we->setSeverity(SeverityInformation);
StringBuffer errStr;
e->errorMessage(errStr);
we->setExceptionMessage(errStr);
we->setExceptionSource("thormasterexception");
we->setExceptionCode(e->errorCode());
WUState newState = (WUStateRunning == w->getState()) ? WUStateWait : WUStateFailed;
w->setState(newState);
relayWuidException(workunit, e);
}
return;
}
Expand Down Expand Up @@ -1413,28 +1426,17 @@ void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphNam
SocketEndpoint dummyAgentEp;
jobManager->execute(workunit, currentWuid, currentGraphName, dummyAgentEp);
IException *e = jobManager->queryExitException();
Owned<IWorkUnit> w = &workunit->lock();
WUState newState = (WUStateRunning == w->getState()) ? WUStateWait : WUStateFailed;
if (e)
{
if (WUStateWait != w->getState()) // if set already, CJobManager::reply may have already set to WUStateWait
{
Owned<IWUException> we = w->createException();
we->setSeverity(SeverityInformation);
StringBuffer errStr;
e->errorMessage(errStr);
we->setExceptionMessage(errStr);
we->setExceptionSource("thormasterexception");
we->setExceptionCode(e->errorCode());

w->setState(newState);
}
// NB: exitException has already been relayed.
break;
}

Owned<IWorkUnit> w = &workunit->lock();
if (!multiJobLinger && lingerPeriod)
w->setDebugValue(instance, "1", true);

WUState newState = (WUStateRunning == w->getState()) ? WUStateWait : WUStateFailed;
w->setState(newState);
}
currentGraphName.clear();
Expand Down
3 changes: 2 additions & 1 deletion thorlcr/master/thgraphmanager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ CSDSServerStatus &openThorServerStatus();
void closeThorServerStatus();
void thorMain(ILogMsgHandler *logHandler, const char *workunit, const char *graphName);

enum ThorExitCodes { TEC_Clean, TEC_CtrlC, TEC_Idle, TEC_Watchdog, TEC_SlaveInit, TEC_Swap, TEC_DaliDown };
enum ThorExitCodes { TEC_Clean, TEC_CtrlC, TEC_Idle, TEC_Watchdog, TEC_SlaveInit, TEC_Swap, TEC_DaliDown, TEC_Exception };

void abortThor(IException *e, unsigned errCode, bool abortCurrentJob=true);
void setExitCode(int code);
int queryExitCode();

void addConnectedWorkerPod(const char *podName);
void publishPodNames(IWorkUnit *workunit);
void relayWuidException(IConstWorkUnit *wu, const IException *exception);


#endif
8 changes: 7 additions & 1 deletion thorlcr/master/thmastermain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1146,6 +1146,12 @@ int main( int argc, const char *argv[] )
{
try
{
if (exception)
{
Owned<IWorkUnitFactory> factory = getWorkUnitFactory();
Owned<IConstWorkUnit> wu = factory->openWorkUnit(workunit);
relayWuidException(wu, exception);
}
k8s::KeepJobs keepJob = k8s::translateKeepJobs(globals->queryProp("@keepJobs"));
switch (keepJob)
{
Expand Down Expand Up @@ -1180,7 +1186,7 @@ int main( int argc, const char *argv[] )
}
}
}
setExitCode(0);
setExitCode(exception ? TEC_Exception : 0);
}

// cleanup handler to be sure we end
Expand Down

0 comments on commit 2118784

Please sign in to comment.