Skip to content

Commit

Permalink
HPCC-32810 Add Thor mechanism to capture stacks
Browse files Browse the repository at this point in the history
Via a workunit "Action", trigger the capture of debug info on
the manager and workers and associate with the workunit.

Signed-off-by: Jake Smith <[email protected]>
  • Loading branch information
jakesmith committed Oct 17, 2024
1 parent 7bf4acc commit 21c7c0c
Show file tree
Hide file tree
Showing 22 changed files with 274 additions and 74 deletions.
1 change: 1 addition & 0 deletions common/workunit/workunit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3673,6 +3673,7 @@ EnumMapping actions[] = {
{ WUActionPause, "pause" },
{ WUActionPauseNow, "pausenow" },
{ WUActionResume, "resume" },
{ WUActionGenerateDebugInfo, "debuginfo" },
{ WUActionSize, NULL },
};

Expand Down
3 changes: 2 additions & 1 deletion common/workunit/workunit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ enum WUAction
WUActionPause = 5,
WUActionPauseNow = 6,
WUActionResume = 7,
WUActionSize = 8
WUActionSize = 8,
WUActionGenerateDebugInfo = 9,
};


Expand Down
3 changes: 2 additions & 1 deletion esp/services/ws_workunits/ws_workunitsHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2704,7 +2704,8 @@ bool WsWuInfo::validateWUAssociatedFile(const char* file, WUFileType type)
//which contains Post Mortem files.
Owned<IFile> postMortemFile = createIFile(name.str());
validatePostMortemFile(postMortemFile, file, validated);
return validated;
if (validated)
return true;
}

if (strieq(file, name.str()))
Expand Down
2 changes: 1 addition & 1 deletion helm/hpcc/templates/eclagent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ data:
{{- $appCmd := printf "%s %s %s _HPCC_ARGS_" $apptype (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "ECL Agent" "optional" false )) }}
{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $appCmd)) | indent 12 }}
env:
{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 1) | nindent 12 }}
{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }}
{{- include "hpcc.generateImageEnv" . | nindent 12 }}
workingDir: /var/lib/HPCCSystems
volumeMounts:
Expand Down
2 changes: 1 addition & 1 deletion helm/hpcc/templates/eclccserver.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ data:
{{- $eclccserverCmd := printf "eclccserver %s %s _HPCC_ARGS_" (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "ECLCC Server" "optional" false)) }}
{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $eclccserverCmd)) | indent 12 }}
env:
{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 1) | nindent 12 }}
{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }}
{{- include "hpcc.generateImageEnv" . | nindent 12 }}
workingDir: /tmp
volumeMounts:
Expand Down
6 changes: 3 additions & 3 deletions helm/hpcc/templates/thor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ data:
{{- $agentCmd := printf "%s %s %s _HPCC_ARGS_" $eclAgentType (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "Thor" "optional" false)) }}
{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $agentCmd)) | indent 12 }}
env:
{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 1) | nindent 12 }}
{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }}
{{- include "hpcc.generateImageEnv" . | nindent 12 }}
workingDir: /var/lib/HPCCSystems
volumeMounts:
Expand Down Expand Up @@ -183,7 +183,7 @@ data:
{{- $thorManagerCmd := printf "thormaster_lcr %s %s _HPCC_ARGS_" (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "Thor" "optional" false)) }}
{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $thorManagerCmd)) | indent 12 }}
env:
{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 2) | nindent 12 }}
{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 2) | nindent 12 }}
{{- include "hpcc.generateImageEnv" . | nindent 12 }}
workingDir: /var/lib/HPCCSystems
volumeMounts:
Expand Down Expand Up @@ -253,7 +253,7 @@ data:
{{- $thorWorkerCmd := printf "thorslave_lcr %s %s _HPCC_ARGS_ --slaveport=%d" (include "hpcc.configArg" $configCtx.me) (include "hpcc.daliArg" (dict "root" $configCtx.root "component" "Thor" "optional" false)) $slavePort }}
{{ include "hpcc.addCommandAndLifecycle" ($configCtx | merge (dict "command" $thorWorkerCmd)) | indent 12 }}
env:
{{- $env := append ($configCtx.me.env | default list) (dict "name" "MY_CONTAINER_NAME" "value" (printf "%s-%d" $thorWorkerJobName $containerNum)) }}
{{- $env := concat ($configCtx.me.env | default list) (list (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_") (dict "name" "MY_CONTAINER_NAME" "value" (printf "%s-%d" $thorWorkerJobName $containerNum))) }}
{{- include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 8) | nindent 12 }}
{{- include "hpcc.generateImageEnv" $configCtx | nindent 12 }}
workingDir: /var/lib/HPCCSystems
Expand Down
7 changes: 4 additions & 3 deletions initfiles/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@ if ( PLATFORM AND UNIX )
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/bash-vars.in" "${CMAKE_BINARY_DIR}/bash-vars")
set(bash-vars "${CMAKE_BINARY_DIR}/bash-vars")

configure_file("${CMAKE_CURRENT_SOURCE_DIR}/bin/.gdbinit.in" "${CMAKE_BINARY_DIR}/bin/.gdbinit" @ONLY)
install ( FILES "${CMAKE_BINARY_DIR}/bin/.gdbinit" DESTINATION ${EXEC_DIR} COMPONENT Runtime )
install ( FILES bin/post-mortem-gdb DESTINATION ${EXEC_DIR} COMPONENT Runtime )
install ( FILES lib/libjlib.so-gdb.py DESTINATION ${LIB_DIR} COMPONENT Runtime )
if ( CONTAINERIZED )
install ( FILES bin/.gdbinit DESTINATION ${EXEC_DIR} COMPONENT Runtime )
install ( FILES bin/post-mortem-gdb DESTINATION ${EXEC_DIR} COMPONENT Runtime )
install ( FILES lib/libjlib.so-gdb.py DESTINATION ${LIB_DIR} COMPONENT Runtime )
install ( PROGRAMS bin/k8s_postjob_clearup.sh DESTINATION ${EXEC_DIR} COMPONENT Runtime )
install ( PROGRAMS bin/check_executes DESTINATION ${EXEC_DIR} COMPONENT Runtime )
else ()
Expand Down
3 changes: 0 additions & 3 deletions initfiles/bin/.gdbinit

This file was deleted.

3 changes: 3 additions & 0 deletions initfiles/bin/.gdbinit.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# These commands will be executed by gdb on startup
add-auto-load-safe-path @DESTDIR@/opt/HPCCSystems/lib/libjlib.so-gdb.py
set print object 1
2 changes: 1 addition & 1 deletion initfiles/lib/libjlib.so-gdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def invoke (self, arg, from_tty):
ignoreVars = set(['statsMetaData', 'roAttributes', 'roAttributeValues', 'RandomMain'])
ignorematch = re.compile(" StatisticsMapping ")
varmatch = re.compile("[^a-zA-Z_0-9:]([a-zA-Z_][a-z0-9_A-Z:]*)(\\[.*])?;$")
goodfilematch = re.compile("^File /hpcc-dev/HPCC-Platform/(.*[.]cpp):$")
goodfilematch = re.compile("^File /.*/HPCC-Platform/(.*[.]cpp):$")
filematch = re.compile("^File (.*):$")
infile = None
file_written = False
Expand Down
10 changes: 9 additions & 1 deletion system/jlib/jcontainerized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

namespace k8s {

static StringBuffer myPodName, myContainerName;
static StringBuffer myPodName, myContainerName, myJobName;

const char *queryMyPodName()
{
Expand All @@ -32,6 +32,11 @@ const char *queryMyContainerName()
return myContainerName;
}

const char *queryMyJobName()
{
return myJobName;
}

KeepJobs translateKeepJobs(const char *keepJob)
{
if (!isEmptyString(keepJob)) // common case
Expand Down Expand Up @@ -508,6 +513,9 @@ MODULE_INIT(INIT_PRIORITY_STANDARD)
getEnvVar("MY_CONTAINER_NAME", myContainerName.clear());
if (myContainerName.isEmpty())
myContainerName.set(myPodName); // if identical (standard case), not set by templates
getEnvVar("MY_JOB_NAME", myJobName.clear()); // only k8s jobs will have this set
if (myJobName.isEmpty())
myJobName.set(myPodName); // if no explicit job name (not a k8s job) then use pod name
};
if (isContainerized())
podInfoInitCBId = installConfigUpdateHook(updateFunc, true);
Expand Down
1 change: 1 addition & 0 deletions system/jlib/jcontainerized.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ namespace k8s {
jlib_decl std::vector<std::vector<std::string>> getPodNodes(const char *selector);
jlib_decl const char *queryMyPodName();
jlib_decl const char *queryMyContainerName();
jlib_decl const char *queryMyJobName();

enum class KeepJobs { none, podfailures, all };
jlib_decl KeepJobs translateKeepJobs(const char *keepJobs);
Expand Down
4 changes: 3 additions & 1 deletion system/jlib/jexcept.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1815,7 +1815,9 @@ bool getDebuggerGetStacksCmd(StringBuffer &output)
output.append("Unable to capture stacks");
return false;
}
return output.appendf("gdb --batch -n -ex 'thread apply all bt' %s %u", exePath, GetCurrentProcessId());

output.appendf("gdb --batch -ix %s/.gdbinit -x %s/post-mortem-gdb %s %u", hpccBuildInfo.execDir, hpccBuildInfo.execDir, exePath, GetCurrentProcessId());
return true;
}

bool getAllStacks(StringBuffer &output)
Expand Down
10 changes: 0 additions & 10 deletions thorlcr/graph/thgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2810,16 +2810,6 @@ void CJobBase::startJob()
keyFileCacheLimit = (querySlaves()+1)*2;
setKeyIndexCacheSize(keyFileCacheLimit);
PROGLOG("Key file cache size set to: %d", keyFileCacheLimit);
if (getOptBool("dumpStacks")) // mainly as an example of printAllStacks() usage
{
StringBuffer output;
if (getAllStacks(output))
{
IERRLOG("%s", output.str());
}
else
IWARNLOG("Failed to capture process stacks: %s", output.str());
}

// NB: these defaults match defaults in jfile rename retry mechanism
constexpr unsigned defaultNumRenameRetries = 4;
Expand Down
152 changes: 138 additions & 14 deletions thorlcr/graph/thgraphmaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@
#include <future>
#include <vector>
#include <iterator>
#include "jprop.hpp"
#include "jcontainerized.hpp"
#include "jexcept.hpp"
#include "jiter.ipp"
#include "jlzw.hpp"
#include "jprop.hpp"
#include "jsocket.hpp"
#include "jset.hpp"
#include "jsort.hpp"
Expand Down Expand Up @@ -1892,26 +1893,110 @@ bool CJobMaster::go()
if (flags & SubscribeOptionAction)
{
job.markWuDirty();
bool abort = false;
bool pause = false;
wu.forceReload();
WUAction action = wu.getAction();
if (action==WUActionPause)
if ((WUActionPause==action) || (WUActionPauseNow==action))
{
// pause after current subgraph
pause = true;
}
else if (action==WUActionPauseNow)
{
// abort current subgraph
abort = true;
pause = true;
}
if (pause)
{
bool abort = (action==WUActionPauseNow); // abort current subgraph
PROGLOG("Pausing job%s", abort?" [now]":"");
job.pause(abort);
}
else if (action==WUActionGenerateDebugInfo)
{
StringBuffer dir;
if (!getConfigurationDirectory(globals->queryPropTree("Directories"), "debug", "thor", globals->queryProp("@name"), dir))
{
if (!isContainerized())
{
appendCurrentDirectory(dir, false);
addPathSepChar(dir);
dir.append("debuginfo"); // use ./debuginfo in non-containerized mode
}
else
{
IWARNLOG("Failed to get debug directory");
return;
}
}
addPathSepChar(dir);
dir.append(job.queryWuid());
if (isContainerized())
{
addPathSepChar(dir);
dir.append(k8s::queryMyJobName());
}
addPathSepChar(dir);
CDateTime now;
now.setNow();
unsigned year, month, day, hour, minute, second, nano;
now.getDate(year, month, day);
now.getTime(hour, minute, second, nano);
VStringBuffer dateStr("%04d%02d%02d-%02d%02d%02d", year, month, day, hour, minute, second);
dir.append(dateStr);

auto managerCaptureFunc = [&dir]()
{
return captureDebugInfo(dir, "thormanager", nullptr);
};
std::future<std::vector<std::string>> managerResultsFuture = std::async(std::launch::async, managerCaptureFunc);

std::vector<std::string> capturedFiles;
auto responseFunc = [&capturedFiles](unsigned worker, MemoryBuffer &mb)
{
bool res;
mb.read(res);
if (!res)
{
Owned<IException> e = deserializeException(mb);
VStringBuffer msg("Failed to get stack trace from worker %u", worker);
IWARNLOG(e, msg);
}
StringAttr file;
while (true)
{
mb.read(file);
if (file.isEmpty())
break;
capturedFiles.push_back(file.get());
}
};
VStringBuffer cmd("<debuginfo dir=\"%s\"/>", dir.str());
job.issueWorkerDebugCmd(cmd, 0, responseFunc);
std::vector<std::string> managerResults = managerResultsFuture.get();
capturedFiles.insert(capturedFiles.end(), managerResults.begin(), managerResults.end());

VStringBuffer description("debuginfo-%s", dateStr.str());
if (isContainerized())
{
VStringBuffer archiveFilename("debuginfo-%s.tar.gz", dateStr.str());
VStringBuffer tarCmd("cd %s && tar -czf %s --exclude=%s --remove-files *", dir.str(), archiveFilename.str(), archiveFilename.str());
if (0 != system(tarCmd))
{
OWARNLOG("Failed to create tarball of debuginfo");
return;
}
Owned<IWorkUnit> lw = &wu.lock();
Owned<IWUQuery> query = lw->updateQuery();
VStringBuffer archiveFilePath("%s/%s", dir.str(), archiveFilename.str());
query->addAssociatedFile(FileTypePostMortem, archiveFilePath, "localhost", description, 0, 0, 0);
}
else
{
Owned<IWorkUnit> lw = &wu.lock();
Owned<IWUQuery> query = lw->updateQuery();
for (auto &file: capturedFiles)
{
RemoteFilename rfn;
rfn.setRemotePath(file.c_str());
StringBuffer localPath;
rfn.getLocalPath(localPath);
StringBuffer host;
rfn.queryEndpoint().getEndpointHostText(host);
query->addAssociatedFile(FileTypeLog, localPath, host, description, 0, 0, 0);
}
}
}
}
}
} workunitStateChangeHandler(*this, *workunit);
Expand Down Expand Up @@ -2089,6 +2174,45 @@ void CJobMaster::pause(bool doAbort)
}
}

void CJobMaster::issueWorkerDebugCmd(const char *rawText, unsigned workerNum, std::function<void(unsigned, MemoryBuffer &mb)> responseFunc)
{
mptag_t replyTag = createReplyTag();
ICommunicator &comm = queryNodeComm();
CMessageBuffer mbuf;
mbuf.append(DebugRequest);
mbuf.append(queryKey());
serializeMPtag(mbuf, replyTag);
mbuf.append(rawText);
rank_t rank = workerNum ? workerNum : RANK_ALL_OTHER; // 0 == all workers
if (!comm.send(mbuf, rank, masterSlaveMpTag, MP_ASYNC_SEND))
{
DBGLOG("Failed to send debug info to slave");
throwUnexpected();
}

rank = workerNum ? workerNum : RANK_ALL;
unsigned numToRecv = workerNum ? 1 : queryNodes();
while (numToRecv)
{
rank_t sender;
mbuf.clear();
unsigned recvTimeoutCount = 0;
while (!comm.recv(mbuf, rank, replyTag, &sender, SHORTTIMEOUT))
{
if (queryAborted())
return;
++recvTimeoutCount;
if (recvTimeoutCount == 10)
throw makeStringExceptionV(0, "Timedout waiting for debugcmd response from worker %u", workerNum);
IWARNLOG("Waiting for debugcmd response from worker %u", workerNum);
}
while (mbuf.remaining())
responseFunc(sender, mbuf);

numToRecv--;
}
}

bool CJobMaster::queryCreatedFile(const char *file)
{
StringBuffer scopedName;
Expand Down
1 change: 1 addition & 0 deletions thorlcr/graph/thgraphmaster.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ public:
void saveSpills();
bool go();
void pause(bool abort);
void issueWorkerDebugCmd(const char *rawText, unsigned workerNum, std::function<void(unsigned, MemoryBuffer &mb)> responseFunc);

virtual IConstWorkUnit &queryWorkUnit() const
{
Expand Down
Loading

0 comments on commit 21c7c0c

Please sign in to comment.