Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error handling fixes #170

Merged
merged 9 commits into from
Nov 12, 2022
23 changes: 15 additions & 8 deletions src/libraries/JANA/CLI/JSignalHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ void send_to_named_pipe(const std::string& path_to_named_pipe, const std::string
close(fd);
}
else {
LOG_ERROR(*g_logger) << "Unable to open named pipe '"
<< g_path_to_named_pipe << "' for writing" << LOG_END;
LOG_WARN(*g_logger) << "Unable to open named pipe '" << g_path_to_named_pipe << "' for writing. \n"
<< " You can use a different named pipe for status info by setting the parameter `jana:status_fname`.\n"
<< " The status report will still show up in the log." << LOG_END;
}
}

Expand All @@ -56,12 +57,12 @@ void produce_thread_report() {
/// If something goes wrong, we want to signal all threads to assemble a report
/// Whereas USR1 is meant to be triggered externally and is caught by one thread,
/// produce_overall_report triggers USR2 and is caught by all threads.
void produce_overall_report() {
std::string produce_overall_report() {
std::stringstream ss;

// Include detailed report from JApplication
auto t = time(nullptr);
ss << "JANA STATUS REPORT: " << ctime(&t) << std::endl;
ss << "JANA status report: " << ctime(&t) << std::endl;
ss << g_app->GetComponentSummary() << std::endl;

// Include backtraces from each individual thread
Expand Down Expand Up @@ -101,9 +102,12 @@ void produce_overall_report() {
else {
ss << "Thread model: unknown" << std::endl;
}
return ss.str();
}

LOG_WARN(*g_logger) << ss.str() << LOG_END;
send_to_named_pipe(g_path_to_named_pipe, ss.str());
void send_overall_report_to_named_pipe() {
LOG_WARN(*g_logger) << "Caught USR1 signal! Sending status report to named pipe. `cat " << g_path_to_named_pipe << "` to view." << LOG_END;
send_to_named_pipe(g_path_to_named_pipe, produce_overall_report());
}


Expand All @@ -130,7 +134,7 @@ void handle_sigint(int) {
}

void handle_usr1(int) {
std::thread th(produce_overall_report);
std::thread th(send_overall_report_to_named_pipe);
th.detach();
}

Expand All @@ -139,7 +143,10 @@ void handle_usr2(int) {
}

void handle_sigsegv(int /*signal_number*/, siginfo_t* /*signal_info*/, void* /*context*/) {
produce_overall_report();
LOG_FATAL(*g_logger) << "Segfault detected! Printing backtraces and exiting." << LOG_END;
auto report = produce_overall_report();
LOG_INFO(*g_logger) << report << LOG_END;
exit(static_cast<int>(JApplication::ExitCode::Segfault));
}


Expand Down
4 changes: 2 additions & 2 deletions src/libraries/JANA/Engine/JArrowTopology.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ struct JArrowTopology {
size_t event_processor_chunksize = 1;
size_t location_count = 1;
bool enable_stealing = false;
int affinity = 2;
int locality = 0;
int affinity = 0; // By default, don't pin the CPU at all
int locality = 0; // By default, assume no NUMA domains

JLogger m_logger;

Expand Down
2 changes: 1 addition & 1 deletion src/libraries/JANA/Engine/JTopologyBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class JTopologyBuilder : public JService {
bool m_enable_call_graph_recording = false;
bool m_enable_stealing = false;
bool m_limit_total_events_in_flight = true;
int m_affinity = 2;
int m_affinity = 0;
int m_locality = 0;
JLogger m_arrow_logger;

Expand Down
77 changes: 58 additions & 19 deletions src/libraries/JANA/Utils/JProcessorMapping.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,24 @@
// Subject to the terms in the LICENSE file found in the top-level directory.

#include "JProcessorMapping.h"

#include <JANA/Utils/JTablePrinter.h>
#include <iomanip>
#include <unistd.h>
#include <sys/wait.h>
#include <algorithm>

void JProcessorMapping::initialize(AffinityStrategy affinity, LocalityStrategy locality) {

m_affinity_strategy = affinity;
m_locality_strategy = locality;

if (affinity == AffinityStrategy::None && locality == LocalityStrategy::Global) {
// User doesn't care about NUMA awareness, so we can skip building the processor map completely
m_error_msg = ""; // Denotes "no error" as used by stringifier
return;
}

// Capture lscpu info

int pipe_fd[2]; // We want to pipe lscpu's stdout straight to us
Expand All @@ -30,8 +39,8 @@ void JProcessorMapping::initialize(AffinityStrategy affinity, LocalityStrategy l
dup2(pipe_fd[1], 1); // Redirect stdout to pipe
close(pipe_fd[0]);
close(pipe_fd[1]);
execlp("lscpu", "lscpu", "-b", "-pcpu,core,node,socket", nullptr);
fclose(stdout); // Send an additional EOF so that the parent doesn't hang
execlp("lscpu", "lscpu", "-b", "-pcpu,core,node,socket", (char*) nullptr);
// Unreachable
exit(-1);
}
else { // We are the parent process
Expand Down Expand Up @@ -67,18 +76,48 @@ void JProcessorMapping::initialize(AffinityStrategy affinity, LocalityStrategy l
}
m_mapping.push_back(row);
}
else {
// On machines with no NUMA domains, lscpu returns "" instead of "0"
int count = sscanf(buffer, "%zu,%zu,,%zu", &row.cpu_id, &row.core_id, &row.socket_id);
row.numa_domain_id = row.socket_id;
if (count == 3) {
switch (m_locality_strategy) {
case LocalityStrategy::CpuLocal: row.location_id = row.cpu_id; break;
case LocalityStrategy::CoreLocal: row.location_id = row.core_id; break;
case LocalityStrategy::NumaDomainLocal: row.location_id = row.numa_domain_id; break;
case LocalityStrategy::SocketLocal: row.location_id = row.socket_id; break;
case LocalityStrategy::Global:
default: row.location_id = 0; break;
}
if (row.location_id >= m_loc_count) {
// Assume all of these ids are zero-indexed and contiguous
m_loc_count = row.location_id + 1;
}
m_mapping.push_back(row);
}

}
}
fclose(infile);
int status = 0;
waitpid(pid, &status, 0); // Wait for child to exit and acknowledge. This prevents child from becoming a zombie.

if (m_mapping.empty()) {
if (!WIFEXITED(status)) {
m_error_msg = "lscpu child process returned abnormally";
return;
}
else if (WEXITSTATUS(status) != 0) {
m_error_msg = "lscpu child process returned with an exit code of " + std::to_string(WEXITSTATUS(status));
return;
}
else if (m_mapping.empty()){
m_error_msg = "Unable to parse lscpu output";
return;
}

// Apply affinity strategy by sorting over sets of columns
switch (m_affinity_strategy) {


case AffinityStrategy::ComputeBound:

std::stable_sort(m_mapping.begin(), m_mapping.end(),
Expand Down Expand Up @@ -126,30 +165,30 @@ std::ostream& operator<<(std::ostream& os, const JProcessorMapping::LocalityStra

std::ostream& operator<<(std::ostream& os, const JProcessorMapping& m) {

os << "NUMA Configuration" << std::endl << std::endl;
os << "NUMA Configuration" << std::endl;
os << " Affinity strategy: " << m.m_affinity_strategy << std::endl;
os << " Locality strategy: " << m.m_locality_strategy << std::endl;
os << " Location count: " << m.m_loc_count << std::endl;
if (m.m_locality_strategy != JProcessorMapping::LocalityStrategy::Global) {
os << " Location count: " << m.m_loc_count << std::endl;
}

if (m.m_initialized) {
os << " +--------+----------+-------+--------+-----------+--------+" << std::endl
<< " | worker | location | cpu | core | numa node | socket |" << std::endl
<< " +--------+----------+-------+--------+-----------+--------+" << std::endl;
JTablePrinter table;
table.AddColumn("worker", JTablePrinter::Justify::Right);
table.AddColumn("location", JTablePrinter::Justify::Right);
table.AddColumn("cpu", JTablePrinter::Justify::Right);
table.AddColumn("core", JTablePrinter::Justify::Right);
table.AddColumn("numa node", JTablePrinter::Justify::Right);
table.AddColumn("socket", JTablePrinter::Justify::Right);

size_t worker_id = 0;
for (const JProcessorMapping::Row& row : m.m_mapping) {
os << " | " << std::right << std::setw(6) << worker_id++;
os << " | " << std::setw(8) << row.location_id;
os << " | " << std::setw(5) << row.cpu_id;
os << " | " << std::setw(6) << row.core_id;
os << " | " << std::setw(9) << row.numa_domain_id;
os << " | " << std::setw(6) << row.socket_id << " |" << std::endl;
table | worker_id++ | row.location_id | row.cpu_id | row.core_id | row.numa_domain_id | row.socket_id;
}

os << " +--------+----------+-------+--------+-----------+--------+" << std::endl;
table.Render(os);
}
else {
os << " ERROR: " << m.m_error_msg << std::endl;
else if (!m.m_error_msg.empty()) {
os << " Error: " << m.m_error_msg << std::endl;
}
return os;
}