Skip to content

Commit

Permalink
Merge pull request lattice#708 from lattice/feature/trace
Browse files Browse the repository at this point in the history
Feature/trace
  • Loading branch information
mathiaswagner authored Jul 9, 2018
2 parents 445cb6b + a771bcb commit 6624045
Show file tree
Hide file tree
Showing 9 changed files with 178 additions and 18 deletions.
20 changes: 20 additions & 0 deletions include/malloc_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,26 @@ namespace quda {
void printPeakMemUsage();
void assertAllMemFree();

/**
@return peak device memory allocated
*/
long device_allocated_peak();

/**
@return peak pinned memory allocated
*/
long pinned_allocated_peak();

/**
@return peak mapped memory allocated
*/
long mapped_allocated_peak();

/**
@return peak host memory allocated
*/
long host_allocated_peak();

/*
* The following functions should not be called directly. Use the
* macros below instead.
Expand Down
3 changes: 2 additions & 1 deletion lib/dslash_coarse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,8 @@ namespace quda {
: out(out), inA(inA), inB(inB), Y(Y), X(X), kappa(kappa), parity(parity),
nParity(out.SiteSubset()), nSrc(out.Ndim()==5 ? out.X(4) : 1)
{
strcpy(aux, out.AuxString());
strcpy(aux, "policy_kernel,");
strcat(aux, out.AuxString());
strcat(aux, comm_dim_partitioned_string());

// record the location of where each pack buffer is in [2*dim+dir] ordering
Expand Down
3 changes: 2 additions & 1 deletion lib/dslash_pack.cu
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,8 @@ namespace quda {
unsigned int minThreads() const { return threads(); }

void fillAux() {
strcpy(aux, in->AuxString());
strcpy(aux,"policy_kernel,");
strcat(aux, in->AuxString());
char comm[5];
comm[0] = (commDim[0] ? '1' : '0');
comm[1] = (commDim[1] ? '1' : '0');
Expand Down
14 changes: 7 additions & 7 deletions lib/dslash_quda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -536,14 +536,14 @@ public:

fillAuxBase();
#ifdef MULTI_GPU
fillAux(INTERIOR_KERNEL, "type=interior");
fillAux(EXTERIOR_KERNEL_ALL, "type=exterior_all");
fillAux(EXTERIOR_KERNEL_X, "type=exterior_x");
fillAux(EXTERIOR_KERNEL_Y, "type=exterior_y");
fillAux(EXTERIOR_KERNEL_Z, "type=exterior_z");
fillAux(EXTERIOR_KERNEL_T, "type=exterior_t");
fillAux(INTERIOR_KERNEL, "policy_kernel=interior");
fillAux(EXTERIOR_KERNEL_ALL, "policy_kernel=exterior_all");
fillAux(EXTERIOR_KERNEL_X, "policy_kernel=exterior_x");
fillAux(EXTERIOR_KERNEL_Y, "policy_kernel=exterior_y");
fillAux(EXTERIOR_KERNEL_Z, "policy_kernel=exterior_z");
fillAux(EXTERIOR_KERNEL_T, "policy_kernel=exterior_t");
#else
fillAux(INTERIOR_KERNEL, "type=single-GPU");
fillAux(INTERIOR_KERNEL, "policy_kernel=single-GPU");
#endif // MULTI_GPU
fillAux(KERNEL_POLICY, "policy");

Expand Down
8 changes: 8 additions & 0 deletions lib/malloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ namespace quda {
static long total_host_bytes, max_total_host_bytes;
static long total_pinned_bytes, max_total_pinned_bytes;

long device_allocated_peak() { return max_total_bytes[DEVICE]; }

long pinned_allocated_peak() { return max_total_bytes[PINNED]; }

long mapped_allocated_peak() { return max_total_bytes[MAPPED]; }

long host_allocated_peak() { return max_total_bytes[HOST]; }

static void print_trace (void) {
void *array[10];
size_t size;
Expand Down
7 changes: 5 additions & 2 deletions lib/multi_reduce_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ template<typename doubleN, typename ReduceType, typename FloatN, int M, int NXZ,
for (int i=0; i<NXZ; i++) {
for (int j=0; j<arg.NYW; j++) {
result[i*arg.NYW+j] = set(((ReduceType*)getHostReduceBuffer())[j*NXZ+i]);
if (tp.grid.z==2) result[i*arg.NYW+j] = set(((ReduceType*)getHostReduceBuffer())[NXZ*arg.NYW+j*NXZ+i]);
if (tp.grid.z==2) sum(result[i*arg.NYW+j], ((ReduceType*)getHostReduceBuffer())[NXZ*arg.NYW+j*NXZ+i]);
}
}
}
Expand Down Expand Up @@ -278,7 +278,10 @@ public:
strcpy(name, num_to_string<NXZ>::value);
strcat(name, std::to_string(NYW).c_str());
strcat(name, typeid(arg.r).name());
return TuneKey(blasStrings.vol_str, name, blasStrings.aux_tmp);
char aux[TuneKey::aux_n];
strcpy(aux, "policy_kernel,");
strcat(aux,blasStrings.aux_tmp);
return TuneKey(blasStrings.vol_str, name, aux);
}

void apply(const cudaStream_t &stream){
Expand Down
126 changes: 121 additions & 5 deletions lib/tune.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <fstream>
#include <typeinfo>
#include <map>
#include <list>
#include <unistd.h>

#include <deque>
Expand All @@ -28,13 +29,68 @@ quda::TuneKey getLastTuneKey() { return quda::last_key; }
namespace quda {
typedef std::map<TuneKey, TuneParam> map;

struct TraceKey {

TuneKey key;
float time;

long device_bytes;
long pinned_bytes;
long mapped_bytes;
long host_bytes;

TraceKey() { }

TraceKey(const TuneKey &key, float time)
: key(key), time(time),
device_bytes(device_allocated_peak()),
pinned_bytes(pinned_allocated_peak()),
mapped_bytes(mapped_allocated_peak()),
host_bytes(host_allocated_peak()) { }

TraceKey(const TraceKey &trace)
: key(trace.key), time(trace.time),
device_bytes(trace.device_bytes),
pinned_bytes(trace.pinned_bytes),
mapped_bytes(trace.mapped_bytes),
host_bytes(trace.host_bytes) { }

TraceKey& operator=(const TraceKey &trace) {
if (&trace != this) {
key = trace.key;
time = trace.time;
device_bytes = trace.device_bytes;
pinned_bytes = trace.pinned_bytes;
mapped_bytes = trace.mapped_bytes;
host_bytes = trace.host_bytes;
}
return *this;
}
};

// linked list that is augmented each time we call a kernel
static std::list<TraceKey> trace_list;
static bool enable_trace = false;

bool traceEnabled() {
static bool init = false;

if (!init) {
char *enable_trace_env = getenv("QUDA_ENABLE_TRACE");
if (enable_trace_env && strcmp(enable_trace_env, "1") == 0) {
enable_trace = true;
}
init = true;
}
return enable_trace;
}

static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
static std::string resource_path;
static map tunecache;
static map::iterator it;
static size_t initial_cache_size = 0;


#define STR_(x) #x
#define STR(x) STR_(x)
static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
Expand Down Expand Up @@ -179,6 +235,34 @@ namespace quda {
async_out << std::endl << "# Total time spent in asynchronous execution = " << async_total_time << " seconds" << std::endl;
}

/**
* Serialize trace to an ostream, useful for writing to a file or sending to other nodes.
*/
static void serializeTrace(std::ostream &out)
{
for (auto it = trace_list.begin(); it != trace_list.end(); it++) {

TuneKey &key = it->key;

// special case kernel members of a policy
char tmp[14] = { };
strncpy(tmp, key.aux, 13);
bool is_policy_kernel = strcmp(tmp, "policy_kernel") == 0 ? true : false;

out << std::setw(12) << it->time << "\t";
out << std::setw(12) << it->device_bytes << "\t";
out << std::setw(12) << it->pinned_bytes << "\t";
out << std::setw(12) << it->mapped_bytes << "\t";
out << std::setw(12) << it->host_bytes << "\t";
out << std::setw(16) << key.volume << "\t";
if (is_policy_kernel) out << "\t";
out << key.name << "\t";
if (!is_policy_kernel) out << "\t";
out << key.aux << std::endl;

}
}


/**
* Distribute the tunecache from node 0 to all other nodes.
Expand Down Expand Up @@ -389,8 +473,8 @@ namespace quda {
{
time_t now;
int lock_handle;
std::string lock_path, profile_path, async_profile_path;
std::ofstream profile_file, async_profile_file;
std::string lock_path, profile_path, async_profile_path, trace_path;
std::ofstream profile_file, async_profile_file, trace_file;

if (resource_path.empty()) return;

Expand Down Expand Up @@ -419,18 +503,21 @@ namespace quda {
char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");

if (!profile_fname) {
warningQuda("Environment variable QUDA_PROFILE_OUTPUT_BASE is not set; writing to profile.tsv and profile_async.tsv");
warningQuda("Environment variable QUDA_PROFILE_OUTPUT_BASE not set; writing to profile.tsv and profile_async.tsv");
profile_path = resource_path + "/profile_" + std::to_string(count) + ".tsv";
async_profile_path = resource_path + "/profile_async_" + std::to_string(count) + ".tsv";
if (traceEnabled()) trace_path = resource_path + "/trace_" + std::to_string(count) + ".tsv";
} else {
profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + ".tsv";
async_profile_path = resource_path + "/" + profile_fname + "_" + std::to_string(count) + "_async.tsv";
if (traceEnabled()) trace_path = resource_path + "/" + profile_fname + "_trace_" + std::to_string(count) + ".tsv";
}

count++;

profile_file.open(profile_path.c_str());
async_profile_file.open(async_profile_path.c_str());
if (traceEnabled()) trace_file.open(trace_path.c_str());

if (getVerbosity() >= QUDA_SUMMARIZE) {
// compute number of non-zero entries that will be output in the profile
Expand All @@ -448,6 +535,7 @@ namespace quda {

printfQuda("Saving %d sets of cached parameters to %s\n", n_entry, profile_path.c_str());
printfQuda("Saving %d sets of cached profiles to %s\n", n_policy, async_profile_path.c_str());
if (traceEnabled()) printfQuda("Saving trace list with %lu entries to %s\n", trace_list.size(), trace_path.c_str());
}

time(&now);
Expand Down Expand Up @@ -477,6 +565,24 @@ namespace quda {
profile_file.close();
async_profile_file.close();

if (traceEnabled()) {
trace_file << "trace" << "\t" << quda_version;
#ifdef GITVERSION
trace_file << "\t" << gitversion;
#else
trace_file << "\t" << quda_version;
#endif
trace_file << "\t" << quda_hash << "\t# Last updated " << ctime(&now) << std::endl;

trace_file << std::setw(12) << "time\t" << std::setw(12) << "device-mem\t" << std::setw(12) << "pinned-mem\t";
trace_file << std::setw(12) << "mapped-mem\t" << std::setw(12) << "host-mem\t";
trace_file << std::setw(16) << "volume" << "\tname\taux" << std::endl;

serializeTrace(trace_file);

trace_file.close();
}

// Release lock.
close(lock_handle);
remove(lock_path.c_str());
Expand All @@ -486,7 +592,6 @@ namespace quda {
#endif
}


static TimeProfile launchTimer("tuneLaunch");

// static int tally = 0;
Expand Down Expand Up @@ -550,6 +655,12 @@ namespace quda {
launchTimer.TPSTOP(QUDA_PROFILE_EPILOGUE);
launchTimer.TPSTOP(QUDA_PROFILE_TOTAL);
#endif

if (traceEnabled()) {
TraceKey trace_entry(key, param.time);
trace_list.push_back(trace_entry);
}

return param;
}

Expand Down Expand Up @@ -663,6 +774,11 @@ namespace quda {
}
param = tunecache[key]; // read this now for all processes

if (traceEnabled()) {
TraceKey trace_entry(key, param.time);
trace_list.push_back(trace_entry);
}

} else if (&tunable != active_tunable) {
errorQuda("Unexpected call to tuneLaunch() in %s::apply()", typeid(tunable).name());
}
Expand Down
4 changes: 3 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ endif()


## BLAS test
add_test(NAME blas_test COMMAND blas_test --sdim 16 --tdim 16 --gtest_output=xml:blas_test.xml)

add_test(NAME blas_test_parity COMMAND blas_test --sdim 16 --tdim 16 --solve-type direct-pc --gtest_output=xml:blas_test_parity.xml)
add_test(NAME blas_test_full COMMAND blas_test --sdim 16 --tdim 16 --solve-type direct --gtest_output=xml:blas_test_full.xml)


# loop over Dslash policies
Expand Down
11 changes: 10 additions & 1 deletion tests/blas_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ extern int niter;
extern bool verify_results;
extern int Nsrc;
extern int Msrc;
extern QudaSolveType solve_type;

extern void usage(char** );

Expand Down Expand Up @@ -108,7 +109,15 @@ void initFields(int prec)
param.nDim = 4; // number of spacetime dimensions

param.pad = 0; // padding must be zero for cpu fields
param.siteSubset = QUDA_PARITY_SITE_SUBSET;

if (solve_type == QUDA_DIRECT_PC_SOLVE) {
param.siteSubset = QUDA_PARITY_SITE_SUBSET;
} else if (solve_type == QUDA_DIRECT_SOLVE) {
param.siteSubset = QUDA_FULL_SITE_SUBSET;
} else {
errorQuda("Unexpected solve_type=%d\n", solve_type);
}

if (param.siteSubset == QUDA_PARITY_SITE_SUBSET) param.x[0] = xdim/2;
else param.x[0] = xdim;
param.x[1] = ydim;
Expand Down

0 comments on commit 6624045

Please sign in to comment.