forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
profiler.h
247 lines (214 loc) · 6.69 KB
/
profiler.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#pragma once
#include <thread>
#include <iostream>
#include <mutex>
#include <memory>
#include <vector>
#include <cstdint>
#include <string>
#include <list>
#include <sstream>
#include <forward_list>
#include <tuple>
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
#ifndef _WIN32
#include <ctime>
#endif
#include <torch/csrc/jit/code_template.h>
typedef struct CUevent_st* CUDAEventStub;
namespace torch { namespace autograd {
struct Function;
namespace profiler {
struct TORCH_API CUDAStubs {
virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
fail();
}
virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
fail();
return 0.f;
}
virtual void nvtxMarkA(const char* name) {
fail();
}
virtual void nvtxRangePushA(const char* name) {
fail();
}
virtual void nvtxRangePop() {
fail();
}
virtual bool enabled() {
return false;
}
virtual void onEachDevice(std::function<void(int)> op) {
fail();
}
virtual void synchronize() {
fail();
}
virtual ~CUDAStubs();
private:
void fail() {
AT_ERROR("CUDA used in profiler but not enabled.");
}
};
TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
return ((a + b - 1) / b) * b;
}
#if defined(__MACH__) && !defined(CLOCK_REALTIME)
#include <sys/time.h>
// clock_gettime is not implemented on older versions of OS X (< 10.12).
// If implemented, CLOCK_REALTIME will have already been defined.
#endif
inline int64_t getTime() {
#ifdef _WIN32
using namespace std::chrono;
using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
#elif defined(__MACH__) && !defined(CLOCK_REALTIME)
struct timeval now;
gettimeofday(&now, NULL);
return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
#else
// clock_gettime is *much* faster than std::chrono implementation on Linux
struct timespec t{};
clock_gettime(CLOCK_MONOTONIC, &t);
return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
#endif
}
enum class EventKind : uint16_t {
Mark,
PushRange,
PopRange
};
struct TORCH_API Event final {
Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
: owned_name_(new std::string(std::move(name)))
, name_ptr_(owned_name_->c_str())
, kind_(kind)
, thread_id_(thread_id) { record(record_cuda); }
Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
: name_ptr_(name)
, kind_(kind)
, thread_id_(thread_id) { record(record_cuda); }
void record(bool record_cuda);
std::string kind() const {
switch(kind_) {
case EventKind::Mark: return "mark";
case EventKind::PushRange: return "push";
case EventKind::PopRange: return "pop";
}
throw std::runtime_error("unknown EventKind");
}
const char* name() const {
return name_ptr_;
}
uint16_t thread_id() const {
return thread_id_;
}
double cpu_elapsed_us(const Event & e) {
return (e.cpu_ns_ - cpu_ns_)/(1000.0);
}
double cuda_elapsed_us(const Event & e);
bool has_cuda() const {
return event != nullptr;
}
int device() const {
return device_;
}
private:
int64_t cpu_ns_ = 0; // signed to allow for negative intervals, initialized for safety.
// std::string is a very large object (usually around 32B),
// and this field is used only for user-created ranges, so
// it's better to save on size of Events.
std::unique_ptr<std::string> owned_name_;
const char * name_ptr_;
EventKind kind_;
uint16_t thread_id_;
int device_ = -1;
struct CUevent_st* event = nullptr;
};
// a linked-list of fixed sized vectors, to avoid
// a std::vector resize from taking a large amount of time inside
// a profiling event
struct RangeEventList {
constexpr static size_t MB = 1024 * 1024;
constexpr static size_t event_block_size = 16 * MB;
constexpr static size_t num_block_elements =
event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
"num_block_elements is calculated incorrectly");
using block_type = std::vector<Event>;
void allocBlock() {
blocks.emplace_front();
auto & new_block = blocks.front();
new_block.reserve(num_block_elements);
// Materialize all pages in the new block to release jitter when recording events.
const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
ptr < end_ptr; ptr += 4 * 1024) {
(*ptr);
}
}
template<typename... Args>
void record(Args&&... args) {
if (blocks.empty() || blocks.front().size() == num_block_elements) {
allocBlock();
}
blocks.front().emplace_back(std::forward<Args>(args)...);
}
std::vector<Event> consolidate() {
std::vector<Event> result;
for (auto & block : blocks) {
result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
blocks.clear();
return result;
}
std::forward_list<block_type> blocks;
};
enum class ProfilerState {
Disabled,
CPU, // CPU-only profiling
CUDA, // CPU + CUDA events
NVTX, // only emit NVTX markers
};
TORCH_API RangeEventList& getEventList();
TORCH_API void mark(std::string name, bool include_cuda = true);
TORCH_API void pushRange(std::string name);
TORCH_API void popRange();
struct TORCH_API RecordFunction {
explicit RecordFunction(Function* fn);
explicit RecordFunction(std::string name);
explicit RecordFunction(const char* name);
explicit RecordFunction(const char* name, int64_t current_sequence_nr);
~RecordFunction() {
popRange();
}
};
using thread_event_lists = std::vector<std::vector<Event>>;
// NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
// there no autograd functions are being executed when these function are used.
TORCH_API void enableProfiler(ProfilerState new_state);
TORCH_API thread_event_lists disableProfiler();
// Usage:
// {
// RecordProfile guard("filename.trace");
// // code you want to profile
// }
// Then open filename.trace in chrome://tracing
struct TORCH_API RecordProfile {
RecordProfile(std::ostream& out);
RecordProfile(const std::string& filename);
~RecordProfile();
private:
void init();
std::unique_ptr<std::ofstream> file_;
std::ostream& out_;
void processEvents(const std::vector<Event*>& events);
};
} // namespace profiler
}} // namespace torch::autograd