Skip to content

Commit

Permalink
tcmalloc: collocate Sampler and __rseq_abi
Browse files Browse the repository at this point in the history
Sampler and __rseq_abi are both accessed on malloc/free fast paths.
Currently they are separate variables and placed on separate cache lines.
Place them in the same cache line.
This is more hacky than I would like to, but all assumptions are
checked with static/runtime checks, so hopefully shuold work reliably.

PiperOrigin-RevId: 569294540
Change-Id: I250cf1943f89f85aa8ec25e46c77446700222051
  • Loading branch information
dvyukov authored and copybara-github committed Sep 28, 2023
1 parent d466616 commit 58c40a3
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 57 deletions.
22 changes: 20 additions & 2 deletions tcmalloc/allocation_sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <memory>
#include <utility>

#include "absl/base/attributes.h"
#include "absl/base/optimization.h"
#include "absl/debugging/stacktrace.h"
#include "absl/time/clock.h"
Expand All @@ -31,6 +32,7 @@
#include "tcmalloc/guarded_allocations.h"
#include "tcmalloc/internal/allocation_guard.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/percpu.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/pagemap.h"
#include "tcmalloc/sampler.h"
Expand Down Expand Up @@ -97,10 +99,26 @@ static std::unique_ptr<const ProfileBase> DumpHeapProfile(State& state) {
return profile;
}

ABSL_CONST_INIT static thread_local Sampler thread_sampler_
extern "C" ABSL_CONST_INIT thread_local Sampler tcmalloc_sampler
ABSL_ATTRIBUTE_INITIAL_EXEC;

inline Sampler* GetThreadSampler() { return &thread_sampler_; }
// Compiler needs to see definition of this variable to generate more
// efficient code for -fPIE/PIC. If the compiler does not see the definition
// it considers it may come from another dynamic library. So even for
// initial-exec model, it need to emit an access via GOT (GOTTPOFF).
// When it sees the definition, it can emit direct %fs:TPOFF access.
// So we provide a weak definition here, but the actual definition is in
// percpu_rseq_asm.S.
ABSL_CONST_INIT ABSL_ATTRIBUTE_WEAK thread_local Sampler tcmalloc_sampler
ABSL_ATTRIBUTE_INITIAL_EXEC;

inline Sampler* GetThreadSampler() {
static_assert(sizeof(tcmalloc_sampler) == TCMALLOC_SAMPLER_SIZE,
"update TCMALLOC_SAMPLER_SIZE");
static_assert(Sampler::HotDataOffset() == TCMALLOC_SAMPLER_HOT_OFFSET,
"update TCMALLOC_SAMPLER_HOT_OFFSET");
return &tcmalloc_sampler;
}

inline bool ShouldGuardingBeAttempted(
Profile::Sample::GuardedStatus guarded_status) {
Expand Down
28 changes: 17 additions & 11 deletions tcmalloc/internal/percpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ ABSL_CONST_INIT static absl::once_flag init_per_cpu_once;
ABSL_CONST_INIT static std::atomic<bool> using_upstream_fence{false};
#endif // TCMALLOC_INTERNAL_PERCPU_USE_RSEQ

extern "C" thread_local char tcmalloc_sampler ABSL_ATTRIBUTE_INITIAL_EXEC;

// Is this thread's __rseq_abi struct currently registered with the kernel?
static bool ThreadRegistered() { return RseqCpuId() >= kCpuIdInitialized; }

Expand Down Expand Up @@ -85,19 +87,23 @@ static void InitPerCpu() {
init_status = kFastMode;

#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ
// See struct Rseq comment for details.
// Ensure __rseq_abi alignment required by ABI.
CHECK_CONDITION(reinterpret_cast<uintptr_t>(&__rseq_abi) % 32 == 0);
// See the comment about data layout in percpu.h for details.
auto sampler_addr = reinterpret_cast<uintptr_t>(&tcmalloc_sampler);
// Have to use volatile because C++ compiler rejects to believe that
// objects can overlap.
volatile auto slabs_addr = reinterpret_cast<uintptr_t>(&tcmalloc_slabs);
auto rseq_abi_addr = reinterpret_cast<uintptr_t>(&__rseq_abi);
// Ensure __rseq_abi alignment required by ABI.
CHECK_CONDITION(rseq_abi_addr % 32 == 0);
// Ensure that all our TLS data is in a single cache line.
CHECK_CONDITION(reinterpret_cast<uintptr_t>(&tcmalloc_rseq) % 64 == 0);
// Ensure that tcmalloc_rseq and __rseq_abi overlap as we expect.
CHECK_CONDITION(reinterpret_cast<uintptr_t>(&tcmalloc_rseq.abi) ==
reinterpret_cast<uintptr_t>(&__rseq_abi));
// And in particular that tcmalloc_rseq.slabs partially overlap with
CHECK_CONDITION((rseq_abi_addr / 64) == (slabs_addr / 64));
CHECK_CONDITION((rseq_abi_addr / 64) ==
((sampler_addr + TCMALLOC_SAMPLER_HOT_OFFSET) / 64));
// Ensure that tcmalloc_slabs partially overlap with
// __rseq_abi.cpu_id_start as we expect.
CHECK_CONDITION(reinterpret_cast<uintptr_t>(&tcmalloc_rseq.slabs) ==
reinterpret_cast<uintptr_t>(&__rseq_abi) +
TCMALLOC_RSEQ_SLABS_OFFSET);
CHECK_CONDITION(slabs_addr == rseq_abi_addr + TCMALLOC_RSEQ_SLABS_OFFSET);
// Ensure that tcmalloc_sampler is located right before tcmalloc_slabs.
CHECK_CONDITION(sampler_addr + TCMALLOC_SAMPLER_SIZE == slabs_addr);

constexpr int kMEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8);
// It is safe to make the syscall below multiple times.
Expand Down
24 changes: 10 additions & 14 deletions tcmalloc/internal/percpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
#ifndef TCMALLOC_INTERNAL_PERCPU_H_
#define TCMALLOC_INTERNAL_PERCPU_H_

// sizeof(Sampler)
#define TCMALLOC_SAMPLER_SIZE 48
// Sampler::HotDataOffset()
#define TCMALLOC_SAMPLER_HOT_OFFSET 33

#define TCMALLOC_PERCPU_SLABS_MASK 0xFFFFFFFFFFFFFF00

// Offset from __rseq_abi to the cached slabs address.
Expand Down Expand Up @@ -149,21 +154,12 @@ inline constexpr int kCpuIdInitialized = 0;
// address to be contained within a single cache line (64 bytes), rather than
// split 2 cache lines. To achieve that we locate __rseq_abi in the second
// part of a cache line.
// For performance reasons we also collocate tcmalloc_sampler with __rseq_abi
// in the same cache line.
// InitPerCpu contains checks that the resulting data layout is as expected.
struct alignas(64) Rseq {
union {
struct alignas(64) {
char unused[28];
// Top 4 bytes of this variable overlap with __rseq_abi.cpu_id_start.
uintptr_t slabs;
} ABSL_ATTRIBUTE_PACKED;
struct alignas(64) {
char pad[32];
kernel_rseq abi;
} ABSL_ATTRIBUTE_PACKED;
};
} ABSL_ATTRIBUTE_PACKED;
extern "C" ABSL_CONST_INIT thread_local volatile Rseq tcmalloc_rseq

// Top 4 bytes of this variable overlap with __rseq_abi.cpu_id_start.
extern "C" ABSL_CONST_INIT thread_local volatile uintptr_t tcmalloc_slabs
ABSL_ATTRIBUTE_INITIAL_EXEC;
extern "C" ABSL_CONST_INIT thread_local volatile kernel_rseq __rseq_abi
ABSL_ATTRIBUTE_INITIAL_EXEC;
Expand Down
18 changes: 12 additions & 6 deletions tcmalloc/internal/percpu_rseq_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,20 @@
#error "RSEQ support expected, but not found."
#endif

// See struct Rseq comment.
.type tcmalloc_rseq, @object
// See the comment about data layout in percpu.h for details.
.type tcmalloc_sampler, @object
.type tcmalloc_slabs, @object
.type __rseq_abi, @object
.section .tdata, "awT", @progbits, unique, 1
.globl tcmalloc_rseq
.globl tcmalloc_sampler
.globl tcmalloc_slabs
.globl __rseq_abi
.p2align 6
tcmalloc_rseq:
.zero 32
.zero 64 + 32 - TCMALLOC_SAMPLER_SIZE - 4
tcmalloc_sampler:
.zero TCMALLOC_SAMPLER_SIZE
tcmalloc_slabs:
.long 0
__rseq_abi:
.long 0 // cpu_id_start
.long 0xffffffff // cpu_id (kCpuIdUninitialized)
Expand All @@ -43,7 +48,8 @@ __rseq_abi:
.short 0xffff // numa_node_id (kCpuIdUninitialized)
.short 0xffff // vcpu_id (kCpuIdUninitialized)
.size __rseq_abi, 32
.size tcmalloc_rseq, 64
.size tcmalloc_sampler, TCMALLOC_SAMPLER_SIZE
.size tcmalloc_slabs, 8

#endif // TCMALLOC_PERCPU_RSEQ_SUPPORTED_PLATFORM

Expand Down
21 changes: 10 additions & 11 deletions tcmalloc/internal/percpu_tcmalloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ struct PerCPUMetadataState {
size_t resident_size;
};

// The bit denotes that tcmalloc_rseq.slabs contains valid slabs offset.
// The bit denotes that tcmalloc_slabs contains valid slabs offset.
constexpr inline uintptr_t kCachedSlabsBit = 63;
constexpr inline uintptr_t kCachedSlabsMask = 1ul << kCachedSlabsBit;

Expand Down Expand Up @@ -392,7 +392,7 @@ class TcmallocSlab {
bool PushSlow(size_t size_class, void* item, OverflowHandler overflow_handler,
void* arg);

// Caches the current cpu slab offset in tcmalloc_rseq.slabs if it wasn't
// Caches the current cpu slab offset in tcmalloc_slabs if it wasn't
// cached and the slab is not resizing. Returns -1 if the offset was cached
// and Push/Pop needs to be retried. Returns the current CPU ID (>=0) when
// the slabs offset was already cached and we need to call underflow/overflow
Expand Down Expand Up @@ -593,7 +593,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
asm volatile(
#endif
TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_Push)
// scratch = tcmalloc_rseq.slabs;
// scratch = tcmalloc_slabs;
"movq " TCMALLOC_RSEQ_TLS_ADDR(rseq_slabs_offset) ", %[scratch]\n"
// if (scratch & TCMALLOC_CACHED_SLABS_MASK>) goto overflow_label;
// scratch &= ~TCMALLOC_CACHED_SLABS_MASK;
Expand Down Expand Up @@ -657,7 +657,7 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool TcmallocSlab_Internal_Push(
asm volatile(
#endif
TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_Push)
// region_start = tcmalloc_rseq.slabs;
// region_start = tcmalloc_slabs;
"ldr %[region_start], " TCMALLOC_RSEQ_TLS_ADDR(rseq_slabs_offset) "\n"
// if (region_start & TCMALLOC_CACHED_SLABS_MASK) goto overflow_label;
// region_start &= ~TCMALLOC_CACHED_SLABS_MASK;
Expand Down Expand Up @@ -795,7 +795,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE auto TcmallocSlab<NumClasses>::Pop(
asm(
#endif
TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_Pop)
// scratch = tcmalloc_rseq.slabs;
// scratch = tcmalloc_slabs;
"movq " TCMALLOC_RSEQ_TLS_ADDR(rseq_slabs_offset) ", %[scratch]\n"
// if (scratch & TCMALLOC_CACHED_SLABS_MASK) goto overflow_label;
// scratch &= ~TCMALLOC_CACHED_SLABS_MASK;
Expand Down Expand Up @@ -880,7 +880,7 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE auto TcmallocSlab<NumClasses>::Pop(
asm(
#endif
TCMALLOC_RSEQ_PROLOGUE(TcmallocSlab_Internal_Pop)
// region_start = tcmalloc_rseq.slabs;
// region_start = tcmalloc_slabs;
"ldr %[region_start], " TCMALLOC_RSEQ_TLS_ADDR(rseq_slabs_offset) "\n"
// if (region_start & TCMALLOC_CACHED_SLABS_MASK) goto overflow_label;
// region_start &= ~TCMALLOC_CACHED_SLABS_MASK;
Expand Down Expand Up @@ -983,8 +983,7 @@ int TcmallocSlab<NumClasses>::CacheCpuSlab() {
int cpu = VirtualRseqCpuId(virtual_cpu_id_offset_);
ASSERT(cpu >= 0);
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ
if (ABSL_PREDICT_FALSE((tcmalloc_rseq.slabs & TCMALLOC_CACHED_SLABS_MASK) ==
0)) {
if (ABSL_PREDICT_FALSE((tcmalloc_slabs & TCMALLOC_CACHED_SLABS_MASK) == 0)) {
return CacheCpuSlabSlow();
}
// We already have slab offset cached, so the slab is indeed full/empty
Expand All @@ -998,7 +997,7 @@ template <size_t NumClasses>
ABSL_ATTRIBUTE_NOINLINE int TcmallocSlab<NumClasses>::CacheCpuSlabSlow() {
int cpu = VirtualRseqCpuId(virtual_cpu_id_offset_);
for (;;) {
intptr_t val = tcmalloc_rseq.slabs;
intptr_t val = tcmalloc_slabs;
ASSERT(!(val & TCMALLOC_CACHED_SLABS_MASK));
const auto [slabs, shift] = GetSlabsAndShift(std::memory_order_relaxed);
Slabs* start = CpuMemoryStart(slabs, shift, cpu);
Expand All @@ -1007,7 +1006,7 @@ ABSL_ATTRIBUTE_NOINLINE int TcmallocSlab<NumClasses>::CacheCpuSlabSlow() {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
auto* ptr = reinterpret_cast<std::atomic<intptr_t>*>(
const_cast<uintptr_t*>(&tcmalloc_rseq.slabs));
const_cast<uintptr_t*>(&tcmalloc_slabs));
#pragma GCC diagnostic pop
int new_cpu =
CompareAndSwapUnsafe(cpu, ptr, val, new_val, virtual_cpu_id_offset_);
Expand All @@ -1025,7 +1024,7 @@ ABSL_ATTRIBUTE_NOINLINE int TcmallocSlab<NumClasses>::CacheCpuSlabSlow() {
// in ResizeSlabs, this prevents possibility of mismatching shift/slabs.
CompilerBarrier();
if (resizing_.load(std::memory_order_relaxed)) {
tcmalloc_rseq.slabs = 0;
tcmalloc_slabs = 0;
return cpu;
}
return -1;
Expand Down
35 changes: 22 additions & 13 deletions tcmalloc/sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,26 +133,22 @@ class Sampler {
// The following are public for the purposes of testing
static uint64_t NextRandom(uint64_t rnd_); // Returns the next prng value

// Used to ensure that the hot fields are collocated in the same cache line
// as __rseq_abi.
static constexpr size_t HotDataOffset() {
return offsetof(Sampler, was_on_fast_path_);
}

constexpr Sampler()
: bytes_until_sample_(0),
sample_period_(0),
: sample_period_(0),
true_bytes_until_sample_(0),
allocs_until_guarded_sample_(0),
rnd_(0),
initialized_(false),
was_on_fast_path_(false) {}
was_on_fast_path_(false),
bytes_until_sample_(0) {}

private:
// Bytes until we sample next.
//
// More specifically when bytes_until_sample_ is X, we can allocate
// X bytes without triggering sampling; on the (X+1)th allocated
// byte, the containing allocation will be sampled.
//
// Always non-negative with only very brief exceptions (see
// DecrementFast{,Finish}, so casting to size_t is ok.
ssize_t bytes_until_sample_;

// Saved copy of the sampling period from when we actually set
// (true_)bytes_until_sample_. This allows us to properly calculate the sample
// weight of the first sample after the sampling period is changed.
Expand All @@ -168,8 +164,21 @@ class Sampler {

uint64_t rnd_; // Cheap random number generator
bool initialized_;

// was_on_fast_path_/bytes_until_sample_ are accessed on every malloc/free,
// so we place them last and collocate with __rseq_abi.
bool was_on_fast_path_;

// Bytes until we sample next.
//
// More specifically when bytes_until_sample_ is X, we can allocate
// X bytes without triggering sampling; on the (X+1)th allocated
// byte, the containing allocation will be sampled.
//
// Always non-negative with only very brief exceptions (see
// DecrementFast{,Finish}, so casting to size_t is ok.
ssize_t bytes_until_sample_;

private:
friend class SamplerTest;
// Initialize this sampler.
Expand Down

0 comments on commit 58c40a3

Please sign in to comment.