From 1482f00275c4496ff9b2b7f348e39370c7547adc Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 10:54:12 +0000 Subject: [PATCH 1/6] Add unsafe api calls checker to track down issues such as #4195 This checker is used to detect accidental thread scheduling switching points happening during profiling sampling. See the bigger comment in unsafe_api_calls_check.h . I was able to check that this checker correctly triggers for the bug in #4195, and also the bug I'm going to fix next, which is the use of `rb_hash_lookup` in the otel context reading code. --- .../collectors_thread_context.c | 78 +++++++++++++++++-- .../profiling.c | 2 + .../unsafe_api_calls_check.c | 47 +++++++++++ .../unsafe_api_calls_check.h | 25 ++++++ lib/datadog/core/configuration.rb | 2 +- .../collectors/thread_context_spec.rb | 23 +++--- 6 files changed, 159 insertions(+), 18 deletions(-) create mode 100644 ext/datadog_profiling_native_extension/unsafe_api_calls_check.c create mode 100644 ext/datadog_profiling_native_extension/unsafe_api_calls_check.h diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 0b56268bed5..b9606797106 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -9,6 +9,7 @@ #include "private_vm_api_access.h" #include "stack_recorder.h" #include "time_helpers.h" +#include "unsafe_api_calls_check.h" // Used to trigger sampling of threads, based on external "events", such as: // * periodic timer for cpu-time and wall-time @@ -203,10 +204,10 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _val static int hash_map_per_thread_context_free_values(st_data_t _thread, st_data_t value_per_thread_context, st_data_t _argument); static VALUE _native_new(VALUE klass); static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self); -static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread); +static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread, VALUE allow_exception); static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance); -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state); +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state, VALUE allow_exception); static void update_metrics_and_sample( struct thread_context_collector_state *state, VALUE thread_being_sampled, @@ -310,11 +311,11 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, -1); rb_define_singleton_method(collectors_thread_context_class, "_native_inspect", _native_inspect, 1); rb_define_singleton_method(collectors_thread_context_class, "_native_reset_after_fork", _native_reset_after_fork, 1); - rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2); + rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 3); rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3); rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1); rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1); - rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2); + rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 3); rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0); rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1); rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); @@ -504,31 +505,49 @@ static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _sel // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread) { +static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread, VALUE allow_exception) { + ENFORCE_BOOLEAN(allow_exception); + if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive"); + if (allow_exception == Qfalse) debug_enter_unsafe_context(); + thread_context_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE), profiler_overhead_stack_thread); + + if (allow_exception == Qfalse) debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_on_gc_start(DDTRACE_UNUSED VALUE self, VALUE collector_instance) { + debug_enter_unsafe_context(); + thread_context_collector_on_gc_start(collector_instance); + + debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_instance) { + debug_enter_unsafe_context(); + (void) !thread_context_collector_on_gc_finish(collector_instance); + + debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) { +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state, VALUE allow_exception) { ENFORCE_BOOLEAN(reset_monotonic_to_system_state); + ENFORCE_BOOLEAN(allow_exception); struct thread_context_collector_state *state; TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); @@ -537,7 +556,12 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_ state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER; } + if (allow_exception == Qfalse) debug_enter_unsafe_context(); + thread_context_collector_sample_after_gc(collector_instance); + + if (allow_exception == Qfalse) debug_leave_unsafe_context(); + return Qtrue; } @@ -982,7 +1006,13 @@ static void trigger_sample_for_thread( // It SHOULD NOT be used for other purposes. static VALUE _native_thread_list(DDTRACE_UNUSED VALUE _self) { VALUE result = rb_ary_new(); + + debug_enter_unsafe_context(); + ddtrace_thread_list(result); + + debug_leave_unsafe_context(); + return result; } @@ -1501,7 +1531,12 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight, VALUE new_object) { + debug_enter_unsafe_context(); + thread_context_collector_sample_allocation(collector_instance, NUM2UINT(sample_weight), new_object); + + debug_leave_unsafe_context(); + return Qtrue; } @@ -1640,7 +1675,12 @@ void thread_context_collector_sample_skipped_allocation_samples(VALUE self_insta } static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples) { + debug_enter_unsafe_context(); + thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples)); + + debug_leave_unsafe_context(); + return Qtrue; } @@ -1979,31 +2019,53 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); + debug_enter_unsafe_context(); + thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread)); + + debug_leave_unsafe_context(); + return Qnil; } static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); + debug_enter_unsafe_context(); + intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread); + + debug_leave_unsafe_context(); + return LONG2NUM(gvl_waiting_at); } static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); - return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse; + debug_enter_unsafe_context(); + + VALUE result = thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse; + + debug_leave_unsafe_context(); + + return result; } static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) { ENFORCE_THREAD(thread); - return thread_context_collector_sample_after_gvl_running( + debug_enter_unsafe_context(); + + VALUE result = thread_context_collector_sample_after_gvl_running( collector_instance, thread, monotonic_wall_time_now_ns(RAISE_ON_FAILURE) ); + + debug_leave_unsafe_context(); + + return result; } static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) { diff --git a/ext/datadog_profiling_native_extension/profiling.c b/ext/datadog_profiling_native_extension/profiling.c index 315ca0d2954..a7bfe0d466b 100644 --- a/ext/datadog_profiling_native_extension/profiling.c +++ b/ext/datadog_profiling_native_extension/profiling.c @@ -11,6 +11,7 @@ #include "ruby_helpers.h" #include "setup_signal_handler.h" #include "time_helpers.h" +#include "unsafe_api_calls_check.h" // Each class/module here is implemented in their separate file void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module); @@ -56,6 +57,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { collectors_thread_context_init(profiling_module); http_transport_init(profiling_module); stack_recorder_init(profiling_module); + unsafe_api_calls_check_init(); // Hosts methods used for testing the native code using RSpec VALUE testing_module = rb_define_module_under(native_extension_module, "Testing"); diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c new file mode 100644 index 00000000000..c3c23b7f1c7 --- /dev/null +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c @@ -0,0 +1,47 @@ +#include +#include +#include + +#include "datadog_ruby_common.h" +#include "unsafe_api_calls_check.h" +#include "extconf.h" + +static bool inside_unsafe_context = false; + +#ifndef NO_POSTPONED_TRIGGER + static rb_postponed_job_handle_t check_for_unsafe_api_calls_handle; +#endif + +static void check_for_unsafe_api_calls(DDTRACE_UNUSED void *_unused); + +void unsafe_api_calls_check_init(void) { + #ifndef NO_POSTPONED_TRIGGER + int unused_flags = 0; + + check_for_unsafe_api_calls_handle = rb_postponed_job_preregister(unused_flags, check_for_unsafe_api_calls, NULL); + + if (check_for_unsafe_api_calls_handle == POSTPONED_JOB_HANDLE_INVALID) { + rb_raise(rb_eRuntimeError, "Failed to register check_for_unsafe_api_calls_handle postponed job (got POSTPONED_JOB_HANDLE_INVALID)"); + } + #endif +} + +void debug_enter_unsafe_context(void) { + inside_unsafe_context = true; + + #ifndef NO_POSTPONED_TRIGGER + rb_postponed_job_trigger(check_for_unsafe_api_calls_handle); + #else + rb_postponed_job_register(0, check_for_unsafe_api_calls, NULL); + #endif +} + +void debug_leave_unsafe_context(void) { + inside_unsafe_context = false; +} + +static void check_for_unsafe_api_calls(DDTRACE_UNUSED void *_unused) { + if (inside_unsafe_context) rb_bug( + "Datadog Ruby profiler detected callback nested inside sample. Please report this at https://github.com/datadog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug" + ); +} diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h new file mode 100644 index 00000000000..092a59f30b3 --- /dev/null +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h @@ -0,0 +1,25 @@ +#pragma once + +// This checker is used to detect accidental thread scheduling switching points happening during profiling sampling. +// +// Specifically, when the profiler is sampling, we're never supposed to call into Ruby code (e.g. methods +// implemented using Ruby code) or allocate Ruby objects. +// That's because those events introduce thread switch points, and really we don't the VM switching between threads +// in the middle of the profiler sampling. +// This includes raising exceptions, unless we're trying to stop the profiler, and even then we must be careful. +// +// The above is especially true in situations such as GC profiling or allocation/heap profiling, as in those situations +// we can even crash the Ruby VM if we switch away at the wrong time. +// +// The below APIs can be used to detect these situations. They work by relying on the following observation: +// in most (all?) thread switch points, Ruby will check for interrupts and run the postponed jobs. +// +// Thus, if we set a flag while we're sampling (inside_unsafe_context), trigger the postponed job, and then only unset +// the flag after sampling, he correct thing to happen is that the postponed job should never see the flag. +// +// If, however, we have a bug and there's a thread switch point, our postponed job will see the flag and immediately +// stop the Ruby VM before further damage happens (and hopefully giving us a stack trace clearly pointing to the culprit). + +void unsafe_api_calls_check_init(void); +void debug_enter_unsafe_context(void); +void debug_leave_unsafe_context(void); diff --git a/lib/datadog/core/configuration.rb b/lib/datadog/core/configuration.rb index 826a3bb414a..4b2b21d6ee7 100644 --- a/lib/datadog/core/configuration.rb +++ b/lib/datadog/core/configuration.rb @@ -236,7 +236,7 @@ def safely_synchronize rescue ThreadError => e logger_without_components.error( 'Detected deadlock during datadog initialization. ' \ - 'Please report this at https://github.com/DataDog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug' \ + 'Please report this at https://github.com/datadog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug' \ "\n\tSource:\n\t#{Array(e.backtrace).join("\n\t")}" ) nil diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index e7b1bd28ba0..7145729bcec 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -66,8 +66,8 @@ end end - def sample(profiler_overhead_stack_thread: Thread.current) - described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread) + def sample(profiler_overhead_stack_thread: Thread.current, allow_exception: false) + described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread, allow_exception) end def on_gc_start @@ -78,8 +78,12 @@ def on_gc_finish described_class::Testing._native_on_gc_finish(cpu_and_wall_time_collector) end - def sample_after_gc(reset_monotonic_to_system_state: false) - described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector, reset_monotonic_to_system_state) + def sample_after_gc(reset_monotonic_to_system_state: false, allow_exception: false) + described_class::Testing._native_sample_after_gc( + cpu_and_wall_time_collector, + reset_monotonic_to_system_state, + allow_exception, + ) end def sample_allocation(weight:, new_object: Object.new) @@ -782,18 +786,18 @@ def otel_span_id_to_i(span_id) ) end - context 'raises an exception' do + context 'when an exception is raised' do before { setup_failure } after { expect(ran_log).to eq [:ran_code] } it 'does not leave the exception pending' do - sample + sample(allow_exception: true) expect($!).to be nil end it 'omits the "local root span id" and "span id" labels in the sample' do - sample + sample(allow_exception: true) expect(t1_sample.labels.keys).to_not include(:"local root span id", :"span id") end @@ -1433,7 +1437,7 @@ def sample_and_check(expected_state:) context "when called before on_gc_start/on_gc_finish" do it do - expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + expect { sample_after_gc(allow_exception: true) }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) end end @@ -1451,7 +1455,8 @@ def sample_and_check(expected_state:) it do sample_after_gc - expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + expect { sample_after_gc(allow_exception: true) } + .to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) end end From 1c26fd855211760e26278406b44d06cef76fb646 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:17:03 +0000 Subject: [PATCH 2/6] Fix going into Ruby code when looking up otel context `rb_hash_lookup` calls `#hash` on the key being looked up so it's safe to use unless during sampling. This can cause the same issue as we saw in #4195 leading to ``` [BUG] unexpected situation - recordd:1 current:0 -- C level backtrace information ------------------------------------------- ruby(rb_print_backtrace+0x11) [0x55ba03ccf90f] vm_dump.c:820 ruby(rb_vm_bugreport) vm_dump.c:1151 ruby(bug_report_end+0x0) [0x55ba03e91607] error.c:1042 ruby(rb_bug_without_die) error.c:1042 ruby(die+0x0) [0x55ba03ac0998] error.c:1050 ruby(rb_bug) error.c:1052 ruby(disallow_reentry+0x0) [0x55ba03ab6dcc] vm_sync.c:226 ruby(rb_ec_vm_lock_rec_check+0x1a) [0x55ba03cb17aa] eval_intern.h:144 ruby(rb_ec_tag_state) eval_intern.h:155 ruby(rb_vm_exec) vm.c:2484 ruby(vm_invoke_proc+0x201) [0x55ba03cb62b1] vm.c:1509 ruby(rb_vm_invoke_proc+0x33) [0x55ba03cb65d3] vm.c:1728 ruby(thread_do_start_proc+0x176) [0x55ba03c63516] thread.c:598 ruby(thread_do_start+0x12) [0x55ba03c648a2] thread.c:615 ruby(thread_start_func_2) thread.c:672 ruby(nt_start+0x107) [0x55ba03c65137] thread_pthread.c:2187 /lib/x86_64-linux-gnu/libpthread.so.0(start_thread+0xd9) [0x7ff360b66609] /lib/x86_64-linux-gnu/libc.so.6(clone+0x43) [0x7ff360a70353] ``` --- .../collectors_thread_context.c | 39 ++++++++- .../collectors/thread_context_spec.rb | 84 ++++++++++++++++++- 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index b9606797106..b8e7aedfefb 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -291,6 +291,7 @@ static void otel_without_ddtrace_trace_identifiers_for( ); static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key); static uint64_t otel_span_id_to_uint(VALUE otel_span_id); +static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -1632,7 +1633,7 @@ static void ddtrace_otel_trace_identifiers_for( // trace and span representing it. Each ddtrace trace is then connected to the previous otel span, forming a linked // list. The local root span is going to be the trace/span we find at the end of this linked list. while (otel_values != Qnil) { - VALUE otel_span = rb_hash_lookup(otel_values, otel_current_span_key); + VALUE otel_span = safely_lookup_hash_without_going_into_ruby_code(otel_values, otel_current_span_key); if (otel_span == Qnil) break; VALUE next_trace = rb_ivar_get(otel_span, at_datadog_trace_id); if (next_trace == Qnil) break; @@ -1766,7 +1767,7 @@ static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_sp if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed; // If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span] - VALUE span = rb_hash_lookup(context_entries, otel_current_span_key); + VALUE span = safely_lookup_hash_without_going_into_ruby_code(context_entries, otel_current_span_key); if (span == Qnil) return failed; // If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context) @@ -2092,3 +2093,37 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { DDTRACE_UNUSED long current_cpu_time_ns ) { return false; } #endif // NO_GVL_INSTRUMENTATION + +#define MAX_SAFE_LOOKUP_SIZE 16 + +typedef struct { VALUE lookup_key; VALUE result; } safe_lookup_hash_state; + +static int safe_lookup_hash_iterate(VALUE key, VALUE value, VALUE state_ptr) { + safe_lookup_hash_state *state = (safe_lookup_hash_state *) state_ptr; + + if (key == state->lookup_key) { + state->result = value; + return ST_STOP; + } + + return ST_CONTINUE; +} + +// This method exists because we need to look up a hash during sampling, but we don't want to invoke any +// Ruby code as a side effect. To be able to look up by hash, `rb_hash_lookup` calls `#hash` on the key, +// which we want to avoid. +// Thus, instead, we opt to just iterate through the hash and check if we can find what we're looking for. +// +// To avoid having too much overhead here we only iterate in hashes up to MAX_SAFE_LOOKUP_SIZE. +// Note that we don't even try to iterate if the hash is bigger -- this is to avoid flaky behavior where +// depending on the internal storage order of the hash we may or not find the key, and instead we always +// enforce the size. +static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key) { + if (!RB_TYPE_P(hash, T_HASH) || RHASH_SIZE(hash) > MAX_SAFE_LOOKUP_SIZE) return Qnil; + + safe_lookup_hash_state state = {.lookup_key = key, .result = Qnil}; + + rb_hash_foreach(hash, safe_lookup_hash_iterate, (VALUE) &state); + + return state.result; +} diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 7145729bcec..0296be0ff72 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -588,6 +588,7 @@ def self.otel_otlp_exporter_available? false end + # When opentelemetry-sdk is on the Gemfile, but not opentelemetry-exporter-otlp context "when trace comes from otel sdk", if: otel_sdk_available? && !otel_otlp_exporter_available? do let(:otel_tracer) do require "datadog/opentelemetry" @@ -622,6 +623,31 @@ def self.otel_otlp_exporter_available? expect(t1_sample.labels).to_not include("trace endpoint": anything) end + describe 'accessing the current span' do + before do + allow(Datadog.logger).to receive(:error) + + # initialize otel context reading + sample + # clear samples + recorder.serialize! + end + + it 'does not try to hash the CURRENT_SPAN_KEY' do + inner_check_ran = false + + otel_tracer.in_span("profiler.test") do |_span| + expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash) + + sample_allocation(weight: 1) + + inner_check_ran = true + end + + expect(inner_check_ran).to be true + end + end + context "when there are multiple otel spans nested" do let(:t1) do Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer| @@ -721,6 +747,7 @@ def self.otel_otlp_exporter_available? end end + # When opentelemetry-sdk AND opentelemetry-exporter-otlp are on the Gemfile context( "when trace comes from otel sdk and the ddtrace otel support is not loaded", if: otel_sdk_available? && otel_otlp_exporter_available? @@ -769,7 +796,7 @@ def otel_span_id_to_i(span_id) expect(t1_sample.labels).to_not include("trace endpoint": anything) end - context 'reading CURRENT_SPAN_KEY' do + describe 'reading CURRENT_SPAN_KEY into otel_current_span_key' do let!(:ran_log) { [] } let(:setup_failure) do @@ -818,6 +845,61 @@ def otel_span_id_to_i(span_id) end end + describe 'accessing the current span' do + before do + allow(OpenTelemetry.logger).to receive(:error) + + # initialize otel context reading + sample + # clear samples + recorder.serialize! + end + + it 'does not try to hash the CURRENT_SPAN_KEY' do + inner_check_ran = false + + otel_tracer.in_span("profiler.test") do |_span| + expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash) + + sample_allocation(weight: 1) + + inner_check_ran = true + end + + expect(inner_check_ran).to be true + end + + context 'when there are more than MAX_SAFE_LOOKUP_SIZE entries in the otel context' do + let(:max_safe_lookup_size) { 16 } # Value of MAX_SAFE_LOOKUP_SIZE in C code + + it 'does not try to look up the context' do + otel_tracer.in_span("profiler.test") do |_span| + current_size = OpenTelemetry::Context.current.instance_variable_get(:@entries).size + + OpenTelemetry::Context.with_values( + Array.new((max_safe_lookup_size + 1 - current_size)) { |it| ["key_#{it}", it] }.to_h + ) do + sample_allocation(weight: 12) + end + + OpenTelemetry::Context.with_values( + Array.new((max_safe_lookup_size - current_size)) { |it| ["key_#{it}", it] }.to_h + ) do + sample_allocation(weight: 34) + end + end + + result = samples_for_thread(samples, Thread.current) + + expect(result.size).to be 2 + expect(result.find { |it| it.values.fetch(:"alloc-samples") == 12 }.labels.keys) + .to_not include(:"local root span id", :"span id") + expect(result.find { |it| it.values.fetch(:"alloc-samples") == 34 }.labels.keys) + .to include(:"local root span id", :"span id") + end + end + end + context 'when otel_context_enabled is false' do let(:otel_context_enabled) { false } From eeed3a95ab6774dcbfce1929eadf5b860a0670d5 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:42:39 +0000 Subject: [PATCH 3/6] Avoid trying to sample allocations when VM is raising exception During my experiments to reproduce issues around allocation profiling, I've noted that the VM is in an especially delicate state during exception raising, so let's just decline to sample in this situation. --- .../collectors_cpu_and_wall_time_worker.c | 10 ++++++++++ .../private_vm_api_access.c | 3 +++ .../private_vm_api_access.h | 2 ++ 3 files changed, 15 insertions(+) diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index e4e9b69473e..723e2d93b33 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -1171,6 +1171,16 @@ static void on_newobj_event(DDTRACE_UNUSED VALUE unused1, DDTRACE_UNUSED void *u return; } + // If Ruby is in the middle of raising an exception, we don't want to try to sample. This is because if we accidentally + // trigger an exception inside the profiler code, bad things will happen (specifically, Ruby will try to kill off the + // thread even though we may try to catch the exception). + // + // Note that "in the middle of raising an exception" means the exception itself has already been allocated. + // What's getting allocated now is probably the backtrace objects (@ivoanjo or at least that's what I've observed) + if (is_raised_flag_set(rb_thread_current())) { + return; + } + // Hot path: Dynamic sampling rate is usually enabled and the sampling decision is usually false if (RB_LIKELY(state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler))) { state->stats.allocation_skipped++; diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index 59e5c644882..f7c188861d7 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -800,3 +800,6 @@ static inline int ddtrace_imemo_type(VALUE imemo) { return current_thread; } #endif + +// Is the VM smack in the middle of raising an exception? +bool is_raised_flag_set(VALUE thread) { return thread_struct_from_object(thread)->ec->raised_flag > 0; } diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.h b/ext/datadog_profiling_native_extension/private_vm_api_access.h index c40992274cb..3e412f51ea5 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.h +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.h @@ -68,3 +68,5 @@ const char *imemo_kind(VALUE imemo); #define ENFORCE_THREAD(value) \ { if (RB_UNLIKELY(!rb_typeddata_is_kind_of(value, RTYPEDDATA_TYPE(rb_thread_current())))) raise_unexpected_type(value, ADD_QUOTES(value), "Thread", __FILE__, __LINE__, __func__); } + +bool is_raised_flag_set(VALUE thread); From 31997bff2d41df72089d19a4d8aa8d4ecc530904 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:51:46 +0000 Subject: [PATCH 4/6] Update tests with new signatures for test methods --- benchmarks/profiler_gc.rb | 6 +++--- benchmarks/profiler_sample_gvl.rb | 2 +- benchmarks/profiler_sample_loop_v2.rb | 2 +- benchmarks/profiler_sample_serialize.rb | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/profiler_gc.rb b/benchmarks/profiler_gc.rb index e96198cdfba..401af152d84 100644 --- a/benchmarks/profiler_gc.rb +++ b/benchmarks/profiler_gc.rb @@ -14,7 +14,7 @@ def create_profiler # We take a dummy sample so that the context for the main thread is created, as otherwise the GC profiling methods do # not create it (because we don't want to do memory allocations in the middle of GC) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, Thread.current) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, Thread.current, false) end def run_benchmark @@ -29,7 +29,7 @@ def run_benchmark x.report('profiler gc') do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false, false) end x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE @@ -52,7 +52,7 @@ def run_benchmark estimated_gc_per_minute.times do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false, false) end @recorder.serialize diff --git a/benchmarks/profiler_sample_gvl.rb b/benchmarks/profiler_sample_gvl.rb index 76e8528f417..7688730b714 100644 --- a/benchmarks/profiler_sample_gvl.rb +++ b/benchmarks/profiler_sample_gvl.rb @@ -27,7 +27,7 @@ def initialize @target_thread = thread_with_very_deep_stack # Sample once to trigger thread context creation for all threads (including @target_thread) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end def create_profiler diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb index e007375cf6c..624f29bad25 100644 --- a/benchmarks/profiler_sample_loop_v2.rb +++ b/benchmarks/profiler_sample_loop_v2.rb @@ -37,7 +37,7 @@ def run_benchmark ) x.report("stack collector #{ENV['CONFIG']}") do - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE diff --git a/benchmarks/profiler_sample_serialize.rb b/benchmarks/profiler_sample_serialize.rb index 47ae1b6854a..b90997c42d5 100644 --- a/benchmarks/profiler_sample_serialize.rb +++ b/benchmarks/profiler_sample_serialize.rb @@ -35,7 +35,7 @@ def run_benchmark simulate_seconds = 60 (samples_per_second * simulate_seconds).times do - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end @recorder.serialize From 54c9d05d576f0a8bde9b35703101ac4644b1619e Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:56:37 +0000 Subject: [PATCH 5/6] Check if symbol is static before calling SYM2ID on it It occurs to me that if a symbol is dynamic, we were causing it to become a static symbol (e.g. making it never be able to be garbage collected). This can be very bad! And also, we know the symbol we're looking for must be a static symbol because if nothing else, our initialization caused it to become a static symbol. Thus, if we see a dynamic symbol, we can stop there, since by definition it won't be the symbol we're looking after. This is... really awkward to add a specific unit test for, so I've just relied on our existing test coverage to show that this has not affected the correctness of our otel code. --- .../collectors_thread_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index b8e7aedfefb..4afb23c5a9e 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -1750,7 +1750,7 @@ static void otel_without_ddtrace_trace_identifiers_for( VALUE root_span_type = rb_ivar_get(local_root_span.span, at_kind_id /* @kind */); // We filter out spans that don't have `kind: :server` - if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || SYM2ID(root_span_type) != server_id) return; + if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || !RB_STATIC_SYM_P(root_span_type) || SYM2ID(root_span_type) != server_id) return; VALUE trace_resource = rb_ivar_get(local_root_span.span, at_name_id /* @name */); if (!RB_TYPE_P(trace_resource, T_STRING)) return; From 7e493241465e0a6e1a817dc0be9b4e5502bb6a9a Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 17:20:49 +0000 Subject: [PATCH 6/6] Document that unsafe api calls checker is only for test code --- .../unsafe_api_calls_check.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h index 092a59f30b3..c6c8dc56cf5 100644 --- a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h @@ -21,5 +21,11 @@ // stop the Ruby VM before further damage happens (and hopefully giving us a stack trace clearly pointing to the culprit). void unsafe_api_calls_check_init(void); + +// IMPORTANT: This method **MUST** only be called from test code, as it causes an immediate hard-crash on the Ruby VM +// when it detects a potential issue, and that's not something we want for production apps. +// +// In the future we may introduce some kind of setting (off by default) to also allow this to be safely be used +// in production code if needed. void debug_enter_unsafe_context(void); void debug_leave_unsafe_context(void);