diff --git a/benchmarks/profiler_gc.rb b/benchmarks/profiler_gc.rb index e96198cdfba..401af152d84 100644 --- a/benchmarks/profiler_gc.rb +++ b/benchmarks/profiler_gc.rb @@ -14,7 +14,7 @@ def create_profiler # We take a dummy sample so that the context for the main thread is created, as otherwise the GC profiling methods do # not create it (because we don't want to do memory allocations in the middle of GC) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, Thread.current) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, Thread.current, false) end def run_benchmark @@ -29,7 +29,7 @@ def run_benchmark x.report('profiler gc') do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false, false) end x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE @@ -52,7 +52,7 @@ def run_benchmark estimated_gc_per_minute.times do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false, false) end @recorder.serialize diff --git a/benchmarks/profiler_sample_gvl.rb b/benchmarks/profiler_sample_gvl.rb index 76e8528f417..7688730b714 100644 --- a/benchmarks/profiler_sample_gvl.rb +++ b/benchmarks/profiler_sample_gvl.rb @@ -27,7 +27,7 @@ def initialize @target_thread = thread_with_very_deep_stack # Sample once to trigger thread context creation for all threads (including @target_thread) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end def create_profiler diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb index e007375cf6c..624f29bad25 100644 --- a/benchmarks/profiler_sample_loop_v2.rb +++ b/benchmarks/profiler_sample_loop_v2.rb @@ -37,7 +37,7 @@ def run_benchmark ) x.report("stack collector #{ENV['CONFIG']}") do - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE diff --git a/benchmarks/profiler_sample_serialize.rb b/benchmarks/profiler_sample_serialize.rb index 47ae1b6854a..b90997c42d5 100644 --- a/benchmarks/profiler_sample_serialize.rb +++ b/benchmarks/profiler_sample_serialize.rb @@ -35,7 +35,7 @@ def run_benchmark simulate_seconds = 60 (samples_per_second * simulate_seconds).times do - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end @recorder.serialize diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index e4e9b69473e..723e2d93b33 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -1171,6 +1171,16 @@ static void on_newobj_event(DDTRACE_UNUSED VALUE unused1, DDTRACE_UNUSED void *u return; } + // If Ruby is in the middle of raising an exception, we don't want to try to sample. This is because if we accidentally + // trigger an exception inside the profiler code, bad things will happen (specifically, Ruby will try to kill off the + // thread even though we may try to catch the exception). + // + // Note that "in the middle of raising an exception" means the exception itself has already been allocated. + // What's getting allocated now is probably the backtrace objects (@ivoanjo or at least that's what I've observed) + if (is_raised_flag_set(rb_thread_current())) { + return; + } + // Hot path: Dynamic sampling rate is usually enabled and the sampling decision is usually false if (RB_LIKELY(state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler))) { state->stats.allocation_skipped++; diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 0b56268bed5..4afb23c5a9e 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -9,6 +9,7 @@ #include "private_vm_api_access.h" #include "stack_recorder.h" #include "time_helpers.h" +#include "unsafe_api_calls_check.h" // Used to trigger sampling of threads, based on external "events", such as: // * periodic timer for cpu-time and wall-time @@ -203,10 +204,10 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _val static int hash_map_per_thread_context_free_values(st_data_t _thread, st_data_t value_per_thread_context, st_data_t _argument); static VALUE _native_new(VALUE klass); static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self); -static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread); +static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread, VALUE allow_exception); static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance); -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state); +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state, VALUE allow_exception); static void update_metrics_and_sample( struct thread_context_collector_state *state, VALUE thread_being_sampled, @@ -290,6 +291,7 @@ static void otel_without_ddtrace_trace_identifiers_for( ); static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key); static uint64_t otel_span_id_to_uint(VALUE otel_span_id); +static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -310,11 +312,11 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, -1); rb_define_singleton_method(collectors_thread_context_class, "_native_inspect", _native_inspect, 1); rb_define_singleton_method(collectors_thread_context_class, "_native_reset_after_fork", _native_reset_after_fork, 1); - rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2); + rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 3); rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3); rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1); rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1); - rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2); + rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 3); rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0); rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1); rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); @@ -504,31 +506,49 @@ static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _sel // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread) { +static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread, VALUE allow_exception) { + ENFORCE_BOOLEAN(allow_exception); + if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive"); + if (allow_exception == Qfalse) debug_enter_unsafe_context(); + thread_context_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE), profiler_overhead_stack_thread); + + if (allow_exception == Qfalse) debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_on_gc_start(DDTRACE_UNUSED VALUE self, VALUE collector_instance) { + debug_enter_unsafe_context(); + thread_context_collector_on_gc_start(collector_instance); + + debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_instance) { + debug_enter_unsafe_context(); + (void) !thread_context_collector_on_gc_finish(collector_instance); + + debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) { +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state, VALUE allow_exception) { ENFORCE_BOOLEAN(reset_monotonic_to_system_state); + ENFORCE_BOOLEAN(allow_exception); struct thread_context_collector_state *state; TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); @@ -537,7 +557,12 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_ state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER; } + if (allow_exception == Qfalse) debug_enter_unsafe_context(); + thread_context_collector_sample_after_gc(collector_instance); + + if (allow_exception == Qfalse) debug_leave_unsafe_context(); + return Qtrue; } @@ -982,7 +1007,13 @@ static void trigger_sample_for_thread( // It SHOULD NOT be used for other purposes. static VALUE _native_thread_list(DDTRACE_UNUSED VALUE _self) { VALUE result = rb_ary_new(); + + debug_enter_unsafe_context(); + ddtrace_thread_list(result); + + debug_leave_unsafe_context(); + return result; } @@ -1501,7 +1532,12 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight, VALUE new_object) { + debug_enter_unsafe_context(); + thread_context_collector_sample_allocation(collector_instance, NUM2UINT(sample_weight), new_object); + + debug_leave_unsafe_context(); + return Qtrue; } @@ -1597,7 +1633,7 @@ static void ddtrace_otel_trace_identifiers_for( // trace and span representing it. Each ddtrace trace is then connected to the previous otel span, forming a linked // list. The local root span is going to be the trace/span we find at the end of this linked list. while (otel_values != Qnil) { - VALUE otel_span = rb_hash_lookup(otel_values, otel_current_span_key); + VALUE otel_span = safely_lookup_hash_without_going_into_ruby_code(otel_values, otel_current_span_key); if (otel_span == Qnil) break; VALUE next_trace = rb_ivar_get(otel_span, at_datadog_trace_id); if (next_trace == Qnil) break; @@ -1640,7 +1676,12 @@ void thread_context_collector_sample_skipped_allocation_samples(VALUE self_insta } static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples) { + debug_enter_unsafe_context(); + thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples)); + + debug_leave_unsafe_context(); + return Qtrue; } @@ -1709,7 +1750,7 @@ static void otel_without_ddtrace_trace_identifiers_for( VALUE root_span_type = rb_ivar_get(local_root_span.span, at_kind_id /* @kind */); // We filter out spans that don't have `kind: :server` - if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || SYM2ID(root_span_type) != server_id) return; + if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || !RB_STATIC_SYM_P(root_span_type) || SYM2ID(root_span_type) != server_id) return; VALUE trace_resource = rb_ivar_get(local_root_span.span, at_name_id /* @name */); if (!RB_TYPE_P(trace_resource, T_STRING)) return; @@ -1726,7 +1767,7 @@ static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_sp if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed; // If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span] - VALUE span = rb_hash_lookup(context_entries, otel_current_span_key); + VALUE span = safely_lookup_hash_without_going_into_ruby_code(context_entries, otel_current_span_key); if (span == Qnil) return failed; // If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context) @@ -1979,31 +2020,53 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); + debug_enter_unsafe_context(); + thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread)); + + debug_leave_unsafe_context(); + return Qnil; } static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); + debug_enter_unsafe_context(); + intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread); + + debug_leave_unsafe_context(); + return LONG2NUM(gvl_waiting_at); } static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); - return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse; + debug_enter_unsafe_context(); + + VALUE result = thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse; + + debug_leave_unsafe_context(); + + return result; } static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) { ENFORCE_THREAD(thread); - return thread_context_collector_sample_after_gvl_running( + debug_enter_unsafe_context(); + + VALUE result = thread_context_collector_sample_after_gvl_running( collector_instance, thread, monotonic_wall_time_now_ns(RAISE_ON_FAILURE) ); + + debug_leave_unsafe_context(); + + return result; } static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) { @@ -2030,3 +2093,37 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { DDTRACE_UNUSED long current_cpu_time_ns ) { return false; } #endif // NO_GVL_INSTRUMENTATION + +#define MAX_SAFE_LOOKUP_SIZE 16 + +typedef struct { VALUE lookup_key; VALUE result; } safe_lookup_hash_state; + +static int safe_lookup_hash_iterate(VALUE key, VALUE value, VALUE state_ptr) { + safe_lookup_hash_state *state = (safe_lookup_hash_state *) state_ptr; + + if (key == state->lookup_key) { + state->result = value; + return ST_STOP; + } + + return ST_CONTINUE; +} + +// This method exists because we need to look up a hash during sampling, but we don't want to invoke any +// Ruby code as a side effect. To be able to look up by hash, `rb_hash_lookup` calls `#hash` on the key, +// which we want to avoid. +// Thus, instead, we opt to just iterate through the hash and check if we can find what we're looking for. +// +// To avoid having too much overhead here we only iterate in hashes up to MAX_SAFE_LOOKUP_SIZE. +// Note that we don't even try to iterate if the hash is bigger -- this is to avoid flaky behavior where +// depending on the internal storage order of the hash we may or not find the key, and instead we always +// enforce the size. +static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key) { + if (!RB_TYPE_P(hash, T_HASH) || RHASH_SIZE(hash) > MAX_SAFE_LOOKUP_SIZE) return Qnil; + + safe_lookup_hash_state state = {.lookup_key = key, .result = Qnil}; + + rb_hash_foreach(hash, safe_lookup_hash_iterate, (VALUE) &state); + + return state.result; +} diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index 59e5c644882..f7c188861d7 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -800,3 +800,6 @@ static inline int ddtrace_imemo_type(VALUE imemo) { return current_thread; } #endif + +// Is the VM smack in the middle of raising an exception? +bool is_raised_flag_set(VALUE thread) { return thread_struct_from_object(thread)->ec->raised_flag > 0; } diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.h b/ext/datadog_profiling_native_extension/private_vm_api_access.h index c40992274cb..3e412f51ea5 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.h +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.h @@ -68,3 +68,5 @@ const char *imemo_kind(VALUE imemo); #define ENFORCE_THREAD(value) \ { if (RB_UNLIKELY(!rb_typeddata_is_kind_of(value, RTYPEDDATA_TYPE(rb_thread_current())))) raise_unexpected_type(value, ADD_QUOTES(value), "Thread", __FILE__, __LINE__, __func__); } + +bool is_raised_flag_set(VALUE thread); diff --git a/ext/datadog_profiling_native_extension/profiling.c b/ext/datadog_profiling_native_extension/profiling.c index 315ca0d2954..a7bfe0d466b 100644 --- a/ext/datadog_profiling_native_extension/profiling.c +++ b/ext/datadog_profiling_native_extension/profiling.c @@ -11,6 +11,7 @@ #include "ruby_helpers.h" #include "setup_signal_handler.h" #include "time_helpers.h" +#include "unsafe_api_calls_check.h" // Each class/module here is implemented in their separate file void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module); @@ -56,6 +57,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { collectors_thread_context_init(profiling_module); http_transport_init(profiling_module); stack_recorder_init(profiling_module); + unsafe_api_calls_check_init(); // Hosts methods used for testing the native code using RSpec VALUE testing_module = rb_define_module_under(native_extension_module, "Testing"); diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c new file mode 100644 index 00000000000..c3c23b7f1c7 --- /dev/null +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c @@ -0,0 +1,47 @@ +#include +#include +#include + +#include "datadog_ruby_common.h" +#include "unsafe_api_calls_check.h" +#include "extconf.h" + +static bool inside_unsafe_context = false; + +#ifndef NO_POSTPONED_TRIGGER + static rb_postponed_job_handle_t check_for_unsafe_api_calls_handle; +#endif + +static void check_for_unsafe_api_calls(DDTRACE_UNUSED void *_unused); + +void unsafe_api_calls_check_init(void) { + #ifndef NO_POSTPONED_TRIGGER + int unused_flags = 0; + + check_for_unsafe_api_calls_handle = rb_postponed_job_preregister(unused_flags, check_for_unsafe_api_calls, NULL); + + if (check_for_unsafe_api_calls_handle == POSTPONED_JOB_HANDLE_INVALID) { + rb_raise(rb_eRuntimeError, "Failed to register check_for_unsafe_api_calls_handle postponed job (got POSTPONED_JOB_HANDLE_INVALID)"); + } + #endif +} + +void debug_enter_unsafe_context(void) { + inside_unsafe_context = true; + + #ifndef NO_POSTPONED_TRIGGER + rb_postponed_job_trigger(check_for_unsafe_api_calls_handle); + #else + rb_postponed_job_register(0, check_for_unsafe_api_calls, NULL); + #endif +} + +void debug_leave_unsafe_context(void) { + inside_unsafe_context = false; +} + +static void check_for_unsafe_api_calls(DDTRACE_UNUSED void *_unused) { + if (inside_unsafe_context) rb_bug( + "Datadog Ruby profiler detected callback nested inside sample. Please report this at https://github.com/datadog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug" + ); +} diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h new file mode 100644 index 00000000000..c6c8dc56cf5 --- /dev/null +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h @@ -0,0 +1,31 @@ +#pragma once + +// This checker is used to detect accidental thread scheduling switching points happening during profiling sampling. +// +// Specifically, when the profiler is sampling, we're never supposed to call into Ruby code (e.g. methods +// implemented using Ruby code) or allocate Ruby objects. +// That's because those events introduce thread switch points, and really we don't the VM switching between threads +// in the middle of the profiler sampling. +// This includes raising exceptions, unless we're trying to stop the profiler, and even then we must be careful. +// +// The above is especially true in situations such as GC profiling or allocation/heap profiling, as in those situations +// we can even crash the Ruby VM if we switch away at the wrong time. +// +// The below APIs can be used to detect these situations. They work by relying on the following observation: +// in most (all?) thread switch points, Ruby will check for interrupts and run the postponed jobs. +// +// Thus, if we set a flag while we're sampling (inside_unsafe_context), trigger the postponed job, and then only unset +// the flag after sampling, he correct thing to happen is that the postponed job should never see the flag. +// +// If, however, we have a bug and there's a thread switch point, our postponed job will see the flag and immediately +// stop the Ruby VM before further damage happens (and hopefully giving us a stack trace clearly pointing to the culprit). + +void unsafe_api_calls_check_init(void); + +// IMPORTANT: This method **MUST** only be called from test code, as it causes an immediate hard-crash on the Ruby VM +// when it detects a potential issue, and that's not something we want for production apps. +// +// In the future we may introduce some kind of setting (off by default) to also allow this to be safely be used +// in production code if needed. +void debug_enter_unsafe_context(void); +void debug_leave_unsafe_context(void); diff --git a/lib/datadog/core/configuration.rb b/lib/datadog/core/configuration.rb index 826a3bb414a..4b2b21d6ee7 100644 --- a/lib/datadog/core/configuration.rb +++ b/lib/datadog/core/configuration.rb @@ -236,7 +236,7 @@ def safely_synchronize rescue ThreadError => e logger_without_components.error( 'Detected deadlock during datadog initialization. ' \ - 'Please report this at https://github.com/DataDog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug' \ + 'Please report this at https://github.com/datadog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug' \ "\n\tSource:\n\t#{Array(e.backtrace).join("\n\t")}" ) nil diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index e7b1bd28ba0..0296be0ff72 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -66,8 +66,8 @@ end end - def sample(profiler_overhead_stack_thread: Thread.current) - described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread) + def sample(profiler_overhead_stack_thread: Thread.current, allow_exception: false) + described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread, allow_exception) end def on_gc_start @@ -78,8 +78,12 @@ def on_gc_finish described_class::Testing._native_on_gc_finish(cpu_and_wall_time_collector) end - def sample_after_gc(reset_monotonic_to_system_state: false) - described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector, reset_monotonic_to_system_state) + def sample_after_gc(reset_monotonic_to_system_state: false, allow_exception: false) + described_class::Testing._native_sample_after_gc( + cpu_and_wall_time_collector, + reset_monotonic_to_system_state, + allow_exception, + ) end def sample_allocation(weight:, new_object: Object.new) @@ -584,6 +588,7 @@ def self.otel_otlp_exporter_available? false end + # When opentelemetry-sdk is on the Gemfile, but not opentelemetry-exporter-otlp context "when trace comes from otel sdk", if: otel_sdk_available? && !otel_otlp_exporter_available? do let(:otel_tracer) do require "datadog/opentelemetry" @@ -618,6 +623,31 @@ def self.otel_otlp_exporter_available? expect(t1_sample.labels).to_not include("trace endpoint": anything) end + describe 'accessing the current span' do + before do + allow(Datadog.logger).to receive(:error) + + # initialize otel context reading + sample + # clear samples + recorder.serialize! + end + + it 'does not try to hash the CURRENT_SPAN_KEY' do + inner_check_ran = false + + otel_tracer.in_span("profiler.test") do |_span| + expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash) + + sample_allocation(weight: 1) + + inner_check_ran = true + end + + expect(inner_check_ran).to be true + end + end + context "when there are multiple otel spans nested" do let(:t1) do Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer| @@ -717,6 +747,7 @@ def self.otel_otlp_exporter_available? end end + # When opentelemetry-sdk AND opentelemetry-exporter-otlp are on the Gemfile context( "when trace comes from otel sdk and the ddtrace otel support is not loaded", if: otel_sdk_available? && otel_otlp_exporter_available? @@ -765,7 +796,7 @@ def otel_span_id_to_i(span_id) expect(t1_sample.labels).to_not include("trace endpoint": anything) end - context 'reading CURRENT_SPAN_KEY' do + describe 'reading CURRENT_SPAN_KEY into otel_current_span_key' do let!(:ran_log) { [] } let(:setup_failure) do @@ -782,18 +813,18 @@ def otel_span_id_to_i(span_id) ) end - context 'raises an exception' do + context 'when an exception is raised' do before { setup_failure } after { expect(ran_log).to eq [:ran_code] } it 'does not leave the exception pending' do - sample + sample(allow_exception: true) expect($!).to be nil end it 'omits the "local root span id" and "span id" labels in the sample' do - sample + sample(allow_exception: true) expect(t1_sample.labels.keys).to_not include(:"local root span id", :"span id") end @@ -814,6 +845,61 @@ def otel_span_id_to_i(span_id) end end + describe 'accessing the current span' do + before do + allow(OpenTelemetry.logger).to receive(:error) + + # initialize otel context reading + sample + # clear samples + recorder.serialize! + end + + it 'does not try to hash the CURRENT_SPAN_KEY' do + inner_check_ran = false + + otel_tracer.in_span("profiler.test") do |_span| + expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash) + + sample_allocation(weight: 1) + + inner_check_ran = true + end + + expect(inner_check_ran).to be true + end + + context 'when there are more than MAX_SAFE_LOOKUP_SIZE entries in the otel context' do + let(:max_safe_lookup_size) { 16 } # Value of MAX_SAFE_LOOKUP_SIZE in C code + + it 'does not try to look up the context' do + otel_tracer.in_span("profiler.test") do |_span| + current_size = OpenTelemetry::Context.current.instance_variable_get(:@entries).size + + OpenTelemetry::Context.with_values( + Array.new((max_safe_lookup_size + 1 - current_size)) { |it| ["key_#{it}", it] }.to_h + ) do + sample_allocation(weight: 12) + end + + OpenTelemetry::Context.with_values( + Array.new((max_safe_lookup_size - current_size)) { |it| ["key_#{it}", it] }.to_h + ) do + sample_allocation(weight: 34) + end + end + + result = samples_for_thread(samples, Thread.current) + + expect(result.size).to be 2 + expect(result.find { |it| it.values.fetch(:"alloc-samples") == 12 }.labels.keys) + .to_not include(:"local root span id", :"span id") + expect(result.find { |it| it.values.fetch(:"alloc-samples") == 34 }.labels.keys) + .to include(:"local root span id", :"span id") + end + end + end + context 'when otel_context_enabled is false' do let(:otel_context_enabled) { false } @@ -1433,7 +1519,7 @@ def sample_and_check(expected_state:) context "when called before on_gc_start/on_gc_finish" do it do - expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + expect { sample_after_gc(allow_exception: true) }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) end end @@ -1451,7 +1537,8 @@ def sample_and_check(expected_state:) it do sample_after_gc - expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + expect { sample_after_gc(allow_exception: true) } + .to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) end end