diff --git a/CHANGELOG.md b/CHANGELOG.md index e573f61e55e..a19036caee0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Fix inconsistent request body closing in `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp`. (#5954) - Fix inconsistent request body closing in `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp`. (#5954) - Fix invalid exemplar keys in `go.opentelemetry.io/otel/exporters/prometheus`. (#5995) +- Fix attribute value truncation in `go.opentelemetry.io/otel/sdk/trace`. (#5997) diff --git a/sdk/trace/span.go b/sdk/trace/span.go index 17f883c2c86..15cd16a3254 100644 --- a/sdk/trace/span.go +++ b/sdk/trace/span.go @@ -347,47 +347,92 @@ func truncateAttr(limit int, attr attribute.KeyValue) attribute.KeyValue { } switch attr.Value.Type() { case attribute.STRING: - if v := attr.Value.AsString(); len(v) > limit { - return attr.Key.String(safeTruncate(v, limit)) - } + v := attr.Value.AsString() + return attr.Key.String(truncate(limit, v)) case attribute.STRINGSLICE: v := attr.Value.AsStringSlice() for i := range v { - if len(v[i]) > limit { - v[i] = safeTruncate(v[i], limit) - } + v[i] = truncate(limit, v[i]) } return attr.Key.StringSlice(v) } return attr } -// safeTruncate truncates the string and guarantees valid UTF-8 is returned. -func safeTruncate(input string, limit int) string { - if trunc, ok := safeTruncateValidUTF8(input, limit); ok { - return trunc +// truncate returns a truncated version of s such that it contains less than +// the limit number of characters. Truncation is applied by returning the limit +// number of valid characters contained in s. +// +// If limit is negative, it returns the original string. +// +// UTF-8 is supported. When truncating, all invalid characters are dropped +// before applying truncation. +// +// If s already contains less than the limit number of bytes, it is returned +// unchanged. No invalid characters are removed. +func truncate(limit int, s string) string { + // This prioritize performance in the following order based on the most + // common expected use-cases. + // + // - Short values less than the default limit (128). + // - Strings with valid encodings that exceed the limit. + // - No limit. + // - Strings with invalid encodings that exceed the limit. + if limit < 0 || len(s) <= limit { + return s + } + + // Optimistically, assume all valid UTF-8. + var b strings.Builder + count := 0 + for i, c := range s { + if c != utf8.RuneError { + count++ + if count > limit { + return s[:i] + } + continue + } + + _, size := utf8.DecodeRuneInString(s[i:]) + if size == 1 { + // Invalid encoding. + b.Grow(len(s) - 1) + _, _ = b.WriteString(s[:i]) + s = s[i:] + break + } + } + + // Fast-path, no invalid input. + if b.Cap() == 0 { + return s } - trunc, _ := safeTruncateValidUTF8(strings.ToValidUTF8(input, ""), limit) - return trunc -} -// safeTruncateValidUTF8 returns a copy of the input string safely truncated to -// limit. The truncation is ensured to occur at the bounds of complete UTF-8 -// characters. If invalid encoding of UTF-8 is encountered, input is returned -// with false, otherwise, the truncated input will be returned with true. -func safeTruncateValidUTF8(input string, limit int) (string, bool) { - for cnt := 0; cnt <= limit; { - r, size := utf8.DecodeRuneInString(input[cnt:]) - if r == utf8.RuneError { - return input, false + // Truncate while validating UTF-8. + for i := 0; i < len(s) && count < limit; { + c := s[i] + if c < utf8.RuneSelf { + // Optimization for single byte runes (common case). + _ = b.WriteByte(c) + i++ + count++ + continue } - if cnt+size > limit { - return input[:cnt], true + _, size := utf8.DecodeRuneInString(s[i:]) + if size == 1 { + // We checked for all 1-byte runes above, this is a RuneError. + i++ + continue } - cnt += size + + _, _ = b.WriteString(s[i : i+size]) + i += size + count++ } - return input, true + + return b.String() } // End ends the span. This method does nothing if the span is already ended or diff --git a/sdk/trace/span_limits_test.go b/sdk/trace/span_limits_test.go index bd57fa662b9..74f8997c8b0 100644 --- a/sdk/trace/span_limits_test.go +++ b/sdk/trace/span_limits_test.go @@ -183,7 +183,7 @@ func TestSpanLimits(t *testing.T) { // Ensure string and string slice attributes are truncated. assert.Contains(t, attrs, attribute.String("string", "ab")) assert.Contains(t, attrs, attribute.StringSlice("stringSlice", []string{"ab", "de"})) - assert.Contains(t, attrs, attribute.String("euro", "")) + assert.Contains(t, attrs, attribute.String("euro", "€")) limits.AttributeValueLengthLimit = 0 attrs = testSpanLimits(t, limits).Attributes() diff --git a/sdk/trace/span_test.go b/sdk/trace/span_test.go index 38bf2b2e8a9..5a0355c0bad 100644 --- a/sdk/trace/span_test.go +++ b/sdk/trace/span_test.go @@ -201,38 +201,158 @@ func TestTruncateAttr(t *testing.T) { attr: strSliceAttr, want: strSliceAttr, }, + } + + for _, test := range tests { + name := fmt.Sprintf("%s->%s(limit:%d)", test.attr.Key, test.attr.Value.Emit(), test.limit) + t.Run(name, func(t *testing.T) { + assert.Equal(t, test.want, truncateAttr(test.limit, test.attr)) + }) + } +} + +func TestTruncate(t *testing.T) { + type group struct { + limit int + input string + expected string + } + + tests := []struct { + name string + groups []group + }{ + // Edge case: limit is negative, no truncation should occur + { + name: "NoTruncation", + groups: []group{ + {-1, "No truncation!", "No truncation!"}, + }, + }, + + // Edge case: string is already shorter than the limit, no truncation + // should occur { - // This tests the ordinary safeTruncate(). - limit: 10, - attr: attribute.String(key, "€€€€"), // 3 bytes each - want: attribute.String(key, "€€€"), + name: "ShortText", + groups: []group{ + {10, "Short text", "Short text"}, + {15, "Short text", "Short text"}, + {100, "Short text", "Short text"}, + }, }, + + // Edge case: truncation happens with ASCII characters only { - // This tests truncation with an invalid UTF-8 input. - // - // Note that after removing the invalid rune, - // the string is over length and still has to - // be truncated on a code point boundary. - limit: 10, - attr: attribute.String(key, "€"[0:2]+"hello€€"), // corrupted first rune, then over limit - want: attribute.String(key, "hello€"), + name: "ASCIIOnly", + groups: []group{ + {1, "Hello World!", "H"}, + {5, "Hello World!", "Hello"}, + {12, "Hello World!", "Hello World!"}, + }, }, + + // Truncation including multi-byte characters (UTF-8) { - // This tests the fallback to invalidTruncate() - // where after validation the string does not require - // truncation. - limit: 6, - attr: attribute.String(key, "€"[0:2]+"hello"), // corrupted first rune, then not over limit - want: attribute.String(key, "hello"), + name: "ValidUTF-8", + groups: []group{ + {7, "Hello, 世界", "Hello, "}, + {8, "Hello, 世界", "Hello, 世"}, + {2, "こんにちは", "こん"}, + {3, "こんにちは", "こんに"}, + {5, "こんにちは", "こんにちは"}, + {12, "こんにちは", "こんにちは"}, + }, + }, + + // Truncation with invalid UTF-8 characters + { + name: "InvalidUTF-8", + groups: []group{ + {11, "Invalid\x80text", "Invalidtext"}, + // Do not modify invalid text if equal to limit. + {11, "Valid text\x80", "Valid text\x80"}, + // Do not modify invalid text if under limit. + {15, "Valid text\x80", "Valid text\x80"}, + {5, "Hello\x80World", "Hello"}, + {11, "Hello\x80World\x80!", "HelloWorld!"}, + {15, "Hello\x80World\x80Test", "HelloWorldTest"}, + {15, "Hello\x80\x80\x80World\x80Test", "HelloWorldTest"}, + {15, "\x80\x80\x80Hello\x80\x80\x80World\x80Test\x80\x80", "HelloWorldTest"}, + }, + }, + + // Truncation with mixed validn and invalid UTF-8 characters + { + name: "MixedUTF-8", + groups: []group{ + {6, "€"[0:2] + "hello€€", "hello€"}, + {6, "€" + "€"[0:2] + "hello", "€hello"}, + {11, "Valid text\x80📜", "Valid text📜"}, + {11, "Valid text📜\x80", "Valid text📜"}, + {14, "😊 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"}, + {14, "😊\x80 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"}, + {14, "😊\x80 Hello\x80World🌍\x80🚀", "😊 HelloWorld🌍🚀"}, + {14, "😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"}, + {14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"}, + }, + }, + + // Edge case: empty string, should return empty string + { + name: "Empty", + groups: []group{ + {5, "", ""}, + }, + }, + + // Edge case: limit is 0, should return an empty string + { + name: "Zero", + groups: []group{ + {0, "Some text", ""}, + {0, "", ""}, + }, }, } - for _, test := range tests { - name := fmt.Sprintf("%s->%s(limit:%d)", test.attr.Key, test.attr.Value.Emit(), test.limit) - t.Run(name, func(t *testing.T) { - assert.Equal(t, test.want, truncateAttr(test.limit, test.attr)) - }) + for _, tt := range tests { + for _, g := range tt.groups { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := truncate(g.limit, g.input) + assert.Equalf( + t, g.expected, got, + "input: %q([]rune%v))\ngot: %q([]rune%v)\nwant %q([]rune%v)", + g.input, []rune(g.input), + got, []rune(got), + g.expected, []rune(g.expected), + ) + }) + } + } +} + +func BenchmarkTruncate(b *testing.B) { + run := func(limit int, input string) func(b *testing.B) { + return func(b *testing.B) { + b.ReportAllocs() + b.RunParallel(func(pb *testing.PB) { + var out string + for pb.Next() { + out = truncate(limit, input) + } + _ = out + }) + } } + b.Run("Unlimited", run(-1, "hello 😊 world 🌍🚀")) + b.Run("Zero", run(0, "Some text")) + b.Run("Short", run(10, "Short Text")) + b.Run("ASCII", run(5, "Hello, World!")) + b.Run("ValidUTF-8", run(10, "hello 😊 world 🌍🚀")) + b.Run("InvalidUTF-8", run(6, "€"[0:2]+"hello€€")) + b.Run("MixedUTF-8", run(14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80")) } func TestLogDropAttrs(t *testing.T) {