feat(term): ansi: implement ANSI aware truncation

This implements an ANSI and wide-characters aware truncation algorithm that uses the newly merged [ANSI parser state machine][statemachine] and the fantastic library uniseg. Since this is using the ANSI state machine, it's compatible with `CSI m` (SGR) style sequence, `OSC 8` (hyperlinks), and basically any other escape sequence supported in the state machine (DCS, ESC, SOS, PM, APC). Related: muesli/reflow#71 [statemachine]: https://github.com/charmbracelet/x/blob/main/exp/term/ansi/parser/transition_table.go
charmbracelet · Mar 13, 2024 · 471d31b · 471d31b
1 parent 8cc69f8
commit 471d31b
Show file tree

Hide file tree

Showing 2 changed files with 173 additions and 0 deletions.
diff --git a/exp/term/ansi/truncate.go b/exp/term/ansi/truncate.go
@@ -0,0 +1,116 @@
+package ansi
+
+import (
+	"bytes"
+
+	. "github.com/charmbracelet/x/exp/term/ansi/parser"
+	"github.com/rivo/uniseg"
+)
+
+// Truncate truncates a string to a given length, adding a tail to the
+// end if the string is longer than the given length.
+// This function is aware of ANSI escape codes and will not break them, and
+// accounts for wide-characters (such as East Asians and emojis).
+func Truncate(s string, length int, tail string) string {
+	tw := StringWidth(tail)
+	length -= tw
+	if length < 0 {
+		return ""
+	}
+
+	var cluster []byte
+	var buf bytes.Buffer
+	curWidth := 0
+	ignoring := false
+	gstate := -1
+	pstate := GroundState // initial state
+	b := []byte(s)
+	i := 0
+
+	// Here we iterate over the bytes of the string and collect printable
+	// characters and runes. We also keep track of the width of the string
+	// in cells.
+	// Once we reach the given length, we start ignoring characters and only
+	// collect ANSI escape codes until we reach the end of the next escape
+	// code, or end of string.
+	for i < len(b) {
+		state, action := Table.Transition(pstate, b[i])
+		// log.Printf("pstate: %s, state: %s, action: %s, code: %q", StateNames[pstate], StateNames[state], ActionNames[action], s[i])
+
+		switch action {
+		case CollectAction:
+			// This action happens when we transition to the Utf8State.
+			if w := utf8ByteLen(b[i]); w > 1 {
+				var width int
+				cluster, _, width, gstate = uniseg.FirstGraphemeCluster(b[i:], gstate)
+
+				// log.Printf("cluster: %q, width: %d, curWidth: %d", string(cluster), width, curWidth)
+
+				// increment the index by the length of the cluster
+				i += len(cluster)
+
+				// Are we ignoring? Skip to the next byte
+				if ignoring {
+					continue
+				}
+
+				// Is this gonna be too wide?
+				// If so write the tail and stop collecting.
+				if curWidth+width >= length && !ignoring {
+					ignoring = true
+					buf.WriteString(tail)
+				}
+
+				if curWidth+width > length {
+					continue
+				}
+
+				curWidth += width
+				for _, r := range cluster {
+					buf.WriteByte(r)
+				}
+
+				// Done collecting, now we're back in the ground state.
+				pstate = GroundState
+				continue
+			} else {
+				// Collecting sequence intermediate bytes
+				buf.WriteByte(b[i])
+			}
+		case PrintAction:
+			// Is this gonna be too wide?
+			// If so write the tail and stop collecting.
+			if curWidth >= length && !ignoring {
+				ignoring = true
+				buf.WriteString(tail)
+			}
+
+			// Skip to the next byte if we're ignoring
+			if ignoring {
+				i++
+				continue
+			}
+
+			// collects printable ASCII
+			curWidth++
+			fallthrough
+		default:
+			buf.WriteByte(b[i])
+			i++
+		}
+
+		// Transition to the next state.
+		pstate = state
+
+		// log.Printf("buf: %q, curWidth: %d, ignoring: %v", buf.String(), curWidth, ignoring)
+
+		// Once we reach the given length, we start ignoring runes and write
+		// the tail to the buffer.
+		if curWidth > length && !ignoring {
+			ignoring = true
+			buf.WriteString(tail)
+		}
+	}
+
+	return buf.String()
+}
diff --git a/exp/term/ansi/truncate_test.go b/exp/term/ansi/truncate_test.go
@@ -0,0 +1,57 @@
+package ansi
+
+import (
+	"testing"
+)
+
+var tcases = []struct {
+	name   string
+	input  string
+	tail   string
+	width  int
+	expect string
+}{
+	{"empty", "", "", 0, ""},
+	{"simple", "foobar", "", 3, "foo"},
+	{"passthrough", "foobar", "", 10, "foobar"},
+	{"ascii", "hello", "", 3, "hel"},
+	{"emoji", "👋", "", 2, "👋"},
+	{"wideemoji", "🫧", "", 2, "🫧"},
+	{"controlemoji", "\x1b[31mhello 👋abc\x1b[0m", "", 8, "\x1b[31mhello 👋\x1b[0m"},
+	{"osc8", "\x1b]8;;https://charm.sh\x1b\\Charmbracelet 🫧\x1b]8;;\x1b\\", "", 5, "\x1b]8;;https://charm.sh\x1b\\Charm\x1b]8;;\x1b\\"},
+	{"osc8_8bit", "\x9d8;;https://charm.sh\x9cCharmbracelet 🫧\x9d8;;\x9c", "", 5, "\x9d8;;https://charm.sh\x9cCharm\x9d8;;\x9c"},
+	{"style_tail", "\x1B[38;5;219mHiya!", "…", 3, "\x1B[38;5;219mHi…"},
+	{"double_style_tail", "\x1B[38;5;219mHiya!\x1B[38;5;219mHello", "…", 7, "\x1B[38;5;219mHiya!\x1B[38;5;219mH…"},
+	{"noop", "\x1B[7m--", "", 2, "\x1B[7m--"},
+	{"double_width", "\x1B[38;2;249;38;114m你好\x1B[0m", "", 3, "\x1B[38;2;249;38;114m你\x1B[0m"},
+	{"double_width_rune", "你", "", 1, ""},
+	{"double_width_runes", "你好", "", 2, "你"},
+	{"spaces_only", "    ", "…", 2, " …"},
+	{"longer_tail", "foo", "...", 2, ""},
+	{"same_tail_width", "foo", "...", 3, "..."},
+	{"same_tail_width_control", "\x1b[31mfoo\x1b[0m", "...", 3, "\x1b[31m...\x1b[0m"},
+	{"same_width", "foo", "", 3, "foo"},
+	{"truncate_with_tail", "foobar", ".", 4, "foo."},
+	{"style", "I really \x1B[38;2;249;38;114mlove\x1B[0m Go!", "", 8, "I really\x1B[38;2;249;38;114m\x1B[0m"},
+	{"dcs", "\x1BPq#0;2;0;0;0#1;2;100;100;0#2;2;0;100;0#1~~@@vv@@~~@@~~$#2??}}GG}}??}}??-#1!14@\x1B\\foobar", "…", 4, "\x1BPq#0;2;0;0;0#1;2;100;100;0#2;2;0;100;0#1~~@@vv@@~~@@~~$#2??}}GG}}??}}??-#1!14@\x1B\\foo…"},
+}
+
+func TestTruncate(t *testing.T) {
+	for i, c := range tcases {
+		t.Run(c.name, func(t *testing.T) {
+			if result := Truncate(c.input, c.width, c.tail); result != c.expect {
+				t.Errorf("test case %d failed: expected %q, got %q", i+1, c.expect, result)
+			}
+		})
+	}
+}
+
+func BenchmarkTruncateString(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		b.ReportAllocs()
+		b.ResetTimer()
+		for pb.Next() {
+			Truncate("foo", 2, "")
+		}
+	})
+}