Python3: Stop breaking surrogate pairs in toDelta()

Resolves #69 for Python3 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
google · Jan 30, 2024 · 50f1542 · 50f1542
1 parent db1cbba
commit 50f1542
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 7 deletions.
diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py
@@ -26,6 +26,7 @@
 __author__ = '[email protected] (Neil Fraser)'
 
 import re
+import struct
 import sys
 import time
 import urllib.parse
@@ -1147,14 +1148,17 @@ def diff_toDelta(self, diffs):
     """
     text = []
     for (op, data) in diffs:
+      if 0 == len(data):
+        continue
+
       if op == self.DIFF_INSERT:
         # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
         data = data.encode("utf-8")
         text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
       elif op == self.DIFF_DELETE:
-        text.append("-%d" % len(data))
+        text.append("-%d" % (len(data.encode('utf-16-be')) // 2))
       elif op == self.DIFF_EQUAL:
-        text.append("=%d" % len(data))
+        text.append("=%d" % (len(data.encode('utf-16-be')) // 2))
     return "\t".join(text)
 
   def diff_fromDelta(self, text1, delta):
@@ -1172,7 +1176,8 @@ def diff_fromDelta(self, text1, delta):
       ValueError: If invalid input.
     """
     diffs = []
-    pointer = 0  # Cursor in text1
+    as_utf16 = text1.encode('utf-16-be')
+    pointer = 0  # Cursor in as_utf16
     tokens = delta.split("\t")
     for token in tokens:
       if token == "":
@@ -1191,8 +1196,8 @@ def diff_fromDelta(self, text1, delta):
           raise ValueError("Invalid number in diff_fromDelta: " + param)
         if n < 0:
           raise ValueError("Negative number in diff_fromDelta: " + param)
-        text = text1[pointer : pointer + n]
-        pointer += n
+        text = as_utf16[pointer : pointer + n * 2].decode('utf-16-be')
+        pointer += n * 2
         if token[0] == "=":
           diffs.append((self.DIFF_EQUAL, text))
         else:
@@ -1201,10 +1206,10 @@ def diff_fromDelta(self, text1, delta):
         # Anything else is an error.
         raise ValueError("Invalid diff operation in diff_fromDelta: " +
             token[0])
-    if pointer != len(text1):
+    if pointer != len(as_utf16):
       raise ValueError(
           "Delta length (%d) does not equal source text length (%d)." %
-         (pointer, len(text1)))
+         (pointer, len(as_utf16)))
     return diffs
 
   #  MATCH FUNCTIONS

diff --git a/python3/tests/diff_match_patch_test.py b/python3/tests/diff_match_patch_test.py
@@ -18,6 +18,7 @@
 """
 
 import imp
+import json
 import os
 import sys
 import time
@@ -444,6 +445,12 @@ def testDiffDelta(self):
     # Convert delta string into a diff.
     self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta))
 
+    diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B")
+    delta = self.dmp.diff_toDelta(diffs)
+    self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta)
+
+    self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2"))
+
     # Verify pool of unchanged characters.
     diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
     text2 = self.dmp.diff_text2(diffs)
@@ -455,6 +462,69 @@ def testDiffDelta(self):
     # Convert delta string into a diff.
     self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta))
 
+    # Unicode: split surrogates
+    self.assertEqual(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_INSERT, '\U0001F171'),
+        (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        '\U0001F170\U0001F171',
+        '\U0001F171\U0001F170\U0001F171'
+      )),
+      'Inserting similar surrogate pair at beginning'
+    )
+
+    self.assertEqual(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_EQUAL, '\U0001F170'),
+        (self.dmp.DIFF_INSERT, '\U0001F172'),
+        (self.dmp.DIFF_EQUAL, '\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        '\U0001F170\U0001F171',
+        '\U0001F170\U0001F172\U0001F171'
+      )),
+      'Inserting similar surrogate pair in the middle'
+    )
+
+    self.assertEqual(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_DELETE, '\U0001F171'),
+        (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        '\U0001F171\U0001F170\U0001F171',
+        '\U0001F170\U0001F171'
+      )),
+      'Deleting similar surogate pair at the beginning'
+    )
+
+    self.assertEqual(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_EQUAL, '\U0001F170'),
+        (self.dmp.DIFF_DELETE, '\U0001F172'),
+        (self.dmp.DIFF_EQUAL, '\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        '\U0001F170\U0001F172\U0001F171',
+        '\U0001F170\U0001F171'
+      )),
+      'Deleting similar surogate pair in the middle'
+    )
+
+    self.assertEqual(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_DELETE, '\U0001F170'),
+        (self.dmp.DIFF_INSERT, '\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        '\U0001F170',
+        '\U0001F171'
+      )),
+      'Swap surrogate pair'
+    )
+
     # 160 kb string.
     a = "abcdefghij"
     for i in range(14):