Skip to content
This repository has been archived by the owner on Aug 5, 2024. It is now read-only.

Commit

Permalink
Python3: Stop breaking surrogate pairs in toDelta()
Browse files Browse the repository at this point in the history
Resolves #69 for Python3

Sometimes we can find a common prefix that runs into the middle of a
surrogate pair and we split that pair when building our diff groups.

This is fine as long as we are operating on UTF-16 code units. It
becomes problematic when we start trying to treat those substrings as
valid Unicode (or UTF-8) sequences.

When we pass these split groups into `toDelta()` we do just that and the
library crashes. In this patch we're post-processing the diff groups
before encoding them to make sure that we un-split the surrogate pairs.

The post-processed diffs should produce the same output when applying
the diffs. The diff string itself will be different but should change
that much - only by a single character at surrogate boundaries.
  • Loading branch information
dmsnell committed Jan 30, 2024
1 parent db1cbba commit 50f1542
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 7 deletions.
19 changes: 12 additions & 7 deletions python3/diff_match_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
__author__ = '[email protected] (Neil Fraser)'

import re
import struct
import sys
import time
import urllib.parse
Expand Down Expand Up @@ -1147,14 +1148,17 @@ def diff_toDelta(self, diffs):
"""
text = []
for (op, data) in diffs:
if 0 == len(data):
continue

if op == self.DIFF_INSERT:
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
data = data.encode("utf-8")
text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
elif op == self.DIFF_DELETE:
text.append("-%d" % len(data))
text.append("-%d" % (len(data.encode('utf-16-be')) // 2))
elif op == self.DIFF_EQUAL:
text.append("=%d" % len(data))
text.append("=%d" % (len(data.encode('utf-16-be')) // 2))
return "\t".join(text)

def diff_fromDelta(self, text1, delta):
Expand All @@ -1172,7 +1176,8 @@ def diff_fromDelta(self, text1, delta):
ValueError: If invalid input.
"""
diffs = []
pointer = 0 # Cursor in text1
as_utf16 = text1.encode('utf-16-be')
pointer = 0 # Cursor in as_utf16
tokens = delta.split("\t")
for token in tokens:
if token == "":
Expand All @@ -1191,8 +1196,8 @@ def diff_fromDelta(self, text1, delta):
raise ValueError("Invalid number in diff_fromDelta: " + param)
if n < 0:
raise ValueError("Negative number in diff_fromDelta: " + param)
text = text1[pointer : pointer + n]
pointer += n
text = as_utf16[pointer : pointer + n * 2].decode('utf-16-be')
pointer += n * 2
if token[0] == "=":
diffs.append((self.DIFF_EQUAL, text))
else:
Expand All @@ -1201,10 +1206,10 @@ def diff_fromDelta(self, text1, delta):
# Anything else is an error.
raise ValueError("Invalid diff operation in diff_fromDelta: " +
token[0])
if pointer != len(text1):
if pointer != len(as_utf16):
raise ValueError(
"Delta length (%d) does not equal source text length (%d)." %
(pointer, len(text1)))
(pointer, len(as_utf16)))
return diffs

# MATCH FUNCTIONS
Expand Down
70 changes: 70 additions & 0 deletions python3/tests/diff_match_patch_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""

import imp
import json
import os
import sys
import time
Expand Down Expand Up @@ -444,6 +445,12 @@ def testDiffDelta(self):
# Convert delta string into a diff.
self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta))

diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B")
delta = self.dmp.diff_toDelta(diffs)
self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta)

self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2"))

# Verify pool of unchanged characters.
diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
text2 = self.dmp.diff_text2(diffs)
Expand All @@ -455,6 +462,69 @@ def testDiffDelta(self):
# Convert delta string into a diff.
self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta))

# Unicode: split surrogates
self.assertEqual(
self.dmp.diff_toDelta([
(self.dmp.DIFF_INSERT, '\U0001F171'),
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
]),
self.dmp.diff_toDelta(self.dmp.diff_main(
'\U0001F170\U0001F171',
'\U0001F171\U0001F170\U0001F171'
)),
'Inserting similar surrogate pair at beginning'
)

self.assertEqual(
self.dmp.diff_toDelta([
(self.dmp.DIFF_EQUAL, '\U0001F170'),
(self.dmp.DIFF_INSERT, '\U0001F172'),
(self.dmp.DIFF_EQUAL, '\U0001F171')
]),
self.dmp.diff_toDelta(self.dmp.diff_main(
'\U0001F170\U0001F171',
'\U0001F170\U0001F172\U0001F171'
)),
'Inserting similar surrogate pair in the middle'
)

self.assertEqual(
self.dmp.diff_toDelta([
(self.dmp.DIFF_DELETE, '\U0001F171'),
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
]),
self.dmp.diff_toDelta(self.dmp.diff_main(
'\U0001F171\U0001F170\U0001F171',
'\U0001F170\U0001F171'
)),
'Deleting similar surogate pair at the beginning'
)

self.assertEqual(
self.dmp.diff_toDelta([
(self.dmp.DIFF_EQUAL, '\U0001F170'),
(self.dmp.DIFF_DELETE, '\U0001F172'),
(self.dmp.DIFF_EQUAL, '\U0001F171')
]),
self.dmp.diff_toDelta(self.dmp.diff_main(
'\U0001F170\U0001F172\U0001F171',
'\U0001F170\U0001F171'
)),
'Deleting similar surogate pair in the middle'
)

self.assertEqual(
self.dmp.diff_toDelta([
(self.dmp.DIFF_DELETE, '\U0001F170'),
(self.dmp.DIFF_INSERT, '\U0001F171')
]),
self.dmp.diff_toDelta(self.dmp.diff_main(
'\U0001F170',
'\U0001F171'
)),
'Swap surrogate pair'
)

# 160 kb string.
a = "abcdefghij"
for i in range(14):
Expand Down

0 comments on commit 50f1542

Please sign in to comment.