This repository has been archived by the owner on Aug 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Python3: Stop breaking surrogate pairs in toDelta()
Resolves #69 for Python3 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
- Loading branch information
Showing
2 changed files
with
82 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
__author__ = '[email protected] (Neil Fraser)' | ||
|
||
import re | ||
import struct | ||
import sys | ||
import time | ||
import urllib.parse | ||
|
@@ -1147,14 +1148,17 @@ def diff_toDelta(self, diffs): | |
""" | ||
text = [] | ||
for (op, data) in diffs: | ||
if 0 == len(data): | ||
continue | ||
|
||
if op == self.DIFF_INSERT: | ||
# High ascii will raise UnicodeDecodeError. Use Unicode instead. | ||
data = data.encode("utf-8") | ||
text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# ")) | ||
elif op == self.DIFF_DELETE: | ||
text.append("-%d" % len(data)) | ||
text.append("-%d" % (len(data.encode('utf-16-be')) // 2)) | ||
elif op == self.DIFF_EQUAL: | ||
text.append("=%d" % len(data)) | ||
text.append("=%d" % (len(data.encode('utf-16-be')) // 2)) | ||
return "\t".join(text) | ||
|
||
def diff_fromDelta(self, text1, delta): | ||
|
@@ -1172,7 +1176,8 @@ def diff_fromDelta(self, text1, delta): | |
ValueError: If invalid input. | ||
""" | ||
diffs = [] | ||
pointer = 0 # Cursor in text1 | ||
as_utf16 = text1.encode('utf-16-be') | ||
pointer = 0 # Cursor in as_utf16 | ||
tokens = delta.split("\t") | ||
for token in tokens: | ||
if token == "": | ||
|
@@ -1191,8 +1196,8 @@ def diff_fromDelta(self, text1, delta): | |
raise ValueError("Invalid number in diff_fromDelta: " + param) | ||
if n < 0: | ||
raise ValueError("Negative number in diff_fromDelta: " + param) | ||
text = text1[pointer : pointer + n] | ||
pointer += n | ||
text = as_utf16[pointer : pointer + n * 2].decode('utf-16-be') | ||
pointer += n * 2 | ||
if token[0] == "=": | ||
diffs.append((self.DIFF_EQUAL, text)) | ||
else: | ||
|
@@ -1201,10 +1206,10 @@ def diff_fromDelta(self, text1, delta): | |
# Anything else is an error. | ||
raise ValueError("Invalid diff operation in diff_fromDelta: " + | ||
token[0]) | ||
if pointer != len(text1): | ||
if pointer != len(as_utf16): | ||
raise ValueError( | ||
"Delta length (%d) does not equal source text length (%d)." % | ||
(pointer, len(text1))) | ||
(pointer, len(as_utf16))) | ||
return diffs | ||
|
||
# MATCH FUNCTIONS | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters