Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop breaking surrogate pairs in toDelta()/fromDelta() #1

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
461b008
JavaScript: Stop breaking surrogate pairs in toDelta()
dmsnell Nov 9, 2019
71646fb
Add fixes for Java client
dmsnell Nov 9, 2019
8d2a5f8
Add fixes for ObjectiveC
dmsnell Nov 10, 2019
535e29e
Add fixes for Python2
dmsnell Nov 10, 2019
aafbd58
Add fixes for Python3
dmsnell Nov 10, 2019
4fc0073
Add fixes for Python3 and fix counter in Python2
dmsnell Nov 10, 2019
d0a578f
Adjust Python3 code
dmsnell Nov 10, 2019
08de57e
Fix reconstructing diff in Python3
dmsnell Nov 10, 2019
df810f7
Updates from review feedback
dmsnell Nov 11, 2019
1d45818
fixup! Updates from review feedback
dmsnell Nov 11, 2019
82c1111
fixup! fixup! Updates from review feedback
dmsnell Nov 11, 2019
ea303f2
Rebuild compressed JavaScript with Closure Compiler
dmsnell Nov 11, 2019
a5f35ab
Handle cases where we delete characters
dmsnell Dec 13, 2019
55a8779
Augment diff_fromDelta with custom decodeURI for encoded surrogate ha…
dmsnell Dec 13, 2019
7e5a643
Port the improvements to Java
dmsnell Dec 13, 2019
42bc948
Java: Use lookup vs. parseInt() for URI decoding
dmsnell Dec 13, 2019
7eeca25
Add more test cases, update code and decoder
dmsnell Dec 16, 2019
1d166fe
Apply patch fix to objective-c
dmsnell Dec 16, 2019
9dede7f
Add objective-c tests
dmsnell Dec 16, 2019
dbada06
JavaScript: Use `digit16` instead of `parseInt()`
dmsnell Dec 16, 2019
6e03198
Objective-C: Introduce failing test for receiving an invalid patch
dmsnell Dec 16, 2019
989d6c6
Python: Update patch and handle wide/narrow compilation
dmsnell Dec 17, 2019
22cb396
Python: Always run encoded toDelta
dmsnell Dec 17, 2019
3c31e62
Python: Add tests and update Python3 toDelta fix
dmsnell Dec 17, 2019
df21753
Update Objective-C code to handle invalid incoming deltas
dmsnell Dec 19, 2019
9a741a4
JavaScript: Fix known-broken diffs from objective-c
dmsnell Dec 19, 2019
8e5241b
Objective-C: Update from feedback - idiomatic code and bounds checks
dmsnell Dec 19, 2019
3a00a83
ObjectiveC: Stop doing bounds checking and rely on exception-catching
dmsnell Dec 19, 2019
57c8246
ObjectiveC: Use `uint16` and `uint32` for more specified bits
dmsnell Dec 19, 2019
9703fc4
ObjectiveC: Use NSUInteger and explain some bit shifting
dmsnell Dec 19, 2019
19606f0
ObjectiveC: Guard against high-surrogate(null)low-surrogate
dmsnell Dec 19, 2019
477b5a6
Fix typo
dmsnell Dec 19, 2019
fa122d3
Java: Use StringBuildler in decodeURI and guard against H(null)L pattern
dmsnell Dec 19, 2019
21aebb4
Remove attempt at fixing the (null) bug
dmsnell Jan 15, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 124 additions & 8 deletions java/src/name/fraser/neil/plaintext/diff_match_patch.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package name.fraser.neil.plaintext;

import java.io.UnsupportedEncodingException;
import java.lang.Character;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.*;
Expand Down Expand Up @@ -1429,7 +1430,31 @@ public int diff_levenshtein(List<Diff> diffs) {
*/
public String diff_toDelta(List<Diff> diffs) {
StringBuilder text = new StringBuilder();
char lastEnd = 0;
boolean isFirst = true;
for (Diff aDiff : diffs) {
if (aDiff.text.isEmpty()) {
continue;
}

char thisTop = aDiff.text.charAt(0);
char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1);

if (Character.isHighSurrogate(thisEnd)) {
lastEnd = thisEnd;
aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1);
}

if (! isFirst && Character.isHighSurrogate(lastEnd) && Character.isLowSurrogate(thisTop)) {
aDiff.text = lastEnd + aDiff.text;
}

isFirst = false;

if ( aDiff.text.isEmpty() ) {
continue;
}

switch (aDiff.operation) {
case INSERT:
try {
Expand Down Expand Up @@ -1457,6 +1482,103 @@ public String diff_toDelta(List<Diff> diffs) {
return delta;
}

private int digit16(char b) throws IllegalArgumentException {
switch (b) {
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case 'A': case 'a': return 10;
case 'B': case 'b': return 11;
case 'C': case 'c': return 12;
case 'D': case 'd': return 13;
case 'E': case 'e': return 14;
case 'F': case 'f': return 15;
default:
throw new IllegalArgumentException();
}
}

private String decodeURI(String text) throws IllegalArgumentException {
int i = 0;
StringBuilder decoded = new StringBuilder(text.length());

while (i < text.length()) {
if (text.charAt(i) != '%') {
decoded.append(text.charAt(i++));
continue;
}

// start a percent-sequence
int byte1 = (digit16(text.charAt(i + 1)) << 4) + digit16(text.charAt(i + 2));
if ((byte1 & 0x80) == 0) {
decoded.append(Character.toChars(byte1));
i += 3;
continue;
}

if ( text.charAt(i + 3) != '%') {
throw new IllegalArgumentException();
}

int byte2 = (digit16(text.charAt(i + 4)) << 4) + digit16(text.charAt(i + 5));
if ((byte2 & 0xC0) != 0x80) {
throw new IllegalArgumentException();
}
byte2 = byte2 & 0x3F;
if ((byte1 & 0xE0) == 0xC0) {
decoded.append(Character.toChars(((byte1 & 0x1F) << 6) | byte2));
i += 6;
continue;
}

if (text.charAt(i + 6) != '%') {
throw new IllegalArgumentException();
}

int byte3 = (digit16(text.charAt(i + 7)) << 4) + digit16(text.charAt(i + 8));
if ((byte3 & 0xC0) != 0x80) {
throw new IllegalArgumentException();
}
byte3 = byte3 & 0x3F;
if ((byte1 & 0xF0) == 0xE0) {
// unpaired surrogate are fine here
decoded.append(Character.toChars(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3));
i += 9;
continue;
}

if (text.charAt(i + 9) != '%') {
throw new IllegalArgumentException();
}

int byte4 = (digit16(text.charAt(i + 10)) << 4) + digit16(text.charAt(i + 11));
if ((byte4 & 0xC0) != 0x80) {
throw new IllegalArgumentException();
}
byte4 = byte4 & 0x3F;
if ((byte1 & 0xF8) == 0xF0) {
int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
decoded.append(Character.toChars((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800));
decoded.append(Character.toChars(0xDC00 | (codePoint & 0xFFFF) & 0x3FF));
i += 12;
continue;
}
}

throw new IllegalArgumentException();
}

return decoded.toString();
}

/**
* Given the original text1, and an encoded string which describes the
* operations required to transform text1 into text2, compute the full diff.
Expand All @@ -1483,10 +1605,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
// decode would change all "+" to " "
param = param.replace("+", "%2B");
try {
param = URLDecoder.decode(param, "UTF-8");
} catch (UnsupportedEncodingException e) {
// Not likely on modern system.
throw new Error("This system does not support UTF-8.", e);
param = this.decodeURI(param);
} catch (IllegalArgumentException e) {
// Malformed URI sequence.
throw new IllegalArgumentException(
Expand Down Expand Up @@ -2269,10 +2388,7 @@ public List<Patch> patch_fromText(String textline)
line = text.getFirst().substring(1);
line = line.replace("+", "%2B"); // decode would change all "+" to " "
try {
line = URLDecoder.decode(line, "UTF-8");
} catch (UnsupportedEncodingException e) {
// Not likely on modern system.
throw new Error("This system does not support UTF-8.", e);
line = this.decodeURI(line);
} catch (IllegalArgumentException e) {
// Malformed URI sequence.
throw new IllegalArgumentException(
Expand Down
36 changes: 36 additions & 0 deletions java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,42 @@ public static void testDiffDelta() {

assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta));

diffs = diffList(new Diff(EQUAL, "\ud83d\ude4b\ud83d"), new Diff(INSERT, "\ude4c\ud83d"), new Diff(EQUAL, "\ude4b"));
delta = dmp.diff_toDelta(diffs);
assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta);

assertEquals(
"diff_toDelta: insert surrogate pair between similar high surrogates",
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd70"), new Diff(EQUAL, "\ud83c\udd71"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70\ud83c"), new Diff(INSERT, "\udd70\ud83c"), new Diff(EQUAL, "\udd71")))
);

assertEquals(
"diff_toDelta: swap surrogate pairs delete/insert",
dmp.diff_toDelta(diffList(new Diff(DELETE, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd71"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(DELETE, "\udd70"), new Diff(INSERT, "\udd71")))
);

assertEquals(
"diff_toDelta: swap surrogate pairs insert/delete",
dmp.diff_toDelta(diffList(new Diff(INSERT, "\ud83c\udd70"), new Diff(DELETE, "\ud83c\udd71"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(INSERT, "\udd70"), new Diff(DELETE, "\udd71")))
);

assertEquals(
"diff_toDelta: empty diff groups",
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(DELETE, ""), new Diff(INSERT, "ghijk"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(INSERT, "ghijk")))
);

// Different versions of the library may have created deltas with
// half of a surrogate pair encoded as if it were valid UTF-8
assertEquals(
"diff_toDelta: surrogate half encoded as UTF8",
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "-2\t+%F0%9F%85%B1")),
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1"))
);

// Verify pool of unchanged characters.
diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "));
String text2 = dmp.diff_text2(diffs);
Expand Down
Loading