Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix/stop breaking surrogate pairs #2

Merged
merged 1 commit into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 177 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
Expand Up @@ -1176,6 +1176,166 @@ diff_match_patch.prototype.diff_cleanupMerge = function(diffs) {
}
};

/**
* Rearrange diff boundaries that split Unicode surrogate pairs.
*
* @param {!Array.<!diff_match_patch.Diff>} diffs Array of diff tuples.
*/
diff_match_patch.prototype.diff_cleanupSplitSurrogates = function(diffs) {
var lastEnd;
for (var x = 0; x < diffs.length; x++) {
var thisDiff = diffs[x];
var thisTop = thisDiff[1][0];
var thisEnd = thisDiff[1][thisDiff[1].length - 1];

if (0 === thisDiff[1].length) {
diffs.splice(x--, 1);
continue;
}

if (thisEnd && this.isHighSurrogate(thisEnd)) {
lastEnd = thisEnd;
thisDiff[1] = thisDiff[1].slice(0, -1);
}

if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) {
thisDiff[1] = lastEnd + thisDiff[1];
}

if (0 === thisDiff[1].length) {
diffs.splice(x--, 1);
continue;
}
}

return diffs;
};

diff_match_patch.prototype.isHighSurrogate = function(c) {
var v = c.charCodeAt(0);
return v >= 0xD800 && v <= 0xDBFF;
};

diff_match_patch.prototype.isLowSurrogate = function(c) {
var v = c.charCodeAt(0);
return v >= 0xDC00 && v <= 0xDFFF;
};

diff_match_patch.prototype.digit16 = function(c) {
switch (c) {
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case 'A': case 'a': return 10;
case 'B': case 'b': return 11;
case 'C': case 'c': return 12;
case 'D': case 'd': return 13;
case 'E': case 'e': return 14;
case 'F': case 'f': return 15;
default: throw new Error('Invalid hex-code');
}
};

/**
* Decode URI-encoded string but allow for encoded surrogate halves
*
* diff_match_patch needs this relaxation of the requirements because
* not all libraries and versions produce valid URI strings in toDelta
* and we don't want to crash this code when the input is valid input
* but at the same time invalid utf-8
*
* @example: decodeURI( 'abcd%3A %F0%9F%85%B0' ) = 'abcd: \ud83c\udd70'
* @example: decodeURI( 'abcd%3A %ED%A0%BC' ) = 'abcd: \ud83c'
*
* @cite: @mathiasbynens utf8.js at https://github.com/mathiasbynens/utf8.js
*
* @param {String} text input string encoded by encodeURI() or equivalent
* @return {String}
*/
diff_match_patch.prototype.decodeURI = function(text) {
try {
return decodeURI(text);
} catch ( e ) {
var i = 0;
var decoded = '';

while (i < text.length) {
if ( text[i] !== '%' ) {
decoded += text[i++];
continue;
}

// start a percent-sequence
var byte1 = (this.digit16(text[i + 1]) << 4) + this.digit16(text[i + 2]);
if ((byte1 & 0x80) === 0) {
decoded += String.fromCharCode(byte1);
i += 3;
continue;
}

if ('%' !== text[i + 3]) {
throw new URIError('URI malformed');
}

var byte2 = (this.digit16(text[i + 4]) << 4) + this.digit16(text[i + 5]);
if ((byte2 & 0xC0) !== 0x80) {
throw new URIError('URI malformed');
}
byte2 = byte2 & 0x3F;
if ((byte1 & 0xE0) === 0xC0) {
decoded += String.fromCharCode(((byte1 & 0x1F) << 6) | byte2);
i += 6;
continue;
}

if ('%' !== text[i + 6]) {
throw new URIError('URI malformed');
}

var byte3 = (this.digit16(text[i + 7]) << 4) + this.digit16(text[i + 8]);
if ((byte3 & 0xC0) !== 0x80) {
throw new URIError('URI malformed');
}
byte3 = byte3 & 0x3F;
if ((byte1 & 0xF0) === 0xE0) {
// unpaired surrogate are fine here
decoded += String.fromCharCode(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3);
i += 9;
continue;
}

if ('%' !== text[i + 9]) {
throw new URIError('URI malformed');
}

var byte4 = (this.digit16(text[i + 10]) << 4) + this.digit16(text[i + 11]);
if ((byte4 & 0xC0) !== 0x80) {
throw new URIError('URI malformed');
}
byte4 = byte4 & 0x3F;
if ((byte1 & 0xF8) === 0xF0) {
var codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
decoded += String.fromCharCode((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800);
decoded += String.fromCharCode(0xDC00 | (codePoint & 0xFFFF) & 0x3FF);
i += 12;
continue;
}
}

throw new URIError('URI malformed');
}

return decoded;
}
};

/**
* loc is a location in text1, compute and return the equivalent location in
Expand Down Expand Up @@ -1219,6 +1379,7 @@ diff_match_patch.prototype.diff_xIndex = function(diffs, loc) {
* @return {string} HTML representation.
*/
diff_match_patch.prototype.diff_prettyHtml = function(diffs) {
diffs = this.diff_cleanupSplitSurrogates(diffs);
var html = [];
var pattern_amp = /&/g;
var pattern_lt = /</g;
Expand Down Expand Up @@ -1319,6 +1480,7 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) {
* @return {string} Delta text.
*/
diff_match_patch.prototype.diff_toDelta = function(diffs) {
diffs = this.diff_cleanupSplitSurrogates(diffs);
var text = [];
for (var x = 0; x < diffs.length; x++) {
switch (diffs[x][0]) {
Expand Down Expand Up @@ -1361,7 +1523,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) {
switch (tokens[x].charAt(0)) {
case '+':
try {
diffs[diffsLength++] = [DIFF_INSERT, decodeURI(param)];
diffs[diffsLength++] = [DIFF_INSERT, this.decodeURI(param)];
} catch (ex) {
// Malformed URI sequence.
throw new Error('Illegal escape in diff_fromDelta: ' + param);
Expand Down Expand Up @@ -1597,11 +1759,23 @@ diff_match_patch.prototype.patch_addContext_ = function(patch, text) {
padding += this.Patch_Margin;

// Add the prefix.
if (
patch.start2 - padding > 0 &&
diff_match_patch.prototype.isLowSurrogate(text[patch.start2 - padding])
) {
padding++;
}
var prefix = text.substring(patch.start2 - padding, patch.start2);
if (prefix) {
patch.diffs.unshift([DIFF_EQUAL, prefix]);
}
// Add the suffix.
if (
patch.start2 + patch.length1 + padding < text.length &&
diff_match_patch.prototype.isHighSurrogate(text[patch.start2 + patch.length1 + padding])
) {
padding++;
}
var suffix = text.substring(patch.start2 + patch.length1,
patch.start2 + patch.length1 + padding);
if (suffix) {
Expand Down Expand Up @@ -1675,6 +1849,7 @@ diff_match_patch.prototype.patch_make = function(a, opt_b, opt_c) {
if (diffs.length === 0) {
return []; // Get rid of the null case.
}
diffs = this.diff_cleanupSplitSurrogates(diffs);
var patches = [];
var patch = new diff_match_patch.patch_obj();
var patchDiffLength = 0; // Keeping our own length var is faster in JS.
Expand Down Expand Up @@ -2171,6 +2346,7 @@ diff_match_patch.patch_obj.prototype.toString = function() {
var text = ['@@ -' + coords1 + ' +' + coords2 + ' @@\n'];
var op;
// Escape the body of the patch with %xx notation.
diff_match_patch.prototype.diff_cleanupSplitSurrogates(this.diffs);
for (var x = 0; x < this.diffs.length; x++) {
switch (this.diffs[x][0]) {
case DIFF_INSERT:
Expand Down
133 changes: 132 additions & 1 deletion test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,137 @@ function testDiffDelta() {
// Convert delta string into a diff.
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));


diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']];
try {
delta = dmp.diff_toDelta(diffs);
assertEquals('=2\t+%F0%9F%99%8C\t=2', delta);
} catch ( e ) {
assertEquals(false, true);
}

(function(){
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
U+1F18x 🆎
0 1 2 3 4 5 6 7 8 9 A B C D E F
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
U+1F21x 🈚
U+1F22x 🈯
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
U+1F25x 🉐 🉑
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;

// applies some random edits to string and returns new, edited string
function applyRandomTextEdit(text) {
let textArr = [...text];
let r = Math.random();
if(r < 1/3) { // swap
let swapCount = Math.floor(Math.random()*5);
for(let i = 0; i < swapCount; i++) {
let swapPos1 = Math.floor(Math.random()*textArr.length);
let swapPos2 = Math.floor(Math.random()*textArr.length);
let char1 = textArr[swapPos1];
let char2 = textArr[swapPos2];
textArr[swapPos1] = char2;
textArr[swapPos2] = char1;
}
} else if(r < 2/3) { // remove
let removeCount = Math.floor(Math.random()*5);
for(let i = 0; i < removeCount; i++) {
let removePos = Math.floor(Math.random()*textArr.length);
textArr[removePos] = "";
}
} else { // add
let addCount = Math.floor(Math.random()*5);
for(let i = 0; i < addCount; i++) {
let addPos = Math.floor(Math.random()*textArr.length);
let addFromPos = Math.floor(Math.random()*textArr.length);
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
}
}
return textArr.join("");
}

for(let i = 0; i < 1000; i++) {
const newText = applyRandomTextEdit(originalText);
dmp.diff_toDelta(dmp.diff_main(originalText, newText));
}
})();

// Unicode - splitting surrogates
try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Inserting similar surrogate pair at beginning', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Inserting similar surrogate pair in the middle', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Deleting similar surrogate pair at the beginning', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Deleting similar surrogate pair in the middle', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]),
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]),
);
} catch ( e ) {
assertEquals('Swap surrogate pair', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]),
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]),
);
} catch ( e ) {
assertEquals('Swap surrogate pair', 'crashed');
}

// Empty diff groups
assertEquivalent(
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]),
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]),
);

// Different versions of the library may have created deltas with
// half of a surrogate pair encoded as if it were valid UTF-8
try {
assertEquivalent(
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
);
} catch ( e ) {
assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
}

// Verify pool of unchanged characters.
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
var text2 = dmp.diff_text2(diffs);
Expand Down Expand Up @@ -1019,4 +1150,4 @@ var tests = [

for (var x = 0; x < tests.length; x++) {
test(tests[x], eval(tests[x]))
}
}