Skip to content

Commit

Permalink
Merge pull request #2 from hackmdio/fix/pathc-stop-breaking-surrogate…
Browse files Browse the repository at this point in the history
…-pairs

fix/stop breaking surrogate pairs
  • Loading branch information
stanley2058 authored May 12, 2023
2 parents c2f8fb9 + 1964735 commit fdb2b05
Show file tree
Hide file tree
Showing 2 changed files with 309 additions and 2 deletions.
178 changes: 177 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
Expand Up @@ -1176,6 +1176,166 @@ diff_match_patch.prototype.diff_cleanupMerge = function(diffs) {
}
};

/**
* Rearrange diff boundaries that split Unicode surrogate pairs.
*
* @param {!Array.<!diff_match_patch.Diff>} diffs Array of diff tuples.
*/
diff_match_patch.prototype.diff_cleanupSplitSurrogates = function(diffs) {
var lastEnd;
for (var x = 0; x < diffs.length; x++) {
var thisDiff = diffs[x];
var thisTop = thisDiff[1][0];
var thisEnd = thisDiff[1][thisDiff[1].length - 1];

if (0 === thisDiff[1].length) {
diffs.splice(x--, 1);
continue;
}

if (thisEnd && this.isHighSurrogate(thisEnd)) {
lastEnd = thisEnd;
thisDiff[1] = thisDiff[1].slice(0, -1);
}

if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) {
thisDiff[1] = lastEnd + thisDiff[1];
}

if (0 === thisDiff[1].length) {
diffs.splice(x--, 1);
continue;
}
}

return diffs;
};

diff_match_patch.prototype.isHighSurrogate = function(c) {
var v = c.charCodeAt(0);
return v >= 0xD800 && v <= 0xDBFF;
};

diff_match_patch.prototype.isLowSurrogate = function(c) {
var v = c.charCodeAt(0);
return v >= 0xDC00 && v <= 0xDFFF;
};

diff_match_patch.prototype.digit16 = function(c) {
switch (c) {
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case 'A': case 'a': return 10;
case 'B': case 'b': return 11;
case 'C': case 'c': return 12;
case 'D': case 'd': return 13;
case 'E': case 'e': return 14;
case 'F': case 'f': return 15;
default: throw new Error('Invalid hex-code');
}
};

/**
* Decode URI-encoded string but allow for encoded surrogate halves
*
* diff_match_patch needs this relaxation of the requirements because
* not all libraries and versions produce valid URI strings in toDelta
* and we don't want to crash this code when the input is valid input
* but at the same time invalid utf-8
*
* @example: decodeURI( 'abcd%3A %F0%9F%85%B0' ) = 'abcd: \ud83c\udd70'
* @example: decodeURI( 'abcd%3A %ED%A0%BC' ) = 'abcd: \ud83c'
*
* @cite: @mathiasbynens utf8.js at https://github.com/mathiasbynens/utf8.js
*
* @param {String} text input string encoded by encodeURI() or equivalent
* @return {String}
*/
diff_match_patch.prototype.decodeURI = function(text) {
try {
return decodeURI(text);
} catch ( e ) {
var i = 0;
var decoded = '';

while (i < text.length) {
if ( text[i] !== '%' ) {
decoded += text[i++];
continue;
}

// start a percent-sequence
var byte1 = (this.digit16(text[i + 1]) << 4) + this.digit16(text[i + 2]);
if ((byte1 & 0x80) === 0) {
decoded += String.fromCharCode(byte1);
i += 3;
continue;
}

if ('%' !== text[i + 3]) {
throw new URIError('URI malformed');
}

var byte2 = (this.digit16(text[i + 4]) << 4) + this.digit16(text[i + 5]);
if ((byte2 & 0xC0) !== 0x80) {
throw new URIError('URI malformed');
}
byte2 = byte2 & 0x3F;
if ((byte1 & 0xE0) === 0xC0) {
decoded += String.fromCharCode(((byte1 & 0x1F) << 6) | byte2);
i += 6;
continue;
}

if ('%' !== text[i + 6]) {
throw new URIError('URI malformed');
}

var byte3 = (this.digit16(text[i + 7]) << 4) + this.digit16(text[i + 8]);
if ((byte3 & 0xC0) !== 0x80) {
throw new URIError('URI malformed');
}
byte3 = byte3 & 0x3F;
if ((byte1 & 0xF0) === 0xE0) {
// unpaired surrogate are fine here
decoded += String.fromCharCode(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3);
i += 9;
continue;
}

if ('%' !== text[i + 9]) {
throw new URIError('URI malformed');
}

var byte4 = (this.digit16(text[i + 10]) << 4) + this.digit16(text[i + 11]);
if ((byte4 & 0xC0) !== 0x80) {
throw new URIError('URI malformed');
}
byte4 = byte4 & 0x3F;
if ((byte1 & 0xF8) === 0xF0) {
var codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
decoded += String.fromCharCode((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800);
decoded += String.fromCharCode(0xDC00 | (codePoint & 0xFFFF) & 0x3FF);
i += 12;
continue;
}
}

throw new URIError('URI malformed');
}

return decoded;
}
};

/**
* loc is a location in text1, compute and return the equivalent location in
Expand Down Expand Up @@ -1219,6 +1379,7 @@ diff_match_patch.prototype.diff_xIndex = function(diffs, loc) {
* @return {string} HTML representation.
*/
diff_match_patch.prototype.diff_prettyHtml = function(diffs) {
diffs = this.diff_cleanupSplitSurrogates(diffs);
var html = [];
var pattern_amp = /&/g;
var pattern_lt = /</g;
Expand Down Expand Up @@ -1319,6 +1480,7 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) {
* @return {string} Delta text.
*/
diff_match_patch.prototype.diff_toDelta = function(diffs) {
diffs = this.diff_cleanupSplitSurrogates(diffs);
var text = [];
for (var x = 0; x < diffs.length; x++) {
switch (diffs[x][0]) {
Expand Down Expand Up @@ -1361,7 +1523,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) {
switch (tokens[x].charAt(0)) {
case '+':
try {
diffs[diffsLength++] = [DIFF_INSERT, decodeURI(param)];
diffs[diffsLength++] = [DIFF_INSERT, this.decodeURI(param)];
} catch (ex) {
// Malformed URI sequence.
throw new Error('Illegal escape in diff_fromDelta: ' + param);
Expand Down Expand Up @@ -1597,11 +1759,23 @@ diff_match_patch.prototype.patch_addContext_ = function(patch, text) {
padding += this.Patch_Margin;

// Add the prefix.
if (
patch.start2 - padding > 0 &&
diff_match_patch.prototype.isLowSurrogate(text[patch.start2 - padding])
) {
padding++;
}
var prefix = text.substring(patch.start2 - padding, patch.start2);
if (prefix) {
patch.diffs.unshift([DIFF_EQUAL, prefix]);
}
// Add the suffix.
if (
patch.start2 + patch.length1 + padding < text.length &&
diff_match_patch.prototype.isHighSurrogate(text[patch.start2 + patch.length1 + padding])
) {
padding++;
}
var suffix = text.substring(patch.start2 + patch.length1,
patch.start2 + patch.length1 + padding);
if (suffix) {
Expand Down Expand Up @@ -1675,6 +1849,7 @@ diff_match_patch.prototype.patch_make = function(a, opt_b, opt_c) {
if (diffs.length === 0) {
return []; // Get rid of the null case.
}
diffs = this.diff_cleanupSplitSurrogates(diffs);
var patches = [];
var patch = new diff_match_patch.patch_obj();
var patchDiffLength = 0; // Keeping our own length var is faster in JS.
Expand Down Expand Up @@ -2171,6 +2346,7 @@ diff_match_patch.patch_obj.prototype.toString = function() {
var text = ['@@ -' + coords1 + ' +' + coords2 + ' @@\n'];
var op;
// Escape the body of the patch with %xx notation.
diff_match_patch.prototype.diff_cleanupSplitSurrogates(this.diffs);
for (var x = 0; x < this.diffs.length; x++) {
switch (this.diffs[x][0]) {
case DIFF_INSERT:
Expand Down
133 changes: 132 additions & 1 deletion test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,137 @@ function testDiffDelta() {
// Convert delta string into a diff.
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));


diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']];
try {
delta = dmp.diff_toDelta(diffs);
assertEquals('=2\t+%F0%9F%99%8C\t=2', delta);
} catch ( e ) {
assertEquals(false, true);
}

(function(){
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
U+1F18x 🆎
0 1 2 3 4 5 6 7 8 9 A B C D E F
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
U+1F21x 🈚
U+1F22x 🈯
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
U+1F25x 🉐 🉑
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;

// applies some random edits to string and returns new, edited string
function applyRandomTextEdit(text) {
let textArr = [...text];
let r = Math.random();
if(r < 1/3) { // swap
let swapCount = Math.floor(Math.random()*5);
for(let i = 0; i < swapCount; i++) {
let swapPos1 = Math.floor(Math.random()*textArr.length);
let swapPos2 = Math.floor(Math.random()*textArr.length);
let char1 = textArr[swapPos1];
let char2 = textArr[swapPos2];
textArr[swapPos1] = char2;
textArr[swapPos2] = char1;
}
} else if(r < 2/3) { // remove
let removeCount = Math.floor(Math.random()*5);
for(let i = 0; i < removeCount; i++) {
let removePos = Math.floor(Math.random()*textArr.length);
textArr[removePos] = "";
}
} else { // add
let addCount = Math.floor(Math.random()*5);
for(let i = 0; i < addCount; i++) {
let addPos = Math.floor(Math.random()*textArr.length);
let addFromPos = Math.floor(Math.random()*textArr.length);
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
}
}
return textArr.join("");
}

for(let i = 0; i < 1000; i++) {
const newText = applyRandomTextEdit(originalText);
dmp.diff_toDelta(dmp.diff_main(originalText, newText));
}
})();

// Unicode - splitting surrogates
try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Inserting similar surrogate pair at beginning', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Inserting similar surrogate pair in the middle', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Deleting similar surrogate pair at the beginning', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]),
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
);
} catch ( e ) {
assertEquals('Deleting similar surrogate pair in the middle', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]),
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]),
);
} catch ( e ) {
assertEquals('Swap surrogate pair', 'crashed');
}

try {
assertEquivalent(
dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]),
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]),
);
} catch ( e ) {
assertEquals('Swap surrogate pair', 'crashed');
}

// Empty diff groups
assertEquivalent(
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]),
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]),
);

// Different versions of the library may have created deltas with
// half of a surrogate pair encoded as if it were valid UTF-8
try {
assertEquivalent(
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
);
} catch ( e ) {
assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
}

// Verify pool of unchanged characters.
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
var text2 = dmp.diff_text2(diffs);
Expand Down Expand Up @@ -1019,4 +1150,4 @@ var tests = [

for (var x = 0; x < tests.length; x++) {
test(tests[x], eval(tests[x]))
}
}

0 comments on commit fdb2b05

Please sign in to comment.