Merge pull request #2 from hackmdio/fix/pathc-stop-breaking-surrogate…

…-pairs fix/stop breaking surrogate pairs
hackmdio · May 12, 2023 · fdb2b05 · fdb2b05
2 parents c2f8fb9 + 1964735
commit fdb2b05
Show file tree

Hide file tree

Showing 2 changed files with 309 additions and 2 deletions.
diff --git a/index.js b/index.js
@@ -1176,6 +1176,166 @@ diff_match_patch.prototype.diff_cleanupMerge = function(diffs) {
   }
 };
 
+/**
+ * Rearrange diff boundaries that split Unicode surrogate pairs.
+ * 
+ * @param {!Array.<!diff_match_patch.Diff>} diffs Array of diff tuples.
+ */
+diff_match_patch.prototype.diff_cleanupSplitSurrogates = function(diffs) {
+  var lastEnd;
+  for (var x = 0; x < diffs.length; x++) {
+    var thisDiff = diffs[x];
+    var thisTop  = thisDiff[1][0];
+    var thisEnd  = thisDiff[1][thisDiff[1].length - 1];
+
+    if (0 === thisDiff[1].length) {
+      diffs.splice(x--, 1);
+      continue;
+    }
+
+    if (thisEnd && this.isHighSurrogate(thisEnd)) {
+      lastEnd = thisEnd;
+      thisDiff[1] = thisDiff[1].slice(0, -1);
+    }
+
+    if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) {
+      thisDiff[1] = lastEnd + thisDiff[1];
+    }
+
+    if (0 === thisDiff[1].length) {
+      diffs.splice(x--, 1);
+      continue;
+    }
+  }
+
+  return diffs;
+};
+
+diff_match_patch.prototype.isHighSurrogate = function(c) {
+  var v = c.charCodeAt(0);
+  return v >= 0xD800 && v <= 0xDBFF;
+};
+
+diff_match_patch.prototype.isLowSurrogate = function(c) {
+  var v = c.charCodeAt(0);
+  return v >= 0xDC00 && v <= 0xDFFF;
+};
+
+diff_match_patch.prototype.digit16 = function(c) {
+  switch (c) {
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    case 'A': case 'a': return 10;
+    case 'B': case 'b': return 11;
+    case 'C': case 'c': return 12;
+    case 'D': case 'd': return 13;
+    case 'E': case 'e': return 14;
+    case 'F': case 'f': return 15;
+    default: throw new Error('Invalid hex-code');
+  }
+};
+
+/**
+ * Decode URI-encoded string but allow for encoded surrogate halves
+ * 
+ * diff_match_patch needs this relaxation of the requirements because
+ * not all libraries and versions produce valid URI strings in toDelta
+ * and we don't want to crash this code when the input is valid input
+ * but at the same time invalid utf-8
+ * 
+ * @example: decodeURI( 'abcd%3A %F0%9F%85%B0' ) = 'abcd: \ud83c\udd70'
+ * @example: decodeURI( 'abcd%3A %ED%A0%BC' ) = 'abcd: \ud83c'
+ * 
+ * @cite: @mathiasbynens utf8.js at https://github.com/mathiasbynens/utf8.js
+ * 
+ * @param {String} text input string encoded by encodeURI() or equivalent
+ * @return {String}
+ */
+diff_match_patch.prototype.decodeURI = function(text) {
+  try {
+    return decodeURI(text);
+  } catch ( e ) {
+    var i = 0;
+    var decoded = '';
+
+    while (i < text.length) {
+      if ( text[i] !== '%' ) {
+        decoded += text[i++];
+        continue;
+      }
+
+      // start a percent-sequence
+      var byte1 = (this.digit16(text[i + 1]) << 4) + this.digit16(text[i + 2]);
+      if ((byte1 & 0x80) === 0) {
+        decoded += String.fromCharCode(byte1);
+        i += 3;
+        continue;
+      }
+
+      if ('%' !== text[i + 3]) {
+        throw new URIError('URI malformed');
+      }
+
+      var byte2 = (this.digit16(text[i + 4]) << 4) + this.digit16(text[i + 5]);
+      if ((byte2 & 0xC0) !== 0x80) {
+        throw new URIError('URI malformed');
+      }
+      byte2 = byte2 & 0x3F;
+      if ((byte1 & 0xE0) === 0xC0) {
+        decoded += String.fromCharCode(((byte1 & 0x1F) << 6) | byte2);
+        i += 6;
+        continue;
+      }
+
+      if ('%' !== text[i + 6]) {
+        throw new URIError('URI malformed');
+      }
+
+      var byte3 = (this.digit16(text[i + 7]) << 4) + this.digit16(text[i + 8]);
+      if ((byte3 & 0xC0) !== 0x80) {
+        throw new URIError('URI malformed');
+      }
+      byte3 = byte3 & 0x3F;
+      if ((byte1 & 0xF0) === 0xE0) {
+        // unpaired surrogate are fine here
+        decoded += String.fromCharCode(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3);
+        i += 9;
+        continue;
+      }
+
+      if ('%' !== text[i + 9]) {
+        throw new URIError('URI malformed');
+      }
+
+      var byte4 = (this.digit16(text[i + 10]) << 4) + this.digit16(text[i + 11]);
+      if ((byte4 & 0xC0) !== 0x80) {
+        throw new URIError('URI malformed');
+      }
+      byte4 = byte4 & 0x3F;
+      if ((byte1 & 0xF8) === 0xF0) {
+        var codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
+        if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
+          decoded += String.fromCharCode((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800);
+          decoded += String.fromCharCode(0xDC00 | (codePoint & 0xFFFF) & 0x3FF);
+          i += 12;
+          continue;
+        }
+      }
+
+      throw new URIError('URI malformed');
+    }
+
+    return decoded;
+  }
+};
 
 /**
  * loc is a location in text1, compute and return the equivalent location in
@@ -1219,6 +1379,7 @@ diff_match_patch.prototype.diff_xIndex = function(diffs, loc) {
  * @return {string} HTML representation.
  */
 diff_match_patch.prototype.diff_prettyHtml = function(diffs) {
+  diffs = this.diff_cleanupSplitSurrogates(diffs);
   var html = [];
   var pattern_amp = /&/g;
   var pattern_lt = /</g;
@@ -1319,6 +1480,7 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) {
  * @return {string} Delta text.
  */
 diff_match_patch.prototype.diff_toDelta = function(diffs) {
+  diffs = this.diff_cleanupSplitSurrogates(diffs);
   var text = [];
   for (var x = 0; x < diffs.length; x++) {
     switch (diffs[x][0]) {
@@ -1361,7 +1523,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) {
     switch (tokens[x].charAt(0)) {
       case '+':
         try {
-          diffs[diffsLength++] = [DIFF_INSERT, decodeURI(param)];
+          diffs[diffsLength++] = [DIFF_INSERT, this.decodeURI(param)];
         } catch (ex) {
           // Malformed URI sequence.
           throw new Error('Illegal escape in diff_fromDelta: ' + param);
@@ -1597,11 +1759,23 @@ diff_match_patch.prototype.patch_addContext_ = function(patch, text) {
   padding += this.Patch_Margin;
 
   // Add the prefix.
+  if (
+    patch.start2 - padding > 0 &&
+    diff_match_patch.prototype.isLowSurrogate(text[patch.start2 - padding])
+  ) {
+    padding++;
+  }
   var prefix = text.substring(patch.start2 - padding, patch.start2);
   if (prefix) {
     patch.diffs.unshift([DIFF_EQUAL, prefix]);
   }
   // Add the suffix.
+  if (
+    patch.start2 + patch.length1 + padding < text.length &&
+    diff_match_patch.prototype.isHighSurrogate(text[patch.start2 + patch.length1 + padding])
+  ) {
+    padding++;
+  }
   var suffix = text.substring(patch.start2 + patch.length1,
                               patch.start2 + patch.length1 + padding);
   if (suffix) {
@@ -1675,6 +1849,7 @@ diff_match_patch.prototype.patch_make = function(a, opt_b, opt_c) {
   if (diffs.length === 0) {
     return [];  // Get rid of the null case.
   }
+  diffs = this.diff_cleanupSplitSurrogates(diffs);
   var patches = [];
   var patch = new diff_match_patch.patch_obj();
   var patchDiffLength = 0;  // Keeping our own length var is faster in JS.
@@ -2171,6 +2346,7 @@ diff_match_patch.patch_obj.prototype.toString = function() {
   var text = ['@@ -' + coords1 + ' +' + coords2 + ' @@\n'];
   var op;
   // Escape the body of the patch with %xx notation.
+  diff_match_patch.prototype.diff_cleanupSplitSurrogates(this.diffs);
   for (var x = 0; x < this.diffs.length; x++) {
     switch (this.diffs[x][0]) {
       case DIFF_INSERT:

diff --git a/test/index.js b/test/index.js
@@ -483,6 +483,137 @@ function testDiffDelta() {
   // Convert delta string into a diff.
   assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
 
+
+  diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']];
+  try {
+    delta = dmp.diff_toDelta(diffs);
+    assertEquals('=2\t+%F0%9F%99%8C\t=2', delta);
+  } catch ( e ) {
+    assertEquals(false, true);
+  }
+
+  (function(){
+    const originalText = `U+1F17x	🅰️	🅱️		🅾️	🅿️ safhawifhkw
+    U+1F18x															🆎	
+    0	1	2	3	4	5	6	7	8	9	A	B	C	D	E	F
+    U+1F19x		🆑	🆒	🆓	🆔	🆕	🆖	🆗	🆘	🆙	🆚					
+    U+1F20x		🈁	🈂️							sfss.,_||saavvvbbds						
+    U+1F21x	🈚					
+    U+1F22x			🈯
+    U+1F23x			🈲	🈳	🈴	🈵	🈶	🈷️	🈸	🈹	🈺					
+    U+1F25x	🉐	🉑		
+    U+1F30x	🌀	🌁	🌂	🌃	🌄	🌅	🌆	🌇	🌈	🌉	🌊	🌋	🌌	🌍	🌎	🌏
+    U+1F31x	🌐	🌑	🌒	🌓	🌔	🌕	🌖	🌗	🌘	🌙	🌚	🌛	🌜	🌝	🌞	`;
+
+    // applies some random edits to string and returns new, edited string
+    function applyRandomTextEdit(text) {
+      let textArr = [...text];
+      let r = Math.random();
+      if(r < 1/3) { // swap
+      let swapCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < swapCount; i++) {
+        let swapPos1 = Math.floor(Math.random()*textArr.length);
+          let swapPos2 = Math.floor(Math.random()*textArr.length);
+          let char1 = textArr[swapPos1];
+          let char2 = textArr[swapPos2];
+          textArr[swapPos1] = char2;
+          textArr[swapPos2] = char1;
+        }
+      } else if(r < 2/3) { // remove
+        let removeCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < removeCount; i++) {
+          let removePos = Math.floor(Math.random()*textArr.length);
+          textArr[removePos] = "";
+        }
+      } else { // add
+        let addCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < addCount; i++) {
+          let addPos = Math.floor(Math.random()*textArr.length);
+          let addFromPos = Math.floor(Math.random()*textArr.length);
+          textArr[addPos] = textArr[addPos] + textArr[addFromPos];
+        }
+      }
+      return textArr.join("");
+    }
+
+    for(let i = 0; i < 1000; i++) {
+      const newText = applyRandomTextEdit(originalText);
+      dmp.diff_toDelta(dmp.diff_main(originalText, newText));
+    }
+  })();
+
+  // Unicode - splitting surrogates
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Inserting similar surrogate pair at beginning', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Inserting similar surrogate pair in the middle', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Deleting similar surrogate pair at the beginning', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Deleting similar surrogate pair in the middle', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]),
+      dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]),
+    );
+  } catch ( e ) {
+    assertEquals('Swap surrogate pair', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]),
+      dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]),
+    );
+  } catch ( e ) {
+    assertEquals('Swap surrogate pair', 'crashed');
+  }
+
+  // Empty diff groups
+  assertEquivalent(
+    dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]),
+    dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]),
+  );
+
+  // Different versions of the library may have created deltas with
+  // half of a surrogate pair encoded as if it were valid UTF-8
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
+      dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
+    );
+  } catch ( e ) {
+    assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
+  }
+
   // Verify pool of unchanged characters.
   diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
   var text2 = dmp.diff_text2(diffs);
@@ -1019,4 +1150,4 @@ var tests = [
 
   for (var x = 0; x < tests.length; x++) {
     test(tests[x], eval(tests[x]))
-  }
+  }