Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings
This repository was archived by the owner on Aug 5, 2024. It is now read-only.

Commit 50f1542

Browse files
committed
Python3: Stop breaking surrogate pairs in toDelta()
Resolves #69 for Python3 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
1 parent db1cbba commit 50f1542

File tree

2 files changed

+82
-7
lines changed

2 files changed

+82
-7
lines changed

‎python3/diff_match_patch.py‎

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
__author__ = 'fraser@google.com (Neil Fraser)'
2727

2828
import re
29+
import struct
2930
import sys
3031
import time
3132
import urllib.parse
@@ -1147,14 +1148,17 @@ def diff_toDelta(self, diffs):
11471148
"""
11481149
text = []
11491150
for (op, data) in diffs:
1151+
if 0 == len(data):
1152+
continue
1153+
11501154
if op == self.DIFF_INSERT:
11511155
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
11521156
data = data.encode("utf-8")
11531157
text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+,ドル# "))
11541158
elif op == self.DIFF_DELETE:
1155-
text.append("-%d" % len(data))
1159+
text.append("-%d" % (len(data.encode('utf-16-be')) //2))
11561160
elif op == self.DIFF_EQUAL:
1157-
text.append("=%d" % len(data))
1161+
text.append("=%d" % (len(data.encode('utf-16-be')) //2))
11581162
return "\t".join(text)
11591163

11601164
def diff_fromDelta(self, text1, delta):
@@ -1172,7 +1176,8 @@ def diff_fromDelta(self, text1, delta):
11721176
ValueError: If invalid input.
11731177
"""
11741178
diffs = []
1175-
pointer = 0 # Cursor in text1
1179+
as_utf16 = text1.encode('utf-16-be')
1180+
pointer = 0 # Cursor in as_utf16
11761181
tokens = delta.split("\t")
11771182
for token in tokens:
11781183
if token == "":
@@ -1191,8 +1196,8 @@ def diff_fromDelta(self, text1, delta):
11911196
raise ValueError("Invalid number in diff_fromDelta: " + param)
11921197
if n < 0:
11931198
raise ValueError("Negative number in diff_fromDelta: " + param)
1194-
text = text1[pointer : pointer + n]
1195-
pointer += n
1199+
text = as_utf16[pointer : pointer + n*2].decode('utf-16-be')
1200+
pointer += n*2
11961201
if token[0] == "=":
11971202
diffs.append((self.DIFF_EQUAL, text))
11981203
else:
@@ -1201,10 +1206,10 @@ def diff_fromDelta(self, text1, delta):
12011206
# Anything else is an error.
12021207
raise ValueError("Invalid diff operation in diff_fromDelta: " +
12031208
token[0])
1204-
if pointer != len(text1):
1209+
if pointer != len(as_utf16):
12051210
raise ValueError(
12061211
"Delta length (%d) does not equal source text length (%d)." %
1207-
(pointer, len(text1)))
1212+
(pointer, len(as_utf16)))
12081213
return diffs
12091214

12101215
# MATCH FUNCTIONS

‎python3/tests/diff_match_patch_test.py‎

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"""
1919

2020
import imp
21+
import json
2122
import os
2223
import sys
2324
import time
@@ -444,6 +445,12 @@ def testDiffDelta(self):
444445
# Convert delta string into a diff.
445446
self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta))
446447

448+
diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B")
449+
delta = self.dmp.diff_toDelta(diffs)
450+
self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta)
451+
452+
self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2"))
453+
447454
# Verify pool of unchanged characters.
448455
diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
449456
text2 = self.dmp.diff_text2(diffs)
@@ -455,6 +462,69 @@ def testDiffDelta(self):
455462
# Convert delta string into a diff.
456463
self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta))
457464

465+
# Unicode: split surrogates
466+
self.assertEqual(
467+
self.dmp.diff_toDelta([
468+
(self.dmp.DIFF_INSERT, '\U0001F171'),
469+
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
470+
]),
471+
self.dmp.diff_toDelta(self.dmp.diff_main(
472+
'\U0001F170\U0001F171',
473+
'\U0001F171\U0001F170\U0001F171'
474+
)),
475+
'Inserting similar surrogate pair at beginning'
476+
)
477+
478+
self.assertEqual(
479+
self.dmp.diff_toDelta([
480+
(self.dmp.DIFF_EQUAL, '\U0001F170'),
481+
(self.dmp.DIFF_INSERT, '\U0001F172'),
482+
(self.dmp.DIFF_EQUAL, '\U0001F171')
483+
]),
484+
self.dmp.diff_toDelta(self.dmp.diff_main(
485+
'\U0001F170\U0001F171',
486+
'\U0001F170\U0001F172\U0001F171'
487+
)),
488+
'Inserting similar surrogate pair in the middle'
489+
)
490+
491+
self.assertEqual(
492+
self.dmp.diff_toDelta([
493+
(self.dmp.DIFF_DELETE, '\U0001F171'),
494+
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
495+
]),
496+
self.dmp.diff_toDelta(self.dmp.diff_main(
497+
'\U0001F171\U0001F170\U0001F171',
498+
'\U0001F170\U0001F171'
499+
)),
500+
'Deleting similar surogate pair at the beginning'
501+
)
502+
503+
self.assertEqual(
504+
self.dmp.diff_toDelta([
505+
(self.dmp.DIFF_EQUAL, '\U0001F170'),
506+
(self.dmp.DIFF_DELETE, '\U0001F172'),
507+
(self.dmp.DIFF_EQUAL, '\U0001F171')
508+
]),
509+
self.dmp.diff_toDelta(self.dmp.diff_main(
510+
'\U0001F170\U0001F172\U0001F171',
511+
'\U0001F170\U0001F171'
512+
)),
513+
'Deleting similar surogate pair in the middle'
514+
)
515+
516+
self.assertEqual(
517+
self.dmp.diff_toDelta([
518+
(self.dmp.DIFF_DELETE, '\U0001F170'),
519+
(self.dmp.DIFF_INSERT, '\U0001F171')
520+
]),
521+
self.dmp.diff_toDelta(self.dmp.diff_main(
522+
'\U0001F170',
523+
'\U0001F171'
524+
)),
525+
'Swap surrogate pair'
526+
)
527+
458528
# 160 kb string.
459529
a = "abcdefghij"
460530
for i in range(14):

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /