Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit db1cbba

Browse files
committed
Python2: Stop breaking surrogate pairs in toDelta()
Resolves google#69 for Python2 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
1 parent dfadc9c commit db1cbba

File tree

2 files changed

+110
-4
lines changed

2 files changed

+110
-4
lines changed

‎python2/diff_match_patch.py‎

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
__author__ = 'fraser@google.com (Neil Fraser)'
2929

3030
import re
31+
import struct
3132
import sys
3233
import time
3334
import urllib
@@ -1135,6 +1136,14 @@ def diff_levenshtein(self, diffs):
11351136
levenshtein += max(insertions, deletions)
11361137
return levenshtein
11371138

1139+
@classmethod
1140+
def is_high_surrogate(cls, c):
1141+
return 0xd800 <= struct.unpack('>H', c)[0] <= 0xdbff
1142+
1143+
@classmethod
1144+
def is_low_surrogate(cls, c):
1145+
return 0xdc00 <= struct.unpack('>H', c)[0] <= 0xdfff
1146+
11381147
def diff_toDelta(self, diffs):
11391148
"""Crush the diff into an encoded string which describes the operations
11401149
required to transform text1 into text2.
@@ -1148,15 +1157,32 @@ def diff_toDelta(self, diffs):
11481157
Delta text.
11491158
"""
11501159
text = []
1160+
last_end = None
11511161
for (op, data) in diffs:
1162+
if 0 == len(data):
1163+
continue
1164+
1165+
encoded = data.encode('utf-16be')
1166+
this_top = encoded[0:2]
1167+
this_end = encoded[-2:]
1168+
1169+
if self.is_high_surrogate(this_end):
1170+
last_end = this_end
1171+
encoded = encoded[0:-2]
1172+
1173+
if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top):
1174+
encoded = last_end + encoded
1175+
1176+
if 0 == len(encoded):
1177+
continue
1178+
11521179
if op == self.DIFF_INSERT:
11531180
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
1154-
data = data.encode("utf-8")
1155-
text.append("+" + urllib.quote(data, "!~*'();/?:@&=+,ドル# "))
1181+
text.append("+" + urllib.quote(encoded.decode('utf-16be').encode('utf-8'), "!~*'();/?:@&=+,ドル# "))
11561182
elif op == self.DIFF_DELETE:
1157-
text.append("-%d" % len(data))
1183+
text.append("-%d" % (len(encoded) //2))
11581184
elif op == self.DIFF_EQUAL:
1159-
text.append("=%d" % len(data))
1185+
text.append("=%d" % (len(encoded) //2))
11601186
return "\t".join(text)
11611187

11621188
def diff_fromDelta(self, text1, delta):

‎python2/tests/diff_match_patch_test.py‎

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,86 @@ def testDiffDelta(self):
441441
# Convert delta string into a diff.
442442
self.assertEquals(diffs, self.dmp.diff_fromDelta(text1, delta))
443443

444+
diffs = [(self.dmp.DIFF_EQUAL, u"\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, u"\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, u"\ude4b")]
445+
delta = self.dmp.diff_toDelta(diffs)
446+
self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta)
447+
448+
# Unicode: split surrogates
449+
# Inserting similar surrogate pair at beginning
450+
self.assertEquals(
451+
self.dmp.diff_toDelta([
452+
(self.dmp.DIFF_INSERT, u'\U0001F171'),
453+
(self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')
454+
]),
455+
self.dmp.diff_toDelta(self.dmp.diff_main(
456+
u'\U0001F170\U0001F171',
457+
u'\U0001F171\U0001F170\U0001F171'
458+
))
459+
)
460+
461+
# Inserting similar surrogate pair in the middle
462+
self.assertEquals(
463+
self.dmp.diff_toDelta([
464+
(self.dmp.DIFF_EQUAL, u'\U0001F170'),
465+
(self.dmp.DIFF_INSERT, u'\U0001F172'),
466+
(self.dmp.DIFF_EQUAL, u'\U0001F171')
467+
]),
468+
self.dmp.diff_toDelta(self.dmp.diff_main(
469+
u'\U0001F170\U0001F171',
470+
u'\U0001F170\U0001F172\U0001F171'
471+
))
472+
)
473+
474+
# Deleting similar surogate pair at the beginning
475+
self.assertEquals(
476+
self.dmp.diff_toDelta([
477+
(self.dmp.DIFF_DELETE, u'\U0001F171'),
478+
(self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')
479+
]),
480+
self.dmp.diff_toDelta(self.dmp.diff_main(
481+
u'\U0001F171\U0001F170\U0001F171',
482+
u'\U0001F170\U0001F171'
483+
))
484+
)
485+
486+
# Deleting similar surogate pair in the middle
487+
self.assertEquals(
488+
self.dmp.diff_toDelta([
489+
(self.dmp.DIFF_EQUAL, u'\U0001F170'),
490+
(self.dmp.DIFF_DELETE, u'\U0001F172'),
491+
(self.dmp.DIFF_EQUAL, u'\U0001F171')
492+
]),
493+
self.dmp.diff_toDelta(self.dmp.diff_main(
494+
u'\U0001F170\U0001F172\U0001F171',
495+
u'\U0001F170\U0001F171'
496+
))
497+
)
498+
499+
# Swap surrogate pair
500+
self.assertEquals(
501+
self.dmp.diff_toDelta([
502+
(self.dmp.DIFF_DELETE, u'\U0001F170'),
503+
(self.dmp.DIFF_INSERT, u'\U0001F171')
504+
]),
505+
self.dmp.diff_toDelta(self.dmp.diff_main(
506+
u'\U0001F170',
507+
u'\U0001F171'
508+
))
509+
)
510+
511+
# Swap surrogate pair, force the invalid diff groups
512+
self.assertEquals(
513+
self.dmp.diff_toDelta([
514+
(self.dmp.DIFF_INSERT, u'\U0001F170'),
515+
(self.dmp.DIFF_DELETE, u'\U0001F171')
516+
]),
517+
self.dmp.diff_toDelta([
518+
(self.dmp.DIFF_EQUAL, u'\ud83c'),
519+
(self.dmp.DIFF_INSERT, u'\udd70'),
520+
(self.dmp.DIFF_DELETE, u'\udd71')
521+
])
522+
)
523+
444524
# Verify pool of unchanged characters.
445525
diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
446526
text2 = self.dmp.diff_text2(diffs)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /