Commit db1cbba

committed

Python2: Stop breaking surrogate pairs in toDelta()

Resolves google#69 for Python2 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.

1 parent dfadc9c commit db1cbbaCopy full SHA for db1cbba

File tree

2 files changed

+110

-4

lines changed

python2
- diff_match_patch.py
- tests
  - diff_match_patch_test.py

2 files changed

+110

-4

lines changed

`‎python2/diff_match_patch.py‎`

Lines changed: 30 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`__author__ = 'fraser@google.com (Neil Fraser)'`
`29`	`29`
`30`	`30`	`import re`
	`31`	`+import struct`
`31`	`32`	`import sys`
`32`	`33`	`import time`
`33`	`34`	`import urllib`
`@@ -1135,6 +1136,14 @@ def diff_levenshtein(self, diffs):`
`1135`	`1136`	`levenshtein += max(insertions, deletions)`
`1136`	`1137`	`return levenshtein`
`1137`	`1138`
	`1139`	`+ @classmethod`
	`1140`	`+ def is_high_surrogate(cls, c):`
	`1141`	`+ return 0xd800 <= struct.unpack('>H', c)[0] <= 0xdbff`
	`1142`	`+`
	`1143`	`+ @classmethod`
	`1144`	`+ def is_low_surrogate(cls, c):`
	`1145`	`+ return 0xdc00 <= struct.unpack('>H', c)[0] <= 0xdfff`
	`1146`	`+`
`1138`	`1147`	`def diff_toDelta(self, diffs):`
`1139`	`1148`	`"""Crush the diff into an encoded string which describes the operations`
`1140`	`1149`	`required to transform text1 into text2.`
`@@ -1148,15 +1157,32 @@ def diff_toDelta(self, diffs):`
`1148`	`1157`	`Delta text.`
`1149`	`1158`	`"""`
`1150`	`1159`	`text = []`
	`1160`	`+ last_end = None`
`1151`	`1161`	`for (op, data) in diffs:`
	`1162`	`+ if 0 == len(data):`
	`1163`	`+ continue`
	`1164`	`+`
	`1165`	`+ encoded = data.encode('utf-16be')`
	`1166`	`+ this_top = encoded[0:2]`
	`1167`	`+ this_end = encoded[-2:]`
	`1168`	`+`
	`1169`	`+ if self.is_high_surrogate(this_end):`
	`1170`	`+ last_end = this_end`
	`1171`	`+ encoded = encoded[0:-2]`
	`1172`	`+`
	`1173`	`+ if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top):`
	`1174`	`+ encoded = last_end + encoded`
	`1175`	`+`
	`1176`	`+ if 0 == len(encoded):`
	`1177`	`+ continue`
	`1178`	`+`
`1152`	`1179`	`if op == self.DIFF_INSERT:`
`1153`	`1180`	`# High ascii will raise UnicodeDecodeError. Use Unicode instead.`
`1154`		`- data = data.encode("utf-8")`
`1155`		`- text.append("+" + urllib.quote(data, "!~*'();/?:@&=+,ドル# "))`
	`1181`	`+ text.append("+" + urllib.quote(encoded.decode('utf-16be').encode('utf-8'), "!~*'();/?:@&=+,ドル# "))`
`1156`	`1182`	`elif op == self.DIFF_DELETE:`
`1157`		`- text.append("-%d" % len(data))`
	`1183`	`+ text.append("-%d" % (len(encoded) //2))`
`1158`	`1184`	`elif op == self.DIFF_EQUAL:`
`1159`		`- text.append("=%d" % len(data))`
	`1185`	`+ text.append("=%d" % (len(encoded) //2))`
`1160`	`1186`	`return "\t".join(text)`
`1161`	`1187`
`1162`	`1188`	`def diff_fromDelta(self, text1, delta):`

`‎python2/tests/diff_match_patch_test.py‎`

Lines changed: 80 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -441,6 +441,86 @@ def testDiffDelta(self):`
`441`	`441`	`# Convert delta string into a diff.`
`442`	`442`	`self.assertEquals(diffs, self.dmp.diff_fromDelta(text1, delta))`
`443`	`443`
	`444`	`+ diffs = [(self.dmp.DIFF_EQUAL, u"\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, u"\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, u"\ude4b")]`
	`445`	`+ delta = self.dmp.diff_toDelta(diffs)`
	`446`	`+ self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta)`
	`447`	`+`
	`448`	`+ # Unicode: split surrogates`
	`449`	`+ # Inserting similar surrogate pair at beginning`
	`450`	`+ self.assertEquals(`
	`451`	`+ self.dmp.diff_toDelta([`
	`452`	`+ (self.dmp.DIFF_INSERT, u'\U0001F171'),`
	`453`	`+ (self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')`
	`454`	`+ ]),`
	`455`	`+ self.dmp.diff_toDelta(self.dmp.diff_main(`
	`456`	`+ u'\U0001F170\U0001F171',`
	`457`	`+ u'\U0001F171\U0001F170\U0001F171'`
	`458`	`+ ))`
	`459`	`+ )`
	`460`	`+`
	`461`	`+ # Inserting similar surrogate pair in the middle`
	`462`	`+ self.assertEquals(`
	`463`	`+ self.dmp.diff_toDelta([`
	`464`	`+ (self.dmp.DIFF_EQUAL, u'\U0001F170'),`
	`465`	`+ (self.dmp.DIFF_INSERT, u'\U0001F172'),`
	`466`	`+ (self.dmp.DIFF_EQUAL, u'\U0001F171')`
	`467`	`+ ]),`
	`468`	`+ self.dmp.diff_toDelta(self.dmp.diff_main(`
	`469`	`+ u'\U0001F170\U0001F171',`
	`470`	`+ u'\U0001F170\U0001F172\U0001F171'`
	`471`	`+ ))`
	`472`	`+ )`
	`473`	`+`
	`474`	`+ # Deleting similar surogate pair at the beginning`
	`475`	`+ self.assertEquals(`
	`476`	`+ self.dmp.diff_toDelta([`
	`477`	`+ (self.dmp.DIFF_DELETE, u'\U0001F171'),`
	`478`	`+ (self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')`
	`479`	`+ ]),`
	`480`	`+ self.dmp.diff_toDelta(self.dmp.diff_main(`
	`481`	`+ u'\U0001F171\U0001F170\U0001F171',`
	`482`	`+ u'\U0001F170\U0001F171'`
	`483`	`+ ))`
	`484`	`+ )`
	`485`	`+`
	`486`	`+ # Deleting similar surogate pair in the middle`
	`487`	`+ self.assertEquals(`
	`488`	`+ self.dmp.diff_toDelta([`
	`489`	`+ (self.dmp.DIFF_EQUAL, u'\U0001F170'),`
	`490`	`+ (self.dmp.DIFF_DELETE, u'\U0001F172'),`
	`491`	`+ (self.dmp.DIFF_EQUAL, u'\U0001F171')`
	`492`	`+ ]),`
	`493`	`+ self.dmp.diff_toDelta(self.dmp.diff_main(`
	`494`	`+ u'\U0001F170\U0001F172\U0001F171',`
	`495`	`+ u'\U0001F170\U0001F171'`
	`496`	`+ ))`
	`497`	`+ )`
	`498`	`+`
	`499`	`+ # Swap surrogate pair`
	`500`	`+ self.assertEquals(`
	`501`	`+ self.dmp.diff_toDelta([`
	`502`	`+ (self.dmp.DIFF_DELETE, u'\U0001F170'),`
	`503`	`+ (self.dmp.DIFF_INSERT, u'\U0001F171')`
	`504`	`+ ]),`
	`505`	`+ self.dmp.diff_toDelta(self.dmp.diff_main(`
	`506`	`+ u'\U0001F170',`
	`507`	`+ u'\U0001F171'`
	`508`	`+ ))`
	`509`	`+ )`
	`510`	`+`
	`511`	`+ # Swap surrogate pair, force the invalid diff groups`
	`512`	`+ self.assertEquals(`
	`513`	`+ self.dmp.diff_toDelta([`
	`514`	`+ (self.dmp.DIFF_INSERT, u'\U0001F170'),`
	`515`	`+ (self.dmp.DIFF_DELETE, u'\U0001F171')`
	`516`	`+ ]),`
	`517`	`+ self.dmp.diff_toDelta([`
	`518`	`+ (self.dmp.DIFF_EQUAL, u'\ud83c'),`
	`519`	`+ (self.dmp.DIFF_INSERT, u'\udd70'),`
	`520`	`+ (self.dmp.DIFF_DELETE, u'\udd71')`
	`521`	`+ ])`
	`522`	`+ )`
	`523`	`+`
`444`	`524`	`# Verify pool of unchanged characters.`
`445`	`525`	`diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]`
`446`	`526`	`text2 = self.dmp.diff_text2(diffs)`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit db1cbba

File tree

2 files changed

2 files changed

`‎python2/diff_match_patch.py‎`

`‎python2/tests/diff_match_patch_test.py‎`

0 commit comments