I pulled this out of my code bucket and dusted it off earlier today in response to a post over on SO that made me cringe. This was originally written to highlight changes in Excel cells in real time via Worksheet_Change
, so it is designed with an eye toward raw speed to avoid blocking the UI. Benchmarks on current hardware are running around 3 seconds to compare 2 1kb strings.
I cleaned it up a bit to modernize the coding style, but I'm mainly looking for input on the algorithm used and suggestions for making the code a bit more understandable for somebody who isn't familiar with it.
It basically works on byte arrays and tracks the current working position in each array with a set of index pointers for the "start" and "end" of each substring that it's working with. The algorithm is similar to a binary search. The entry point function finds the longest matching substring in the two byte arrays, excludes it, logs all the differences to the output, and then recursively calls itself on the slices of the arrays to the right and left of the match:
'Returns a comma delimited string containing the positions of differences in the passed byte arrays. Recursive.
'Arrays are not modified, index parameters specify where the pointers are in the arrays on each subsequent call.
Private Function FindDifferences(ByRef first() As Byte, ByRef other() As Byte, Optional ByVal firstStartIndex As Long = -1, _
Optional ByVal firstEndIndex As Long, Optional ByVal otherStartIndex As Long, _
Optional ByVal otherEndIndex As Long) As String
If firstStartIndex = -1 Then
'Find matching substrings and set index markers.
SkipSubstringMatches first, other, firstStartIndex, firstEndIndex, otherStartIndex, otherEndIndex
If firstEndIndex = -1 And otherEndIndex > 0 Then
'All matches in first.
Exit Function
ElseIf otherEndIndex = -1 And firstEndIndex > 0 Then
'All matches in other.
FindDifferences = FormatIndexSpanForOutput(firstStartIndex, firstEndIndex)
Exit Function
ElseIf firstEndIndex = -1 And otherEndIndex = -1 Then
'Identical input.
Exit Function
End If
End If
Dim matchLength As Long
Dim firstMatch As Long
Dim otherMatch As Long
FindNextMatch first, other, firstStartIndex, firstEndIndex, otherStartIndex, otherEndIndex, firstMatch, otherMatch, matchLength
Dim differences As String
Dim returnValue As String
'Test to see if there are unmatched chars.
If matchLength <> 0 Then
differences = FindDifferences(first, other, firstStartIndex, firstMatch - 1, otherStartIndex, otherMatch - 1)
If Len(differences) <> 0 Then returnValue = returnValue & "," & differences
differences = FindDifferences(first, other, firstMatch + matchLength, firstEndIndex, otherMatch + matchLength, otherEndIndex)
If Len(differences) <> 0 Then returnValue = returnValue & "," & differences
Else
returnValue = returnValue & "," & FormatIndexSpanForOutput(firstStartIndex, firstEndIndex)
End If
If Right$(returnValue, 1) = "," Then
returnValue = Left$(returnValue, Len(returnValue) - 1)
End If
If Left$(returnValue, 1) = "," Then
returnValue = Right$(returnValue, Len(returnValue) - 1)
End If
FindDifferences = returnValue
End Function
There are a couple helper functions for managing the array indexes - the first skips matching substrings in the 2 arrays:
'Sets ByRef index parameters to the position of the first mismatched byte from both the start and end. Arrays are not modified.
Private Sub SkipSubstringMatches(ByRef first() As Byte, ByRef other() As Byte, ByRef firstStartIndex As Long, _
ByRef firstEndIndex As Long, ByRef otherStartIndex As Long, ByRef otherEndIndex As Long)
Dim topFirst As Long
Dim topOther As Long
Dim baseFirst As Long
topFirst = UBound(first)
topOther = UBound(other)
baseFirst = LBound(first)
Dim lower As Long
If topFirst >= topOther Then
lower = topOther
Else
lower = topFirst
End If
Dim index As Long
Do Until index > lower
If first(index) <> other(index) Then
Exit Do
End If
index = index + 1
Loop
firstStartIndex = index
otherStartIndex = index
'-1 indicates all matches.
If index > topFirst Then
firstEndIndex = -1
otherEndIndex = topOther
ElseIf index > topOther Then
otherEndIndex = -1
firstEndIndex = topFirst
End If
If firstEndIndex = -1 Or otherEndIndex = -1 Then
Exit Sub
Else
Do Until first(topFirst) <> other(topOther)
topFirst = topFirst - 1
topOther = topOther - 1
If topFirst < baseFirst Or topOther < baseFirst Then
Exit Do
End If
Loop
firstEndIndex = topFirst
otherEndIndex = topOther
End If
End Sub
...and a procedure that seeks the next match (finds the start and ending indexes of the mismatch):
'Advance indexes until the next matches are found.
Private Sub FindNextMatch(ByRef first() As Byte, ByRef other() As Byte, ByRef firstStartIndex As Long, _
ByRef firstEndIndex As Long, ByRef otherStartIndex As Long, ByRef otherEndIndex As Long, _
ByRef matchPositionFirst As Long, ByRef matchPositionOther As Long, ByRef matchLength As Long)
Dim tempIndex As Long
Dim result As Long
Dim firstIndex As Long
Dim otherIndex As Long
For otherIndex = otherStartIndex To otherEndIndex
firstIndex = firstStartIndex
Do Until firstIndex >= firstEndIndex
'Seek forward in first until there is a match.
Do Until other(otherIndex) = first(firstIndex)
firstIndex = firstIndex + 1
If firstIndex = firstEndIndex Then
Exit Do
End If
Loop
'Concurrently seek forward in both until a mismatch is found.
tempIndex = otherIndex
Do Until other(tempIndex) <> first(firstIndex)
tempIndex = tempIndex + 1
firstIndex = firstIndex + 1
If firstIndex > firstEndIndex Then
Exit Do
End If
If tempIndex > otherEndIndex Then
Exit Do
End If
Loop
'Calculate match indexes and length.
result = tempIndex - otherIndex
If result > matchLength Then
matchLength = result
matchPositionOther = otherIndex
matchPositionFirst = firstIndex - matchLength
End If
If matchLength > firstEndIndex - firstIndex Then
Exit Do
End If
Loop
If matchLength + otherIndex > otherEndIndex Then
'No possible matches left.
Exit For
End If
Next
End Sub
Finally, there's a helper function that formats the output. This could be pretty much anything - in this case it's a comma delimited string of 1 based character indexes (that was the convenient output for the original calling code):
'Returns a comma delimited string of indexes between starting and ending, rebases to 1 base.
Private Function FormatIndexSpanForOutput(ByVal starting As Long, ByVal ending As Long) As String
Dim returnValue As String
If starting = ending Then
returnValue = CStr(ending + 1)
Else
Dim index As Long
For index = starting To ending - 1
returnValue = returnValue & CStr(index + 1) & ","
Next index
If starting < ending Then
returnValue = returnValue & CStr(ending + 1)
End If
End If
FormatIndexSpanForOutput = returnValue
End Function
...and a simple wrapper for calling it with string input instead of array input - note that this isn't unicode aware.
'Just a wrapper for passing strings instead of byte arrays.
Public Function StringDiffs(ByVal first As String, ByVal other As String) As String
Dim firstChars() As Byte
Dim otherChars() As Byte
firstChars = StrConv(first, vbFromUnicode)
otherChars = StrConv(other, vbFromUnicode)
StringDiffs = FindDifferences(firstChars, otherChars)
End Function
For convenience (bad side-scrolling here), the full module is also on Pastebin.
Sample usage:
Public Sub Demo()
Dim first As String
Dim other As String
first = "This is a test string."
other = "This was a test thing."
Debug.Print StringDiffs(first, other) & " in string 1 were deleted."
Debug.Print StringDiffs(other, first) & " in string 2 were inserted."
End Sub
Output:
6,16,18 in string 1 were deleted. 6,7,18 in string 2 were inserted.
-
\$\begingroup\$ @Raystafarian - Wow, I'm blind today. I get "2,4,12,13 in string 1 were deleted." and "2,3,12,14 in string 2 were inserted.", which is correct output. Delete the 'e' in "test" and the 'hi' is "this" and the 's' is unchanged. Then insert the 't' at the end of test. \$\endgroup\$Comintern– Comintern2017年03月02日 17:11:05 +00:00Commented Mar 2, 2017 at 17:11
-
\$\begingroup\$ I guess I misunderstood, it's more of a transmutation of strings? \$\endgroup\$Raystafarian– Raystafarian2017年03月02日 17:15:34 +00:00Commented Mar 2, 2017 at 17:15
-
1\$\begingroup\$ @Raystafarian - Yes and no - consider it like a diff file on, say, git or a Word document with track changes on. The way it was originally used was to color text red and strike it through if it was "deleted" and color text that was added in green. If you remove the character positions in the deletions from left to right, then insert the characters positions in the inserts from left to right it should give you the altered string. \$\endgroup\$Comintern– Comintern2017年03月02日 17:22:53 +00:00Commented Mar 2, 2017 at 17:22
1 Answer 1
Nothing much to complain about with this code, so here's a short list of opportunities that may or may not fall under the realm of personal preference:
Dual-branch conditional assignments like this, where there's only a single, non-side-effecting instruction in each branch:
If topFirst >= topOther Then lower = topOther Else lower = topFirst End If
Can be written as a one-liner assignment with the IIf
function:
lower = IIf(topFirst >= topOther, topOther, topFirst)
Single-liner conditional blocks that basically act as guard clauses:
If firstIndex = firstEndIndex Then Exit Do End If
Can be written with the conditional statement syntax:
If firstIndex = firstEndIndex Then Exit Do
Note that doing that removes a nesting level here:
If firstEndIndex = -1 Or otherEndIndex = -1 Then Exit Sub Else Do Until first(topFirst) <> other(topOther) topFirst = topFirst - 1 topOther = topOther - 1 If topFirst < baseFirst Or topOther < baseFirst Then Exit Do End If Loop firstEndIndex = topFirst otherEndIndex = topOther End If
Becomes:
If firstEndIndex = -1 Or otherEndIndex = -1 Then Exit Sub
Do Until first(topFirst) <> other(topOther)
topFirst = topFirst - 1
topOther = topOther - 1
If topFirst < baseFirst Or topOther < baseFirst Then
Exit Do
End If
Loop
firstEndIndex = topFirst
otherEndIndex = topOther
The declarations-as-close-as-possible-to-first-use style in StringDiffs
wrapper function is not consistent with the rest of the code:
Public Function StringDiffs(ByVal first As String, ByVal other As String) As String Dim firstChars() As Byte Dim otherChars() As Byte firstChars = StrConv(first, vbFromUnicode) otherChars = StrConv(other, vbFromUnicode) StringDiffs = FindDifferences(firstChars, otherChars) End Function
Would have been:
Public Function StringDiffs(ByVal first As String, ByVal other As String) As String
Dim firstChars() As Byte
firstChars = StrConv(first, vbFromUnicode)
Dim otherChars() As Byte
otherChars = StrConv(other, vbFromUnicode)
StringDiffs = FindDifferences(firstChars, otherChars)
End Function
This bit looks a little packed, some vertical whitespace wouldn't hurt:
If matchLength <> 0 Then differences = FindDifferences(first, other, firstStartIndex, firstMatch - 1, otherStartIndex, otherMatch - 1) If Len(differences) <> 0 Then returnValue = returnValue & "," & differences differences = FindDifferences(first, other, firstMatch + matchLength, firstEndIndex, otherMatch + matchLength, otherEndIndex) If Len(differences) <> 0 Then returnValue = returnValue & "," & differences Else
But otherwise the whole code looks great.