11
\$\begingroup\$

I pulled this out of my code bucket and dusted it off earlier today in response to a post over on SO that made me cringe. This was originally written to highlight changes in Excel cells in real time via Worksheet_Change, so it is designed with an eye toward raw speed to avoid blocking the UI. Benchmarks on current hardware are running around 3 seconds to compare 2 1kb strings.

I cleaned it up a bit to modernize the coding style, but I'm mainly looking for input on the algorithm used and suggestions for making the code a bit more understandable for somebody who isn't familiar with it.

It basically works on byte arrays and tracks the current working position in each array with a set of index pointers for the "start" and "end" of each substring that it's working with. The algorithm is similar to a binary search. The entry point function finds the longest matching substring in the two byte arrays, excludes it, logs all the differences to the output, and then recursively calls itself on the slices of the arrays to the right and left of the match:

'Returns a comma delimited string containing the positions of differences in the passed byte arrays. Recursive.
'Arrays are not modified, index parameters specify where the pointers are in the arrays on each subsequent call.
Private Function FindDifferences(ByRef first() As Byte, ByRef other() As Byte, Optional ByVal firstStartIndex As Long = -1, _
 Optional ByVal firstEndIndex As Long, Optional ByVal otherStartIndex As Long, _
 Optional ByVal otherEndIndex As Long) As String
 If firstStartIndex = -1 Then
 'Find matching substrings and set index markers.
 SkipSubstringMatches first, other, firstStartIndex, firstEndIndex, otherStartIndex, otherEndIndex
 If firstEndIndex = -1 And otherEndIndex > 0 Then
 'All matches in first.
 Exit Function
 ElseIf otherEndIndex = -1 And firstEndIndex > 0 Then
 'All matches in other.
 FindDifferences = FormatIndexSpanForOutput(firstStartIndex, firstEndIndex)
 Exit Function
 ElseIf firstEndIndex = -1 And otherEndIndex = -1 Then
 'Identical input.
 Exit Function
 End If
 End If
 Dim matchLength As Long
 Dim firstMatch As Long
 Dim otherMatch As Long
 FindNextMatch first, other, firstStartIndex, firstEndIndex, otherStartIndex, otherEndIndex, firstMatch, otherMatch, matchLength
 Dim differences As String
 Dim returnValue As String
 'Test to see if there are unmatched chars.
 If matchLength <> 0 Then
 differences = FindDifferences(first, other, firstStartIndex, firstMatch - 1, otherStartIndex, otherMatch - 1)
 If Len(differences) <> 0 Then returnValue = returnValue & "," & differences
 differences = FindDifferences(first, other, firstMatch + matchLength, firstEndIndex, otherMatch + matchLength, otherEndIndex)
 If Len(differences) <> 0 Then returnValue = returnValue & "," & differences
 Else
 returnValue = returnValue & "," & FormatIndexSpanForOutput(firstStartIndex, firstEndIndex)
 End If
 If Right$(returnValue, 1) = "," Then
 returnValue = Left$(returnValue, Len(returnValue) - 1)
 End If
 If Left$(returnValue, 1) = "," Then
 returnValue = Right$(returnValue, Len(returnValue) - 1)
 End If
 FindDifferences = returnValue
End Function

There are a couple helper functions for managing the array indexes - the first skips matching substrings in the 2 arrays:

'Sets ByRef index parameters to the position of the first mismatched byte from both the start and end. Arrays are not modified.
Private Sub SkipSubstringMatches(ByRef first() As Byte, ByRef other() As Byte, ByRef firstStartIndex As Long, _
 ByRef firstEndIndex As Long, ByRef otherStartIndex As Long, ByRef otherEndIndex As Long)
 Dim topFirst As Long
 Dim topOther As Long
 Dim baseFirst As Long
 topFirst = UBound(first)
 topOther = UBound(other)
 baseFirst = LBound(first)
 Dim lower As Long
 If topFirst >= topOther Then
 lower = topOther
 Else
 lower = topFirst
 End If
 Dim index As Long
 Do Until index > lower
 If first(index) <> other(index) Then
 Exit Do
 End If
 index = index + 1
 Loop
 firstStartIndex = index
 otherStartIndex = index
 '-1 indicates all matches.
 If index > topFirst Then
 firstEndIndex = -1
 otherEndIndex = topOther
 ElseIf index > topOther Then
 otherEndIndex = -1
 firstEndIndex = topFirst
 End If
 If firstEndIndex = -1 Or otherEndIndex = -1 Then
 Exit Sub
 Else
 Do Until first(topFirst) <> other(topOther)
 topFirst = topFirst - 1
 topOther = topOther - 1
 If topFirst < baseFirst Or topOther < baseFirst Then
 Exit Do
 End If
 Loop
 firstEndIndex = topFirst
 otherEndIndex = topOther
 End If
End Sub

...and a procedure that seeks the next match (finds the start and ending indexes of the mismatch):

'Advance indexes until the next matches are found.
Private Sub FindNextMatch(ByRef first() As Byte, ByRef other() As Byte, ByRef firstStartIndex As Long, _
 ByRef firstEndIndex As Long, ByRef otherStartIndex As Long, ByRef otherEndIndex As Long, _
 ByRef matchPositionFirst As Long, ByRef matchPositionOther As Long, ByRef matchLength As Long)
 Dim tempIndex As Long
 Dim result As Long
 Dim firstIndex As Long
 Dim otherIndex As Long
 For otherIndex = otherStartIndex To otherEndIndex
 firstIndex = firstStartIndex
 Do Until firstIndex >= firstEndIndex
 'Seek forward in first until there is a match.
 Do Until other(otherIndex) = first(firstIndex)
 firstIndex = firstIndex + 1
 If firstIndex = firstEndIndex Then
 Exit Do
 End If
 Loop
 'Concurrently seek forward in both until a mismatch is found.
 tempIndex = otherIndex
 Do Until other(tempIndex) <> first(firstIndex)
 tempIndex = tempIndex + 1
 firstIndex = firstIndex + 1
 If firstIndex > firstEndIndex Then
 Exit Do
 End If
 If tempIndex > otherEndIndex Then
 Exit Do
 End If
 Loop
 'Calculate match indexes and length.
 result = tempIndex - otherIndex
 If result > matchLength Then
 matchLength = result
 matchPositionOther = otherIndex
 matchPositionFirst = firstIndex - matchLength
 End If
 If matchLength > firstEndIndex - firstIndex Then
 Exit Do
 End If
 Loop
 If matchLength + otherIndex > otherEndIndex Then
 'No possible matches left.
 Exit For
 End If
 Next
End Sub

Finally, there's a helper function that formats the output. This could be pretty much anything - in this case it's a comma delimited string of 1 based character indexes (that was the convenient output for the original calling code):

'Returns a comma delimited string of indexes between starting and ending, rebases to 1 base.
Private Function FormatIndexSpanForOutput(ByVal starting As Long, ByVal ending As Long) As String
 Dim returnValue As String
 If starting = ending Then
 returnValue = CStr(ending + 1)
 Else
 Dim index As Long
 For index = starting To ending - 1
 returnValue = returnValue & CStr(index + 1) & ","
 Next index
 If starting < ending Then
 returnValue = returnValue & CStr(ending + 1)
 End If
 End If
 FormatIndexSpanForOutput = returnValue
End Function

...and a simple wrapper for calling it with string input instead of array input - note that this isn't unicode aware.

'Just a wrapper for passing strings instead of byte arrays.
Public Function StringDiffs(ByVal first As String, ByVal other As String) As String
 Dim firstChars() As Byte
 Dim otherChars() As Byte
 firstChars = StrConv(first, vbFromUnicode)
 otherChars = StrConv(other, vbFromUnicode)
 StringDiffs = FindDifferences(firstChars, otherChars)
End Function

For convenience (bad side-scrolling here), the full module is also on Pastebin.

Sample usage:

Public Sub Demo()
 Dim first As String
 Dim other As String
 first = "This is a test string."
 other = "This was a test thing."
 Debug.Print StringDiffs(first, other) & " in string 1 were deleted."
 Debug.Print StringDiffs(other, first) & " in string 2 were inserted."
End Sub

Output:

6,16,18 in string 1 were deleted. 
6,7,18 in string 2 were inserted.
asked Mar 2, 2017 at 4:47
\$\endgroup\$
3
  • \$\begingroup\$ @Raystafarian - Wow, I'm blind today. I get "2,4,12,13 in string 1 were deleted." and "2,3,12,14 in string 2 were inserted.", which is correct output. Delete the 'e' in "test" and the 'hi' is "this" and the 's' is unchanged. Then insert the 't' at the end of test. \$\endgroup\$ Commented Mar 2, 2017 at 17:11
  • \$\begingroup\$ I guess I misunderstood, it's more of a transmutation of strings? \$\endgroup\$ Commented Mar 2, 2017 at 17:15
  • 1
    \$\begingroup\$ @Raystafarian - Yes and no - consider it like a diff file on, say, git or a Word document with track changes on. The way it was originally used was to color text red and strike it through if it was "deleted" and color text that was added in green. If you remove the character positions in the deletions from left to right, then insert the characters positions in the inserts from left to right it should give you the altered string. \$\endgroup\$ Commented Mar 2, 2017 at 17:22

1 Answer 1

3
\$\begingroup\$

Nothing much to complain about with this code, so here's a short list of opportunities that may or may not fall under the realm of personal preference:

Dual-branch conditional assignments like this, where there's only a single, non-side-effecting instruction in each branch:

If topFirst >= topOther Then
 lower = topOther
Else
 lower = topFirst
End If

Can be written as a one-liner assignment with the IIf function:

lower = IIf(topFirst >= topOther, topOther, topFirst)

Single-liner conditional blocks that basically act as guard clauses:

If firstIndex = firstEndIndex Then
 Exit Do
End If

Can be written with the conditional statement syntax:

If firstIndex = firstEndIndex Then Exit Do

Note that doing that removes a nesting level here:

If firstEndIndex = -1 Or otherEndIndex = -1 Then
 Exit Sub
Else
 Do Until first(topFirst) <> other(topOther)
 topFirst = topFirst - 1
 topOther = topOther - 1
 If topFirst < baseFirst Or topOther < baseFirst Then
 Exit Do
 End If
 Loop
 firstEndIndex = topFirst
 otherEndIndex = topOther
End If

Becomes:

If firstEndIndex = -1 Or otherEndIndex = -1 Then Exit Sub
Do Until first(topFirst) <> other(topOther)
 topFirst = topFirst - 1
 topOther = topOther - 1
 If topFirst < baseFirst Or topOther < baseFirst Then
 Exit Do
 End If
Loop
firstEndIndex = topFirst
otherEndIndex = topOther

The declarations-as-close-as-possible-to-first-use style in StringDiffs wrapper function is not consistent with the rest of the code:

Public Function StringDiffs(ByVal first As String, ByVal other As String) As String
 Dim firstChars() As Byte
 Dim otherChars() As Byte
 firstChars = StrConv(first, vbFromUnicode)
 otherChars = StrConv(other, vbFromUnicode)
 StringDiffs = FindDifferences(firstChars, otherChars)
End Function

Would have been:

Public Function StringDiffs(ByVal first As String, ByVal other As String) As String
 Dim firstChars() As Byte
 firstChars = StrConv(first, vbFromUnicode)
 Dim otherChars() As Byte
 otherChars = StrConv(other, vbFromUnicode)
 StringDiffs = FindDifferences(firstChars, otherChars) 
End Function

This bit looks a little packed, some vertical whitespace wouldn't hurt:

If matchLength <> 0 Then
 differences = FindDifferences(first, other, firstStartIndex, firstMatch - 1, otherStartIndex, otherMatch - 1)
 If Len(differences) <> 0 Then returnValue = returnValue & "," & differences
 differences = FindDifferences(first, other, firstMatch + matchLength, firstEndIndex, otherMatch + matchLength, otherEndIndex)
 If Len(differences) <> 0 Then returnValue = returnValue & "," & differences
Else

But otherwise the whole code looks great.

answered Dec 14, 2017 at 15:36
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.