4
\$\begingroup\$

I'm trying to loop through around 100k files in varying degrees of nested directories using vba and md5 hashing each of those files. Right now, the current state of the program takes around 1.5 to 2 hours to run completely. What can I do to increase the speed it is running at?

Option Explicit
Public Const OriginalFolder As String = "\\path\to\original\"
Public Const NewFolder As String = "\\path\to\the\copy\"
Public Sub runRec()
 Dim StartTime As Double
 StartTime = Timer
 Dim originalFiles As Dictionary
 Set originalFiles = New Dictionary
 
 recurse OriginalFolder, OriginalFolder, originalFiles
 
 Debug.Print "Runtime to first recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 Debug.Print "Original File Count: " & originalFiles.count
 
 Dim newFiles As Dictionary
 Set newFiles = New Dictionary
 
 recurse NewFolder, NewFolder, newFiles
 
 Debug.Print "Runtime to second recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 Debug.Print "Copy File Count: " & newFiles.count
 
 'Remove matching md5 hashes
 Dim originalKey As Variant
 
 For Each originalKey In originalFiles.Keys
 If newFiles.Exists(originalKey) Then
 originalFiles.Remove originalKey
 newFiles.Remove originalKey
 End If
 Next
 
 Debug.Print "Runtime to md5: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 
 Dim originalColl As Collection
 Set originalColl = New Collection
 Dim copyColl As New Collection
 Set copyColl = New Collection
 
 'Remove any files with same filename and relative path
 Dim copyKey As Variant
 
 For Each originalKey In originalFiles.Keys
 For Each copyKey In newFiles.Keys
 If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
 originalColl.Add originalKey
 copyColl.Add copyKey
 End If
 Next
 Next
 
 For Each originalKey In originalColl
 originalFiles.Remove originalKey
 Next
 
 For Each copyKey In copyColl
 newFiles.Remove copyKey
 Next
 
 Debug.Print "Runtime to relative: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 
 'Report any files that have the same name but different location as an anomaly
 
 prepDictionary originalFiles
 prepDictionary newFiles
 
 Set originalColl = New Collection
 
 For Each originalKey In originalFiles.Keys
 For Each copyKey In newFiles.Keys
 If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
 originalColl.Add originalKey
 End If
 Next
 Next
 
 Debug.Print "Anomaly count: " & originalColl.count
 
 Debug.Print "Runtime to Anomaly : " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 
 'Report missing files
 
 Debug.Print "Original File Count: " & originalFiles.count
 Debug.Print "Copy File Count: " & newFiles.count
 
End Sub
Private Sub prepDictionary(ByRef dict As Dictionary)
 Dim dictKey As Variant
 Dim slashLocation As Long
 
 For Each dictKey In dict.Keys
 slashLocation = InStrRev(dict.item(dictKey), "\")
 
 dict.item(dictKey) = Right(dict.item(dictKey), Len(dict.item(dictKey)) - slashLocation)
 Next
End Sub
Private Sub recurse(ByVal basePath As String, ByVal folderPath As String, ByRef fileDict As Dictionary)
 Dim FSO As FileSystemObject
 Dim currentFolder As Folder
 Dim subFolder As Folder
 Dim folderFile As File
 
 Set FSO = New FileSystemObject
 
 Set currentFolder = FSO.GetFolder(folderPath)
 
 Dim fileHash As String
 For Each folderFile In currentFolder.Files
 'Need to ignore files that dont have a size and lock files
 If FileLen(folderFile.path) > 0 And Left(folderFile.Name, 2) <> "~$" And folderFile.Name <> "Thumbs.db" Then
 fileHash = FileToMD5Hex(folderFile.path)
 
 If Not fileDict.Exists(fileHash) Then
 fileDict.Add fileHash, Right(folderFile.path, Len(folderFile.path) - Len(basePath))
 End If
 End If
 Next
 
 For Each subFolder In currentFolder.SubFolders
 recurse basePath, subFolder.path, fileDict
 Next
End Sub
Public Function FileToMD5Hex(sFileName As String) As String
 Dim enc
 Dim bytes
 Dim outstr As String
 Dim pos As Integer
 Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
 
 'Convert the string to a byte array and hash it
 bytes = GetFileBytes(sFileName)
 bytes = enc.ComputeHash_2((bytes))
 
 'Convert the byte array to a hex string
 For pos = 1 To LenB(bytes)
 outstr = outstr & LCase(Right("0" & Hex(AscB(MidB(bytes, pos, 1))), 2))
 Next
 
 FileToMD5Hex = outstr
 
 Set enc = Nothing
End Function
Private Function GetFileBytes(ByVal path As String) As Byte()
 Dim lngFileNum As Long
 Dim bytRtnVal() As Byte
 
 lngFileNum = FreeFile
 
 If LenB(Dir(path, vbNormal + vbReadOnly + vbHidden)) Then ''// Does file exist?
 Open path For Binary Access Read As lngFileNum
 ReDim bytRtnVal(LOF(lngFileNum) - 1&) As Byte
 Get lngFileNum, , bytRtnVal
 Close lngFileNum
 Else
 Err.Raise 53
 End If
 
 GetFileBytes = bytRtnVal
 
 Erase bytRtnVal
End Function
Sᴀᴍ Onᴇᴌᴀ
29.5k16 gold badges45 silver badges201 bronze badges
asked Jul 31, 2024 at 19:58
\$\endgroup\$
9
  • \$\begingroup\$ what does that debug print? Just to get an idea of your performance measures? \$\endgroup\$ Commented Aug 1, 2024 at 10:53
  • \$\begingroup\$ @Greedo Yes, since it won't be running in any short amount of time. I'd just like to get a general idea of the performance \$\endgroup\$ Commented Aug 1, 2024 at 14:36
  • 1
    \$\begingroup\$ Sorry I meant would you be able to give the log so we can see what your profiling has found. You can edit the post to add this info \$\endgroup\$ Commented Aug 1, 2024 at 14:45
  • 1
    \$\begingroup\$ As @Greedo notes: Your own diagnostics (which parts are taking the most time) would be helpful. I have tried to follow the code, but it is difficult, it seems to follow a tortured path. I don't have time to tease out all the issues. \$\endgroup\$ Commented Aug 3, 2024 at 7:23
  • 1
    \$\begingroup\$ Out of scope of your question, I notice you raise an error at one point, rather than handling it gracefully. Why (given that there may be a good reason for this)? For me, any Err action is a red flag unless properly explained. \$\endgroup\$ Commented Aug 3, 2024 at 7:29

1 Answer 1

3
\$\begingroup\$

Get files

You can speed up retrieving the files by importing LibFileTools module from VBA-FileTools. Your recurse method then becomes:

Private Sub recurse(ByVal folderPath As String, ByRef fileDict As Dictionary)
 Dim filePath As Variant
 Dim fileHash As String
 '
 For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
 If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
 fileHash = FileToMD5Hex(CStr(filePath))
 If Not fileDict.Exists(fileHash) Then
 fileDict.Add fileHash, Right(filePath, Len(filePath) - Len(folderPath))
 End If
 End If
 Next
End Sub

Since you're ignoring duplicated hashes, and storing just first found, you could just store last found by updating to this:

Private Sub recurse(ByVal folderPath As String, ByRef fileDict As Dictionary)
 Dim filePath As Variant
 '
 For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
 If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
 fileDict(FileToMD5Hex(CStr(filePath))) = Right(filePath, Len(filePath) - Len(folderPath))
 End If
 Next
End Sub

Of course you will then need to update the main runRec function and:

'Replace this:
recurse OriginalFolder, OriginalFolder, originalFiles
'with this:
recurse OriginalFolder, originalFiles
'And replace this:
recurse NewFolder, NewFolder, newFiles
'with this:
recurse NewFolder, newFiles

Finally, the GetFiles function does the recursion for you so you might want to rename the method from recurse to something like HashFilesMD5.

Hashing

We can improve on the FileToMD5Hex function by:

  1. using a map for the hex values - this avoid unnecessary function calls
  2. using a buffer for the result - because string concatenation is slow
Public Function FileToMD5Hex(sFileName As String) As String
 Static enc As Object
 Dim bytes() As Byte
 If enc Is Nothing Then Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
 
 ReadBytes sFileName, bytes
 On Error Resume Next
 bytes = enc.ComputeHash_2((bytes))
 If Err.Number <> 0 Then
 Err.Clear
 Exit Function
 End If
 On Error GoTo 0
 
 'Convert the byte array to a hex string
 Static map(0 To 255) As String
 Dim i As Long
 
 'Cache hex values for bytes
 If LenB(map(0)) = 0 Then
 For i = 0 To 255
 map(i) = LCase(Right$("0" & Hex$(i), 2))
 Next i
 End If
 FileToMD5Hex = Space$((UBound(bytes) - LBound(bytes) + 1) * 2)
 For i = LBound(bytes) To UBound(bytes)
 Mid$(FileToMD5Hex, i * 2 + 1) = map(bytes(i))
 Next i
End Function

Notice that ReadBytes is already part of same library.

Mac

In case you want to use this on a Mac operating system, then you need to update the prepDictionary method to something like this:

Private Sub prepDictionary(ByRef dict As Dictionary)
 Dim dictKey As Variant
 Dim dictItem As String
 Dim separatorLocation As Long
 
 For Each dictKey In dict
 dictItem = dict.Item(dictKey)
 separatorLocation = InStrRev(dictItem, PATH_SEPARATOR)
 dict.Item(dictKey) = Right$(dictItem, Len(dictItem) - separatorLocation)
 Next
End Sub

Notice that PATH_SEPARATOR is a constant defined in the same library I linked to.

Dictionary

You can import the Dictionary class from VBA-FastDictionary to make your code work on Mac. The added advantage is that this class is generally faster on Windows as well, especially for more than 32k key-item pairs - see benchmarking.

Final code

Notice that I removed all occurences of [dict].Keys and left only [dict] because a For Each.. loop on a dictionary will loop through the keys using a faster iterator object while looping .Keys makes a copy of the internal array of keys first before the iteration happens on the actual array.

Option Explicit
Public Const OriginalFolder As String = "\\path\to\original\"
Public Const NewFolder As String = "\\path\to\the\copy\"
Public Sub runRec()
 Dim StartTime As Double
 StartTime = Timer
 Dim originalFiles As Dictionary
 Set originalFiles = New Dictionary
 
 HashFilesMD5 OriginalFolder, originalFiles
 
 Debug.Print "Runtime to first recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 Debug.Print "Original File Count: " & originalFiles.Count
 
 Dim newFiles As Dictionary
 Set newFiles = New Dictionary
 
 HashFilesMD5 NewFolder, newFiles
 
 Debug.Print "Runtime to second recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 Debug.Print "Copy File Count: " & newFiles.Count
 
 'Remove matching md5 hashes
 Dim originalKey As Variant
 
 For Each originalKey In originalFiles
 If newFiles.Exists(originalKey) Then
 originalFiles.Remove originalKey
 newFiles.Remove originalKey
 End If
 Next
 
 Debug.Print "Runtime to md5: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 
 Dim originalColl As Collection
 Set originalColl = New Collection
 Dim copyColl As New Collection
 Set copyColl = New Collection
 
 'Remove any files with same filename and relative path
 Dim copyKey As Variant
 
 For Each originalKey In originalFiles
 For Each copyKey In newFiles
 If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
 originalColl.Add originalKey
 copyColl.Add copyKey
 End If
 Next
 Next
 
 For Each originalKey In originalColl
 originalFiles.Remove originalKey
 Next
 
 For Each copyKey In copyColl
 newFiles.Remove copyKey
 Next
 
 Debug.Print "Runtime to relative: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 
 'Report any files that have the same name but different location as an anomaly
 
 PrepDictionary originalFiles
 PrepDictionary newFiles
 
 Set originalColl = New Collection
 
 For Each originalKey In originalFiles
 For Each copyKey In newFiles
 If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
 originalColl.Add originalKey
 End If
 Next
 Next
 
 Debug.Print "Anomaly count: " & originalColl.Count
 
 Debug.Print "Runtime to Anomaly : " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
 
 'Report missing files
 
 Debug.Print "Original File Count: " & originalFiles.Count
 Debug.Print "Copy File Count: " & newFiles.Count
 
End Sub
Private Sub PrepDictionary(ByRef dict As Dictionary)
 Dim dictKey As Variant
 Dim dictItem As String
 Dim separatorLocation As Long
 
 For Each dictKey In dict
 dictItem = dict.Item(dictKey)
 separatorLocation = InStrRev(dictItem, PATH_SEPARATOR)
 dict.Item(dictKey) = Right$(dictItem, Len(dictItem) - separatorLocation)
 Next
End Sub
Private Sub HashFilesMD5(ByVal folderPath As String, ByRef fileDict As Dictionary)
 Dim filePath As Variant
 '
 For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
 If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
 fileDict(FileToMD5Hex(CStr(filePath))) = Right(filePath, Len(filePath) - Len(folderPath))
 End If
 Next
End Sub
Public Function FileToMD5Hex(sFileName As String) As String
 Static enc As Object
 Dim bytes() As Byte
 If enc Is Nothing Then Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
 
 ReadBytes sFileName, bytes
 On Error Resume Next
 bytes = enc.ComputeHash_2((bytes))
 If Err.Number <> 0 Then
 Err.Clear
 Exit Function
 End If
 On Error GoTo 0
 
 'Convert the byte array to a hex string
 Static map(0 To 255) As String
 Dim i As Long
 
 'Cache hex values for bytes
 If LenB(map(0)) = 0 Then
 For i = 0 To 255
 map(i) = LCase(Right$("0" & Hex$(i), 2))
 Next i
 End If
 FileToMD5Hex = Space$((UBound(bytes) - LBound(bytes) + 1) * 2)
 For i = LBound(bytes) To UBound(bytes)
 Mid$(FileToMD5Hex, i * 2 + 1) = map(bytes(i))
 Next i
End Function

Final thoughts

I did not modify code like this:

For Each originalKey In originalFiles
 For Each copyKey In newFiles
 If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
 originalColl.Add originalKey
 copyColl.Add copyKey
 End If
 Next
Next

but if I were to try to improve it, I would probably create a new dictionary with the keys being the actual paths in originalFiles and then just use dict.Exists while looping through the files in newFiles.

answered Aug 8, 2024 at 13:50
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.