I'm trying to loop through around 100k files in varying degrees of nested directories using vba and md5 hashing each of those files. Right now, the current state of the program takes around 1.5 to 2 hours to run completely. What can I do to increase the speed it is running at?
Option Explicit
Public Const OriginalFolder As String = "\\path\to\original\"
Public Const NewFolder As String = "\\path\to\the\copy\"
Public Sub runRec()
Dim StartTime As Double
StartTime = Timer
Dim originalFiles As Dictionary
Set originalFiles = New Dictionary
recurse OriginalFolder, OriginalFolder, originalFiles
Debug.Print "Runtime to first recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Debug.Print "Original File Count: " & originalFiles.count
Dim newFiles As Dictionary
Set newFiles = New Dictionary
recurse NewFolder, NewFolder, newFiles
Debug.Print "Runtime to second recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Debug.Print "Copy File Count: " & newFiles.count
'Remove matching md5 hashes
Dim originalKey As Variant
For Each originalKey In originalFiles.Keys
If newFiles.Exists(originalKey) Then
originalFiles.Remove originalKey
newFiles.Remove originalKey
End If
Next
Debug.Print "Runtime to md5: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Dim originalColl As Collection
Set originalColl = New Collection
Dim copyColl As New Collection
Set copyColl = New Collection
'Remove any files with same filename and relative path
Dim copyKey As Variant
For Each originalKey In originalFiles.Keys
For Each copyKey In newFiles.Keys
If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
originalColl.Add originalKey
copyColl.Add copyKey
End If
Next
Next
For Each originalKey In originalColl
originalFiles.Remove originalKey
Next
For Each copyKey In copyColl
newFiles.Remove copyKey
Next
Debug.Print "Runtime to relative: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
'Report any files that have the same name but different location as an anomaly
prepDictionary originalFiles
prepDictionary newFiles
Set originalColl = New Collection
For Each originalKey In originalFiles.Keys
For Each copyKey In newFiles.Keys
If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
originalColl.Add originalKey
End If
Next
Next
Debug.Print "Anomaly count: " & originalColl.count
Debug.Print "Runtime to Anomaly : " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
'Report missing files
Debug.Print "Original File Count: " & originalFiles.count
Debug.Print "Copy File Count: " & newFiles.count
End Sub
Private Sub prepDictionary(ByRef dict As Dictionary)
Dim dictKey As Variant
Dim slashLocation As Long
For Each dictKey In dict.Keys
slashLocation = InStrRev(dict.item(dictKey), "\")
dict.item(dictKey) = Right(dict.item(dictKey), Len(dict.item(dictKey)) - slashLocation)
Next
End Sub
Private Sub recurse(ByVal basePath As String, ByVal folderPath As String, ByRef fileDict As Dictionary)
Dim FSO As FileSystemObject
Dim currentFolder As Folder
Dim subFolder As Folder
Dim folderFile As File
Set FSO = New FileSystemObject
Set currentFolder = FSO.GetFolder(folderPath)
Dim fileHash As String
For Each folderFile In currentFolder.Files
'Need to ignore files that dont have a size and lock files
If FileLen(folderFile.path) > 0 And Left(folderFile.Name, 2) <> "~$" And folderFile.Name <> "Thumbs.db" Then
fileHash = FileToMD5Hex(folderFile.path)
If Not fileDict.Exists(fileHash) Then
fileDict.Add fileHash, Right(folderFile.path, Len(folderFile.path) - Len(basePath))
End If
End If
Next
For Each subFolder In currentFolder.SubFolders
recurse basePath, subFolder.path, fileDict
Next
End Sub
Public Function FileToMD5Hex(sFileName As String) As String
Dim enc
Dim bytes
Dim outstr As String
Dim pos As Integer
Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
'Convert the string to a byte array and hash it
bytes = GetFileBytes(sFileName)
bytes = enc.ComputeHash_2((bytes))
'Convert the byte array to a hex string
For pos = 1 To LenB(bytes)
outstr = outstr & LCase(Right("0" & Hex(AscB(MidB(bytes, pos, 1))), 2))
Next
FileToMD5Hex = outstr
Set enc = Nothing
End Function
Private Function GetFileBytes(ByVal path As String) As Byte()
Dim lngFileNum As Long
Dim bytRtnVal() As Byte
lngFileNum = FreeFile
If LenB(Dir(path, vbNormal + vbReadOnly + vbHidden)) Then ''// Does file exist?
Open path For Binary Access Read As lngFileNum
ReDim bytRtnVal(LOF(lngFileNum) - 1&) As Byte
Get lngFileNum, , bytRtnVal
Close lngFileNum
Else
Err.Raise 53
End If
GetFileBytes = bytRtnVal
Erase bytRtnVal
End Function
1 Answer 1
Get files
You can speed up retrieving the files by importing LibFileTools
module from VBA-FileTools. Your recurse
method then becomes:
Private Sub recurse(ByVal folderPath As String, ByRef fileDict As Dictionary)
Dim filePath As Variant
Dim fileHash As String
'
For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
fileHash = FileToMD5Hex(CStr(filePath))
If Not fileDict.Exists(fileHash) Then
fileDict.Add fileHash, Right(filePath, Len(filePath) - Len(folderPath))
End If
End If
Next
End Sub
Since you're ignoring duplicated hashes, and storing just first found, you could just store last found by updating to this:
Private Sub recurse(ByVal folderPath As String, ByRef fileDict As Dictionary)
Dim filePath As Variant
'
For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
fileDict(FileToMD5Hex(CStr(filePath))) = Right(filePath, Len(filePath) - Len(folderPath))
End If
Next
End Sub
Of course you will then need to update the main runRec
function and:
'Replace this:
recurse OriginalFolder, OriginalFolder, originalFiles
'with this:
recurse OriginalFolder, originalFiles
'And replace this:
recurse NewFolder, NewFolder, newFiles
'with this:
recurse NewFolder, newFiles
Finally, the GetFiles
function does the recursion for you so you might want to rename the method from recurse
to something like HashFilesMD5
.
Hashing
We can improve on the FileToMD5Hex
function by:
- using a map for the hex values - this avoid unnecessary function calls
- using a buffer for the result - because string concatenation is slow
Public Function FileToMD5Hex(sFileName As String) As String
Static enc As Object
Dim bytes() As Byte
If enc Is Nothing Then Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
ReadBytes sFileName, bytes
On Error Resume Next
bytes = enc.ComputeHash_2((bytes))
If Err.Number <> 0 Then
Err.Clear
Exit Function
End If
On Error GoTo 0
'Convert the byte array to a hex string
Static map(0 To 255) As String
Dim i As Long
'Cache hex values for bytes
If LenB(map(0)) = 0 Then
For i = 0 To 255
map(i) = LCase(Right$("0" & Hex$(i), 2))
Next i
End If
FileToMD5Hex = Space$((UBound(bytes) - LBound(bytes) + 1) * 2)
For i = LBound(bytes) To UBound(bytes)
Mid$(FileToMD5Hex, i * 2 + 1) = map(bytes(i))
Next i
End Function
Notice that ReadBytes
is already part of same library.
Mac
In case you want to use this on a Mac operating system, then you need to update the prepDictionary
method to something like this:
Private Sub prepDictionary(ByRef dict As Dictionary)
Dim dictKey As Variant
Dim dictItem As String
Dim separatorLocation As Long
For Each dictKey In dict
dictItem = dict.Item(dictKey)
separatorLocation = InStrRev(dictItem, PATH_SEPARATOR)
dict.Item(dictKey) = Right$(dictItem, Len(dictItem) - separatorLocation)
Next
End Sub
Notice that PATH_SEPARATOR
is a constant defined in the same library I linked to.
Dictionary
You can import the Dictionary
class from VBA-FastDictionary to make your code work on Mac. The added advantage is that this class is generally faster on Windows as well, especially for more than 32k key-item pairs - see benchmarking.
Final code
Notice that I removed all occurences of [dict].Keys
and left only [dict]
because a For Each..
loop on a dictionary will loop through the keys using a faster iterator object while looping .Keys
makes a copy of the internal array of keys first before the iteration happens on the actual array.
Option Explicit
Public Const OriginalFolder As String = "\\path\to\original\"
Public Const NewFolder As String = "\\path\to\the\copy\"
Public Sub runRec()
Dim StartTime As Double
StartTime = Timer
Dim originalFiles As Dictionary
Set originalFiles = New Dictionary
HashFilesMD5 OriginalFolder, originalFiles
Debug.Print "Runtime to first recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Debug.Print "Original File Count: " & originalFiles.Count
Dim newFiles As Dictionary
Set newFiles = New Dictionary
HashFilesMD5 NewFolder, newFiles
Debug.Print "Runtime to second recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Debug.Print "Copy File Count: " & newFiles.Count
'Remove matching md5 hashes
Dim originalKey As Variant
For Each originalKey In originalFiles
If newFiles.Exists(originalKey) Then
originalFiles.Remove originalKey
newFiles.Remove originalKey
End If
Next
Debug.Print "Runtime to md5: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Dim originalColl As Collection
Set originalColl = New Collection
Dim copyColl As New Collection
Set copyColl = New Collection
'Remove any files with same filename and relative path
Dim copyKey As Variant
For Each originalKey In originalFiles
For Each copyKey In newFiles
If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
originalColl.Add originalKey
copyColl.Add copyKey
End If
Next
Next
For Each originalKey In originalColl
originalFiles.Remove originalKey
Next
For Each copyKey In copyColl
newFiles.Remove copyKey
Next
Debug.Print "Runtime to relative: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
'Report any files that have the same name but different location as an anomaly
PrepDictionary originalFiles
PrepDictionary newFiles
Set originalColl = New Collection
For Each originalKey In originalFiles
For Each copyKey In newFiles
If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
originalColl.Add originalKey
End If
Next
Next
Debug.Print "Anomaly count: " & originalColl.Count
Debug.Print "Runtime to Anomaly : " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
'Report missing files
Debug.Print "Original File Count: " & originalFiles.Count
Debug.Print "Copy File Count: " & newFiles.Count
End Sub
Private Sub PrepDictionary(ByRef dict As Dictionary)
Dim dictKey As Variant
Dim dictItem As String
Dim separatorLocation As Long
For Each dictKey In dict
dictItem = dict.Item(dictKey)
separatorLocation = InStrRev(dictItem, PATH_SEPARATOR)
dict.Item(dictKey) = Right$(dictItem, Len(dictItem) - separatorLocation)
Next
End Sub
Private Sub HashFilesMD5(ByVal folderPath As String, ByRef fileDict As Dictionary)
Dim filePath As Variant
'
For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
fileDict(FileToMD5Hex(CStr(filePath))) = Right(filePath, Len(filePath) - Len(folderPath))
End If
Next
End Sub
Public Function FileToMD5Hex(sFileName As String) As String
Static enc As Object
Dim bytes() As Byte
If enc Is Nothing Then Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
ReadBytes sFileName, bytes
On Error Resume Next
bytes = enc.ComputeHash_2((bytes))
If Err.Number <> 0 Then
Err.Clear
Exit Function
End If
On Error GoTo 0
'Convert the byte array to a hex string
Static map(0 To 255) As String
Dim i As Long
'Cache hex values for bytes
If LenB(map(0)) = 0 Then
For i = 0 To 255
map(i) = LCase(Right$("0" & Hex$(i), 2))
Next i
End If
FileToMD5Hex = Space$((UBound(bytes) - LBound(bytes) + 1) * 2)
For i = LBound(bytes) To UBound(bytes)
Mid$(FileToMD5Hex, i * 2 + 1) = map(bytes(i))
Next i
End Function
Final thoughts
I did not modify code like this:
For Each originalKey In originalFiles
For Each copyKey In newFiles
If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
originalColl.Add originalKey
copyColl.Add copyKey
End If
Next
Next
but if I were to try to improve it, I would probably create a new dictionary with the keys being the actual paths in originalFiles
and then just use dict.Exists
while looping through the files in newFiles
.
Explore related questions
See similar questions with these tags.
Err
action is a red flag unless properly explained. \$\endgroup\$