I have this code that is supposed to compare two Excel sheets. The code is working fine for small comparisons. I did a test run with 7 rows and 2 columns.
The code itself works as follows: it compares the two sheets and copies the differences into a new workbook.
However, the code should be applied to files that have around 16 columns and a lot of rows (around 206700). It doesn't seem to scale very well to these larger sheets - the process is shown "Not Responding", for more than ten minutes when I gave up.
This is how my excel file columns looks likem, keeping in mind that most of them contain text and only few times numbers are used. enter image description here
I'd like to improve the performance with these larger files; any other suggestions for improvement are also welcome.
Sub Compare2WorkSheets(ws1 As Worksheet, ws2 As Worksheet)
Dim ws1row As Long, ws2row As Long, ws1col As Integer, ws2col As Integer
Dim maxrow As Long, maxcol As Integer, colval1 As String, colval2 As String
Dim report As Workbook, difference As Long
Dim row As Long, col As Integer
Set report = Workbooks.Add
With ws1.UsedRange
ws1row = .Rows.Count
ws1col = .Columns.Count
End With
With ws2.UsedRange
ws2row = .Rows.Count
ws2col = .Columns.Count
End With
maxrow = ws1row
maxcol = ws1col
If maxrow < ws2row Then maxrow = ws2row
If maxcol < ws2col Then maxcol = ws2col
difference = 0
For col = 1 To maxcol
For row = 1 To maxrow
colval1 = ""
colval2 = ""
colval1 = ws1.Cells(row, col).Formula
colval2 = ws2.Cells(row, col).Formula
If colval1 <> colval2 Then
difference = difference + 1
Cells(row, col).Formula = colval1 & "<> " & colval2
Cells(row, col).Interior.Color = 255
Cells(row, col).Font.ColorIndex = 2
Cells(row, col).Font.Bold = True
End If
Next row
Next col
Columns("A:B").ColumnWidth = 25
report.Saved = True
If difference = 0 Then
report.Close False
End If
Set report = Nothing
MsgBox difference & " cells contain different data! ", vbInformation, _
"Comparing Two Worksheets"
End Sub
-
\$\begingroup\$ "16 columns and a lot of rows (around 206700)" Have you tried scaling up more slowly? What happens with 3 columns and 400 rows? 6 columns and 800 rows? Now all you can tell us you don't have the patience to see whether it still works after 10 minutes, which doesn't tell us much about the current inefficiency. \$\endgroup\$Mast– Mast ♦2019年08月15日 09:49:45 +00:00Commented Aug 15, 2019 at 9:49
-
2\$\begingroup\$ Can you tell us more about how this code is used (executed in 1 of the spreadsheets, or in a 3rd), what kind of data (length) we're talking about and how the cells are formatted (are they text, numbers, currency, date)? This could all be relevant. \$\endgroup\$Mast– Mast ♦2019年08月15日 09:53:29 +00:00Commented Aug 15, 2019 at 9:53
-
\$\begingroup\$ @Mast I actually have waited almost 1 hour and the file was stil saying "Not responding" . I just tried it with 10k rows and all the columns . It took more than a 1min. If you look at the question I also have added a picture oh how my excel file approx looks like \$\endgroup\$Miriam List– Miriam List2019年08月15日 11:15:32 +00:00Commented Aug 15, 2019 at 11:15
-
\$\begingroup\$ It looks like you're comparing the formulas in each of the cells and not the values of the cells. Is this correct? \$\endgroup\$PeterT– PeterT2019年08月15日 13:53:47 +00:00Commented Aug 15, 2019 at 13:53
-
4\$\begingroup\$ @Close-Voters: Please read the vba tag wiki. Excel "freezing" and/or "(not responding)" is not broken code. \$\endgroup\$Mathieu Guindon– Mathieu Guindon2019年08月15日 13:56:26 +00:00Commented Aug 15, 2019 at 13:56
4 Answers 4
May try the modified code using Arrays to Compare. Tested with 250000 rows X 26 columns of random data and every 5th cells have value difference (Total 130000 differences). It takes around 18 secs to compare and another 22 secs to completes report generation with total 40 seconds only.
Sub Compare2WorkSheets(ws1 As Worksheet, ws2 As Worksheet)
Dim ws1row As Long, ws2row As Long, ws1col As Integer, ws2col As Integer
Dim maxrow As Long, maxcol As Integer, colval1 As String, colval2 As String
Dim Report As Workbook, difference As Long
Dim row As Long, col As Integer
Dim Arr1 As Variant, Arr2 As Variant, Arr3 As Variant, Rng As Range
Dim tm As Double
tm = Timer
'Application.ScreenUpdating = False
'Application.Calculation = xlCalculationManual
'Application.EnableEvents = False
With ws1.UsedRange
ws1row = .Rows.Count
ws1col = .Columns.Count
End With
With ws2.UsedRange
ws2row = .Rows.Count
ws2col = .Columns.Count
End With
maxrow = ws1row
maxcol = ws1col
If maxrow < ws2row Then maxrow = ws2row
If maxcol < ws2col Then maxcol = ws2col
Debug.Print maxrow, maxcol
Arr1 = ws1.Range(ws1.Cells(1, 1), ws1.Cells(maxrow, maxcol)).Formula
Arr2 = ws2.Range(ws2.Cells(1, 1), ws2.Cells(maxrow, maxcol)).Formula
ReDim Arr3(1 To UBound(Arr1, 1), 1 To UBound(Arr1, 2))
difference = 0
For col = 1 To maxcol
For row = 1 To maxrow
If Arr1(row, col) <> Arr2(row, col) Then
difference = difference + 1
Arr3(row, col) = Arr1(row, col) & "<> " & Arr2(row, col)
Else
Arr3(row, col) = ""
End If
Next row
Next col
Debug.Print " Calc secs " & Timer - tm
If difference > 0 Then
Set Report = Workbooks.Add
With Report.ActiveSheet
.Range("A1").Resize(UBound(Arr3, 1), UBound(Arr3, 2)).Value = Arr3
.Columns("A:B").ColumnWidth = 25
Set Rng = .Range(Report.ActiveSheet.Cells(1, 1), Report.ActiveSheet.Cells(UBound(Arr3, 1), UBound(Arr3, 2)))
End With
With Rng
.FormatConditions.Add Type:=xlCellValue, Operator:=xlGreater, Formula1:="=""""" '""""""""
.FormatConditions(Selection.FormatConditions.Count).SetFirstPriority
With .FormatConditions(1)
.Interior.Color = 255
.Font.Bold = True
.Font.ColorIndex = 2
End With
End With
Debug.Print "Report Generated secs " & Timer - tm
End If
'Set Report = Nothing
'Application.ScreenUpdating = True
'Application.Calculation = xlCalculationAutomatic
'Application.EnableEvents = True
MsgBox difference & " cells contain different data! ", vbInformation, "Comparing Two Worksheets"
End Sub
Since I personally don't prefer to keep calculations, event processing and screen updating off (in normal cases) i haven't used that standard lines. However you may use these standard techniques, depending on the working file condition.
-
\$\begingroup\$ thank a lot , works so smooth. I have some small features I want to add. I might need some help with that. Should I ask here or just edit my question and add those parts there ? \$\endgroup\$Miriam List– Miriam List2019年08月16日 06:50:55 +00:00Commented Aug 16, 2019 at 6:50
-
\$\begingroup\$ in case the row is identical, I want it to be copied on the report sheet (without any change of color of font). I tried iimplementing it on the code however, even the rows that are the same are canging color and getting the background color \$\endgroup\$Miriam List– Miriam List2019年08月16日 07:14:55 +00:00Commented Aug 16, 2019 at 7:14
-
\$\begingroup\$ It is not clear, do you want to copy & list values of only identical rows in the report sheet and difference is not to be calculated? \$\endgroup\$Ahmed AU– Ahmed AU2019年08月16日 15:57:48 +00:00Commented Aug 16, 2019 at 15:57
-
\$\begingroup\$ @ Ahmed AU Is it ok if I post another question about that because since reviewing the files some I have to add something and the logic behind it is unclear and I'm not managing to find a way. I would link you to the question \$\endgroup\$Miriam List– Miriam List2019年08月16日 16:06:20 +00:00Commented Aug 16, 2019 at 16:06
-
\$\begingroup\$ if it is not about efficiency and performance may post it it on SO. If this question solves your efficiency problem may pl accept the answer. \$\endgroup\$Ahmed AU– Ahmed AU2019年08月16日 16:09:31 +00:00Commented Aug 16, 2019 at 16:09
There are some issues with your code:
1) Try to avoid unqualified references, this means always specify the worksheet, when referencing a cell.
2) Try to avoid formatting a lot of single cells, rather format them all at once at the end. Formatting slows down Excel a lot!
3) When handeling a great deal of Ranges, Integer
can be insufficient, use Long
instead.
Sub Compare2WorkSheets(ws1 As Worksheet, ws2 As Worksheet)
Dim ws1row As Long, ws2row As Long, ws1col As Long, ws2col As Long
Dim maxrow As Long, maxcol As Integer, colval1 As String, colval2 As String
Dim report As Workbook, difference As Long
Dim row As Long, col As Long
Dim unionRng as Range
Set report = Workbooks.Add
With ws1.UsedRange
ws1row = .Rows.Count
ws1col = .Columns.Count
End With
With ws2.UsedRange
ws2row = .Rows.Count
ws2col = .Columns.Count
End With
maxrow = ws1row
maxcol = ws1col
If maxrow < ws2row Then maxrow = ws2row
If maxcol < ws2col Then maxcol = ws2col
difference = 0
With report.Sheets(1)
For col = 1 To maxcol
For row = 1 To maxrow
colval1 = ""
colval2 = ""
colval1 = ws1.Cells(row, col).Formula
colval2 = ws2.Cells(row, col).Formula
If colval1 <> colval2 Then
difference = difference + 1
.Cells(row, col).Value = colval1 & "<> " & colval2 'I guess you want to show, that the formulas used are not equal.
If unionRng is Nothing Then
Set unionRng = .Cells(row, col)
Else
Set unionRng = Application.Union(unionRng, .Cells(row, col))
End If
End If
Next row
Next col
.Columns("A:B").ColumnWidth = 25
End With
unionRng.Interior.Color = 255
unionRng.Font.ColorIndex = 2
unionRng.Font.Bold = True
report.SaveAs Filename:="report.xlsx"
If difference = 0 Then
report.Close False
End If
Set report = Nothing
MsgBox difference & " cells contain different data! ", vbInformation, _
"Comparing Two Worksheets"
End Sub
```
-
\$\begingroup\$ I'm getting error Invalid procedure call or invalid argument at Set unionRng = Application.Union(unionRng, .Cells(row, col)) \$\endgroup\$Miriam List– Miriam List2019年08月15日 14:44:39 +00:00Commented Aug 15, 2019 at 14:44
-
\$\begingroup\$ See the edit, forgot to check for the case, that
unionRng
is set for the first time. \$\endgroup\$MGP– MGP2019年08月15日 14:49:27 +00:00Commented Aug 15, 2019 at 14:49 -
\$\begingroup\$ It's been 40 mins since the code is running, as it for no it's showing Not Responding but I guess files are still being compared. The problem is, this is taking way to long and an end point doesn't look near. \$\endgroup\$Miriam List– Miriam List2019年08月15日 15:36:49 +00:00Commented Aug 15, 2019 at 15:36
-
My answer is very similar to Ahmed AU with a few exceptions.
- I didn't bother adding the Conditional formatting because everything on the new worksheet represents changes.
- The other main difference is that I match the Ranges using the Range addresses. These will automatically adjust for differences in column and rows count and starting cell.
Refactored Code
Sub Compare2WorkSheets(ByRef Worksheet1 As Worksheet, ByRef Worksheet2 As Worksheet)
Dim t As Double: t = Timer
Dim Range1 As Range, Range2 As Range
SetRanges Worksheet1, Worksheet2, Range1, Range2
Dim Values1, Values2, Results
Dim r As Long, c As Long, Count As Long
Values1 = Range1.Value
Values2 = Range2.Value
ReDim Results(1 To UBound(Values1), 1 To UBound(Values1, 2))
For r = 1 To UBound(Values1)
For c = 1 To UBound(Values1, 2)
If Values1(r, c) <> Values2(r, c) Then
Count = Count + 1
Results(r, c) = Values1(r, c) & vbNewLine & Values2(r, c)
End If
Next
Next
Workbooks.Add.Worksheets(1).Range(Range1.Address).Value = Results
Debug.Print "Compare2WorkSheets: ", Worksheet1.Name; " to "; Worksheet2.Name
Debug.Print "Runtime in Second(s):"; Round(Timer - t, 2)
Debug.Print "Number of Cells per Worksheet: "; Range1.CountLarge
Debug.Print "Number of Differences: "; Count
MsgBox "There were " & Count & " Differences"
End Sub
Sub SetRanges(ByRef Worksheet1 As Worksheet, ByRef Worksheet2 As Worksheet, ByRef Range1 As Range, ByRef Range2 As Range)
With Worksheet1
Set Range1 = Union(.UsedRange, .Range(Worksheet2.UsedRange.Address))
End With
With Worksheet2
Set Range2 = Union(.UsedRange, .Range(Worksheet1.UsedRange.Address))
End With
End Sub
Results
-
\$\begingroup\$ Again you win my respect, the way you matched the Ranges using the Range addresses. would you please throw some light on my recent serious apprehensions about using union range method with large non contiguous area count at least in excel 2007. May refer my to boring post] and may provide some more information. Actually while posting that I hoped for some info from you, @Tim William etc \$\endgroup\$Ahmed AU– Ahmed AU2019年08月16日 00:14:31 +00:00Commented Aug 16, 2019 at 0:14
-
\$\begingroup\$ @TinMan please have a look at my new post, it is regarding the code from question. [codereview.stackexchange.com/questions/226346/… \$\endgroup\$Miriam List– Miriam List2019年08月18日 20:09:08 +00:00Commented Aug 18, 2019 at 20:09
My answer makes the assumption that you mean to compare the values of the data and not the formulas, though much of the other comments here remain valid for your code.
- Identify your function parameters as
ByRef
orByVal
. As you may guess, passing a variable "by reference" generally allows you to reference the variable as held by the caller and make modifications to its value (there are some exceptions, but this is the concept). Passing a variable "by value" effectively copies the value of the variable to a new variable in your routine. When passing objects, such asWorksheet
, to a function, I nearly always pass these objectsByRef
so I'm thinking there is some deep copy action happening. - It's generally recommended to declare your variables as close to its first use as possible. This will save you (and anyone else reviewing your code) from having to scroll up/down to determine variable types and definitions.
So to declare references to the two source worksheets would look like this:
Dim area1 As Range
Dim area2 As Range
Set area1 = ws1.UsedRange
Set area2 = ws2.UsedRange
- You can simplify how you determine the maximum number of rows and columns using two statements with the
IIf
.
Notice how I'm overwriting the initial Set
of each area variable, possibly expanding it to cover the largest possible range.
'--- calculate the max-sized range for the data and expand
' the ranges as needed
Dim maxRows As Long
Dim maxCols As Long
maxRows = IIf(area1.Rows.Count > area2.Rows.Count, _
area1.Rows.Count, area2.Rows.Count)
maxCols = IIf(area1.Columns.Count > area2.Columns.Count, _
area1.Columns.Count, area2.Columns.Count)
Set area1 = area1.Resize(maxRows, maxCols)
Set area2 = area2.Resize(maxRows, maxCols)
- Here's where the real speed improvement kicks in - memory-based arrays (see #19 and #20). When you're working with a
Range
object, Excel incurs a lot of overhead to manage all the aspects of that range object. This is expensive in execution time, especially if all you want is the value of each cell in the range. It's a quick step to copy all the values into an array.
Note that each array is declared as a Variant
without array dimensions. The assignment of the range .Value
will cast the variant into an array:
'--- create memory-based arrays for the data in the ranges
Dim data1 As Variant
Dim data2 As Variant
data1 = area1.value
data2 = area2.value
- In keeping with the idea of speed, I'm creating a
Collection
of items that captures all of the differences/discrepancies discovered between the two worksheet areas. Because theCollection
is also a memory-based object, it will also be very fast.
Each entry in the Collection
is a comma separated value string, which we'll pull apart later on.
'--- we'll build up the report as a series of discrepancy
' entries in a Collection for now
Dim report As Collection
Set report = New Collection
Dim r As Long
Dim c As Long
For r = 1 To maxRows
For c = 1 To maxCols
If data1(r, c) <> data2(r, c) Then
'--- add a discrepancy entry to log the difference
' as a comma separated string:
' "row,column,value1 <> value2"
report.Add r & "," & c & "," & data1(r, c) & " <> " & data2(r, c)
End If
Next c
Next r
- The example here shows two different ways to present your results, depending on your requirements. The first simply reports the results as a virtual list (array) that is copied directly to a worksheet. (I didn't create a separate workbook, for simplicity of my example.)
This list does not mimic the dimensions of the data areas at all.
'--- results as a simple list
Dim reportData As Variant
ReDim reportData(1 To report.Count + 1, 1 To 3)
reportData(1, 1) = "Row"
reportData(1, 2) = "Column"
reportData(1, 3) = "Difference"
For r = 2 To report.Count + 1
Dim parts() As String
parts = Split(report.Item(r - 1), ",")
reportData(r, 1) = parts(0)
reportData(r, 2) = parts(1)
reportData(r, 3) = parts(2)
Next r
Set finalReport = reportWS.Range("A1").Resize(report.Count + 1, 3)
finalReport.value = reportData
An alternative solution is to report the discrepancies in a worksheet range that is dimensionally similar to your source worksheets. Your original post is trying to applying shading to cells with differences.
The example here works very fast for two reasons: 1) because of how we've collected the discrepancies earlier, there's no need to loop over every single cell in the range. We have the row and column of each discrepancy, so we can directly (and quickly) just set the highlight colors and be finished, and 2) by disabling/enabling Application.ScreenUpdating
we prevent Excel from interacting with the display, and that gains more speed.
'--- results as a data range with highlighted cells
Application.ScreenUpdating = False
Set reportWS = Sheet4
Set finalReport = reportWS.Range("A1").Resize(maxRows, maxCols)
Dim discrepancy As Variant
For Each discrepancy In report
'Dim parts() As String
parts = Split(discrepancy, ",")
With finalReport.Cells(CLng(parts(0)), CLng(parts(1)))
.value = parts(2)
.Interior.Color = 255
.Font.ColorIndex = 2
.Font.Bold = True
End With
Next discrepancy
Application.ScreenUpdating = True
Here is the full module you can use for testing:
Option Explicit
Sub test()
CompareData Sheet1, Sheet2
End Sub
Sub CompareData(ByRef ws1 As Worksheet, ByRef ws2 As Worksheet)
Dim area1 As Range
Dim area2 As Range
Set area1 = ws1.UsedRange
Set area2 = ws2.UsedRange
'--- calculate the max-sized range for the data and expand
' the ranges as needed
Dim maxRows As Long
Dim maxCols As Long
maxRows = IIf(area1.Rows.Count > area2.Rows.Count, _
area1.Rows.Count, area2.Rows.Count)
maxCols = IIf(area1.Columns.Count > area2.Columns.Count, _
area1.Columns.Count, area2.Columns.Count)
Set area1 = area1.Resize(maxRows, maxCols)
Set area2 = area2.Resize(maxRows, maxCols)
'--- create memory-based arrays for the data in the ranges
Dim data1 As Variant
Dim data2 As Variant
data1 = area1.value
data2 = area2.value
'--- we'll build up the report as a series of discrepancy
' entries in a Collection for now
Dim report As Collection
Set report = New Collection
Dim r As Long
Dim c As Long
For r = 1 To maxRows
For c = 1 To maxCols
If data1(r, c) <> data2(r, c) Then
'--- add a discrepancy entry to log the difference
' as a comma separated string:
' "row,column,value1 <> value2"
report.Add r & "," & c & "," & data1(r, c) & " <> " & data2(r, c)
End If
Next c
Next r
Dim reportWB As Workbook
Dim reportWS As Worksheet
Dim finalReport As Range
'Set reportWB = Workbooks.Add
'Set reportWS = reportWB.Sheets(1)
Set reportWS = Sheet3
'--- results as a simple list
Dim reportData As Variant
ReDim reportData(1 To report.Count + 1, 1 To 3)
reportData(1, 1) = "Row"
reportData(1, 2) = "Column"
reportData(1, 3) = "Difference"
For r = 2 To report.Count + 1
Dim parts() As String
parts = Split(report.Item(r - 1), ",")
reportData(r, 1) = parts(0)
reportData(r, 2) = parts(1)
reportData(r, 3) = parts(2)
Next r
Set finalReport = reportWS.Range("A1").Resize(report.Count + 1, 3)
finalReport.value = reportData
'--- results as a data range with highlighted cells
Application.ScreenUpdating = False
Set reportWS = Sheet4
Set finalReport = reportWS.Range("A1").Resize(maxRows, maxCols)
Dim discrepancy As Variant
For Each discrepancy In report
'Dim parts() As String
parts = Split(discrepancy, ",")
With finalReport.Cells(CLng(parts(0)), CLng(parts(1)))
.value = parts(2)
.Interior.Color = 255
.Font.ColorIndex = 2
.Font.Bold = True
End With
Next discrepancy
Application.ScreenUpdating = True
End Sub
-
\$\begingroup\$ thank you a lot for your detailed answer. I needed to add some new features to the code and unfortunately that is above my vba knowledge. can you please have a look at my new post ? [codereview.stackexchange.com/questions/226346/… \$\endgroup\$Miriam List– Miriam List2019年08月18日 11:11:24 +00:00Commented Aug 18, 2019 at 11:11