My code below find the duplicates based on 2 criteria:
- The first criteria is the Name in
Column A
- The second criteria is the Country in
Column D
Sheet("RawData")
:
CompanyName | Duns ID | Product ID | CountryName
Sheets("Conso")
after the macro:
CompanyNames | Duns ID | Product ID
What I am looking is to increase the speed of this code, because I am working with more than 150K rows and it takes hours.
Sub MDMDuplicates()
Dim WB As Workbook
Dim wsRawData As Worksheet, wsConso As Worksheet
Dim i As Long, j As Long, Lastrow As Long, LastrowConso as Long
Dim SupNameToCheck As String, ConsoSupplierDUNS As String, ConsoSupplierMDM As String, ConsoSupplierNAME As String
Set WB = ThisWorkbook
Set wsRawData = WB.Sheets("RawData")
Set wsConso = WB.Sheets("Conso")
Lastrow = wsRawData.Range("A" & Rows.Count).End(xlUp).Row
Application.ScreenUpdating = False
With wsRawData
For i = 2 To Lastrow
SupNameToCheck = .Cells(i, "A").Value
SupCountryToCheck = .Cells(i, "D")
ConsoSupplierDUNS = ""
ConsoSupplierMDM = ""
ConsoSupplierNAME = ""
j = Lastrow
Do
If i <> j And SupNameToCheck = .Cells(j, "A") And SupCountryToCheck = .Cells(j, "D") Then
If ConsoSupplierNAME = "" Then
ConsoSupplierDUNS = .Cells(i, "B") & "," & .Cells(j, "B")
ConsoSupplierMDM = .Cells(i, "C") & "," & .Cells(j, "C")
ConsoSupplierNAME = SupNameToCheck & "," & .Cells(j, "A")
Else
ConsoSupplierDUNS = .Cells(j, "B") & "," & ConsoSupplierDUNS
ConsoSupplierMDM = .Cells(j, "C") & "," & ConsoSupplierMDM
ConsoSupplierNAME = .Cells(j, "A") & "," & ConsoSupplierNAME
End If
.Cells(j, "A").EntireRow.Delete
End If
j = j - 1
Loop Until j = 1
LastrowConso = wsConso.Range("A" & Rows.Count).End(xlUp).Row + 1
If Not ConsoSupplierNAME = "" Then
wsConso.Cells(LastrowConso, "B") = ConsoSupplierDUNS
wsConso.Cells(LastrowConso, "C") = ConsoSupplierMDM
wsConso.Cells(LastrowConso, "A") = ConsoSupplierNAME
Else
wsConso.Cells(LastrowConso, "B") = .Cells(i, "B")
wsConso.Cells(LastrowConso, "C") = .Cells(i, "C")
wsConso.Cells(LastrowConso, "A") = SupNameToCheck
End If
Next i
Application.ScreenUpdating = True
End With
End Sub
-
\$\begingroup\$ I'm unclear on why the name has to be repeated in the result. It would seem to me that having AAA rather than AAA, AAA as a result would be more efficient and more functional for later processing (e.g. using a lookup). \$\endgroup\$user66882– user668822016年02月21日 02:53:00 +00:00Commented Feb 21, 2016 at 2:53
4 Answers 4
It looks to me that there is repeated loops cycling through the entire data matrix trying to identify duplicates. Putting the whole data block into a two-dimensional variant array would help. Faster lookups could be achieved with WorksheetFunction object's use of the MATCH function but a Scripting.Dictionary object with its unique collection index can greatly reduce the looping and the lookups.
I would heartily recommend either setting the VBE's Tools ► Options ► Editor ► Require variable declaration or manually putting Option Explicit
1 at the top of each module code sheet. Currently, the SupCountryToCheck var was left undeclared.
Sub MDM_Duplicates_Jeeped()
Dim wb As Workbook, wsRawData As Worksheet, wsConso As Worksheet
Dim sKey As String, tmp As Variant
'late binding of the dictionary object
Dim d As Long, vTMPs As Variant, dMDMs As Object
'early binding of the dictionary object (see footnote 2)
'Dim d As Long, vTMPs as variant, dMDMs As new Scripting.Dictionary
appTGGL bTGGL:=False 'turn off unnecessary environment overhead
Set wb = ThisWorkbook
Set wsRawData = wb.Worksheets("RawData")
Set wsConso = wb.Worksheets("Conso")
'late binding of the dictionary object
Set dMDMs = CreateObject("Scripting.Dictionary")
dMDMs.CompareMode = vbTextCompare
With wsRawData
'dump all of the values into a 2-D variant array
vTMPs = .Range(.Cells(2, 1), .Cells(Rows.Count, 4).End(xlUp)).Value2
'for testing purposes - first 50 rows
'vTMPs = .Range(.Cells(2, 1), .Cells(50, 4)).Value2
End With
'populate the dictionary with name/country pair keys and
'comma separated duns/product concatenated item pairs
For d = LBound(vTMPs, 1) To UBound(vTMPs, 1)
sKey = Join(Array(vTMPs(d, 1), vTMPs(d, 4)), ChrW(8203))
If dMDMs.Exists(sKey) Then
dMDMs.Item(sKey) = Join(Array(Split(dMDMs.Item(sKey), ChrW(8203))(0) & Chr(44) & Format(vTMPs(d, 2), "000000000"), _
Split(dMDMs.Item(sKey), ChrW(8203))(1) & Chr(44) & vTMPs(d, 3)), ChrW(8203))
Else
dMDMs.Add Key:=sKey, _
Item:=Join(Array(CStr(Format(vTMPs(d, 2), "'000000000")), Chr(39) & CStr(vTMPs(d, 3))), ChrW(8203))
End If
Next d
'put the dictionary's collated keys and items back into a redimmed vTMPs
'late binding needs to iterate through for each key
Erase vTMPs
ReDim vTMPs(1 To dMDMs.Count, 1 To 4)
d = 1
For Each tmp In dMDMs.Keys
vTMPs(d, 1) = Split(tmp, ChrW(8203))(0)
vTMPs(d, 2) = Split(dMDMs.Item(tmp), ChrW(8203))(0)
vTMPs(d, 3) = Split(dMDMs.Item(tmp), ChrW(8203))(1)
vTMPs(d, 4) = Split(tmp, ChrW(8203))(1)
d = d + 1
Next tmp
'dump all of the temp variant array's values back into the results worksheet
With wsConso
.Cells(2, 1).Resize(UBound(vTMPs, 1), UBound(vTMPs, 2)) = vTMPs
End With
'restore the environment
appTGGL
End Sub
Sub appTGGL(Optional bTGGL As Boolean = True)
With Application
.ScreenUpdating = bTGGL
.EnableEvents = bTGGL
.DisplayAlerts = bTGGL
.Calculation = IIf(bTGGL, xlCalculationAutomatic, xlCalculationManual)
End With
Debug.Print Timer
End Sub
150,000 rows of random sample data generated to mimic your own sample data3 (sorted or unsorted).
Late Binding: 13.94 seconds
Early Binding: 9.48 seconds
Sample XLSB workbook available temporarily at: MDMDuplicates.xlsb
I wasn't too sure on how to deal with the DUNS numbers. Typically for DUNS, I use true numbers formatted as 000000000
or 000000000-0000
(the latter for DUNS+4) as numerical lookups are faster and more versatile than text-based lookups. However, for this I used forced text with leading zeroes to make 9 digit placeholders. The productid numbers were similarly forced into text with a '
Range.PrefixCharacter property so you didn't end up with some as text-that-look-like-numbers (multiple) and some as true numbers (single).
I have also retained the country codes and augmented them to the three letter ISO 3166-1 alpha-3 standard. It didn't make sense to have the country codes as one of the unique criteria and then discard them from the results.
1 Setting Require Variable Declaration within the VBE's Tools ► Options ► Editor property page will put the Option Explicit statement at the top of each newly created code sheet. This will avoid silly coding mistakes like misspellings as well as influencing you to use the correct variable type in the variable declaration. Variables created on-the-fly without declaration are all of the variant/object type. Using Option Explicit is widely considered 'best practice'.
2 If you convert the late binding of the Scripting.Dictionary object to early binding, you must add Microsoft Scripting Runtime to the VBE's Tools ► References.
3 Tests were performed on a five-year-old business class i5 laptop with 8 Gbs of DRAM and Excel 2010 version 14.0.7166.5000 (32-bit). To my mind, that is the low end of the scale on which an operation like this should be performed.
-
\$\begingroup\$ Hi @Jeeped, i am gonna try this tomorrow. However i understood the logic and you are the only one who gave me such good and precious advise for what i was looking for. thanks again i will tell you tomorrow the result! you're awesome! \$\endgroup\$manu– manu2016年02月21日 18:09:01 +00:00Commented Feb 21, 2016 at 18:09
The first thing to do before making any changes to this code, is to improve its readability. I saw .Cells
and thought "oh we're in a With block" ...and then had to look three times to find the With
statement.
The keyword here, is indentation.
Sub DoSomething()
....
....With SomeObject
....|...
....|...If SomeCondition Then
....|...|...DoActionOne
....|...Else
....|...|...DoActionTwo
....|...End If
....|...
....|...Do
....|...|...DoActionThree
....|...Loop
....|...
....End With
....
....For i = 1 To 10
....|....
....|....DoActionFour
....|....
....Next
....
End Sub
When the Else
blocks don't line up with the corresponding If
statement, or when a Loop
keyword doesn't line up with its corresponding Do
keyword, or when nested blocks line up in column 1, you basically set yourself up for making a change that introduces a bug.
Proper indentation cannot be underestimated.
You have redundant object references:
- If
WB.Sheets("RawData")
has CodeNameSheet1
, name itRawDataSheet
and use that reference instead. - If
WB.Sheets("Conso")
has CodeNameSheet2
, name itConsoSheet
and use that reference instead.
"CodeName" is a property of all sheet objects in Excel VBA; there's a global object reference pointing to these, readily available for you to use - no need to fetch it from WB.Sheets
collection, which by the way could give you non-worksheet objects, since the Sheets
collection includes Charts, among other sheet types. You probably meant to use the WB.Worksheets
collection instead. But then again, you don't need it - just use the global object VBA gives you for free instead.
Application.ScreenUpdating = False
Whenever you set that value to False
, you need to handle runtime errors and make sure that whatever happens, the method can't exit without setting it back to True
. Otherwise Excel will look "frozen" when it's actually completely responsive, just not redrawing itself... because you told it not to.
Don't assume things won't blow up. They always do.
Sub DoSomething()
On Error GoTo CleanFail
Application.ScreenUpdating = False
'...
CleanExit:
Application.ScreenUpdating = True
Exit Sub
CleanFail:
'handle errors
Resume CleanExit
End Sub
Now that's great, but you're deleting rows, which should trigger a recalculation if calculation is set to xlAutomatic
. Consider not only turning off ScreenUpdating
, but also setting Calculation
to xlManual
while you're doing your thing.
I'm going to throw an answer out here as well. First things first, I agree with Mat's Mug about using codenames, but I'm going to leave it as is for my answer, out of simplicity. Same goes for indentation and mostly everything else.
Out of personal preference, I don't like variables all defined on the same row - it hinders my ability to go see the variables easily. I also like to define them as soon as I dim them.
I would turn this
Dim WB As Workbook
Dim wsRawData As Worksheet, wsConso As Worksheet
Dim i As Long, j As Long, Lastrow As Long, LastrowConso as Long
Dim SupNameToCheck As String, ConsoSupplierDUNS As String, ConsoSupplierMDM As String, ConsoSupplierNAME As String
Set WB = ThisWorkbook
Set wsRawData = WB.Sheets("RawData")
Set wsConso = WB.Sheets("Conso")
Lastrow = wsRawData.Range("A" & Rows.Count).End(xlUp).Row
Into this:
Dim WB As Workbook
Set WB = ThisWorkbook
Dim wsRawData As Worksheet
Set wsRawData = WB.Sheets("RawData")
Dim wsConso As Worksheet
Set wsConso = WB.Sheets("Conso")
Dim i As Long
Dim j As Long
Dim lastRow As Long
lastRow = wsRawData.Range("A" & Rows.Count).End(xlUp).Row
Dim LastrowConso As Long
Dim supNameToCheck As String
Dim consoSupplierDUNS As String
Dim consoSupplierMDM As String
Dim consoSupplierNAME As String
Now it's easier to read and you'll notice Standard VBA naming conventions have camelCase
for local variables and PascalCase
for other variables and names.
Speaking of variables names - give them meaning. You've done a good job of this except for i
and j
. Now, those are acceptable in most circles, so I can't say it's wrong, but why not use something like rowNumber
? And why use j
? You already have i
and lastRow
defined.
Going into your With
and Do
loops has me backtracking all over the place trying to figure out what's going on with the i
and the j
and the weird spacing. This first Do Loop
is just for populating your strings, right?
Seems to me that sorting your sheet would reduce your loop, so
wsRawData.Columns("A:Z").Sort key1:=Range("A:A"), order1:=xlAscending, key2:=Range("D:D"), order2:=xlAscending, Header:=xlYes
Now we're ordered by column A and D, so we just need to start at the bottom and go up:
For rowNumber = lastRow To 2 Step -1
Now you check for duplicates based on columns A and D? (削除) But your example is just finding A duplicates and combining the other columns, isn't it? (削除ここまで) An easier way to do that would be like this (unless I'm misunderstanding):
For rowNumber = lastRow To 2 Step -1
If Cells(rownumber, a) = Cells(rownumber - 1, a) And Cells(rownumber, 4) = Cells(rownumber - 1, 4) Then
Cells(lastRow - 1, 2) = Cells(lastRow, 2) & ", " & Cells(lastRow - 1, 2)
Cells(lastRow - 1, 3) = Cells(lastRow, 3) & ", " & Cells(lastRow - 1, 3)
Rows(rownumber).EntireRow.Delete
End If
Next
Loop through the rows only once and if two criteria match, combine the others. Now you only need 1 loop and you don't need those strings or anything. Cutting a loop out as well as what Mat spoke of will really help your speed.
I also got with of the With
block - just use your worksheet variables in your ranges.
I can't quite figure out what's happening at the bottom with the other worksheet, so I'm not addressing it. You could probably bring it into your single loop as well.
Have you considered pushing all this data into an Access database, then using a query to identify the duplicates? That seems to me to be the right tool for this job.