I wrote this to scrape album review data from AOTY into a spreadsheet. Check it out and let me know what I could've done better.
Option Explicit
Const classAlbum As String = "listLargeTitle"
Const classScore As String = "listScoreContainer"
Const classRating As String = "listScoreValueContainer"
Const classReview As String = "listScoreText"
Const classNext As String = "pageSelect"
Sub ScrapeAOTY()
Dim IE As MSXML2.XMLHTTP60
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLBody As MSHTML.HTMLBody
Dim Albums As Object
Dim Ratings As Object
Dim Reviews As Object
Dim MetaData As Object
Dim InnerMeta As Object
Dim nextPage As Object
Dim selection As Object
Dim sheet As Worksheet
Dim aotyYear As String
Dim url As String
Dim bookmark As String
Dim album As String
Dim review As String
Dim releaseDate As String
Dim genre As String
Dim test As String
Dim rating As Double
Dim index As Long
Dim row As Long
Dim page As Long
Dim pageLoading As Boolean
Dim exitFlag As Boolean
Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
aotyYear = InputBox("Input the year you're scraping")
If aotyYear < 1900 Then Exit Sub
url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/1"
Set sheet = ThisWorkbook.Worksheets("ScrapeSheet")
row = 2
exitFlag = False
page = 1
Do
Set IE = New MSXML2.XMLHTTP60
url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/" & page
IE.Open "GET", url, False
IE.send
Do While IE.ReadyState <> 4
DoEvents
Loop
Set HTMLDoc = New MSHTML.HTMLDocument
Set HTMLBody = HTMLDoc.body
HTMLBody.innerHTML = IE.responseText
Set Albums = HTMLDoc.getElementsByClassName(classAlbum)
Set Ratings = HTMLDoc.getElementsByClassName(classRating)
Set Reviews = HTMLDoc.getElementsByClassName(classReview)
Set MetaData = HTMLDoc.getElementsByClassName(classScore)
For index = 0 To Albums.Length - 1
Set InnerMeta = MetaData(index).parentElement.parentElement
Set InnerMeta = InnerMeta.getElementsByTagName("div")
album = Albums(index).innerText
rating = CDbl(Ratings(index).getAttribute("title"))
review = Reviews(index).innerText
releaseDate = InnerMeta(0).innerText
If InnerMeta.Length < 10 Then
genre = ""
Else
genre = InnerMeta(1).innerText
End If
sheet.Cells(row, 1) = Mid(album, InStr(1, album, " ") + 1, InStr(1, album, " - ") - InStr(1, album, ".") - 2)
sheet.Cells(row, 2) = Right(album, Len(album) - InStr(1, album, " - ") - 2)
sheet.Cells(row, 3) = rating
sheet.Cells(row, 4) = Left(review, Len(review) - 8)
sheet.Cells(row, 5) = releaseDate
sheet.Cells(row, 6) = genre
row = row + 1
Next index
Set nextPage = HTMLDoc.getElementsByClassName(classNext)
If nextPage.Length = 2 _
Or nextPage(0).innerText = "Next >" Then
page = page + 1
Else
exitFlag = True
End If
Set IE = Nothing
Loop Until exitFlag = True
MsgBox ("Done!")
Application.ScreenUpdating = True
Application.Calculation = xlCalculationAutomatic
End Sub
-
4\$\begingroup\$ I wouldn't name a variable "sheet" or "selection" \$\endgroup\$Raystafarian– Raystafarian2015年12月02日 16:49:42 +00:00Commented Dec 2, 2015 at 16:49
3 Answers 3
I can only point out cosmetic issues (I don't know sqat about VBA), I hope another reviewer will pass by and cover other aspects.
Unused variables
These variables seem to be unused, so why not remove them:
Dim selection As Object
Dim bookmark As String
Dim test As String
Dim pageLoading As Boolean
Magic string
This string appears twice:
"http://www.albumoftheyear.org/ratings/6-highest-rated/"
It would be better to define it one place, so you can change it one place if needed.
Readability
This block:
aotyYear = InputBox("Input the year you're scraping") If aotyYear < 1900 Then Exit Sub url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/1" Set sheet = ThisWorkbook.Worksheets("ScrapeSheet") row = 2 exitFlag = False page = 1
Would be more readable if you break a line after the early return:
aotyYear = InputBox("Input the year you're scraping")
If aotyYear < 1900 Then Exit Sub
url = "http://www.albumoftheyear.org/ratings/6-highest-rated/" & aotyYear & "/1"
Set sheet = ThisWorkbook.Worksheets("ScrapeSheet")
row = 2
exitFlag = False
page = 1
It might be a matter of taste, but I think this would be more readable if the condition was not split, on a single line.
If nextPage.Length = 2 _ Or nextPage(0).innerText = "Next >" Then page = page + 1 Else exitFlag = True End If
-
2\$\begingroup\$ Good catch with the unused/unassigned variables - the only thing Rubberduck picks up that you didn't (apart from a false positive with
sheet
being reported as not used), is that theScrapeAOTY
member is implicitly public. Note that theselection
local variable hides theSelection
object defined by the Excel object model. Nice answer! \$\endgroup\$Mathieu Guindon– Mathieu Guindon2015年12月02日 19:39:20 +00:00Commented Dec 2, 2015 at 19:39
First off, I just want to say that's a pretty solid program there. Compared to most reviews I do, most of these points are fairly minor. That said, let's begin.
Naming
Don't ever use protected names for your variables. Protected names are names that are already used by the VBA object model, such as selection
or sheet
. They can and will cause chaos and confusion down the road, not to mention the mistakes the compiler might make if it gets confused.
Your naming is not consistent. Some variables have leading capitals, some do not.
The general convention for VBA is:
camelCase
for procedure-Level Variables
PascalCase
for Module and Global-Level variables
SHOUTY_SNAKE_CASE
for constants
you don't have to use these specifically, so long as you do pick a convention and apply it consistently.
Other than that, there are a couple of variable names that could be even clearer, but your naming is pretty solid. The variables I think could be clearer: pageLoading
--> pageIsLoading
, page
--> pageNum
, IE
to something much clearer. I know, it's obvious here what it is, but variable names should always be able to stand on their own without additional context.
Other bits and pieces
I personally prefer to put my loop conditions at the start of my loops rather than the end. The end can be much easier to miss and this way, it means you know what to look out for while you're reading through it.
You may want to dim similar variables on the same line to make it less wall-of-text-like. Like so:
dim albums as object, ratings as object, reviews as object
-
2\$\begingroup\$ Good points, but determining where to put the condition on a loop very much comes down to how you want to use it. A loop that you know you want to run 1+ times, is naturally checked at the end. For loops that run 0+ times a check initially is natural. Since the OPs code actually determines whether or not to continue based upon data obtained within the loop a check at the end makes more sense. \$\endgroup\$Daniel– Daniel2015年12月04日 14:34:48 +00:00Commented Dec 4, 2015 at 14:34
All the major things that can be changed to give your scraper a better look have already been pointed out. However, few things you should take into consideration:
It doesn't require to use the same
Link
more than once; rather, put it in a variable and reuse it.If any number given in the InputBox is greater than 2017 then it will fail miserably or give the wrong results. Moreover, if anyone runs your code and decides instantly to quit or any bad data is given or
OK
button is pressed without feeding anything to the InputBox then it will throw error.You don't need to use
Do Loop
here because theFor Loop
is sufficient to accomplish the task.You didn't make your scraper in such a way so that it can handle errors if any data is missing.
And lastly, how about creating a header? If your scraper is made to run the headers will always be there.
Implementing all the things I've said here, your parser can get a different appearance like below:
Sub Aoty_Scraper()
Const main_link As String = "http://www.albumoftheyear.org/ratings/6-highest-rated/"
Dim http As New XMLHTTP60, html As New HTMLDocument, htm As New HTMLDocument
Dim post As HTMLHtmlElement, page As Object, next_page As String
Dim row As Long, inputyear As Variant
Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
inputyear = InputBox("Input any year between 1900 and this year")
If inputyear < 1900 Or inputyear > 2017 Then Exit Sub
[A1:D1] = [{"Album/Singer","Title","Score","Review"}]
With http
.Open "GET", main_link & inputyear & "/1", False
.send
html.body.innerHTML = .responseText
End With
For Each page In html.getElementsByClassName("smallBottomLink")
next_page = page.innerText
With http
.Open "GET", main_link & inputyear & "/" & next_page, False
.send
htm.body.innerHTML = .responseText
End With
For Each post In htm.getElementsByClassName("albumListRow")
With post.getElementsByClassName("listLargeTitle")(0).getElementsByTagName("a")
If .Length Then row = row + 1: Cells(row + 1, 1) = Split(.item(0).innerText, "-")(0)
End With
With post.getElementsByClassName("listLargeTitle")(0).getElementsByTagName("a")
If .Length Then Cells(row + 1, 2) = Split(.item(0).innerText, "-")(1)
End With
With post.getElementsByClassName("listScoreValue")
If .Length Then Cells(row + 1, 3) = .item(0).innerText
End With
With post.getElementsByClassName("listScoreText")
If .Length Then Cells(row + 1, 4) = Split(.item(0).innerText, " ")(0)
End With
Next post
Next page
MsgBox "It's done"
Application.ScreenUpdating = True
Application.Calculation = xlCalculationAutomatic
End Sub