1
1
import io .github .bonigarcia .wdm .WebDriverManager ;
2
+ import lombok .extern .slf4j .Slf4j ;
2
3
import model .ProblemStatement ;
3
4
import org .jsoup .Jsoup ;
4
5
import org .jsoup .nodes .Document ;
12
13
import java .util .ArrayList ;
13
14
import java .util .List ;
14
15
16
+ @ Slf4j
15
17
public class Scraper {
16
18
private static final String USERNAME = "" ; // Provide your LeetCode username/email
17
19
private static final String PASSWORD = "" ; // Provide your LeetCode password
18
- public static final int QUESTIONS_PAGE_WAIT_MILLIS = 10000 ;
19
- public static final int LOGIN_PAGE_WAIT_MILLIS = 2000 ;
20
+ public static final int QUESTIONS_PAGE_WAIT_MILLIS = 5000 ;
21
+ public static final int LOGIN_PAGE_WAIT_MILLIS = 30000 ;
20
22
WebDriver driver ;
21
- List <String > companyURLs = new ArrayList <>();
22
23
23
24
public void setup () throws InterruptedException , IOException {
24
25
WebDriverManager .edgedriver ().setup ();
@@ -37,98 +38,45 @@ public void setup() throws InterruptedException, IOException {
37
38
// List<WebElement> companies = driver.findElements(By.cssSelector(".mb-4.mr-3"));
38
39
// for (WebElement company : companies) {
39
40
// String link = company.getAttribute("href");
40
- // System.out.println (link);
41
+ // log.info (link);
41
42
// companyURLs.add(link);
42
43
// }
43
44
// for (String companyURL : companyURLs) {
44
- visitCompanies ("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all" , driver );
45
+ visitCompanies ("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all" , driver );
45
46
// }
46
47
}
47
48
48
- private void visitCompanies (String companyURL , WebDriver driver ) throws InterruptedException , IOException {
49
+ private void visitCompanies (String companyURL , WebDriver driver ) throws InterruptedException {
49
50
String companyName = companyURL .substring (companyURL .lastIndexOf ("/" ) + 1 );
50
- System . out . println ("Visiting " + companyURL );
51
- this . driver .get (companyURL );
51
+ log . info ("Visiting {}" , companyName );
52
+ driver .get (companyURL );
52
53
Thread .sleep (QUESTIONS_PAGE_WAIT_MILLIS );
53
54
loadAllProblems (driver );
54
- // Get the page source and parse with Jsoup
55
- String pageSource = this .driver .getPageSource ();
55
+ String pageSource = driver .getPageSource ();
56
56
Document doc = Jsoup .parse (pageSource );
57
57
List <ProblemStatement > problems = extractProblems (doc );
58
- // Print results
59
- System . out . println ("Extracted " + problems .size () + " problems:" );
58
+
59
+ log . info ("Extracted {} problems from" , problems .size ());
60
60
for (ProblemStatement problem : problems ) {
61
- System . out . println ( problem );
61
+ log . info ( "{}" , problem );
62
62
}
63
63
}
64
64
65
- // private void visitCompanies(String companyURL) throws InterruptedException, IOException {
66
- // String companyName = companyURL.substring(companyURL.lastIndexOf("/") + 1);
67
- // System.out.println("Visiting " + companyURL);
68
- // driver.get(companyURL);
69
- // Thread.sleep(QUESTIONS_PAGE_WAIT_MILLIS); // Wait for the page to load, for companies like Google/Amazon, it takes a lot of time
70
- // String table = "";
71
- // try {
72
- // table = "<table>" + driver.findElement(By.className("table")).getAttribute("innerHTML") + "</table>";
73
- // } catch (NoSuchElementException ex) {
74
- // Thread.sleep(30000);
75
- // driver.get(companyURL);
76
- // Thread.sleep(30000);
77
- // table = "<table>" + driver.findElement(By.className("table")).getAttribute("innerHTML") + "</table>";
78
- // }
79
- // Document doc = Jsoup.parse(table); // parse the table html content
80
- // List<String[]> result = new ArrayList<>();
81
- // String[] header = new String[]{"ID", "Title", "URL", "Is Premium", "Acceptance %", "Difficulty", "Frequency %"};
82
- // result.add(header);
83
- // for (Element row : doc.getElementsByTag("tr")) {
84
- // Elements cols = row.getElementsByTag("td");
85
- // int size = cols.size();
86
- // if (size != 0) { // for <th> size would be 0
87
- // String id = cols.get(1).text();
88
- // String title = cols.get(2).text();
89
- // Elements href = cols.get(2).getElementsByAttribute("href");
90
- // boolean isPremium = !cols.get(2).getElementsByTag("i").isEmpty();
91
- // String problemUrl = href.get(0).attr("href");
92
- // String acceptance = cols.get(3).text();
93
- // String difficulty = cols.get(4).getElementsByTag("span").text();
94
- // String frequency = cols.get(5).getElementsByClass("progress-bar").attr("style");
95
- // // sample response "width: 29.1345%";
96
- // frequency = frequency.substring(frequency.indexOf(" ") + 1); // get the value after the first whitespace
97
- // String[] res = new String[]{id, title, problemUrl, isPremium ? "Y" : "N", acceptance, difficulty, frequency};
98
- // result.add(res);
99
- // }
100
- // }
101
- // try (CSVWriter csvWriter = new CSVWriter(new FileWriter(companyName + ".csv"))) {
102
- // csvWriter.writeAll(result);
103
- // }
104
- // }
105
-
106
65
public static void loadAllProblems (WebDriver driver ) {
107
66
JavascriptExecutor js = (JavascriptExecutor ) driver ;
108
67
int maxScrolls = 30 ; // Maximum number of scrolls
109
- int scrollDelay = 10 ; // Delay between scrolls in seconds
110
68
int consecutiveNoChange = 0 ; // Counter for consecutive scrolls with no new content
111
69
int maxConsecutiveNoChange = 3 ; // Stop after 3 consecutive scrolls with no new content
112
70
113
- System . out . println ("Loading all problems by scrolling..." );
71
+ log . info ("Loading all problems by scrolling..." );
114
72
115
73
for (int i = 0 ; i < maxScrolls ; i ++) {
116
- // Get current number of problem links
117
74
int currentCount = driver .findElements (By .cssSelector ("a[href*='/problems/'][id]" )).size ();
118
75
119
- // Try multiple scrolling methods to ensure scrolling works
120
76
boolean scrolled = performScroll (driver , js );
121
77
122
78
if (!scrolled ) {
123
- System .out .println ("Scroll " + (i + 1 ) + ": Unable to scroll further, stopping." );
124
- break ;
125
- }
126
-
127
- // Wait for new content to load
128
- try {
129
- Thread .sleep (scrollDelay * 1000 );
130
- } catch (InterruptedException e ) {
131
- Thread .currentThread ().interrupt ();
79
+ log .info ("Scroll " + (i + 1 ) + ": Unable to scroll further, stopping." );
132
80
break ;
133
81
}
134
82
@@ -138,96 +86,43 @@ public static void loadAllProblems(WebDriver driver) {
138
86
if (newCount > currentCount ) {
139
87
// New content loaded
140
88
consecutiveNoChange = 0 ;
141
- System . out . println ("Scroll " + (i + 1 ) + ": Found " + newCount + " problems (+" + (newCount - currentCount ) + " new)" );
89
+ log . info ("Scroll " + (i + 1 ) + ": Found " + newCount + " problems (+" + (newCount - currentCount ) + " new)" );
142
90
} else {
143
91
// No new content
144
92
consecutiveNoChange ++;
145
- System . out . println ("Scroll " + (i + 1 ) + ": No new problems loaded (" + consecutiveNoChange + "/" + maxConsecutiveNoChange + ")" );
93
+ log . info ("Scroll " + (i + 1 ) + ": No new problems loaded (" + consecutiveNoChange + "/" + maxConsecutiveNoChange + ")" );
146
94
147
95
// If we've had consecutive scrolls with no new content, assume we've reached the end
148
96
if (consecutiveNoChange >= maxConsecutiveNoChange ) {
149
- System . out . println ("No new content loaded after " + maxConsecutiveNoChange + " consecutive scrolls. Assuming all problems are loaded." );
97
+ log . info ("No new content loaded after " + maxConsecutiveNoChange + " consecutive scrolls. Assuming all problems are loaded." );
150
98
break ;
151
99
}
152
100
}
153
101
}
154
-
155
- System .out .println ("Finished loading problems." );
102
+ log .info ("Finished loading problems." );
156
103
}
157
104
158
- /**
159
- * Performs scrolling using multiple methods to ensure it works
160
- * @param driver WebDriver instance
161
- * @param js JavascriptExecutor instance
162
- * @return true if scrolling was performed, false otherwise
163
- */
164
105
private static boolean performScroll (WebDriver driver , JavascriptExecutor js ) {
165
106
try {
166
- // Get current scroll position
167
- long currentScrollY = (Long ) js .executeScript ("return window.scrollY || window.pageYOffset;" );
168
-
169
- // Method 1: Scroll to document height
170
- js .executeScript ("window.scrollTo(0, document.body.scrollHeight);" );
171
- Thread .sleep (1000 ); // Short wait
107
+ // Find the specific element to scroll inside
108
+ WebElement scrollElement = driver .findElement (By .xpath ("/html/body/div[1]/div[1]/div[4]/div/div[2]" ));
172
109
173
- // Check if we actually scrolled
174
- long newScrollY = (Long ) js .executeScript ("return window.scrollY || window.pageYOffset;" );
175
- if (newScrollY > currentScrollY ) {
176
- return true ;
177
- }
110
+ // Scroll down by 5000 pixels inside the element
111
+ js .executeScript ("arguments[0].scrollTop = arguments[0].scrollHeight;" , scrollElement );
112
+ Thread .sleep (3000 );
178
113
179
- // Method 2: Try document.documentElement.scrollHeight
180
- js .executeScript ("window.scrollTo(0, document.documentElement.scrollHeight);" );
181
- Thread .sleep (1000 );
182
-
183
- newScrollY = (Long ) js .executeScript ("return window.scrollY || window.pageYOffset;" );
184
- if (newScrollY > currentScrollY ) {
185
- return true ;
186
- }
187
-
188
- // Method 3: Scroll by a large amount
189
- js .executeScript ("window.scrollBy(0, 3000);" );
190
- Thread .sleep (1000 );
191
-
192
- newScrollY = (Long ) js .executeScript ("return window.scrollY || window.pageYOffset;" );
193
- if (newScrollY > currentScrollY ) {
194
- return true ;
195
- }
196
-
197
- // Method 4: Try scrolling the main container (common in SPAs)
198
- js .executeScript (
199
- "const containers = document.querySelectorAll('[class*=\" scroll\" ], [class*=\" overflow\" ], main, .main-content, #main');" +
200
- "for (let container of containers) {" +
201
- " if (container.scrollHeight > container.clientHeight) {" +
202
- " container.scrollTop = container.scrollHeight;" +
203
- " break;" +
204
- " }" +
205
- "}"
206
- );
207
- Thread .sleep (1000 );
208
-
209
- newScrollY = (Long ) js .executeScript ("return window.scrollY || window.pageYOffset;" );
210
- if (newScrollY > currentScrollY ) {
211
- return true ;
212
- }
213
-
214
- // Method 5: End key simulation
215
- js .executeScript ("document.body.dispatchEvent(new KeyboardEvent('keydown', {key: 'End'}));" );
216
- Thread .sleep (1000 );
217
-
218
- newScrollY = (Long ) js .executeScript ("return window.scrollY || window.pageYOffset;" );
219
- return newScrollY > currentScrollY ;
114
+ return true ;
220
115
221
116
} catch (Exception e ) {
222
- System . err . println ("Error during scrolling: " + e . getMessage () );
117
+ log . error ("Error during scrolling: " , e );
223
118
return false ;
224
119
}
225
120
}
226
121
227
122
public static List <ProblemStatement > extractProblems (Document doc ) {
228
123
List <ProblemStatement > problems = new ArrayList <>();
229
124
230
- // Find all problem links (assuming they are anchor tags with specific pattern)
125
+ // Find all problem links
231
126
Elements problemLinks = doc .select ("a[href*='/problems/'][id]" );
232
127
233
128
for (Element link : problemLinks ) {
@@ -237,7 +132,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {
237
132
238
133
// Extract URL from href attribute
239
134
String url = link .attr ("href" );
240
- // Remove query parameters if needed
135
+ // Remove query parameters
241
136
if (url .contains ("?" )) {
242
137
url = url .substring (0 , url .indexOf ("?" ));
243
138
}
@@ -255,9 +150,13 @@ public static List<ProblemStatement> extractProblems(Document doc) {
255
150
256
151
// Extract acceptance percentage
257
152
String acceptancePercentage = "" ;
258
- Element acceptanceElement = link .select ("div.text-sd-muted-foreground" ).first ();
259
- if (acceptanceElement != null ) {
260
- acceptancePercentage = acceptanceElement .text ().trim ();
153
+ Elements candidates = link .select ("div.text-sd-muted-foreground" );
154
+ for (Element candidate : candidates ) {
155
+ String text = candidate .text ().trim ();
156
+ if (text .contains ("%" )) {
157
+ acceptancePercentage = text ;
158
+ break ;
159
+ }
261
160
}
262
161
263
162
// Extract difficulty based on CSS class
@@ -276,7 +175,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {
276
175
277
176
// Extract frequency percentage by counting orange divs (exclude those with opacity)
278
177
String frequencyPercentage = "" ;
279
- Elements orangeDivs = link .select ("div. bg-brand-orange. h-2. w-0\\ .5. rounded" );
178
+ Elements orangeDivs = link .select ("div[class*=' bg-brand-orange'][class*=' h-2'][class*=' w-0.5'][class*=' rounded'] " );
280
179
if (!orangeDivs .isEmpty ()) {
281
180
int validCount = 0 ;
282
181
for (Element div : orangeDivs ) {
@@ -298,8 +197,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {
298
197
}
299
198
300
199
} catch (Exception e ) {
301
- System .err .println ("Error processing problem element: " + e .getMessage ());
302
- continue ;
200
+ log .error ("Error processing problem element: " , e );
303
201
}
304
202
}
305
203
0 commit comments