Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 0aaf085

Browse files
Make Scraper fasteR
1 parent b3463e2 commit 0aaf085

File tree

1 file changed

+14
-18
lines changed

1 file changed

+14
-18
lines changed

‎src/main/java/Scraper.java

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import org.openqa.selenium.edge.EdgeDriver;
1111

1212
import java.io.FileWriter;
13-
import java.io.IOException;
1413
import java.nio.file.Files;
1514
import java.nio.file.Path;
1615
import java.nio.file.Paths;
@@ -50,16 +49,26 @@ public void setup() throws InterruptedException {
5049
companyURLs.add(link);
5150
}
5251

53-
for (String companyURL : companyURLs) {
52+
for (int i = 0; i < companyURLs.size(); i++) {
53+
String companyURL = companyURLs.get(i);
5454
String companyName = extractCompanyNameWithRegex(companyURL);
5555
for (String recency : new String[]{"thirty-days", "three-months", "six-months", "more-than-six-months", "all"}) {
5656
visitCompanies(String.format("https://leetcode.com/company/%s/?favoriteSlug=%s-%s", companyName, companyName, recency), driver, recency);
5757
}
58+
log.info("Pending {} companies", companyURLs.size() - i - 1);
5859
}
5960
}
6061

6162
private void visitCompanies(String companyURL, WebDriver driver, String recency) throws InterruptedException {
6263
String companyName = extractCompanyNameWithRegex(companyURL);
64+
65+
// Create directory structure: companyName/recency/
66+
Path outputDir = Paths.get(companyName, String.format("%s.csv", recency));
67+
if (Files.exists(outputDir)) {
68+
log.info("File already exists: {}", outputDir.toAbsolutePath());
69+
return;
70+
}
71+
6372
log.info("Visiting {} with recency {}", companyName, recency);
6473
driver.get(companyURL);
6574
Thread.sleep(QUESTIONS_PAGE_WAIT_MILLIS);
@@ -89,38 +98,25 @@ private String extractCompanyNameWithRegex(String companyURL) {
8998
private static void loadAllProblems(WebDriver driver) {
9099
JavascriptExecutor js = (JavascriptExecutor) driver;
91100
int maxScrolls = 30; // Maximum number of scrolls
92-
int consecutiveNoChange = 0; // Counter for consecutive scrolls with no new content
93-
int maxConsecutiveNoChange = 3; // Stop after 3 consecutive scrolls with no new content
94101

95102
log.info("Loading all problems by scrolling...");
96103

97104
for (int i = 0; i < maxScrolls; i++) {
98105
int currentCount = driver.findElements(By.cssSelector("a[href*='/problems/'][id]")).size();
99-
100106
boolean scrolled = performScroll(driver, js);
101-
102107
if (!scrolled) {
103-
log.info("Scroll " + (i + 1) + ": Unable to scroll further, stopping.");
108+
log.info("Scroll {}: Unable to scroll further, stopping.", i + 1);
104109
break;
105110
}
106111

107112
// Check if new problems were loaded
108113
int newCount = driver.findElements(By.cssSelector("a[href*='/problems/'][id]")).size();
109-
110114
if (newCount > currentCount) {
111115
// New content loaded
112-
consecutiveNoChange = 0;
113-
log.info("Scroll " + (i + 1) + ": Found " + newCount + " problems (+" + (newCount - currentCount) + " new)");
116+
log.info("Scroll {}: Found {} problems (+{} new)", i + 1, newCount, newCount - currentCount);
114117
} else {
115118
// No new content
116-
consecutiveNoChange++;
117-
log.info("Scroll " + (i + 1) + ": No new problems loaded (" + consecutiveNoChange + "/" + maxConsecutiveNoChange + ")");
118-
119-
// If we've had consecutive scrolls with no new content, assume we've reached the end
120-
if (consecutiveNoChange >= maxConsecutiveNoChange) {
121-
log.info("No new content loaded after " + maxConsecutiveNoChange + " consecutive scrolls. Assuming all problems are loaded.");
122-
break;
123-
}
119+
break;
124120
}
125121
}
126122
log.info("Finished loading problems.");

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /