Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 8afa5dd

Browse files
Working draft for one company
1 parent 89574ef commit 8afa5dd

File tree

1 file changed

+37
-139
lines changed

1 file changed

+37
-139
lines changed

‎src/main/java/Scraper.java

Lines changed: 37 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io.github.bonigarcia.wdm.WebDriverManager;
2+
import lombok.extern.slf4j.Slf4j;
23
import model.ProblemStatement;
34
import org.jsoup.Jsoup;
45
import org.jsoup.nodes.Document;
@@ -12,13 +13,13 @@
1213
import java.util.ArrayList;
1314
import java.util.List;
1415

16+
@Slf4j
1517
public class Scraper {
1618
private static final String USERNAME = ""; // Provide your LeetCode username/email
1719
private static final String PASSWORD = ""; // Provide your LeetCode password
18-
public static final int QUESTIONS_PAGE_WAIT_MILLIS = 10000;
19-
public static final int LOGIN_PAGE_WAIT_MILLIS = 2000;
20+
public static final int QUESTIONS_PAGE_WAIT_MILLIS = 5000;
21+
public static final int LOGIN_PAGE_WAIT_MILLIS = 30000;
2022
WebDriver driver;
21-
List<String> companyURLs = new ArrayList<>();
2223

2324
public void setup() throws InterruptedException, IOException {
2425
WebDriverManager.edgedriver().setup();
@@ -37,98 +38,45 @@ public void setup() throws InterruptedException, IOException {
3738
// List<WebElement> companies = driver.findElements(By.cssSelector(".mb-4.mr-3"));
3839
// for (WebElement company : companies) {
3940
// String link = company.getAttribute("href");
40-
// System.out.println(link);
41+
// log.info(link);
4142
// companyURLs.add(link);
4243
// }
4344
// for (String companyURL : companyURLs) {
44-
visitCompanies("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all", driver);
45+
visitCompanies("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all", driver);
4546
// }
4647
}
4748

48-
private void visitCompanies(String companyURL, WebDriver driver) throws InterruptedException, IOException {
49+
private void visitCompanies(String companyURL, WebDriver driver) throws InterruptedException {
4950
String companyName = companyURL.substring(companyURL.lastIndexOf("/") + 1);
50-
System.out.println("Visiting " + companyURL);
51-
this.driver.get(companyURL);
51+
log.info("Visiting {}", companyName);
52+
driver.get(companyURL);
5253
Thread.sleep(QUESTIONS_PAGE_WAIT_MILLIS);
5354
loadAllProblems(driver);
54-
// Get the page source and parse with Jsoup
55-
String pageSource = this.driver.getPageSource();
55+
String pageSource = driver.getPageSource();
5656
Document doc = Jsoup.parse(pageSource);
5757
List<ProblemStatement> problems = extractProblems(doc);
58-
// Print results
59-
System.out.println("Extracted " + problems.size() + " problems:");
58+
59+
log.info("Extracted {} problems from", problems.size());
6060
for (ProblemStatement problem : problems) {
61-
System.out.println(problem);
61+
log.info("{}", problem);
6262
}
6363
}
6464

65-
// private void visitCompanies(String companyURL) throws InterruptedException, IOException {
66-
// String companyName = companyURL.substring(companyURL.lastIndexOf("/") + 1);
67-
// System.out.println("Visiting " + companyURL);
68-
// driver.get(companyURL);
69-
// Thread.sleep(QUESTIONS_PAGE_WAIT_MILLIS); // Wait for the page to load, for companies like Google/Amazon, it takes a lot of time
70-
// String table = "";
71-
// try {
72-
// table = "<table>" + driver.findElement(By.className("table")).getAttribute("innerHTML") + "</table>";
73-
// } catch (NoSuchElementException ex) {
74-
// Thread.sleep(30000);
75-
// driver.get(companyURL);
76-
// Thread.sleep(30000);
77-
// table = "<table>" + driver.findElement(By.className("table")).getAttribute("innerHTML") + "</table>";
78-
// }
79-
// Document doc = Jsoup.parse(table); // parse the table html content
80-
// List<String[]> result = new ArrayList<>();
81-
// String[] header = new String[]{"ID", "Title", "URL", "Is Premium", "Acceptance %", "Difficulty", "Frequency %"};
82-
// result.add(header);
83-
// for (Element row : doc.getElementsByTag("tr")) {
84-
// Elements cols = row.getElementsByTag("td");
85-
// int size = cols.size();
86-
// if (size != 0) { // for <th> size would be 0
87-
// String id = cols.get(1).text();
88-
// String title = cols.get(2).text();
89-
// Elements href = cols.get(2).getElementsByAttribute("href");
90-
// boolean isPremium = !cols.get(2).getElementsByTag("i").isEmpty();
91-
// String problemUrl = href.get(0).attr("href");
92-
// String acceptance = cols.get(3).text();
93-
// String difficulty = cols.get(4).getElementsByTag("span").text();
94-
// String frequency = cols.get(5).getElementsByClass("progress-bar").attr("style");
95-
// // sample response "width: 29.1345%";
96-
// frequency = frequency.substring(frequency.indexOf(" ") + 1); // get the value after the first whitespace
97-
// String[] res = new String[]{id, title, problemUrl, isPremium ? "Y" : "N", acceptance, difficulty, frequency};
98-
// result.add(res);
99-
// }
100-
// }
101-
// try (CSVWriter csvWriter = new CSVWriter(new FileWriter(companyName + ".csv"))) {
102-
// csvWriter.writeAll(result);
103-
// }
104-
// }
105-
10665
public static void loadAllProblems(WebDriver driver) {
10766
JavascriptExecutor js = (JavascriptExecutor) driver;
10867
int maxScrolls = 30; // Maximum number of scrolls
109-
int scrollDelay = 10; // Delay between scrolls in seconds
11068
int consecutiveNoChange = 0; // Counter for consecutive scrolls with no new content
11169
int maxConsecutiveNoChange = 3; // Stop after 3 consecutive scrolls with no new content
11270

113-
System.out.println("Loading all problems by scrolling...");
71+
log.info("Loading all problems by scrolling...");
11472

11573
for (int i = 0; i < maxScrolls; i++) {
116-
// Get current number of problem links
11774
int currentCount = driver.findElements(By.cssSelector("a[href*='/problems/'][id]")).size();
11875

119-
// Try multiple scrolling methods to ensure scrolling works
12076
boolean scrolled = performScroll(driver, js);
12177

12278
if (!scrolled) {
123-
System.out.println("Scroll " + (i + 1) + ": Unable to scroll further, stopping.");
124-
break;
125-
}
126-
127-
// Wait for new content to load
128-
try {
129-
Thread.sleep(scrollDelay * 1000);
130-
} catch (InterruptedException e) {
131-
Thread.currentThread().interrupt();
79+
log.info("Scroll " + (i + 1) + ": Unable to scroll further, stopping.");
13280
break;
13381
}
13482

@@ -138,96 +86,43 @@ public static void loadAllProblems(WebDriver driver) {
13886
if (newCount > currentCount) {
13987
// New content loaded
14088
consecutiveNoChange = 0;
141-
System.out.println("Scroll " + (i + 1) + ": Found " + newCount + " problems (+" + (newCount - currentCount) + " new)");
89+
log.info("Scroll " + (i + 1) + ": Found " + newCount + " problems (+" + (newCount - currentCount) + " new)");
14290
} else {
14391
// No new content
14492
consecutiveNoChange++;
145-
System.out.println("Scroll " + (i + 1) + ": No new problems loaded (" + consecutiveNoChange + "/" + maxConsecutiveNoChange + ")");
93+
log.info("Scroll " + (i + 1) + ": No new problems loaded (" + consecutiveNoChange + "/" + maxConsecutiveNoChange + ")");
14694

14795
// If we've had consecutive scrolls with no new content, assume we've reached the end
14896
if (consecutiveNoChange >= maxConsecutiveNoChange) {
149-
System.out.println("No new content loaded after " + maxConsecutiveNoChange + " consecutive scrolls. Assuming all problems are loaded.");
97+
log.info("No new content loaded after " + maxConsecutiveNoChange + " consecutive scrolls. Assuming all problems are loaded.");
15098
break;
15199
}
152100
}
153101
}
154-
155-
System.out.println("Finished loading problems.");
102+
log.info("Finished loading problems.");
156103
}
157104

158-
/**
159-
* Performs scrolling using multiple methods to ensure it works
160-
* @param driver WebDriver instance
161-
* @param js JavascriptExecutor instance
162-
* @return true if scrolling was performed, false otherwise
163-
*/
164105
private static boolean performScroll(WebDriver driver, JavascriptExecutor js) {
165106
try {
166-
// Get current scroll position
167-
long currentScrollY = (Long) js.executeScript("return window.scrollY || window.pageYOffset;");
168-
169-
// Method 1: Scroll to document height
170-
js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
171-
Thread.sleep(1000); // Short wait
107+
// Find the specific element to scroll inside
108+
WebElement scrollElement = driver.findElement(By.xpath("/html/body/div[1]/div[1]/div[4]/div/div[2]"));
172109

173-
// Check if we actually scrolled
174-
long newScrollY = (Long) js.executeScript("return window.scrollY || window.pageYOffset;");
175-
if (newScrollY > currentScrollY) {
176-
return true;
177-
}
110+
// Scroll down by 5000 pixels inside the element
111+
js.executeScript("arguments[0].scrollTop = arguments[0].scrollHeight;", scrollElement);
112+
Thread.sleep(3000);
178113

179-
// Method 2: Try document.documentElement.scrollHeight
180-
js.executeScript("window.scrollTo(0, document.documentElement.scrollHeight);");
181-
Thread.sleep(1000);
182-
183-
newScrollY = (Long) js.executeScript("return window.scrollY || window.pageYOffset;");
184-
if (newScrollY > currentScrollY) {
185-
return true;
186-
}
187-
188-
// Method 3: Scroll by a large amount
189-
js.executeScript("window.scrollBy(0, 3000);");
190-
Thread.sleep(1000);
191-
192-
newScrollY = (Long) js.executeScript("return window.scrollY || window.pageYOffset;");
193-
if (newScrollY > currentScrollY) {
194-
return true;
195-
}
196-
197-
// Method 4: Try scrolling the main container (common in SPAs)
198-
js.executeScript(
199-
"const containers = document.querySelectorAll('[class*=\"scroll\"], [class*=\"overflow\"], main, .main-content, #main');" +
200-
"for (let container of containers) {" +
201-
" if (container.scrollHeight > container.clientHeight) {" +
202-
" container.scrollTop = container.scrollHeight;" +
203-
" break;" +
204-
" }" +
205-
"}"
206-
);
207-
Thread.sleep(1000);
208-
209-
newScrollY = (Long) js.executeScript("return window.scrollY || window.pageYOffset;");
210-
if (newScrollY > currentScrollY) {
211-
return true;
212-
}
213-
214-
// Method 5: End key simulation
215-
js.executeScript("document.body.dispatchEvent(new KeyboardEvent('keydown', {key: 'End'}));");
216-
Thread.sleep(1000);
217-
218-
newScrollY = (Long) js.executeScript("return window.scrollY || window.pageYOffset;");
219-
return newScrollY > currentScrollY;
114+
return true;
220115

221116
} catch (Exception e) {
222-
System.err.println("Error during scrolling: " + e.getMessage());
117+
log.error("Error during scrolling: ", e);
223118
return false;
224119
}
225120
}
226121

227122
public static List<ProblemStatement> extractProblems(Document doc) {
228123
List<ProblemStatement> problems = new ArrayList<>();
229124

230-
// Find all problem links (assuming they are anchor tags with specific pattern)
125+
// Find all problem links
231126
Elements problemLinks = doc.select("a[href*='/problems/'][id]");
232127

233128
for (Element link : problemLinks) {
@@ -237,7 +132,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {
237132

238133
// Extract URL from href attribute
239134
String url = link.attr("href");
240-
// Remove query parameters if needed
135+
// Remove query parameters
241136
if (url.contains("?")) {
242137
url = url.substring(0, url.indexOf("?"));
243138
}
@@ -255,9 +150,13 @@ public static List<ProblemStatement> extractProblems(Document doc) {
255150

256151
// Extract acceptance percentage
257152
String acceptancePercentage = "";
258-
Element acceptanceElement = link.select("div.text-sd-muted-foreground").first();
259-
if (acceptanceElement != null) {
260-
acceptancePercentage = acceptanceElement.text().trim();
153+
Elements candidates = link.select("div.text-sd-muted-foreground");
154+
for (Element candidate : candidates) {
155+
String text = candidate.text().trim();
156+
if (text.contains("%")) {
157+
acceptancePercentage = text;
158+
break;
159+
}
261160
}
262161

263162
// Extract difficulty based on CSS class
@@ -276,7 +175,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {
276175

277176
// Extract frequency percentage by counting orange divs (exclude those with opacity)
278177
String frequencyPercentage = "";
279-
Elements orangeDivs = link.select("div.bg-brand-orange.h-2.w-0\\.5.rounded");
178+
Elements orangeDivs = link.select("div[class*='bg-brand-orange'][class*='h-2'][class*='w-0.5'][class*='rounded']");
280179
if (!orangeDivs.isEmpty()) {
281180
int validCount = 0;
282181
for (Element div : orangeDivs) {
@@ -298,8 +197,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {
298197
}
299198

300199
} catch (Exception e) {
301-
System.err.println("Error processing problem element: " + e.getMessage());
302-
continue;
200+
log.error("Error processing problem element: ", e);
303201
}
304202
}
305203

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /