Commit 8afa5dd

committed

Working draft for one company

1 parent 89574ef commit 8afa5ddCopy full SHA for 8afa5dd

File tree

1 file changed

+37

-139

lines changed

src/main/java
- Scraper.java

1 file changed

+37

-139

lines changed

`‎src/main/java/Scraper.java`

Lines changed: 37 additions & 139 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`import io.github.bonigarcia.wdm.WebDriverManager;`
	`2`	`+import lombok.extern.slf4j.Slf4j;`
`2`	`3`	`import model.ProblemStatement;`
`3`	`4`	`import org.jsoup.Jsoup;`
`4`	`5`	`import org.jsoup.nodes.Document;`
`@@ -12,13 +13,13 @@`
`12`	`13`	`import java.util.ArrayList;`
`13`	`14`	`import java.util.List;`
`14`	`15`
	`16`	`+@Slf4j`
`15`	`17`	`public class Scraper {`
`16`	`18`	`private static final String USERNAME = ""; // Provide your LeetCode username/email`
`17`	`19`	`private static final String PASSWORD = ""; // Provide your LeetCode password`
`18`		`- public static final int QUESTIONS_PAGE_WAIT_MILLIS = 10000;`
`19`		`- public static final int LOGIN_PAGE_WAIT_MILLIS = 2000;`
	`20`	`+ public static final int QUESTIONS_PAGE_WAIT_MILLIS = 5000;`
	`21`	`+ public static final int LOGIN_PAGE_WAIT_MILLIS = 30000;`
`20`	`22`	`WebDriver driver;`
`21`		`- List<String> companyURLs = new ArrayList<>();`
`22`	`23`
`23`	`24`	`public void setup() throws InterruptedException, IOException {`
`24`	`25`	`WebDriverManager.edgedriver().setup();`
`@@ -37,98 +38,45 @@ public void setup() throws InterruptedException, IOException {`
`37`	`38`	`// List<WebElement> companies = driver.findElements(By.cssSelector(".mb-4.mr-3"));`
`38`	`39`	`// for (WebElement company : companies) {`
`39`	`40`	`// String link = company.getAttribute("href");`
`40`		`-// System.out.println(link);`
	`41`	`+// log.info(link);`
`41`	`42`	`// companyURLs.add(link);`
`42`	`43`	`// }`
`43`	`44`	`// for (String companyURL : companyURLs) {`
`44`		`- visitCompanies("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all", driver);`
	`45`	`+ visitCompanies("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all", driver);`
`45`	`46`	`// }`
`46`	`47`	`}`
`47`	`48`
`48`		`- private void visitCompanies(String companyURL, WebDriver driver) throws InterruptedException, IOException {`
	`49`	`+ private void visitCompanies(String companyURL, WebDriver driver) throws InterruptedException {`
`49`	`50`	`String companyName = companyURL.substring(companyURL.lastIndexOf("/") + 1);`
`50`		`- System.out.println("Visiting " + companyURL);`
`51`		`- this.driver.get(companyURL);`
	`51`	`+ log.info("Visiting {}", companyName);`
	`52`	`+ driver.get(companyURL);`
`52`	`53`	`Thread.sleep(QUESTIONS_PAGE_WAIT_MILLIS);`
`53`	`54`	`loadAllProblems(driver);`
`54`		`- // Get the page source and parse with Jsoup`
`55`		`- String pageSource = this.driver.getPageSource();`
	`55`	`+ String pageSource = driver.getPageSource();`
`56`	`56`	`Document doc = Jsoup.parse(pageSource);`
`57`	`57`	`List<ProblemStatement> problems = extractProblems(doc);`
`58`		`-// Print results`
`59`		`- System.out.println("Extracted " + problems.size() + " problems:");`
	`58`	`+`
	`59`	`+ log.info("Extracted {} problems from", problems.size());`
`60`	`60`	`for (ProblemStatement problem : problems) {`
`61`		`- System.out.println(problem);`
	`61`	`+ log.info("{}", problem);`
`62`	`62`	`}`
`63`	`63`	`}`
`64`	`64`
`65`		`-// private void visitCompanies(String companyURL) throws InterruptedException, IOException {`
`66`		`-// String companyName = companyURL.substring(companyURL.lastIndexOf("/") + 1);`
`67`		`-// System.out.println("Visiting " + companyURL);`
`68`		`-// driver.get(companyURL);`
`69`		`-// Thread.sleep(QUESTIONS_PAGE_WAIT_MILLIS); // Wait for the page to load, for companies like Google/Amazon, it takes a lot of time`
`70`		`-// String table = "";`
`71`		`-// try {`
`72`		`-// table = "<table>" + driver.findElement(By.className("table")).getAttribute("innerHTML") + "</table>";`
`73`		`-// } catch (NoSuchElementException ex) {`
`74`		`-// Thread.sleep(30000);`
`75`		`-// driver.get(companyURL);`
`76`		`-// Thread.sleep(30000);`
`77`		`-// table = "<table>" + driver.findElement(By.className("table")).getAttribute("innerHTML") + "</table>";`
`78`		`-// }`
`79`		`-// Document doc = Jsoup.parse(table); // parse the table html content`
`80`		`-// List<String[]> result = new ArrayList<>();`
`81`		`-// String[] header = new String[]{"ID", "Title", "URL", "Is Premium", "Acceptance %", "Difficulty", "Frequency %"};`
`82`		`-// result.add(header);`
`83`		`-// for (Element row : doc.getElementsByTag("tr")) {`
`84`		`-// Elements cols = row.getElementsByTag("td");`
`85`		`-// int size = cols.size();`
`86`		`-// if (size != 0) { // for <th> size would be 0`
`87`		`-// String id = cols.get(1).text();`
`88`		`-// String title = cols.get(2).text();`
`89`		`-// Elements href = cols.get(2).getElementsByAttribute("href");`
`90`		`-// boolean isPremium = !cols.get(2).getElementsByTag("i").isEmpty();`
`91`		`-// String problemUrl = href.get(0).attr("href");`
`92`		`-// String acceptance = cols.get(3).text();`
`93`		`-// String difficulty = cols.get(4).getElementsByTag("span").text();`
`94`		`-// String frequency = cols.get(5).getElementsByClass("progress-bar").attr("style");`
`95`		`-// // sample response "width: 29.1345%";`
`96`		`-// frequency = frequency.substring(frequency.indexOf(" ") + 1); // get the value after the first whitespace`
`97`		`-// String[] res = new String[]{id, title, problemUrl, isPremium ? "Y" : "N", acceptance, difficulty, frequency};`
`98`		`-// result.add(res);`
`99`		`-// }`
`100`		`-// }`
`101`		`-// try (CSVWriter csvWriter = new CSVWriter(new FileWriter(companyName + ".csv"))) {`
`102`		`-// csvWriter.writeAll(result);`
`103`		`-// }`
`104`		`-// }`
`105`		`-`
`106`	`65`	`public static void loadAllProblems(WebDriver driver) {`
`107`	`66`	`JavascriptExecutor js = (JavascriptExecutor) driver;`
`108`	`67`	`int maxScrolls = 30; // Maximum number of scrolls`
`109`		`- int scrollDelay = 10; // Delay between scrolls in seconds`
`110`	`68`	`int consecutiveNoChange = 0; // Counter for consecutive scrolls with no new content`
`111`	`69`	`int maxConsecutiveNoChange = 3; // Stop after 3 consecutive scrolls with no new content`
`112`	`70`
`113`		`- System.out.println("Loading all problems by scrolling...");`
	`71`	`+ log.info("Loading all problems by scrolling...");`
`114`	`72`
`115`	`73`	`for (int i = 0; i < maxScrolls; i++) {`
`116`		`- // Get current number of problem links`
`117`	`74`	`int currentCount = driver.findElements(By.cssSelector("a[href*='/problems/'][id]")).size();`
`118`	`75`
`119`		`- // Try multiple scrolling methods to ensure scrolling works`
`120`	`76`	`boolean scrolled = performScroll(driver, js);`
`121`	`77`
`122`	`78`	`if (!scrolled) {`
`123`		`- System.out.println("Scroll " + (i + 1) + ": Unable to scroll further, stopping.");`
`124`		`- break;`
`125`		`- }`
`126`		`-`
`127`		`- // Wait for new content to load`
`128`		`- try {`
`129`		`- Thread.sleep(scrollDelay * 1000);`
`130`		`- } catch (InterruptedException e) {`
`131`		`- Thread.currentThread().interrupt();`
	`79`	`+ log.info("Scroll " + (i + 1) + ": Unable to scroll further, stopping.");`
`132`	`80`	`break;`
`133`	`81`	`}`
`134`	`82`
`@@ -138,96 +86,43 @@ public static void loadAllProblems(WebDriver driver) {`
`138`	`86`	`if (newCount > currentCount) {`
`139`	`87`	`// New content loaded`
`140`	`88`	`consecutiveNoChange = 0;`
`141`		`- System.out.println("Scroll " + (i + 1) + ": Found " + newCount + " problems (+" + (newCount - currentCount) + " new)");`
	`89`	`+ log.info("Scroll " + (i + 1) + ": Found " + newCount + " problems (+" + (newCount - currentCount) + " new)");`
`142`	`90`	`} else {`
`143`	`91`	`// No new content`
`144`	`92`	`consecutiveNoChange++;`
`145`		`- System.out.println("Scroll " + (i + 1) + ": No new problems loaded (" + consecutiveNoChange + "/" + maxConsecutiveNoChange + ")");`
	`93`	`+ log.info("Scroll " + (i + 1) + ": No new problems loaded (" + consecutiveNoChange + "/" + maxConsecutiveNoChange + ")");`
`146`	`94`
`147`	`95`	`// If we've had consecutive scrolls with no new content, assume we've reached the end`
`148`	`96`	`if (consecutiveNoChange >= maxConsecutiveNoChange) {`
`149`		`- System.out.println("No new content loaded after " + maxConsecutiveNoChange + " consecutive scrolls. Assuming all problems are loaded.");`
	`97`	`+ log.info("No new content loaded after " + maxConsecutiveNoChange + " consecutive scrolls. Assuming all problems are loaded.");`
`150`	`98`	`break;`
`151`	`99`	`}`
`152`	`100`	`}`
`153`	`101`	`}`
`154`		`-`
`155`		`- System.out.println("Finished loading problems.");`
	`102`	`+ log.info("Finished loading problems.");`
`156`	`103`	`}`
`157`	`104`
`158`		`- /**`
`159`		`- * Performs scrolling using multiple methods to ensure it works`
`160`		`- * @param driver WebDriver instance`
`161`		`- * @param js JavascriptExecutor instance`
`162`		`- * @return true if scrolling was performed, false otherwise`
`163`		`- */`
`164`	`105`	`private static boolean performScroll(WebDriver driver, JavascriptExecutor js) {`
`165`	`106`	`try {`
`166`		`- // Get current scroll position`
`167`		`- long currentScrollY = (Long) js.executeScript("return window.scrollY \|\| window.pageYOffset;");`
`168`		`-`
`169`		`- // Method 1: Scroll to document height`
`170`		`- js.executeScript("window.scrollTo(0, document.body.scrollHeight);");`
`171`		`- Thread.sleep(1000); // Short wait`
	`107`	`+ // Find the specific element to scroll inside`
	`108`	`+ WebElement scrollElement = driver.findElement(By.xpath("/html/body/div[1]/div[1]/div[4]/div/div[2]"));`
`172`	`109`
`173`		`- // Check if we actually scrolled`
`174`		`- long newScrollY = (Long) js.executeScript("return window.scrollY \|\| window.pageYOffset;");`
`175`		`- if (newScrollY > currentScrollY) {`
`176`		`- return true;`
`177`		`- }`
	`110`	`+ // Scroll down by 5000 pixels inside the element`
	`111`	`+ js.executeScript("arguments[0].scrollTop = arguments[0].scrollHeight;", scrollElement);`
	`112`	`+ Thread.sleep(3000);`
`178`	`113`
`179`		`- // Method 2: Try document.documentElement.scrollHeight`
`180`		`- js.executeScript("window.scrollTo(0, document.documentElement.scrollHeight);");`
`181`		`- Thread.sleep(1000);`
`182`		`-`
`183`		`- newScrollY = (Long) js.executeScript("return window.scrollY \|\| window.pageYOffset;");`
`184`		`- if (newScrollY > currentScrollY) {`
`185`		`- return true;`
`186`		`- }`
`187`		`-`
`188`		`- // Method 3: Scroll by a large amount`
`189`		`- js.executeScript("window.scrollBy(0, 3000);");`
`190`		`- Thread.sleep(1000);`
`191`		`-`
`192`		`- newScrollY = (Long) js.executeScript("return window.scrollY \|\| window.pageYOffset;");`
`193`		`- if (newScrollY > currentScrollY) {`
`194`		`- return true;`
`195`		`- }`
`196`		`-`
`197`		`- // Method 4: Try scrolling the main container (common in SPAs)`
`198`		`- js.executeScript(`
`199`		`- "const containers = document.querySelectorAll('[class=\"scroll\"], [class=\"overflow\"], main, .main-content, #main');" +`
`200`		`- "for (let container of containers) {" +`
`201`		`- " if (container.scrollHeight > container.clientHeight) {" +`
`202`		`- " container.scrollTop = container.scrollHeight;" +`
`203`		`- " break;" +`
`204`		`- " }" +`
`205`		`- "}"`
`206`		`- );`
`207`		`- Thread.sleep(1000);`
`208`		`-`
`209`		`- newScrollY = (Long) js.executeScript("return window.scrollY \|\| window.pageYOffset;");`
`210`		`- if (newScrollY > currentScrollY) {`
`211`		`- return true;`
`212`		`- }`
`213`		`-`
`214`		`- // Method 5: End key simulation`
`215`		`- js.executeScript("document.body.dispatchEvent(new KeyboardEvent('keydown', {key: 'End'}));");`
`216`		`- Thread.sleep(1000);`
`217`		`-`
`218`		`- newScrollY = (Long) js.executeScript("return window.scrollY \|\| window.pageYOffset;");`
`219`		`- return newScrollY > currentScrollY;`
	`114`	`+ return true;`
`220`	`115`
`221`	`116`	`} catch (Exception e) {`
`222`		`- System.err.println("Error during scrolling: " + e.getMessage());`
	`117`	`+ log.error("Error during scrolling: ", e);`
`223`	`118`	`return false;`
`224`	`119`	`}`
`225`	`120`	`}`
`226`	`121`
`227`	`122`	`public static List<ProblemStatement> extractProblems(Document doc) {`
`228`	`123`	`List<ProblemStatement> problems = new ArrayList<>();`
`229`	`124`
`230`		`- // Find all problem links (assuming they are anchor tags with specific pattern)`
	`125`	`+ // Find all problem links`
`231`	`126`	`Elements problemLinks = doc.select("a[href*='/problems/'][id]");`
`232`	`127`
`233`	`128`	`for (Element link : problemLinks) {`
`@@ -237,7 +132,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {`
`237`	`132`
`238`	`133`	`// Extract URL from href attribute`
`239`	`134`	`String url = link.attr("href");`
`240`		`- // Remove query parameters if needed`
	`135`	`+ // Remove query parameters`
`241`	`136`	`if (url.contains("?")) {`
`242`	`137`	`url = url.substring(0, url.indexOf("?"));`
`243`	`138`	`}`
`@@ -255,9 +150,13 @@ public static List<ProblemStatement> extractProblems(Document doc) {`
`255`	`150`
`256`	`151`	`// Extract acceptance percentage`
`257`	`152`	`String acceptancePercentage = "";`
`258`		`- Element acceptanceElement = link.select("div.text-sd-muted-foreground").first();`
`259`		`- if (acceptanceElement != null) {`
`260`		`- acceptancePercentage = acceptanceElement.text().trim();`
	`153`	`+ Elements candidates = link.select("div.text-sd-muted-foreground");`
	`154`	`+ for (Element candidate : candidates) {`
	`155`	`+ String text = candidate.text().trim();`
	`156`	`+ if (text.contains("%")) {`
	`157`	`+ acceptancePercentage = text;`
	`158`	`+ break;`
	`159`	`+ }`
`261`	`160`	`}`
`262`	`161`
`263`	`162`	`// Extract difficulty based on CSS class`
`@@ -276,7 +175,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {`
`276`	`175`
`277`	`176`	`// Extract frequency percentage by counting orange divs (exclude those with opacity)`
`278`	`177`	`String frequencyPercentage = "";`
`279`		`- Elements orangeDivs = link.select("div.bg-brand-orange.h-2.w-0\\.5.rounded");`
	`178`	`+ Elements orangeDivs = link.select("div[class='bg-brand-orange'][class='h-2'][class='w-0.5'][class='rounded']");`
`280`	`179`	`if (!orangeDivs.isEmpty()) {`
`281`	`180`	`int validCount = 0;`
`282`	`181`	`for (Element div : orangeDivs) {`
`@@ -298,8 +197,7 @@ public static List<ProblemStatement> extractProblems(Document doc) {`
`298`	`197`	`}`
`299`	`198`
`300`	`199`	`} catch (Exception e) {`
`301`		`- System.err.println("Error processing problem element: " + e.getMessage());`
`302`		`- continue;`
	`200`	`+ log.error("Error processing problem element: ", e);`
`303`	`201`	`}`
`304`	`202`	`}`
`305`	`203`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 8afa5dd

File tree

1 file changed

1 file changed

`‎src/main/java/Scraper.java`

0 commit comments