11import io .github .bonigarcia .wdm .WebDriverManager ;
2+ import lombok .SneakyThrows ;
23import lombok .extern .slf4j .Slf4j ;
34import model .ProblemStatement ;
45import org .jsoup .Jsoup ;
89import org .openqa .selenium .*;
910import org .openqa .selenium .edge .EdgeDriver ;
1011
12+ import java .io .FileWriter ;
1113import java .io .IOException ;
14+ import java .nio .file .Files ;
15+ import java .nio .file .Path ;
16+ import java .nio .file .Paths ;
1217import java .time .Duration ;
1318import java .util .ArrayList ;
1419import java .util .List ;
20+ import java .util .regex .Matcher ;
21+ import java .util .regex .Pattern ;
1522
1623@ Slf4j
1724public class Scraper {
@@ -42,13 +49,15 @@ public void setup() throws InterruptedException, IOException {
4249// companyURLs.add(link);
4350// }
4451// for (String companyURL : companyURLs) {
45- visitCompanies ("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all" , driver );
52+ for (String recency : new String []{"amazon-30-days" , "amazon-three-months" , "amazon-six-months" , "amazon-more-than-six-months" , "amazon-all" }) {
53+ visitCompanies (String .format ("https://leetcode.com/company/amazon/?favoriteSlug=%s" , recency ), driver , recency );
54+ }
4655// }
4756 }
4857
49- private void visitCompanies (String companyURL , WebDriver driver ) throws InterruptedException {
50- String companyName = companyURL . substring (companyURL . lastIndexOf ( "/" ) + 1 );
51- log .info ("Visiting {}" , companyName );
58+ private void visitCompanies (String companyURL , WebDriver driver , String recency ) throws InterruptedException {
59+ String companyName = extractCompanyNameWithRegex (companyURL );
60+ log .info ("Visiting {} with recency {} " , companyName , recency );
5261 driver .get (companyURL );
5362 Thread .sleep (QUESTIONS_PAGE_WAIT_MILLIS );
5463 loadAllProblems (driver );
@@ -60,6 +69,18 @@ private void visitCompanies(String companyURL, WebDriver driver) throws Interrup
6069 for (ProblemStatement problem : problems ) {
6170 log .info ("{}" , problem );
6271 }
72+ exportToCSV (problems , companyName , recency );
73+ }
74+ 75+ private String extractCompanyNameWithRegex (String companyURL ) {
76+ Pattern pattern = Pattern .compile ("/company/([^/?]+)" );
77+ Matcher matcher = pattern .matcher (companyURL );
78+ 79+ if (matcher .find ()) {
80+ return matcher .group (1 );
81+ }
82+ 83+ throw new IllegalArgumentException ("Invalid company URL format: " + companyURL );
6384 }
6485
6586 public static void loadAllProblems (WebDriver driver ) {
@@ -104,15 +125,12 @@ public static void loadAllProblems(WebDriver driver) {
104125
105126 private static boolean performScroll (WebDriver driver , JavascriptExecutor js ) {
106127 try {
107- // Find the specific element to scroll inside
108128 WebElement scrollElement = driver .findElement (By .xpath ("/html/body/div[1]/div[1]/div[4]/div/div[2]" ));
109129
110- // Scroll down by 5000 pixels inside the element
111130 js .executeScript ("arguments[0].scrollTop = arguments[0].scrollHeight;" , scrollElement );
112131 Thread .sleep (3000 );
113132
114133 return true ;
115- 116134 } catch (Exception e ) {
117135 log .error ("Error during scrolling: " , e );
118136 return false ;
@@ -203,4 +221,49 @@ public static List<ProblemStatement> extractProblems(Document doc) {
203221
204222 return problems ;
205223 }
224+ 225+ @ SneakyThrows
226+ private void exportToCSV (List <ProblemStatement > problems , String companyName , String recency ) {
227+ // Create directory structure: companyName/recency/
228+ Path outputDir = Paths .get (companyName , recency );
229+ if (!Files .exists (outputDir )) {
230+ Files .createDirectories (outputDir );
231+ }
232+ 233+ Path filePath = outputDir .resolve ("problems.csv" );
234+ 235+ try (FileWriter writer = new FileWriter (filePath .toFile ())) {
236+ writer .append ("ID,URL,Title,Difficulty,Acceptance %,Frequency %\n " );
237+ 238+ for (ProblemStatement problem : problems ) {
239+ writer .append (escapeCsvValue (problem .id ()))
240+ .append (',' )
241+ .append (escapeCsvValue ("https://leetcode.com" + problem .url ()))
242+ .append (',' )
243+ .append (escapeCsvValue (problem .title ()))
244+ .append (',' )
245+ .append (escapeCsvValue (problem .difficulty ()))
246+ .append (',' )
247+ .append (escapeCsvValue (problem .acceptancePercentage ()))
248+ .append (',' )
249+ .append (escapeCsvValue (problem .frequencyPercentage ()))
250+ .append ('\n' );
251+ }
252+ }
253+ 254+ log .info ("CSV file created: {}" , filePath .toAbsolutePath ());
255+ }
256+ 257+ private String escapeCsvValue (String value ) {
258+ if (value == null ) {
259+ return "" ;
260+ }
261+ 262+ // If value contains comma, double quote, or newline, wrap in quotes and escape internal quotes
263+ if (value .contains ("," ) || value .contains ("\" " ) || value .contains ("\n " ) || value .contains ("\r " )) {
264+ return "\" " + value .replace ("\" " , "\" \" " ) + "\" " ;
265+ }
266+ 267+ return value ;
268+ }
206269}
0 commit comments