1
1
import io .github .bonigarcia .wdm .WebDriverManager ;
2
+ import lombok .SneakyThrows ;
2
3
import lombok .extern .slf4j .Slf4j ;
3
4
import model .ProblemStatement ;
4
5
import org .jsoup .Jsoup ;
8
9
import org .openqa .selenium .*;
9
10
import org .openqa .selenium .edge .EdgeDriver ;
10
11
12
+ import java .io .FileWriter ;
11
13
import java .io .IOException ;
14
+ import java .nio .file .Files ;
15
+ import java .nio .file .Path ;
16
+ import java .nio .file .Paths ;
12
17
import java .time .Duration ;
13
18
import java .util .ArrayList ;
14
19
import java .util .List ;
20
+ import java .util .regex .Matcher ;
21
+ import java .util .regex .Pattern ;
15
22
16
23
@ Slf4j
17
24
public class Scraper {
@@ -42,13 +49,15 @@ public void setup() throws InterruptedException, IOException {
42
49
// companyURLs.add(link);
43
50
// }
44
51
// for (String companyURL : companyURLs) {
45
- visitCompanies ("https://leetcode.com/company/amazon/?favoriteSlug=amazon-all" , driver );
52
+ for (String recency : new String []{"amazon-30-days" , "amazon-three-months" , "amazon-six-months" , "amazon-more-than-six-months" , "amazon-all" }) {
53
+ visitCompanies (String .format ("https://leetcode.com/company/amazon/?favoriteSlug=%s" , recency ), driver , recency );
54
+ }
46
55
// }
47
56
}
48
57
49
- private void visitCompanies (String companyURL , WebDriver driver ) throws InterruptedException {
50
- String companyName = companyURL . substring (companyURL . lastIndexOf ( "/" ) + 1 );
51
- log .info ("Visiting {}" , companyName );
58
+ private void visitCompanies (String companyURL , WebDriver driver , String recency ) throws InterruptedException {
59
+ String companyName = extractCompanyNameWithRegex (companyURL );
60
+ log .info ("Visiting {} with recency {} " , companyName , recency );
52
61
driver .get (companyURL );
53
62
Thread .sleep (QUESTIONS_PAGE_WAIT_MILLIS );
54
63
loadAllProblems (driver );
@@ -60,6 +69,18 @@ private void visitCompanies(String companyURL, WebDriver driver) throws Interrup
60
69
for (ProblemStatement problem : problems ) {
61
70
log .info ("{}" , problem );
62
71
}
72
+ exportToCSV (problems , companyName , recency );
73
+ }
74
+
75
+ private String extractCompanyNameWithRegex (String companyURL ) {
76
+ Pattern pattern = Pattern .compile ("/company/([^/?]+)" );
77
+ Matcher matcher = pattern .matcher (companyURL );
78
+
79
+ if (matcher .find ()) {
80
+ return matcher .group (1 );
81
+ }
82
+
83
+ throw new IllegalArgumentException ("Invalid company URL format: " + companyURL );
63
84
}
64
85
65
86
public static void loadAllProblems (WebDriver driver ) {
@@ -104,15 +125,12 @@ public static void loadAllProblems(WebDriver driver) {
104
125
105
126
private static boolean performScroll (WebDriver driver , JavascriptExecutor js ) {
106
127
try {
107
- // Find the specific element to scroll inside
108
128
WebElement scrollElement = driver .findElement (By .xpath ("/html/body/div[1]/div[1]/div[4]/div/div[2]" ));
109
129
110
- // Scroll down by 5000 pixels inside the element
111
130
js .executeScript ("arguments[0].scrollTop = arguments[0].scrollHeight;" , scrollElement );
112
131
Thread .sleep (3000 );
113
132
114
133
return true ;
115
-
116
134
} catch (Exception e ) {
117
135
log .error ("Error during scrolling: " , e );
118
136
return false ;
@@ -203,4 +221,49 @@ public static List<ProblemStatement> extractProblems(Document doc) {
203
221
204
222
return problems ;
205
223
}
224
+
225
+ @ SneakyThrows
226
+ private void exportToCSV (List <ProblemStatement > problems , String companyName , String recency ) {
227
+ // Create directory structure: companyName/recency/
228
+ Path outputDir = Paths .get (companyName , recency );
229
+ if (!Files .exists (outputDir )) {
230
+ Files .createDirectories (outputDir );
231
+ }
232
+
233
+ Path filePath = outputDir .resolve ("problems.csv" );
234
+
235
+ try (FileWriter writer = new FileWriter (filePath .toFile ())) {
236
+ writer .append ("ID,URL,Title,Difficulty,Acceptance %,Frequency %\n " );
237
+
238
+ for (ProblemStatement problem : problems ) {
239
+ writer .append (escapeCsvValue (problem .id ()))
240
+ .append (',' )
241
+ .append (escapeCsvValue ("https://leetcode.com" + problem .url ()))
242
+ .append (',' )
243
+ .append (escapeCsvValue (problem .title ()))
244
+ .append (',' )
245
+ .append (escapeCsvValue (problem .difficulty ()))
246
+ .append (',' )
247
+ .append (escapeCsvValue (problem .acceptancePercentage ()))
248
+ .append (',' )
249
+ .append (escapeCsvValue (problem .frequencyPercentage ()))
250
+ .append ('\n' );
251
+ }
252
+ }
253
+
254
+ log .info ("CSV file created: {}" , filePath .toAbsolutePath ());
255
+ }
256
+
257
+ private String escapeCsvValue (String value ) {
258
+ if (value == null ) {
259
+ return "" ;
260
+ }
261
+
262
+ // If value contains comma, double quote, or newline, wrap in quotes and escape internal quotes
263
+ if (value .contains ("," ) || value .contains ("\" " ) || value .contains ("\n " ) || value .contains ("\r " )) {
264
+ return "\" " + value .replace ("\" " , "\" \" " ) + "\" " ;
265
+ }
266
+
267
+ return value ;
268
+ }
206
269
}
0 commit comments