Commit bc1d944

committed

images in the examples working

1 parent e6fee97 commit bc1d944Copy full SHA for bc1d944

File tree

3 files changed

+42

-14

lines changed

.gitignore
leetcode_scraper.py
requirements.txt

3 files changed

+42

-14

lines changed

`‎.gitignore`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -33,3 +33,7 @@ env/`
`33`	`33`
`34`	`34`	`# Project specific`
`35`	`35`	`problems/`
	`36`	`+leetcode_last_solution.py`
	`37`	`+debug_soup.html`
	`38`	`+debug_content_html.txt`
	`39`	`+git_setup_commands.txt`

`‎leetcode_scraper.py`

Lines changed: 37 additions & 13 deletions

Original file line number	Diff line number	Diff line change
`@@ -97,9 +97,20 @@ def _process_problem_data(self, question):`
`97`	`97`	`}`
`98`	`98`
`99`	`99`	`# Process content with BeautifulSoup to extract description, examples, and constraints`
	`100`	`+ import os`
`100`	`101`	`content_html = question.get('content', '')`
	`102`	`+ debug_dir = os.path.dirname(os.path.abspath(__file__))`
	`103`	`+ debug_content_path = os.path.join(debug_dir, 'debug_content_html.txt')`
	`104`	`+ debug_soup_path = os.path.join(debug_dir, 'debug_soup.html')`
	`105`	`+ # Write content_html to a debug file for inspection`
	`106`	`+ with open(debug_content_path, 'w', encoding='utf-8') as f:`
	`107`	`+ f.write(content_html)`
	`108`	`+ print(f"[DEBUG] Wrote content_html to {debug_content_path}")`
`101`	`109`	`soup = BeautifulSoup(content_html, 'html.parser')`
`102`		`-`
	`110`	`+ # Write soup prettified HTML to a debug file for inspection`
	`111`	`+ with open(debug_soup_path, 'w', encoding='utf-8') as f:`
	`112`	`+ f.write(soup.prettify())`
	`113`	`+ print(f"[DEBUG] Wrote soup HTML to {debug_soup_path}")`
`103`	`114`	`# Get description (text before the first <strong>Example</strong>)`
`104`	`115`	`description = []`
`105`	`116`	`current_element = soup.find()`
`@@ -113,14 +124,23 @@ def _process_problem_data(self, question):`
`113`	`124`
`114`	`125`	`problem_data['description'] = '\n'.join([d for d in description if d])`
`115`	`126`
`116`		`- # Extract examples`
	`127`	`+ # Extract examples and attach the closest preceding image to each`
`117`	`128`	`examples = []`
`118`	`129`	`example_blocks = soup.find_all('pre')`
`119`	`130`	`for i, example in enumerate(example_blocks, 1):`
`120`		`- examples.append({`
	`131`	`+ example_dict={`
`121`	`132`	`'example_num': i,`
`122`		`- 'example_text': example.get_text().strip()`
`123`		`- })`
	`133`	`+ 'example_text': example.get_text().strip(),`
	`134`	`+ 'images': []`
	`135`	`+ }`
	`136`	`+ # Find the closest preceding <img> tag before this <pre>`
	`137`	`+ prev = example.previous_element`
	`138`	`+ while prev:`
	`139`	`+ if getattr(prev, 'name', None) == 'img' and prev.has_attr('src'):`
	`140`	`+ example_dict['images'].append(prev['src'])`
	`141`	`+ break`
	`142`	`+ prev = prev.previous_element`
	`143`	`+ examples.append(example_dict)`
`124`	`144`	`problem_data['examples'] = examples`
`125`	`145`
`126`	`146`	`# Extract constraints`
`@@ -212,18 +232,22 @@ def scrape_problem_list(self, limit=10):`
`212`	`232`
`213`	`233`	`return problem_list`
`214`	`234`
`215`		`-if __name__ == "__main__":`
`216`		`- scraper = LeetCodeScraper()`
	`235`	`+# if __name__ == "__main__":`
	`236`	`+# scraper = LeetCodeScraper()`
`217`	`237`
`218`	`238`	`# Option 1: Scrape a specific problem`
`219`	`239`	`# problem_data = scraper.scrape_problem("two-sum")`
`220`	`240`	`# print(json.dumps(problem_data, indent=2))`
`221`	`241`
	`242`	`+if __name__ == "__main__":`
	`243`	`+ scraper = LeetCodeScraper()`
	`244`	`+ problem_data = scraper.scrape_problem("linked-list-cycle")`
	`245`	`+ print(json.dumps(problem_data, indent=2))`
`222`	`246`	`# Option 2: Scrape multiple problems from the list`
`223`		`- problem_list = scraper.scrape_problem_list(limit=5)`
	`247`	`+ # problem_list = scraper.scrape_problem_list(limit=5)`
`224`	`248`
`225`		`- # Add a delay between requests to avoid being blocked`
`226`		`- for problem in problem_list:`
`227`		`- print(f"Scraping problem: {problem['title']} ({problem['slug']})")`
`228`		`- scraper.scrape_problem(problem['slug'])`
`229`		`- time.sleep(2) # Wait 2 seconds between requests`
	`249`	`+ # # Add a delay between requests to avoid being blocked`
	`250`	`+ # for problem in problem_list:`
	`251`	`+ # print(f"Scraping problem: {problem['title']} ({problem['slug']})")`
	`252`	`+ # scraper.scrape_problem(problem['slug'])`
	`253`	`+ # time.sleep(2) # Wait 2 seconds between requests`

`‎requirements.txt`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`		`-requests==2.28.2`
	`1`	`+requests>=2.25.1`
`2`	`2`	`beautifulsoup4==4.12.2`
`3`	`3`	`lxml==4.9.2`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit bc1d944

File tree

3 files changed

3 files changed

`‎.gitignore`

`‎leetcode_scraper.py`

`‎requirements.txt`

0 commit comments