Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 9a9ef9a

Browse files
Codeforces Problem Scraper Added
1 parent bc2de17 commit 9a9ef9a

File tree

1 file changed

+128
-27
lines changed

1 file changed

+128
-27
lines changed

‎Coderforces_Problem_Scrapper/Codeforces_problem_scrapper.py‎

Lines changed: 128 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,103 @@
22
from selenium import webdriver # Automated webdriver
33
from PIL import Image
44
from fpdf import FPDF # For converting images to pdf
5+
56
DRIVER_PATH = ''
67

7-
def getproblem():
8+
9+
def select_difficulty():
10+
"""
11+
This function will let user to choose the difficulty level
12+
:return: difficulty_level[]
13+
"""
14+
difficulty_level = []
15+
print("\nEnter the Range between 800 to 3500: ")
16+
difficulty_level.append(int(input("Min: ")))
17+
difficulty_level.append(int(input("Max: ")))
18+
19+
return difficulty_level
20+
21+
22+
def extracting_problem_links(diff_level):
23+
"""
24+
This function saves first saves the link of the pages to scrape from
25+
and then the link of every question, saves it in list
26+
:param diff_level: difficulty_level entered by the user
27+
:return pblms_links: consists of all the available questions to scrape
28+
"""
29+
no_of_questions = int(input("\nHow many Questions you want to scrape: "))
30+
31+
pblms_link_scraped = 0
32+
pblms_links = []
33+
page = 1
34+
options = webdriver.ChromeOptions()
35+
options.headless = True
36+
driver = webdriver.Chrome(DRIVER_PATH, options=options)
37+
print("\nRequesting URL ...")
38+
driver.get(f"https://codeforces.com/problemset/?tags={diff_level[0]}-{diff_level[1]}")
39+
40+
# ===================Getting no. of Pages to Scrape=============================
41+
42+
# It will give the total no. of pages present with that question from
43+
# which we are going to scrape
44+
page_links = []
45+
46+
print("\nFinding available pages to scrape....")
47+
48+
available_pages = driver.find_elements_by_css_selector("div.pagination a")
49+
for page_no in available_pages:
50+
page_links.append(page_no.get_attribute("href"))
51+
52+
print(f"Available Pages to scrape are: {len(page_links[:-1])}")
53+
54+
# ===================================================================================
55+
56+
# ***************************** SCRAPING PAGE 1 *************************************
57+
print(f"\nScraping Page {page}")
58+
59+
elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")
60+
for element in elements:
61+
# Saving the link in pblms_links
62+
pblms_links.append(element.get_attribute("href"))
63+
pblms_link_scraped += 1
64+
65+
# If we scraped required no. of questions then return
66+
if pblms_link_scraped == no_of_questions:
67+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
68+
print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")
69+
return pblms_links
70+
page += 1
71+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
72+
# *************************************************************************************
73+
74+
# ----------------------------- SCRAPING SUBSEQUENT PAGES -----------------------------
75+
for link in page_links[1:-1]:
76+
print(f"\nScraping Page {page}")
77+
78+
# Going to next Page
79+
driver.get(link)
80+
elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")
81+
for element in elements:
82+
# Saving the link in pblms_links
83+
pblms_links.append(element.get_attribute("href"))
84+
pblms_link_scraped += 1
85+
86+
# If we scraped required no. of questions then return
87+
if pblms_link_scraped == no_of_questions:
88+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
89+
print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")
90+
return pblms_links
91+
92+
print(f"URLs of Question Scraped till now: {pblms_link_scraped}")
93+
page += 1
94+
# ----------------------------------------------------------------------------------------------
95+
96+
# scraped all the available questions but still the count is less
97+
print(f"\n{pblms_link_scraped} out of {no_of_questions} URLs able to scrapped !!!")
98+
return pblms_links
99+
100+
101+
def getproblem(URLs):
8102
"""
9103
getproblem() : It takes input from the user of codeforces problemID and difficulty
10104
level and then by using selenium and chrome webdriver, capturing screenshot of the
@@ -13,42 +107,49 @@ def getproblem():
13107
Then saving the image.png as pdf file by using fdf library.
14108
"""
15109

16-
# Taking input from the user to search for the problem
17-
Pblm_id = input("Enter the Problem ID: ")
18-
difficulty = input("Enter the difficulty level: ")
19-
filename = input('Enter the file name to store Question: ') + '.pdf'
20-
21-
# Going to the specific URL
22-
url = "https://codeforces.com/problemset/problem/" + Pblm_id + "/" + difficulty
23110
path = 'image.png'
24-
options = webdriver.ChromeOptions()
25111

112+
# Creating a Target Output Folder
113+
target_folder = './problems_pdf'
114+
if not os.path.exists(target_folder):
115+
os.makedirs(target_folder)
116+
117+
options = webdriver.ChromeOptions()
26118
# Headless = True for taking a scrolling snapshot
27119
options.headless = True
28120
driver = webdriver.Chrome(DRIVER_PATH, options=options)
29-
driver.get(url)
30-
# Deciding height by tag
31-
required_height = driver.execute_script(
32-
'return document.body.parentNode.scrollHeight')
33-
driver.set_window_size(1366, required_height)
121+
file_counter = 1
122+
123+
for url in URLs:
124+
driver.get(url)
125+
# Deciding height by tag
126+
required_height = driver.execute_script(
127+
'return document.body.parentNode.scrollHeight')
128+
driver.set_window_size(1366, required_height)
129+
130+
title = driver.find_element_by_class_name("title").text
131+
filename = title[3:] + '.pdf'
34132

35-
# Taking SS of everything within the ttypography class
36-
driver.find_element_by_class_name('ttypography').screenshot(path)
133+
# Taking SS of everything within the ttypography class
134+
driver.find_element_by_class_name('ttypography').screenshot(path)
37135

38-
# Opening image with pillow so based to capture its height and width
39-
cover = Image.open(path)
40-
WIDTH, HEIGHT = cover.size
41-
MARGIN = 10
42-
# based on image's height and width we are adjusting the pdf margin and borders
43-
pdf = FPDF(unit='pt', format=[WIDTH + 2 * MARGIN, HEIGHT + 2 * MARGIN])
44-
pdf.add_page() # Adding new page to the pdf
45-
pdf.image(path, MARGIN, MARGIN)
46-
pdf.output(filename, "F") # saving the pdf with the specified filename
136+
# Opening image with pillow so based to capture its height and width
137+
cover = Image.open(path)
138+
WIDTH, HEIGHT = cover.size
139+
MARGIN = 10
140+
# based on image's height and width we are adjusting the pdf margin and borders
141+
pdf = FPDF(unit='pt', format=[WIDTH + 2 * MARGIN, HEIGHT + 2 * MARGIN])
142+
pdf.add_page() # Adding new page to the pdf
143+
pdf.image(path, MARGIN, MARGIN)
47144

48-
print(f'\nGreat Success!!! Check your directory for {filename} file!')
145+
pdf.output(os.path.join(target_folder, filename), "F") # saving the pdf with the specified filename
146+
print(f'File saved in your directory ./problems_pdf/{filename} ({file_counter}/{len(URLs)}) !')
147+
file_counter += 1
49148

50149

51150
if __name__ == "__main__":
52151
DRIVER_PATH = input("Enter DRIVER PATH location: ")
53-
getproblem()
152+
diff = select_difficulty() # Accepting difficulty level from user
153+
problems_link = extracting_problem_links(diff) # scraping the required the no. of links
154+
getproblem(problems_link) # saving the Questions in PDF file.
54155
os.remove('image.png')

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /