Commit 9a9ef9a

authored

Codeforces Problem Scraper Added

1 parent bc2de17 commit 9a9ef9aCopy full SHA for 9a9ef9a

File tree

1 file changed

+128

-27

lines changed

Coderforces_Problem_Scrapper
- Codeforces_problem_scrapper.py

1 file changed

+128

-27

lines changed

`‎Coderforces_Problem_Scrapper/Codeforces_problem_scrapper.py‎`

Lines changed: 128 additions & 27 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,9 +2,103 @@`
`2`	`2`	`from selenium import webdriver # Automated webdriver`
`3`	`3`	`from PIL import Image`
`4`	`4`	`from fpdf import FPDF # For converting images to pdf`
	`5`	`+`
`5`	`6`	`DRIVER_PATH = ''`
`6`	`7`
`7`		`-def getproblem():`
	`8`	`+`
	`9`	`+def select_difficulty():`
	`10`	`+ """`
	`11`	`+ This function will let user to choose the difficulty level`
	`12`	`+ :return: difficulty_level[]`
	`13`	`+ """`
	`14`	`+ difficulty_level = []`
	`15`	`+ print("\nEnter the Range between 800 to 3500: ")`
	`16`	`+ difficulty_level.append(int(input("Min: ")))`
	`17`	`+ difficulty_level.append(int(input("Max: ")))`
	`18`	`+`
	`19`	`+ return difficulty_level`
	`20`	`+`
	`21`	`+`
	`22`	`+def extracting_problem_links(diff_level):`
	`23`	`+ """`
	`24`	`+ This function saves first saves the link of the pages to scrape from`
	`25`	`+ and then the link of every question, saves it in list`
	`26`	`+ :param diff_level: difficulty_level entered by the user`
	`27`	`+ :return pblms_links: consists of all the available questions to scrape`
	`28`	`+ """`
	`29`	`+ no_of_questions = int(input("\nHow many Questions you want to scrape: "))`
	`30`	`+`
	`31`	`+ pblms_link_scraped = 0`
	`32`	`+ pblms_links = []`
	`33`	`+ page = 1`
	`34`	`+ options = webdriver.ChromeOptions()`
	`35`	`+ options.headless = True`
	`36`	`+ driver = webdriver.Chrome(DRIVER_PATH, options=options)`
	`37`	`+ print("\nRequesting URL ...")`
	`38`	`+ driver.get(f"https://codeforces.com/problemset/?tags={diff_level[0]}-{diff_level[1]}")`
	`39`	`+`
	`40`	`+ # ===================Getting no. of Pages to Scrape=============================`
	`41`	`+`
	`42`	`+ # It will give the total no. of pages present with that question from`
	`43`	`+ # which we are going to scrape`
	`44`	`+ page_links = []`
	`45`	`+`
	`46`	`+ print("\nFinding available pages to scrape....")`
	`47`	`+`
	`48`	`+ available_pages = driver.find_elements_by_css_selector("div.pagination a")`
	`49`	`+ for page_no in available_pages:`
	`50`	`+ page_links.append(page_no.get_attribute("href"))`
	`51`	`+`
	`52`	`+ print(f"Available Pages to scrape are: {len(page_links[:-1])}")`
	`53`	`+`
	`54`	`+ # ===================================================================================`
	`55`	`+`
	`56`	`+ # *************************** SCRAPING PAGE 1 ***********************************`
	`57`	`+ print(f"\nScraping Page {page}")`
	`58`	`+`
	`59`	`+ elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")`
	`60`	`+ for element in elements:`
	`61`	`+ # Saving the link in pblms_links`
	`62`	`+ pblms_links.append(element.get_attribute("href"))`
	`63`	`+ pblms_link_scraped += 1`
	`64`	`+`
	`65`	`+ # If we scraped required no. of questions then return`
	`66`	`+ if pblms_link_scraped == no_of_questions:`
	`67`	`+ print(f"URLs of Question Scraped till now: {pblms_link_scraped}")`
	`68`	`+ print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")`
	`69`	`+ return pblms_links`
	`70`	`+ page += 1`
	`71`	`+ print(f"URLs of Question Scraped till now: {pblms_link_scraped}")`
	`72`	`+ # *************************************************************************************`
	`73`	`+`
	`74`	`+ # ----------------------------- SCRAPING SUBSEQUENT PAGES -----------------------------`
	`75`	`+ for link in page_links[1:-1]:`
	`76`	`+ print(f"\nScraping Page {page}")`
	`77`	`+`
	`78`	`+ # Going to next Page`
	`79`	`+ driver.get(link)`
	`80`	`+ elements = driver.find_elements_by_css_selector("td.id.dark.left a" and "td.id.left a")`
	`81`	`+ for element in elements:`
	`82`	`+ # Saving the link in pblms_links`
	`83`	`+ pblms_links.append(element.get_attribute("href"))`
	`84`	`+ pblms_link_scraped += 1`
	`85`	`+`
	`86`	`+ # If we scraped required no. of questions then return`
	`87`	`+ if pblms_link_scraped == no_of_questions:`
	`88`	`+ print(f"URLs of Question Scraped till now: {pblms_link_scraped}")`
	`89`	`+ print(f"\nURLs Scrapped Successfully {pblms_link_scraped} out of {no_of_questions}")`
	`90`	`+ return pblms_links`
	`91`	`+`
	`92`	`+ print(f"URLs of Question Scraped till now: {pblms_link_scraped}")`
	`93`	`+ page += 1`
	`94`	`+ # ----------------------------------------------------------------------------------------------`
	`95`	`+`
	`96`	`+ # scraped all the available questions but still the count is less`
	`97`	`+ print(f"\n{pblms_link_scraped} out of {no_of_questions} URLs able to scrapped !!!")`
	`98`	`+ return pblms_links`
	`99`	`+`
	`100`	`+`
	`101`	`+def getproblem(URLs):`
`8`	`102`	`"""`
`9`	`103`	`getproblem() : It takes input from the user of codeforces problemID and difficulty`
`10`	`104`	`level and then by using selenium and chrome webdriver, capturing screenshot of the`
`@@ -13,42 +107,49 @@ def getproblem():`
`13`	`107`	`Then saving the image.png as pdf file by using fdf library.`
`14`	`108`	`"""`
`15`	`109`
`16`		`- # Taking input from the user to search for the problem`
`17`		`- Pblm_id = input("Enter the Problem ID: ")`
`18`		`- difficulty = input("Enter the difficulty level: ")`
`19`		`- filename = input('Enter the file name to store Question: ') + '.pdf'`
`20`		`-`
`21`		`- # Going to the specific URL`
`22`		`- url = "https://codeforces.com/problemset/problem/" + Pblm_id + "/" + difficulty`
`23`	`110`	`path = 'image.png'`
`24`		`- options = webdriver.ChromeOptions()`
`25`	`111`
	`112`	`+ # Creating a Target Output Folder`
	`113`	`+ target_folder = './problems_pdf'`
	`114`	`+ if not os.path.exists(target_folder):`
	`115`	`+ os.makedirs(target_folder)`
	`116`	`+`
	`117`	`+ options = webdriver.ChromeOptions()`
`26`	`118`	`# Headless = True for taking a scrolling snapshot`
`27`	`119`	`options.headless = True`
`28`	`120`	`driver = webdriver.Chrome(DRIVER_PATH, options=options)`
`29`		`- driver.get(url)`
`30`		`- # Deciding height by tag`
`31`		`- required_height = driver.execute_script(`
`32`		`- 'return document.body.parentNode.scrollHeight')`
`33`		`- driver.set_window_size(1366, required_height)`
	`121`	`+ file_counter = 1`
	`122`	`+`
	`123`	`+ for url in URLs:`
	`124`	`+ driver.get(url)`
	`125`	`+ # Deciding height by tag`
	`126`	`+ required_height = driver.execute_script(`
	`127`	`+ 'return document.body.parentNode.scrollHeight')`
	`128`	`+ driver.set_window_size(1366, required_height)`
	`129`	`+`
	`130`	`+ title = driver.find_element_by_class_name("title").text`
	`131`	`+ filename = title[3:] + '.pdf'`
`34`	`132`
`35`		`- # Taking SS of everything within the ttypography class`
`36`		`- driver.find_element_by_class_name('ttypography').screenshot(path)`
	`133`	`+ # Taking SS of everything within the ttypography class`
	`134`	`+ driver.find_element_by_class_name('ttypography').screenshot(path)`
`37`	`135`
`38`		`- # Opening image with pillow so based to capture its height and width`
`39`		`- cover = Image.open(path)`
`40`		`- WIDTH, HEIGHT = cover.size`
`41`		`- MARGIN = 10`
`42`		`- # based on image's height and width we are adjusting the pdf margin and borders`
`43`		`- pdf = FPDF(unit='pt', format=[WIDTH + 2 * MARGIN, HEIGHT + 2 * MARGIN])`
`44`		`- pdf.add_page() # Adding new page to the pdf`
`45`		`- pdf.image(path, MARGIN, MARGIN)`
`46`		`- pdf.output(filename, "F") # saving the pdf with the specified filename`
	`136`	`+ # Opening image with pillow so based to capture its height and width`
	`137`	`+ cover = Image.open(path)`
	`138`	`+ WIDTH, HEIGHT = cover.size`
	`139`	`+ MARGIN = 10`
	`140`	`+ # based on image's height and width we are adjusting the pdf margin and borders`
	`141`	`+ pdf = FPDF(unit='pt', format=[WIDTH + 2 * MARGIN, HEIGHT + 2 * MARGIN])`
	`142`	`+ pdf.add_page() # Adding new page to the pdf`
	`143`	`+ pdf.image(path, MARGIN, MARGIN)`
`47`	`144`
`48`		`- print(f'\nGreat Success!!! Check your directory for {filename} file!')`
	`145`	`+ pdf.output(os.path.join(target_folder, filename), "F") # saving the pdf with the specified filename`
	`146`	`+ print(f'File saved in your directory ./problems_pdf/{filename} ({file_counter}/{len(URLs)}) !')`
	`147`	`+ file_counter += 1`
`49`	`148`
`50`	`149`
`51`	`150`	`if __name__ == "__main__":`
`52`	`151`	`DRIVER_PATH = input("Enter DRIVER PATH location: ")`
`53`		`- getproblem()`
	`152`	`+ diff = select_difficulty() # Accepting difficulty level from user`
	`153`	`+ problems_link = extracting_problem_links(diff) # scraping the required the no. of links`
	`154`	`+ getproblem(problems_link) # saving the Questions in PDF file.`
`54`	`155`	`os.remove('image.png')`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 9a9ef9a

File tree

1 file changed

1 file changed

`‎Coderforces_Problem_Scrapper/Codeforces_problem_scrapper.py‎`

0 commit comments