Commit 2ac8f5e

authored

Merge pull request avinashkranjan#837 from smriti26raina/issue-754

Added script for Codechef Scrapper

2 parents 91e787a + 127b4a6 commit 2ac8f5eCopy full SHA for 2ac8f5e

File tree

3 files changed

+161

-0

lines changed

Codechef Scrapper

3 files changed

+161

-0

lines changed

`‎Codechef Scrapper/README.md`

Lines changed: 40 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,40 @@`
	`1`	`+# Codechef Scraper`
	`2`	`+This python script will let the user to scrape 'n' number of codechef problems from any category/difficulty in https://www.codechef.com/ ,as provided by the user. The functionality of the script is to gain the information regarding particular codechef problem in different PDFs.`
	`3`	`+`
	`4`	`+`
	`5`	`+## Prerequisite Steps:`
	`6`	`+Download the required packages from the following command in you terminal.(Make sure you're in the same project directory)`
	`7`	`+`
	`8`	+```
	`9`	`+pip3 install -r requirements.txt`
	`10`	`+`
	`11`	+```
	`12`	`+`
	`13`	+To run this script,you need to have selenium installed and configure webdriver to use chrome browser in your`$PATH`.You can directly download chrome driver from the link below-
	`14`	`+https://chromedriver.chromium.org/downloads`
	`15`	`+Further,you can set the path to chromedriver using`
	`16`	`+`
	`17`	+```
	`18`	`+driver = webdriver.chrome("/usr/lib/chromium-browser/chromedriver")`
	`19`	`+`
	`20`	+```
	`21`	`+`
	`22`	`+`
	`23`	`+`
	`24`	`+## Running the script:`
	`25`	`+After installing all the requirements,run this command in your terminal.`
	`26`	`+`
	`27`	+```
	`28`	`+python3 codechef.py`
	`29`	`+`
	`30`	+```
	`31`	`+`
	`32`	`+## Output:`
	`33`	`+This script will generate 'n' number of different PDFs in a folder to store the problem information (problem title,problem statement,test cases,problem link) separately.`
	`34`	`+`
	`35`	`+![image](https://user-images.githubusercontent.com/30191221/113629602-46a4ff80-9684-11eb-8938-c6e8f934d3ae.png)`
	`36`	`+`
	`37`	`+![image](https://user-images.githubusercontent.com/30191221/113629697-64726480-9684-11eb-9d14-3b1ac515d40e.png)`
	`38`	`+`
	`39`	`+Author:`
	`40`	`+[Smriti Raina](https://github.com/smriti26raina)`

`‎Codechef Scrapper/codechef.py`

Lines changed: 117 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,117 @@`
	`1`	`+from selenium import webdriver`
	`2`	`+import os`
	`3`	`+options = webdriver.ChromeOptions()`
	`4`	`+options.add_argument("--headless")`
	`5`	`+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities`
	`6`	`+from selenium.webdriver.support.ui import WebDriverWait`
	`7`	`+from selenium.webdriver.support import expected_conditions as EC`
	`8`	`+from selenium.webdriver.common.by import By`
	`9`	`+from selenium.common.exceptions import NoSuchElementException`
	`10`	`+from selenium.common.exceptions import TimeoutException`
	`11`	`+from fpdf import FPDF`
	`12`	`+`
	`13`	`+`
	`14`	`+capa = DesiredCapabilities.CHROME`
	`15`	`+capa["pageLoadStrategy"] = "none"`
	`16`	`+`
	`17`	`+driver = webdriver.Chrome(desired_capabilities=capa,options=options)`
	`18`	`+baseurl="https://www.codechef.com/problems"`
	`19`	`+wait = WebDriverWait(driver, 15)`
	`20`	`+`
	`21`	`+# map to get url from its problem difficulty`
	`22`	`+problem_difficulty = {"Beginner": "school", "Easy": "easy", "Medium": "medium", "Hard": "hard", "Challenge": "challenge"}`
	`23`	`+`
	`24`	`+# get_problems returns the name and links of the problems`
	`25`	`+def get_problems(category, no_of_problems):`
	`26`	`+`
	`27`	`+# A map to store problem name and problem url`
	`28`	`+ problem_info = {}`
	`29`	`+ try:`
	`30`	`+ driver.get(baseurl + '/' + category)`
	`31`	`+ # wait till the first element is loaded`
	`32`	`+ wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[1]/td[1]/div/a/b")))`
	`33`	`+ except TimeoutException as exception:`
	`34`	`+ print("Couldn't fetch problem. Network issue or page slow to render. Try again")`
	`35`	`+ os._exit(-1)`
	`36`	`+`
	`37`	`+`
	`38`	`+`
	`39`	`+ for problem_index in range(1, no_of_problems + 1):`
	`40`	`+ problem_name = driver.find_element_by_xpath("//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[{}]/td[1]/div/a/b".format(problem_index)).text`
	`41`	`+ problem_url = driver.find_element_by_xpath("//*[@id='primary-content']/div/div[2]/div/div[2]/table/tbody/tr[{}]/td[1]/div/a".format(problem_index)).get_attribute('href')`
	`42`	`+ print(problem_name," ",problem_url)`
	`43`	`+ problem_info[problem_name] = problem_url`
	`44`	`+ return problem_info`
	`45`	`+`
	`46`	`+# get_problem_desciption returns content of the problem`
	`47`	`+def get_problem_description(problem_url,problem_name):`
	`48`	`+ try:`
	`49`	`+ driver.get(problem_url)`
	`50`	`+ wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='problem-statement']/p[1]")))`
	`51`	`+ problem_title= problem_name`
	`52`	`+ problem_statement = driver.find_element_by_xpath("//*[@id='problem-statement']/p[1]").text`
	`53`	`+ problem_test_cases = driver.find_element_by_xpath("//*[@id='problem-statement']/pre[1]").text`
	`54`	`+`
	`55`	`+`
	`56`	`+ if (problem_test_cases.find("Output") == -1):`
	`57`	`+ problem_test_cases = "Input\n" + problem_test_cases`
	`58`	`+ problem_test_cases+="\nOutput\n"`
	`59`	`+ problem_test_cases += driver.find_element_by_xpath("//*[@id='problem-statement']/pre[2]").text`
	`60`	`+`
	`61`	`+`
	`62`	`+ else:`
	`63`	`+`
	`64`	`+`
	`65`	`+ driver.execute_script("window.stop();")`
	`66`	`+ problem={'title':problem_title,'statement':problem_statement,'test_case':problem_test_cases,'url':problem_url}`
	`67`	`+ return problem`
	`68`	`+`
	`69`	`+ #Handling exceptions`
	`70`	`+ except NoSuchElementException as e:`
	`71`	`+ print("Couldn't scrap the element, Unable to locate it")`
	`72`	`+ problem=None`
	`73`	`+ except TimeoutException as exception:`
	`74`	`+ print("Couldn't scrap the element, Unable to locate it")`
	`75`	`+ problem=None`
	`76`	`+`
	`77`	`+`
	`78`	`+`
	`79`	`+`
	`80`	`+`
	`81`	`+#storing the information in the pdf`
	`82`	`+def convert_to_pdf(problem):`
	`83`	`+ pdf = FPDF()`
	`84`	`+ pdf.add_page()`
	`85`	`+ pdf.set_font("Arial", size = 15)`
	`86`	`+ # Replace character that aren't in latin-1 character set`
	`87`	`+ title=problem["title"].encode('latin-1', 'replace').decode('latin-1')`
	`88`	`+ statement=problem["statement"].encode('latin-1', 'replace').decode('latin-1')`
	`89`	`+ test_case=problem["test_case"].encode('latin-1', 'replace').decode('latin-1')`
	`90`	`+ url=problem["url"]`
	`91`	`+ # add sections to pdf`
	`92`	`+ pdf.cell(200, 10, txt =title, ln = 1, align = 'C')`
	`93`	`+ pdf.multi_cell(200, 10, txt =statement, align = 'L')`
	`94`	`+ pdf.multi_cell(200, 10, txt =test_case, align = 'L')`
	`95`	`+ pdf.write(5, 'Problem_Link: ')`
	`96`	`+ pdf.write(5,url,url)`
	`97`	`+`
	`98`	`+`
	`99`	`+ pdf.output(title+".pdf")`
	`100`	`+`
	`101`	`+`
	`102`	`+#main function`
	`103`	`+def main():`
	`104`	`+ category=input("Enter the difficulty level from the following \n Beginner \n Easy \n Medium \n Hard \n Challenge \n\n")`
	`105`	`+ no_of_problems=int(input("\n Enter the number of problems to be scrapped: \n"))`
	`106`	`+ info = get_problems(problem_difficulty[category],no_of_problems)`
	`107`	`+ for name, url in info.items():`
	`108`	`+ problem=get_problem_description(url,name)`
	`109`	`+ if(problem is not None ):`
	`110`	`+ convert_to_pdf(problem)`
	`111`	`+ else:`
	`112`	`+ pass`
	`113`	`+`
	`114`	`+if __name__ == '__main__':`
	`115`	`+ main()`
	`116`	`+`
	`117`	`+driver.close()`

`‎Codechef Scrapper/requirements.txt`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,4 @@`
	`1`	`+fpdf==1.7.2`
	`2`	`+requests==2.24.0`
	`3`	`+selenium==3.141.0`
	`4`	`+urllib3==1.25.11`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 2ac8f5e

File tree

3 files changed

3 files changed

`‎Codechef Scrapper/README.md`

`‎Codechef Scrapper/codechef.py`

`‎Codechef Scrapper/requirements.txt`

0 commit comments