Commit deb923b

authored

Merge pull request avinashkranjan#1595 from THEGAMECHANGER416/master

Added script to scrape coursera

2 parents 2a0cf5a + 6a6ec46 commit deb923bCopy full SHA for deb923b

File tree

3 files changed

+180

-0

lines changed

Coursera Scraper

3 files changed

+180

-0

lines changed

`‎Coursera Scraper/README.md‎`

Lines changed: 34 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,34 @@`
	`1`	`+# Coursera Courses Scraper`
	`2`	`+## Description`
	`3`	`+A simple scraping module that gets coursera courses by web scraping. The purpose of this is to provide an alternate way to get list of courses from coursera`
	`4`	`+`
	`5`	`+### Language`
	`6`	`+- [X] Python`
	`7`	`+`
	`8`	`+### Usage`
	`9`	+To access the `courses`, this application imports the following modules.
	`10`	+```python
	`11`	`+import selenium`
	`12`	+```
	`13`	`+`
	`14`	`+### Instructions to run this application`
	`15`	`+`
	`16`	+ 1. Fork the repository and open `courses.py`
	`17`	`+ 2. Initialize the courses class with`
	`18`	+```python
	`19`	`+c = Courses("<Course_Name>","<No_of_pages>")`
	`20`	+```
	`21`	`+ 3. Use any of the functions to get required data like`
	`22`	+```python
	`23`	`+c.scrape_all()`
	`24`	+```
	`25`	`+ 4. It will return a dictionary containing the list of courses`
	`26`	`+`
	`27`	`+##### Example Output`
	`28`	`+The functions will return -`
	`29`	+```
	`30`	`+{`
	`31`	`+ data : [<List of Dictionaries>],`
	`32`	`+ msg : Course Titles for <Keyword>`
	`33`	`+}`
	`34`	+```

`‎Coursera Scraper/courses.py‎`

Lines changed: 145 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,145 @@`
	`1`	`+from selenium import webdriver`
	`2`	`+from selenium.webdriver.common.by import By`
	`3`	`+from selenium.webdriver.chrome.service import Service`
	`4`	`+from selenium.webdriver.support.ui import WebDriverWait`
	`5`	`+from selenium.webdriver.support import expected_conditions as EC`
	`6`	`+from selenium.webdriver.chrome.options import Options`
	`7`	`+`
	`8`	`+"""`
	`9`	`+Example code :`
	`10`	`+ python_scraper = Courses("python",5)`
	`11`	`+ print(python_scraper.scrape_all())`
	`12`	`+"""`
	`13`	`+class Courses:`
	`14`	`+ def __init__(self, keyword, page_count):`
	`15`	`+ self.keyword = keyword`
	`16`	`+ self.page_count = page_count`
	`17`	`+`
	`18`	`+ def __scrape_page(self):`
	`19`	`+ chromedriver_path = ''`
	`20`	`+ options = Options()`
	`21`	`+ options.add_argument("--headless")`
	`22`	`+ driver = webdriver.Chrome(service=Service(chromedriver_path), options=options)`
	`23`	`+ wait = WebDriverWait(driver, 100)`
	`24`	`+ driver.get('https://www.coursera.org/search?query=' + self.keyword)`
	`25`	`+ return wait, driver`
	`26`	`+ def scrape_all(self):`
	`27`	`+ wait, driver = self.__scrape_page()`
	`28`	`+ courses_data = []`
	`29`	`+ try:`
	`30`	`+ j = 0`
	`31`	`+ for i in range(self.page_count):`
	`32`	`+ courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))`
	`33`	`+ for course in courses:`
	`34`	`+ title = driver.execute_script('return arguments[0].querySelector("h3")?.innerText',course)`
	`35`	`+ description = driver.execute_script('return arguments[0].querySelector("p>span")?.innerText', course)`
	`36`	`+ review = driver.execute_script('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course)`
	`37`	`+ url = driver.execute_script('return String(arguments[0].querySelector("a")?.href)', course)`
	`38`	`+ data = {"id":j,"title":title,"description":description,"review":review,"url":url}`
	`39`	`+ courses_data += [data]`
	`40`	`+ j+=1`
	`41`	`+ next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')`
	`42`	`+ if 'disabled' in next_btn.get_attribute('class'):`
	`43`	`+ print('There are no more pages')`
	`44`	`+ break`
	`45`	`+ else:`
	`46`	`+ next_btn.click()`
	`47`	`+ return {`
	`48`	`+ "data": courses_data,`
	`49`	`+ "message": f"Course Titles for {self.keyword}"`
	`50`	`+ }`
	`51`	`+ except:`
	`52`	`+ return {`
	`53`	`+ "data": None,`
	`54`	`+ "message": f"No courses found for {self.keyword}"`
	`55`	`+ }`
	`56`	`+ def course_titles(self):`
	`57`	`+ wait, driver = self.__scrape_page()`
	`58`	`+ titles = []`
	`59`	`+ try:`
	`60`	`+ for i in range(self.page_count):`
	`61`	`+ courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))`
	`62`	`+ titles.extend([driver.execute_script('return arguments[0].querySelector("h3")?.innerText', course) for course in courses])`
	`63`	`+ next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')`
	`64`	`+ if 'disabled' in next_btn.get_attribute('class'):`
	`65`	`+ print('There are no more pages')`
	`66`	`+ break`
	`67`	`+ else:`
	`68`	`+ next_btn.click()`
	`69`	`+ return {`
	`70`	`+ "data": titles,`
	`71`	`+ "message": f"Course Titles for {self.keyword}"`
	`72`	`+ }`
	`73`	`+ except:`
	`74`	`+ return {`
	`75`	`+ "data": None,`
	`76`	`+ "message": f"No courses found for {self.keyword}"`
	`77`	`+ }`
	`78`	`+ def course_description(self):`
	`79`	`+ wait, driver = self.__scrape_page()`
	`80`	`+ descriptions = []`
	`81`	`+ try:`
	`82`	`+ for i in range(self.page_count):`
	`83`	`+ courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))`
	`84`	`+ descriptions.extend([driver.execute_script('return arguments[0].querySelector("p>span")?.innerText', course) for course in courses])`
	`85`	`+ next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')`
	`86`	`+ if 'disabled' in next_btn.get_attribute('class'):`
	`87`	`+ print('There are no more pages')`
	`88`	`+ break`
	`89`	`+ else:`
	`90`	`+ next_btn.click()`
	`91`	`+ return {`
	`92`	`+ "data": descriptions,`
	`93`	`+ "message": f"Course Titles for {self.keyword}"`
	`94`	`+ }`
	`95`	`+ except:`
	`96`	`+ return {`
	`97`	`+ "data": None,`
	`98`	`+ "message": f"No courses found for {self.keyword}"`
	`99`	`+ }`
	`100`	`+ def course_reviews(self):`
	`101`	`+ wait, driver = self.__scrape_page()`
	`102`	`+ reviews = []`
	`103`	`+ try:`
	`104`	`+ for i in range(self.page_count):`
	`105`	`+ courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))`
	`106`	`+ reviews.extend([driver.execute_script('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course) for course in courses])`
	`107`	`+ next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')`
	`108`	`+ if 'disabled' in next_btn.get_attribute('class'):`
	`109`	`+ print('There are no more pages')`
	`110`	`+ break`
	`111`	`+ else:`
	`112`	`+ next_btn.click()`
	`113`	`+ return {`
	`114`	`+ "data": reviews,`
	`115`	`+ "message": f"Course Titles for {self.keyword}"`
	`116`	`+ }`
	`117`	`+ except:`
	`118`	`+ return {`
	`119`	`+ "data": None,`
	`120`	`+ "message": f"No courses found for {self.keyword}"`
	`121`	`+ }`
	`122`	`+ def course_urls(self):`
	`123`	`+ wait, driver = self.__scrape_page()`
	`124`	`+ urls = []`
	`125`	`+ try:`
	`126`	`+ for i in range(self.page_count):`
	`127`	`+ courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))`
	`128`	`+ urls.extend([driver.execute_script('return String(arguments[0].querySelector("a")?.href)', course) for course in courses])`
	`129`	`+ next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')`
	`130`	`+ if 'disabled' in next_btn.get_attribute('class'):`
	`131`	`+ print('There are no more pages')`
	`132`	`+ break`
	`133`	`+ else:`
	`134`	`+ next_btn.click()`
	`135`	`+ return {`
	`136`	`+ "data": urls,`
	`137`	`+ "message": f"Course Titles for {self.keyword}"`
	`138`	`+ }`
	`139`	`+ except:`
	`140`	`+ return {`
	`141`	`+ "data": None,`
	`142`	`+ "message": f"No courses found for {self.keyword}"`
	`143`	`+ }`
	`144`	`+python_scraper = Courses("python",5)`
	`145`	`+print(python_scraper.scrape_all())`

`‎Coursera Scraper/requirements.txt‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+selenium==4.9.1`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit deb923b

File tree

3 files changed

3 files changed

`‎Coursera Scraper/README.md‎`

`‎Coursera Scraper/courses.py‎`

`‎Coursera Scraper/requirements.txt‎`

0 commit comments