Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit deb923b

Browse files
Merge pull request avinashkranjan#1595 from THEGAMECHANGER416/master
Added script to scrape coursera
2 parents 2a0cf5a + 6a6ec46 commit deb923b

File tree

3 files changed

+180
-0
lines changed

3 files changed

+180
-0
lines changed

‎Coursera Scraper/README.md‎

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Coursera Courses Scraper
2+
## Description
3+
A simple scraping module that gets coursera courses by web scraping. The purpose of this is to provide an alternate way to get list of courses from coursera
4+
5+
### Language
6+
- [X] Python
7+
8+
### Usage
9+
To access the `courses`, this application imports the following modules.
10+
```python
11+
import selenium
12+
```
13+
14+
### Instructions to run this application
15+
16+
1. Fork the repository and open `courses.py`
17+
2. Initialize the courses class with
18+
```python
19+
c = Courses("<Course_Name>","<No_of_pages>")
20+
```
21+
3. Use any of the functions to get required data like
22+
```python
23+
c.scrape_all()
24+
```
25+
4. It will return a dictionary containing the list of courses
26+
27+
##### Example Output
28+
The functions will return -
29+
```
30+
{
31+
data : [<List of Dictionaries>],
32+
msg : Course Titles for <Keyword>
33+
}
34+
```

‎Coursera Scraper/courses.py‎

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.common.by import By
3+
from selenium.webdriver.chrome.service import Service
4+
from selenium.webdriver.support.ui import WebDriverWait
5+
from selenium.webdriver.support import expected_conditions as EC
6+
from selenium.webdriver.chrome.options import Options
7+
8+
"""
9+
Example code :
10+
python_scraper = Courses("python",5)
11+
print(python_scraper.scrape_all())
12+
"""
13+
class Courses:
14+
def __init__(self, keyword, page_count):
15+
self.keyword = keyword
16+
self.page_count = page_count
17+
18+
def __scrape_page(self):
19+
chromedriver_path = ''
20+
options = Options()
21+
options.add_argument("--headless")
22+
driver = webdriver.Chrome(service=Service(chromedriver_path), options=options)
23+
wait = WebDriverWait(driver, 100)
24+
driver.get('https://www.coursera.org/search?query=' + self.keyword)
25+
return wait, driver
26+
def scrape_all(self):
27+
wait, driver = self.__scrape_page()
28+
courses_data = []
29+
try:
30+
j = 0
31+
for i in range(self.page_count):
32+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
33+
for course in courses:
34+
title = driver.execute_script('return arguments[0].querySelector("h3")?.innerText',course)
35+
description = driver.execute_script('return arguments[0].querySelector("p>span")?.innerText', course)
36+
review = driver.execute_script('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course)
37+
url = driver.execute_script('return String(arguments[0].querySelector("a")?.href)', course)
38+
data = {"id":j,"title":title,"description":description,"review":review,"url":url}
39+
courses_data += [data]
40+
j+=1
41+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
42+
if 'disabled' in next_btn.get_attribute('class'):
43+
print('There are no more pages')
44+
break
45+
else:
46+
next_btn.click()
47+
return {
48+
"data": courses_data,
49+
"message": f"Course Titles for {self.keyword}"
50+
}
51+
except:
52+
return {
53+
"data": None,
54+
"message": f"No courses found for {self.keyword}"
55+
}
56+
def course_titles(self):
57+
wait, driver = self.__scrape_page()
58+
titles = []
59+
try:
60+
for i in range(self.page_count):
61+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
62+
titles.extend([driver.execute_script('return arguments[0].querySelector("h3")?.innerText', course) for course in courses])
63+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
64+
if 'disabled' in next_btn.get_attribute('class'):
65+
print('There are no more pages')
66+
break
67+
else:
68+
next_btn.click()
69+
return {
70+
"data": titles,
71+
"message": f"Course Titles for {self.keyword}"
72+
}
73+
except:
74+
return {
75+
"data": None,
76+
"message": f"No courses found for {self.keyword}"
77+
}
78+
def course_description(self):
79+
wait, driver = self.__scrape_page()
80+
descriptions = []
81+
try:
82+
for i in range(self.page_count):
83+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
84+
descriptions.extend([driver.execute_script('return arguments[0].querySelector("p>span")?.innerText', course) for course in courses])
85+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
86+
if 'disabled' in next_btn.get_attribute('class'):
87+
print('There are no more pages')
88+
break
89+
else:
90+
next_btn.click()
91+
return {
92+
"data": descriptions,
93+
"message": f"Course Titles for {self.keyword}"
94+
}
95+
except:
96+
return {
97+
"data": None,
98+
"message": f"No courses found for {self.keyword}"
99+
}
100+
def course_reviews(self):
101+
wait, driver = self.__scrape_page()
102+
reviews = []
103+
try:
104+
for i in range(self.page_count):
105+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
106+
reviews.extend([driver.execute_script('return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course) for course in courses])
107+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
108+
if 'disabled' in next_btn.get_attribute('class'):
109+
print('There are no more pages')
110+
break
111+
else:
112+
next_btn.click()
113+
return {
114+
"data": reviews,
115+
"message": f"Course Titles for {self.keyword}"
116+
}
117+
except:
118+
return {
119+
"data": None,
120+
"message": f"No courses found for {self.keyword}"
121+
}
122+
def course_urls(self):
123+
wait, driver = self.__scrape_page()
124+
urls = []
125+
try:
126+
for i in range(self.page_count):
127+
courses = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'main ul>li')))
128+
urls.extend([driver.execute_script('return String(arguments[0].querySelector("a")?.href)', course) for course in courses])
129+
next_btn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
130+
if 'disabled' in next_btn.get_attribute('class'):
131+
print('There are no more pages')
132+
break
133+
else:
134+
next_btn.click()
135+
return {
136+
"data": urls,
137+
"message": f"Course Titles for {self.keyword}"
138+
}
139+
except:
140+
return {
141+
"data": None,
142+
"message": f"No courses found for {self.keyword}"
143+
}
144+
python_scraper = Courses("python",5)
145+
print(python_scraper.scrape_all())

‎Coursera Scraper/requirements.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
selenium==4.9.1

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /