Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit bfb1e89

Browse files
Merge pull request avinashkranjan#2635 from Mihan786Chistie/techCrunch
added Tech crunch Scraper
2 parents 57fe6d4 + 2dc2121 commit bfb1e89

File tree

3 files changed

+173
-0
lines changed

3 files changed

+173
-0
lines changed

‎TechCrunch-Scraper/README.md‎

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
## Tech Crunch
2+
3+
### Scrape articles with title, descriptions, images, author, date and link
4+
5+
Create an instance of `TechCrunch` class.
6+
7+
```python
8+
articles = TechCrunch()
9+
```
10+
11+
| Methods | Details |
12+
| ---------------- | ---------------------------------------------------------------------------------------------------------------------- |
13+
| `.getArticles()` | Returns the articles with title, descriptions, images, author, date and link regarding a category in JSON format |
14+
| `.search()` | Returns the searched articles with title, descriptions, images, author, date and link regarding a topic in JSON format |
15+
16+
---

‎TechCrunch-Scraper/requirements.txt‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
beautifulsoup4
2+
requests
3+
json

‎TechCrunch-Scraper/techCrunch.py‎

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import json
4+
5+
6+
class TechCrunch:
7+
"""
8+
Class - `TechCrunch`
9+
Example:
10+
```
11+
articles = TechCrunch()
12+
```\n
13+
Methods :\n
14+
1. ``.getArticles() | Response - Articles with title, descriptions, images, date and link.
15+
"""
16+
17+
def get_articles(self, category):
18+
19+
"""
20+
Class - `TechCrunch`
21+
Example:
22+
```
23+
articles = TechCrunch()
24+
articles.getArticles("artificial-intelligence")
25+
```
26+
Returns:
27+
{
28+
"title": Tile of the article
29+
"description": Description of the article
30+
"image": Image of the article
31+
"author": Author of the Article
32+
"date": Date the article was posted
33+
"link": Link to the article
34+
}
35+
"""
36+
url = (
37+
"https://techcrunch.com/category/" + category.replace(" ", "-").lower()
38+
)
39+
try:
40+
res = requests.get(url)
41+
soup = BeautifulSoup(res.text, "html.parser")
42+
43+
articles_data = {"articles": []}
44+
45+
articles = soup.find_all(
46+
"div", class_="post-block post-block--image post-block--unread"
47+
)
48+
for n in articles:
49+
name = (
50+
n.select_one(".post-block__title__link")
51+
.getText()
52+
.strip()
53+
.encode("ascii", "ignore")
54+
.decode()
55+
)
56+
desc = (
57+
n.select_one(".post-block__content")
58+
.getText()
59+
.strip()
60+
.encode("ascii", "ignore")
61+
.decode()
62+
)
63+
img = n.find_all("img", src=True)
64+
image = img[0]["src"]
65+
author = (
66+
n.select_one(".river-byline__authors")
67+
.getText()
68+
.strip()
69+
.encode("ascii", "ignore")
70+
.decode()
71+
)
72+
time = n.find_all("div", class_="river-byline")
73+
date = (
74+
time[0]
75+
.select_one(".river-byline__time")
76+
.getText()
77+
.strip()
78+
.encode("ascii", "ignore")
79+
.decode()
80+
)
81+
links = n.find_all("a", class_="post-block__title__link", href=True)
82+
link = links[0]["href"]
83+
articles_data["articles"].append(
84+
{
85+
"title": name,
86+
"description": desc,
87+
"image": image,
88+
"author": author,
89+
"date": date,
90+
"link": link,
91+
}
92+
)
93+
res_json = json.dumps(articles_data)
94+
return res_json
95+
except ValueError:
96+
error_message = {
97+
"message": "Can't fetch any articles from the topic provided."
98+
}
99+
ejson = json.dumps(error_message)
100+
return ejson
101+
102+
def search(self, topic):
103+
104+
"""
105+
Class - `TechCrunch`
106+
Example:
107+
```
108+
articles = TechCrunch()
109+
articles.search("github")
110+
```
111+
Returns:
112+
{
113+
"title": Tile of the article
114+
"description": Description of the article
115+
"image": Image of the article
116+
"author": Author of the Article
117+
"date": Date the article was posted
118+
"link": Link to the article
119+
}
120+
"""
121+
url = "https://search.techcrunch.com/search?p=" + topic + "&fr=techcrunch"
122+
try:
123+
res = requests.get(url)
124+
soup = BeautifulSoup(res.text, "html.parser")
125+
126+
articles_data = {"articles": []}
127+
128+
articles = soup.find_all("li", class_="ov-a mt-0 pt-26 pb-26 bt-dbdbdb")
129+
for i in articles:
130+
name = i.find("a", class_="fz-20 lh-22 fw-b").getText()
131+
desc = i.find("p", class_="fz-14 lh-20 c-777").getText()
132+
img = i.find("img", class_="s-img mr-10 s-img-errchk", src=True)
133+
image = img["src"]
134+
author = i.find("span", class_="mr-15").getText()
135+
date = i.find("span", class_="pl-15 bl-1-666").getText()
136+
links = i.find("a", class_="fz-20 lh-22 fw-b", href=True)
137+
link = links["href"]
138+
articles_data["articles"].append(
139+
{
140+
"title": name,
141+
"description": desc,
142+
"image": image,
143+
"author": author,
144+
"date": date,
145+
"link": link,
146+
}
147+
)
148+
return articles_data
149+
except ValueError:
150+
error_message = {
151+
"message": "Can't fetch any articles from the topic provided."
152+
}
153+
ejson = json.dumps(error_message)
154+
return ejson

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /