Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 77ae12d

Browse files
Merge pull request avinashkranjan#1116 from RohiniRG/RohiniRG-scrapereddit
Reddit scraper without API
2 parents 482d48c + 9837f7e commit 77ae12d

File tree

4 files changed

+244
-0
lines changed

4 files changed

+244
-0
lines changed

‎Reddit_Scraper_without_API/README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Reddit Scraper
2+
3+
- Using BeautifulSoup, a python library useful for web scraping, this script helps to scrape a desired subreddit to obtain all relevant data regarding its posts.
4+
5+
- In the `fetch_reddit.py` , we take user input for the subreddit name, tags and the maximum count of posts to be scraped, we fetch and store all this information in a database file.
6+
7+
- In the `display_reddit.py` , we display the desired results from the database to the user.
8+
9+
## Setup instructions
10+
11+
- The requirements can be installed as follows:
12+
13+
```shell
14+
$ pip install -r requirements.txt
15+
```
16+
17+
## Working screenshots
18+
19+
20+
![Image](https://i.imgur.com/2jHHjCh.png)
21+
#
22+
23+
![Image](https://i.imgur.com/XW8dkrQ.png)
24+
25+
## Author
26+
[Rohini Rao](www.github.com/RohiniRG)
27+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import sqlite3
2+
import os
3+
4+
5+
def sql_connection():
6+
"""
7+
Establishes a connection to the SQL file database
8+
:return connection object:
9+
"""
10+
path = os.path.abspath('SubredditDatabase.db')
11+
con = sqlite3.connect(path)
12+
return con
13+
14+
15+
def sql_fetcher(con):
16+
"""
17+
Fetches all the tweets with the given hashtag from our database
18+
:param con:
19+
:return:
20+
"""
21+
subreddit = input("\nEnter subreddit to search: r/")
22+
count = 0
23+
cur = con.cursor()
24+
cur.execute('SELECT * FROM posts') # SQL search query
25+
rows = cur.fetchall()
26+
27+
for r in rows:
28+
if subreddit in r:
29+
count += 1
30+
print(f'\nTAG: {r[1]}\nPOST TITLE: {r[2]}\nAUTHOR: {r[3]}\n'
31+
f'TIME STAMP: {r[4]}\nUPVOTES: {r[5]}\nCOMMENTS: {r[6]}'
32+
f'\nURL: {r[7]}\n')
33+
34+
if count:
35+
print(f'{count} posts fetched from database\n')
36+
else:
37+
print('\nNo posts stored for this subreddit\n')
38+
39+
40+
con = sql_connection()
41+
42+
while 1:
43+
sql_fetcher(con)
44+
45+
ans = input('\nPress (y) to continue or any other key to exit: ').lower()
46+
if ans == 'y':
47+
continue
48+
else:
49+
print('\nExiting..\n')
50+
break
51+
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import requests
2+
import csv
3+
import time
4+
import sqlite3
5+
from bs4 import BeautifulSoup
6+
7+
8+
def sql_connection():
9+
"""
10+
Establishes a connection to the SQL file database
11+
:return connection object:
12+
"""
13+
con = sqlite3.connect('SubredditDatabase.db')
14+
return con
15+
16+
17+
def sql_table(con):
18+
"""
19+
Creates a table in the database (if it does not exist already)
20+
to store the tweet info
21+
:param con:
22+
:return:
23+
"""
24+
cur = con.cursor()
25+
cur.execute("CREATE TABLE IF NOT EXISTS posts(SUBREDDIT text, TAG text, "
26+
" TITLE text, AUTHOR text, TIMESTAMP text, UPVOTES int, "
27+
" COMMENTS text, URL text)")
28+
con.commit()
29+
30+
31+
def sql_insert_table(con, entities):
32+
"""
33+
Inserts the desired data into the table to store tweet info
34+
:param con:
35+
:param entities:
36+
:return:
37+
"""
38+
cur = con.cursor()
39+
cur.execute('INSERT INTO posts(SUBREDDIT, TAG, TITLE, AUTHOR, '
40+
'TIMESTAMP, UPVOTES, COMMENTS, URL) '
41+
'VALUES(?, ?, ?, ?, ?, ?, ?, ?)', entities)
42+
con.commit()
43+
44+
45+
def scraper():
46+
"""
47+
The function scrapes the post info from the desired subreddit and stores it
48+
into the desired file.
49+
:return:
50+
"""
51+
con = sql_connection()
52+
sql_table(con)
53+
54+
while 1:
55+
subreddit = input('\n\nEnter the name of the subreddit: r/').lower()
56+
max_count = int(input('Enter the maximum number of entries to collect: '))
57+
select = int(input('Select tags to add for the search: \n1. hot\n2. new'
58+
'\n3. rising\n4. controversial\n5. top\nMake your choice: '))
59+
60+
if select == 1:
61+
tag = 'hot'
62+
tag_url = '/'
63+
elif select == 2:
64+
tag = 'new'
65+
tag_url = '/new/'
66+
elif select == 3:
67+
tag = 'rising'
68+
tag_url = '/rising/'
69+
elif select == 4:
70+
tag = 'controversial'
71+
tag_url = '/controversial/'
72+
elif select == 5:
73+
tag = 'top'
74+
tag_url = '/top/'
75+
76+
# URL for the desired subreddit
77+
url = 'https://old.reddit.com/r/' + subreddit
78+
79+
# Using a user-agent to mimic browser activity
80+
headers = {'User-Agent': 'Mozilla/5.0'}
81+
82+
req = requests.get(url, headers=headers)
83+
84+
if req.status_code == 200:
85+
soup = BeautifulSoup(req.text, 'html.parser')
86+
print(f'\nCOLLECTING INFORMATION FOR r/{subreddit}....')
87+
88+
attrs = {'class': 'thing'}
89+
counter = 1
90+
full = 0
91+
reddit_info = []
92+
while 1:
93+
for post in soup.find_all('div', attrs=attrs):
94+
try:
95+
# To obtain the post title
96+
title = post.find('a', class_='title').text
97+
98+
# To get the username of the post author
99+
author = post.find('a', class_='author').text
100+
101+
# To obtain the time of the post
102+
time_stamp = post.time.attrs['title']
103+
104+
# To obtain the number of comments on the post
105+
comments = post.find('a', class_='comments').text.split()[0]
106+
if comments == 'comment':
107+
comments = 0
108+
109+
# To get the number of comments on the post
110+
upvotes = post.find('div', class_='score likes').text
111+
if upvotes == '•':
112+
upvotes = "None"
113+
114+
# To get the URL of the post
115+
link = post.find('a', class_='title')['href']
116+
link = 'www.reddit.com' + link
117+
118+
# Entering all the collected information into our database
119+
entities = (subreddit, tag, title, author, time_stamp, upvotes,
120+
comments, link)
121+
sql_insert_table(con, entities)
122+
123+
if counter == max_count:
124+
full = 1
125+
break
126+
127+
counter += 1
128+
except AttributeError:
129+
continue
130+
131+
if full:
132+
break
133+
134+
try:
135+
# To go to the next page
136+
next_button = soup.find('span', class_='next-button')
137+
next_page_link = next_button.find('a').attrs['href']
138+
139+
time.sleep(2)
140+
141+
req = requests.get(next_page_link, headers=headers)
142+
soup = BeautifulSoup(req.text, 'html.parser')
143+
except:
144+
break
145+
146+
print('DONE!\n')
147+
ans = input('Press (y) to continue or any other key to exit: ').lower()
148+
if ans == 'y':
149+
continue
150+
else:
151+
print('Exiting..')
152+
break
153+
else:
154+
print('Error fetching results.. Try again!')
155+
156+
157+
if __name__ == '__main__':
158+
scraper()
159+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
beautifulsoup4==4.9.3
2+
certifi==2020年12月5日
3+
chardet==4.0.0
4+
idna==2.10
5+
requests==2.25.1
6+
soupsieve==2.2.1
7+
urllib3==1.26.4

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /