Commit 77ae12d

authored

Merge pull request avinashkranjan#1116 from RohiniRG/RohiniRG-scrapereddit

Reddit scraper without API

2 parents 482d48c + 9837f7e commit 77ae12dCopy full SHA for 77ae12d

File tree

4 files changed

+244

-0

lines changed

Reddit_Scraper_without_API

4 files changed

+244

-0

lines changed

`‎Reddit_Scraper_without_API/README.md‎`

Lines changed: 27 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,27 @@`
	`1`	`+# Reddit Scraper`
	`2`	`+`
	`3`	`+- Using BeautifulSoup, a python library useful for web scraping, this script helps to scrape a desired subreddit to obtain all relevant data regarding its posts.`
	`4`	`+`
	`5`	+- In the `fetch_reddit.py` , we take user input for the subreddit name, tags and the maximum count of posts to be scraped, we fetch and store all this information in a database file.
	`6`	`+`
	`7`	+- In the `display_reddit.py` , we display the desired results from the database to the user.
	`8`	`+`
	`9`	`+## Setup instructions`
	`10`	`+`
	`11`	`+- The requirements can be installed as follows:`
	`12`	`+`
	`13`	+```shell
	`14`	`+ $ pip install -r requirements.txt`
	`15`	+```
	`16`	`+`
	`17`	`+## Working screenshots`
	`18`	`+`
	`19`	`+`
	`20`	`+![Image](https://i.imgur.com/2jHHjCh.png)`
	`21`	`+#`
	`22`	`+`
	`23`	`+![Image](https://i.imgur.com/XW8dkrQ.png)`
	`24`	`+`
	`25`	`+## Author`
	`26`	`+[Rohini Rao](www.github.com/RohiniRG)`
	`27`	`+`

`‎Reddit_Scraper_without_API/display_reddit.py‎`

Lines changed: 51 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,51 @@`
	`1`	`+import sqlite3`
	`2`	`+import os`
	`3`	`+`
	`4`	`+`
	`5`	`+def sql_connection():`
	`6`	`+ """`
	`7`	`+ Establishes a connection to the SQL file database`
	`8`	`+ :return connection object:`
	`9`	`+ """`
	`10`	`+ path = os.path.abspath('SubredditDatabase.db')`
	`11`	`+ con = sqlite3.connect(path)`
	`12`	`+ return con`
	`13`	`+`
	`14`	`+`
	`15`	`+def sql_fetcher(con):`
	`16`	`+ """`
	`17`	`+ Fetches all the tweets with the given hashtag from our database`
	`18`	`+ :param con:`
	`19`	`+ :return:`
	`20`	`+ """`
	`21`	`+ subreddit = input("\nEnter subreddit to search: r/")`
	`22`	`+ count = 0`
	`23`	`+ cur = con.cursor()`
	`24`	`+ cur.execute('SELECT * FROM posts') # SQL search query`
	`25`	`+ rows = cur.fetchall()`
	`26`	`+`
	`27`	`+ for r in rows:`
	`28`	`+ if subreddit in r:`
	`29`	`+ count += 1`
	`30`	`+ print(f'\nTAG: {r[1]}\nPOST TITLE: {r[2]}\nAUTHOR: {r[3]}\n'`
	`31`	`+ f'TIME STAMP: {r[4]}\nUPVOTES: {r[5]}\nCOMMENTS: {r[6]}'`
	`32`	`+ f'\nURL: {r[7]}\n')`
	`33`	`+`
	`34`	`+ if count:`
	`35`	`+ print(f'{count} posts fetched from database\n')`
	`36`	`+ else:`
	`37`	`+ print('\nNo posts stored for this subreddit\n')`
	`38`	`+`
	`39`	`+`
	`40`	`+con = sql_connection()`
	`41`	`+`
	`42`	`+while 1:`
	`43`	`+ sql_fetcher(con)`
	`44`	`+`
	`45`	`+ ans = input('\nPress (y) to continue or any other key to exit: ').lower()`
	`46`	`+ if ans == 'y':`
	`47`	`+ continue`
	`48`	`+ else:`
	`49`	`+ print('\nExiting..\n')`
	`50`	`+ break`
	`51`	`+`

`‎Reddit_Scraper_without_API/fetch_reddit.py‎`

Lines changed: 159 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,159 @@`
	`1`	`+import requests`
	`2`	`+import csv`
	`3`	`+import time`
	`4`	`+import sqlite3`
	`5`	`+from bs4 import BeautifulSoup`
	`6`	`+`
	`7`	`+`
	`8`	`+def sql_connection():`
	`9`	`+ """`
	`10`	`+ Establishes a connection to the SQL file database`
	`11`	`+ :return connection object:`
	`12`	`+ """`
	`13`	`+ con = sqlite3.connect('SubredditDatabase.db')`
	`14`	`+ return con`
	`15`	`+`
	`16`	`+`
	`17`	`+def sql_table(con):`
	`18`	`+ """`
	`19`	`+ Creates a table in the database (if it does not exist already)`
	`20`	`+ to store the tweet info`
	`21`	`+ :param con:`
	`22`	`+ :return:`
	`23`	`+ """`
	`24`	`+ cur = con.cursor()`
	`25`	`+ cur.execute("CREATE TABLE IF NOT EXISTS posts(SUBREDDIT text, TAG text, "`
	`26`	`+ " TITLE text, AUTHOR text, TIMESTAMP text, UPVOTES int, "`
	`27`	`+ " COMMENTS text, URL text)")`
	`28`	`+ con.commit()`
	`29`	`+`
	`30`	`+`
	`31`	`+def sql_insert_table(con, entities):`
	`32`	`+ """`
	`33`	`+ Inserts the desired data into the table to store tweet info`
	`34`	`+ :param con:`
	`35`	`+ :param entities:`
	`36`	`+ :return:`
	`37`	`+ """`
	`38`	`+ cur = con.cursor()`
	`39`	`+ cur.execute('INSERT INTO posts(SUBREDDIT, TAG, TITLE, AUTHOR, '`
	`40`	`+ 'TIMESTAMP, UPVOTES, COMMENTS, URL) '`
	`41`	`+ 'VALUES(?, ?, ?, ?, ?, ?, ?, ?)', entities)`
	`42`	`+ con.commit()`
	`43`	`+`
	`44`	`+`
	`45`	`+def scraper():`
	`46`	`+ """`
	`47`	`+ The function scrapes the post info from the desired subreddit and stores it`
	`48`	`+ into the desired file.`
	`49`	`+ :return:`
	`50`	`+ """`
	`51`	`+ con = sql_connection()`
	`52`	`+ sql_table(con)`
	`53`	`+`
	`54`	`+ while 1:`
	`55`	`+ subreddit = input('\n\nEnter the name of the subreddit: r/').lower()`
	`56`	`+ max_count = int(input('Enter the maximum number of entries to collect: '))`
	`57`	`+ select = int(input('Select tags to add for the search: \n1. hot\n2. new'`
	`58`	`+ '\n3. rising\n4. controversial\n5. top\nMake your choice: '))`
	`59`	`+`
	`60`	`+ if select == 1:`
	`61`	`+ tag = 'hot'`
	`62`	`+ tag_url = '/'`
	`63`	`+ elif select == 2:`
	`64`	`+ tag = 'new'`
	`65`	`+ tag_url = '/new/'`
	`66`	`+ elif select == 3:`
	`67`	`+ tag = 'rising'`
	`68`	`+ tag_url = '/rising/'`
	`69`	`+ elif select == 4:`
	`70`	`+ tag = 'controversial'`
	`71`	`+ tag_url = '/controversial/'`
	`72`	`+ elif select == 5:`
	`73`	`+ tag = 'top'`
	`74`	`+ tag_url = '/top/'`
	`75`	`+`
	`76`	`+ # URL for the desired subreddit`
	`77`	`+ url = 'https://old.reddit.com/r/' + subreddit`
	`78`	`+`
	`79`	`+ # Using a user-agent to mimic browser activity`
	`80`	`+ headers = {'User-Agent': 'Mozilla/5.0'}`
	`81`	`+`
	`82`	`+ req = requests.get(url, headers=headers)`
	`83`	`+`
	`84`	`+ if req.status_code == 200:`
	`85`	`+ soup = BeautifulSoup(req.text, 'html.parser')`
	`86`	`+ print(f'\nCOLLECTING INFORMATION FOR r/{subreddit}....')`
	`87`	`+`
	`88`	`+ attrs = {'class': 'thing'}`
	`89`	`+ counter = 1`
	`90`	`+ full = 0`
	`91`	`+ reddit_info = []`
	`92`	`+ while 1:`
	`93`	`+ for post in soup.find_all('div', attrs=attrs):`
	`94`	`+ try:`
	`95`	`+ # To obtain the post title`
	`96`	`+ title = post.find('a', class_='title').text`
	`97`	`+`
	`98`	`+ # To get the username of the post author`
	`99`	`+ author = post.find('a', class_='author').text`
	`100`	`+`
	`101`	`+ # To obtain the time of the post`
	`102`	`+ time_stamp = post.time.attrs['title']`
	`103`	`+`
	`104`	`+ # To obtain the number of comments on the post`
	`105`	`+ comments = post.find('a', class_='comments').text.split()[0]`
	`106`	`+ if comments == 'comment':`
	`107`	`+ comments = 0`
	`108`	`+`
	`109`	`+ # To get the number of comments on the post`
	`110`	`+ upvotes = post.find('div', class_='score likes').text`
	`111`	`+ if upvotes == '•':`
	`112`	`+ upvotes = "None"`
	`113`	`+`
	`114`	`+ # To get the URL of the post`
	`115`	`+ link = post.find('a', class_='title')['href']`
	`116`	`+ link = 'www.reddit.com' + link`
	`117`	`+`
	`118`	`+ # Entering all the collected information into our database`
	`119`	`+ entities = (subreddit, tag, title, author, time_stamp, upvotes,`
	`120`	`+ comments, link)`
	`121`	`+ sql_insert_table(con, entities)`
	`122`	`+`
	`123`	`+ if counter == max_count:`
	`124`	`+ full = 1`
	`125`	`+ break`
	`126`	`+`
	`127`	`+ counter += 1`
	`128`	`+ except AttributeError:`
	`129`	`+ continue`
	`130`	`+`
	`131`	`+ if full:`
	`132`	`+ break`
	`133`	`+`
	`134`	`+ try:`
	`135`	`+ # To go to the next page`
	`136`	`+ next_button = soup.find('span', class_='next-button')`
	`137`	`+ next_page_link = next_button.find('a').attrs['href']`
	`138`	`+`
	`139`	`+ time.sleep(2)`
	`140`	`+`
	`141`	`+ req = requests.get(next_page_link, headers=headers)`
	`142`	`+ soup = BeautifulSoup(req.text, 'html.parser')`
	`143`	`+ except:`
	`144`	`+ break`
	`145`	`+`
	`146`	`+ print('DONE!\n')`
	`147`	`+ ans = input('Press (y) to continue or any other key to exit: ').lower()`
	`148`	`+ if ans == 'y':`
	`149`	`+ continue`
	`150`	`+ else:`
	`151`	`+ print('Exiting..')`
	`152`	`+ break`
	`153`	`+ else:`
	`154`	`+ print('Error fetching results.. Try again!')`
	`155`	`+`
	`156`	`+`
	`157`	`+if __name__ == '__main__':`
	`158`	`+ scraper()`
	`159`	`+`

`‎Reddit_Scraper_without_API/requirements.txt‎`

Lines changed: 7 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,7 @@`
	`1`	`+beautifulsoup4==4.9.3`
	`2`	`+certifi==2020年12月5日`
	`3`	`+chardet==4.0.0`
	`4`	`+idna==2.10`
	`5`	`+requests==2.25.1`
	`6`	`+soupsieve==2.2.1`
	`7`	`+urllib3==1.26.4`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 77ae12d

File tree

4 files changed

4 files changed

`‎Reddit_Scraper_without_API/README.md‎`

`‎Reddit_Scraper_without_API/display_reddit.py‎`

`‎Reddit_Scraper_without_API/fetch_reddit.py‎`

`‎Reddit_Scraper_without_API/requirements.txt‎`

0 commit comments