Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit ce8fa75

Browse files
Added script
1 parent 1a47b5c commit ce8fa75

File tree

2 files changed

+64
-0
lines changed

2 files changed

+64
-0
lines changed

‎reddit-scraper/grabnews.py‎

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import json, requests
2+
import sqlite3
3+
4+
5+
def handle(content):
6+
if not content or content is None:
7+
content = None
8+
return content
9+
10+
def dump(endpoint, toget):
11+
headers = {'User-agent' : 'Chrome'}
12+
unique = set()
13+
url = 'http://www.reddit.com/r/python/'+str(endpoint)+'/.json?limit='+str(toget)
14+
response = requests.get(url, headers=headers)
15+
data = json.loads(response.text)
16+
#parsed = json.dumps(data, indent = 4, sort_keys = True)
17+
return data
18+
19+
def _news(data, toget, unique):
20+
for i in range(toget):
21+
parsed_content = json.dumps(data['data']['children'][i]['data'], indent = 4)
22+
content_title = handle(data['data']['children'][i]['data']['title'].strip())
23+
content_text = handle(data['data']['children'][i]['data']['selftext'].strip())
24+
content_author = handle(data['data']['children'][i]['data']['author_fullname'].strip())
25+
content_ups = handle(data['data']['children'][i]['data']['ups'])
26+
content_url = handle(data['data']['children'][i]['data']['url'].strip())
27+
content_id = handle(data['data']['children'][i]['data']['id'])
28+
29+
post = (content_id, content_title, content_text, content_author, content_url, content_ups)
30+
update_post = (content_title, content_text, content_author, content_url, content_ups, content_id)
31+
32+
if content_id in unique:
33+
c.execute("UPDATE top_news SET ptitle = ?, ptext = ?, pauthor = ?, purl = ?, pups = ? where pid = ? ", update_post)
34+
print("Updated")
35+
else:
36+
unique.add(content_id)
37+
c.execute("INSERT INTO top_news VALUES (?, ?, ?, ?, ?, ?)", post)
38+
print("Inserted")
39+
i = i + 1
40+
41+
def get_top_news(endpoint = 'top', toget = 10):
42+
def connect():
43+
c.execute('''CREATE TABLE IF NOT EXISTS top_news
44+
(pid text PRIMARY KEY, ptitle text, ptext text, pauthor text, purl text, pups int)''')
45+
connect()
46+
unique = set()
47+
data = dump(endpoint, toget)
48+
_news(data, toget, unique)
49+
50+
def get_hot_news(endpoint = 'hot', toget = 10):
51+
def connect():
52+
c.execute('''CREATE TABLE IF NOT EXISTS hot_news
53+
(pid text PRIMARY KEY, ptitle text, ptext text, pauthor text, purl text, pups int)''')
54+
connect()
55+
unique = set()
56+
data = dump(endpoint, toget)
57+
_news(data, toget, unique)
58+
59+
def reddit_get():
60+
conn = sqlite3.connect('reddit_news.db')
61+
c = conn.cursor()
62+
get_top_news()
63+
conn.commit()
64+
conn.close()

‎reddit-scraper/reddit_news.db‎

12 KB
Binary file not shown.

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /