Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 5942824

Browse files
Merge pull request avinashkranjan#1011 from Ayushjain2205/Stackoverflow-Scraper
Stack overflow questions scraper
2 parents fdac7d5 + d41e8f1 commit 5942824

File tree

3 files changed

+221
-0
lines changed

3 files changed

+221
-0
lines changed

‎Stack-overflow-scraper/README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Stack overflow question scraper
2+
Running this Script would allow the user to scrape top questions from Stack overflow based on the question tag(python, java, etc) of their choice. The question, summary, link, votes and views will be stored in a local SQL DB.
3+
4+
## Setup instructions
5+
In order to run this script, you need to have Python and pip installed on your system. After you're done installing Python and pip, run the following command from your terminal to install the requirements from the same folder (directory) of the project.
6+
```
7+
pip install -r requirements.txt
8+
```
9+
After satisfying all the requirements for the project, Open the terminal in the project folder and run
10+
```
11+
python scraper.py
12+
```
13+
or
14+
```
15+
python3 scraper.py
16+
```
17+
depending upon the python version. Make sure that you are running the command from the same virtual environment in which the required modules are installed.
18+
19+
## Output
20+
21+
The user can choose the question tag based on which they want to scrape top questions from Stack Overflow.
22+
23+
![Stack overflow question scraper](https://i.postimg.cc/d3FrwysV/stack.png)
24+
25+
## Author
26+
[Ayush Jain](https://github.com/Ayushjain2205)

‎Stack-overflow-scraper/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
requests
2+
beautifulsoup4

‎Stack-overflow-scraper/scraper.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import tkinter as tk
4+
from tkinter import messagebox, simpledialog
5+
from tkinter import ttk
6+
from tkinter import font as tkFont
7+
import time
8+
import sqlite3
9+
from sqlite3 import Error
10+
11+
# Function to connect to the SQL Database
12+
def sql_connection():
13+
try:
14+
con = sqlite3.connect('./Stack-overflow-scraper/stackoverflow.db')
15+
return con
16+
except Error:
17+
print(Error)
18+
19+
# Function to create table
20+
def sql_table(con):
21+
cursorObj = con.cursor()
22+
cursorObj.execute(
23+
"CREATE TABLE IF NOT EXISTS questions(question_text text, question_summary text, question_link text,votes integer, views integer )")
24+
con.commit()
25+
26+
# Call functions to connect to database and create table
27+
con = sql_connection()
28+
sql_table(con)
29+
30+
# Function to insert into table
31+
def sql_insert(con, entities):
32+
cursorObj = con.cursor()
33+
cursorObj.execute(
34+
'INSERT INTO questions(question_text, question_summary, question_link, votes, views) VALUES(?, ?, ?, ?, ?)', entities)
35+
con.commit()
36+
37+
# Function to generate URL based on choice
38+
def get_URL():
39+
tag = search_box.get()
40+
if not tag:
41+
messagebox.showinfo("Alert", "Please Enter tag!")
42+
return
43+
url = 'https://stackoverflow.com/questions/tagged/{}?sort=MostVotes&edited=true'.format(tag)
44+
return url
45+
46+
def number_questions():
47+
questions = int(questions_box.get())
48+
if type(questions) != int or questions > 15:
49+
return 15
50+
return questions
51+
52+
def scrape_questions():
53+
for count in range(5):
54+
progress['value'] += 15
55+
window.update_idletasks()
56+
time.sleep(0.10)
57+
58+
question_count = number_questions()
59+
count = 0
60+
61+
url = get_URL()
62+
if url:
63+
page = requests.get(url)
64+
else:
65+
clear_progress()
66+
return
67+
68+
# Start scraping resultant html data
69+
soup = BeautifulSoup(page.content, 'html.parser')
70+
questions = soup.find_all('div', {'class': 'question-summary'})
71+
if not questions:
72+
messagebox.showinfo("Invalid", "Invalid search tag")
73+
clear_progress()
74+
return ""
75+
for question in questions:
76+
if count >= question_count:
77+
break
78+
question_text = question.find('a', {'class': 'question-hyperlink'}).text.strip()
79+
question_summary = question.find('div', {'class': 'excerpt'}).text.strip()
80+
question_summary = question_summary.replace('\n',' ')
81+
question_link = 'https://stackoverflow.com{}'.format(question.find('a', {'class': 'question-hyperlink'})['href'])
82+
votes = question.find('span', {'class': 'vote-count-post'}).text.strip()
83+
views = question.find('div', {'class': 'views'}).text.strip().split()[0]
84+
entities = (question_text, question_summary, question_link, votes, views)
85+
sql_insert(con, entities)
86+
count += 1
87+
88+
messagebox.showinfo("Success!", "Questions scrapped successfully!")
89+
clear_progress()
90+
91+
# Function to fetch stackoverflow questions from DB
92+
def sql_fetch(con):
93+
cursorObj = con.cursor()
94+
try:
95+
cursorObj.execute('SELECT DISTINCT * FROM questions ORDER BY rowid DESC') # SQL search query
96+
except Error:
97+
print("Database empty... Fetch users using GUI")
98+
return
99+
100+
rows = cursorObj.fetchall()
101+
display_text = ""
102+
103+
# Show messagebox incase of empty DB
104+
if len(rows) == 0 :
105+
messagebox.showinfo("Alert", "No users scraped yet!")
106+
return " "
107+
108+
first_row = "{:^65}".format("Question") + "{:^65}".format("Summary") + "{:^40}".format("Link") + "{:^15}".format("Votes") + "{:^15}".format("Views") + '\n'
109+
display_text += first_row
110+
111+
# Format rows
112+
for row in rows:
113+
question_text = "{:<65}".format(
114+
row[0] if len(row[0]) < 60 else row[0][:56]+"...")
115+
question_summary = "{:<65}".format(
116+
row[1] if len(row[1]) < 60 else row[1][:56]+"...")
117+
question_link = "{:<40}".format(
118+
row[2] if len(row[2]) < 30 else row[2][:36]+"...")
119+
votes = "{:^15}".format(row[3])
120+
views = "{:^15}".format(row[4])
121+
display_text += (question_text + question_summary + question_link + votes + views +'\n')
122+
123+
return display_text
124+
125+
def show_results():
126+
display_text = sql_fetch(con)
127+
query_label.config(state=tk.NORMAL)
128+
query_label.delete(1.0, "end")
129+
query_label.insert(1.0, display_text)
130+
query_label.config(state=tk.DISABLED)
131+
132+
def clear_progress():
133+
#set progress bar back to 0
134+
progress['value'] = 100
135+
window.update_idletasks()
136+
progress['value'] = 0
137+
window.update_idletasks()
138+
139+
# Creating tkinter window
140+
window = tk.Tk()
141+
window.title('Stack overflow question scraper')
142+
window.geometry('1200x1000')
143+
window.configure(bg='white')
144+
145+
style = ttk.Style()
146+
style.theme_use('alt')
147+
style.map('my.TButton', background=[('active','white')])
148+
style.configure('my.TButton', font=('Helvetica', 16, 'bold'))
149+
style.configure('my.TButton', background='white')
150+
style.configure('my.TButton', foreground='orange')
151+
style.configure('my.TFrame', background='white')
152+
153+
# label text for title
154+
ttk.Label(window, text="Stack overflow question scraper",
155+
background='white', foreground="Orange",
156+
font=("Helvetica", 30, 'bold')).grid(row=0, column=1)
157+
158+
# label texts
159+
ttk.Label(window, text="Enter tag (ex - python):", background = 'white',
160+
font=("Helvetica", 15)).grid(column=0,
161+
row=5, padx=10, pady=25)
162+
163+
ttk.Label(window, text="No of questions to scrape:", background = 'white',
164+
font=("Helvetica", 15)).grid(column=0,
165+
row=6, padx=10, pady=5)
166+
167+
168+
# Button creation
169+
scrape_btn = ttk.Button(window, text="Scrape questions!", style='my.TButton', command=scrape_questions)
170+
scrape_btn.grid(row=5, column=2, pady=5, padx=15, ipadx=5)
171+
172+
display_btn = ttk.Button(window, text="Display from DB", style='my.TButton', command = show_results)
173+
display_btn.grid(row=6, column=2, pady=5, padx=15, ipadx=5)
174+
175+
# Search Box
176+
search_box = tk.Entry(window, font=("Helvetica 15"), bd = 2, width=60)
177+
search_box.grid(row=5, column=1, pady=5, padx=15, ipadx=5)
178+
179+
questions_box = tk.Entry(window, font=("Helvetica 15"), bd = 2, width=60)
180+
questions_box.grid(row=6, column=1, pady=5, padx=15, ipadx=5)
181+
182+
frame = ttk.Frame(window, style='my.TFrame')
183+
frame.place(relx=0.50, rely=0.18, relwidth=0.98, relheight=0.90, anchor="n")
184+
185+
# Progress bar
186+
progress = ttk.Progressbar(window, orient="horizontal", length=200, mode="determinate")
187+
progress.grid(row=5, column=5, pady=5, padx=15, ipadx=5)
188+
189+
# To display questions data
190+
query_label = tk.Text(frame ,height="52" ,width="500", bg="alice blue")
191+
query_label.grid(row=10, columnspan=2)
192+
193+
window.mainloop()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /