1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import tkinter as tk
4
+ from tkinter import messagebox , simpledialog
5
+ from tkinter import ttk
6
+ from tkinter import font as tkFont
7
+ import time
8
+ import sqlite3
9
+ from sqlite3 import Error
10
+
11
+ # Function to connect to the SQL Database
12
+ def sql_connection ():
13
+ try :
14
+ con = sqlite3 .connect ('./Stack-overflow-scraper/stackoverflow.db' )
15
+ return con
16
+ except Error :
17
+ print (Error )
18
+
19
+ # Function to create table
20
+ def sql_table (con ):
21
+ cursorObj = con .cursor ()
22
+ cursorObj .execute (
23
+ "CREATE TABLE IF NOT EXISTS questions(question_text text, question_summary text, question_link text,votes integer, views integer )" )
24
+ con .commit ()
25
+
26
+ # Call functions to connect to database and create table
27
+ con = sql_connection ()
28
+ sql_table (con )
29
+
30
+ # Function to insert into table
31
+ def sql_insert (con , entities ):
32
+ cursorObj = con .cursor ()
33
+ cursorObj .execute (
34
+ 'INSERT INTO questions(question_text, question_summary, question_link, votes, views) VALUES(?, ?, ?, ?, ?)' , entities )
35
+ con .commit ()
36
+
37
+ # Function to generate URL based on choice
38
+ def get_URL ():
39
+ tag = search_box .get ()
40
+ if not tag :
41
+ messagebox .showinfo ("Alert" , "Please Enter tag!" )
42
+ return
43
+ url = 'https://stackoverflow.com/questions/tagged/{}?sort=MostVotes&edited=true' .format (tag )
44
+ return url
45
+
46
+ def number_questions ():
47
+ questions = int (questions_box .get ())
48
+ if type (questions ) != int or questions > 15 :
49
+ return 15
50
+ return questions
51
+
52
+ def scrape_questions ():
53
+ for count in range (5 ):
54
+ progress ['value' ] += 15
55
+ window .update_idletasks ()
56
+ time .sleep (0.10 )
57
+
58
+ question_count = number_questions ()
59
+ count = 0
60
+
61
+ url = get_URL ()
62
+ if url :
63
+ page = requests .get (url )
64
+ else :
65
+ clear_progress ()
66
+ return
67
+
68
+ # Start scraping resultant html data
69
+ soup = BeautifulSoup (page .content , 'html.parser' )
70
+ questions = soup .find_all ('div' , {'class' : 'question-summary' })
71
+ if not questions :
72
+ messagebox .showinfo ("Invalid" , "Invalid search tag" )
73
+ clear_progress ()
74
+ return ""
75
+ for question in questions :
76
+ if count >= question_count :
77
+ break
78
+ question_text = question .find ('a' , {'class' : 'question-hyperlink' }).text .strip ()
79
+ question_summary = question .find ('div' , {'class' : 'excerpt' }).text .strip ()
80
+ question_summary = question_summary .replace ('\n ' ,' ' )
81
+ question_link = 'https://stackoverflow.com{}' .format (question .find ('a' , {'class' : 'question-hyperlink' })['href' ])
82
+ votes = question .find ('span' , {'class' : 'vote-count-post' }).text .strip ()
83
+ views = question .find ('div' , {'class' : 'views' }).text .strip ().split ()[0 ]
84
+ entities = (question_text , question_summary , question_link , votes , views )
85
+ sql_insert (con , entities )
86
+ count += 1
87
+
88
+ messagebox .showinfo ("Success!" , "Questions scrapped successfully!" )
89
+ clear_progress ()
90
+
91
+ # Function to fetch stackoverflow questions from DB
92
+ def sql_fetch (con ):
93
+ cursorObj = con .cursor ()
94
+ try :
95
+ cursorObj .execute ('SELECT DISTINCT * FROM questions ORDER BY rowid DESC' ) # SQL search query
96
+ except Error :
97
+ print ("Database empty... Fetch users using GUI" )
98
+ return
99
+
100
+ rows = cursorObj .fetchall ()
101
+ display_text = ""
102
+
103
+ # Show messagebox incase of empty DB
104
+ if len (rows ) == 0 :
105
+ messagebox .showinfo ("Alert" , "No users scraped yet!" )
106
+ return " "
107
+
108
+ first_row = "{:^65}" .format ("Question" ) + "{:^65}" .format ("Summary" ) + "{:^40}" .format ("Link" ) + "{:^15}" .format ("Votes" ) + "{:^15}" .format ("Views" ) + '\n '
109
+ display_text += first_row
110
+
111
+ # Format rows
112
+ for row in rows :
113
+ question_text = "{:<65}" .format (
114
+ row [0 ] if len (row [0 ]) < 60 else row [0 ][:56 ]+ "..." )
115
+ question_summary = "{:<65}" .format (
116
+ row [1 ] if len (row [1 ]) < 60 else row [1 ][:56 ]+ "..." )
117
+ question_link = "{:<40}" .format (
118
+ row [2 ] if len (row [2 ]) < 30 else row [2 ][:36 ]+ "..." )
119
+ votes = "{:^15}" .format (row [3 ])
120
+ views = "{:^15}" .format (row [4 ])
121
+ display_text += (question_text + question_summary + question_link + votes + views + '\n ' )
122
+
123
+ return display_text
124
+
125
+ def show_results ():
126
+ display_text = sql_fetch (con )
127
+ query_label .config (state = tk .NORMAL )
128
+ query_label .delete (1.0 , "end" )
129
+ query_label .insert (1.0 , display_text )
130
+ query_label .config (state = tk .DISABLED )
131
+
132
+ def clear_progress ():
133
+ #set progress bar back to 0
134
+ progress ['value' ] = 100
135
+ window .update_idletasks ()
136
+ progress ['value' ] = 0
137
+ window .update_idletasks ()
138
+
139
+ # Creating tkinter window
140
+ window = tk .Tk ()
141
+ window .title ('Stack overflow question scraper' )
142
+ window .geometry ('1200x1000' )
143
+ window .configure (bg = 'white' )
144
+
145
+ style = ttk .Style ()
146
+ style .theme_use ('alt' )
147
+ style .map ('my.TButton' , background = [('active' ,'white' )])
148
+ style .configure ('my.TButton' , font = ('Helvetica' , 16 , 'bold' ))
149
+ style .configure ('my.TButton' , background = 'white' )
150
+ style .configure ('my.TButton' , foreground = 'orange' )
151
+ style .configure ('my.TFrame' , background = 'white' )
152
+
153
+ # label text for title
154
+ ttk .Label (window , text = "Stack overflow question scraper" ,
155
+ background = 'white' , foreground = "Orange" ,
156
+ font = ("Helvetica" , 30 , 'bold' )).grid (row = 0 , column = 1 )
157
+
158
+ # label texts
159
+ ttk .Label (window , text = "Enter tag (ex - python):" , background = 'white' ,
160
+ font = ("Helvetica" , 15 )).grid (column = 0 ,
161
+ row = 5 , padx = 10 , pady = 25 )
162
+
163
+ ttk .Label (window , text = "No of questions to scrape:" , background = 'white' ,
164
+ font = ("Helvetica" , 15 )).grid (column = 0 ,
165
+ row = 6 , padx = 10 , pady = 5 )
166
+
167
+
168
+ # Button creation
169
+ scrape_btn = ttk .Button (window , text = "Scrape questions!" , style = 'my.TButton' , command = scrape_questions )
170
+ scrape_btn .grid (row = 5 , column = 2 , pady = 5 , padx = 15 , ipadx = 5 )
171
+
172
+ display_btn = ttk .Button (window , text = "Display from DB" , style = 'my.TButton' , command = show_results )
173
+ display_btn .grid (row = 6 , column = 2 , pady = 5 , padx = 15 , ipadx = 5 )
174
+
175
+ # Search Box
176
+ search_box = tk .Entry (window , font = ("Helvetica 15" ), bd = 2 , width = 60 )
177
+ search_box .grid (row = 5 , column = 1 , pady = 5 , padx = 15 , ipadx = 5 )
178
+
179
+ questions_box = tk .Entry (window , font = ("Helvetica 15" ), bd = 2 , width = 60 )
180
+ questions_box .grid (row = 6 , column = 1 , pady = 5 , padx = 15 , ipadx = 5 )
181
+
182
+ frame = ttk .Frame (window , style = 'my.TFrame' )
183
+ frame .place (relx = 0.50 , rely = 0.18 , relwidth = 0.98 , relheight = 0.90 , anchor = "n" )
184
+
185
+ # Progress bar
186
+ progress = ttk .Progressbar (window , orient = "horizontal" , length = 200 , mode = "determinate" )
187
+ progress .grid (row = 5 , column = 5 , pady = 5 , padx = 15 , ipadx = 5 )
188
+
189
+ # To display questions data
190
+ query_label = tk .Text (frame ,height = "52" ,width = "500" , bg = "alice blue" )
191
+ query_label .grid (row = 10 , columnspan = 2 )
192
+
193
+ window .mainloop ()
0 commit comments