Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 0b68c56

Browse files
authored
Create crawler.py
1 parent 00450aa commit 0b68c56

File tree

1 file changed

+66
-0
lines changed

1 file changed

+66
-0
lines changed

‎crawler.py‎

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import os
2+
import requests
3+
from ruamel.yaml import YAML
4+
from bs4 import BeautifulSoup
5+
6+
BASE_URL = 'https://stackoverflow.com/questions/tagged/c'
7+
SORT = '?sort=votes'
8+
PAGE = '&page='
9+
PAGE_SIZE_URL = '&pageSize='
10+
11+
PAGE_SIZE = 15
12+
NUM_ANSWERS = 3
13+
14+
headers = {
15+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
16+
}
17+
18+
def crawl_pages(num_pages):
19+
start = 1
20+
current_page = start
21+
end = start + num_pages
22+
while current_page != end:
23+
try:
24+
page_url = BASE_URL + SORT + PAGE + str(current_page) + PAGE_SIZE_URL + str(PAGE_SIZE)
25+
source_code = requests.get(page_url, headers=headers, timeout=10).text
26+
soup = BeautifulSoup(source_code, 'html.parser')
27+
print('crawling page ' + str(current_page) + ': ' + page_url)
28+
q_no = 0
29+
for link in soup.find_all('a', {'class': 'question-hyperlink'}):
30+
if q_no == PAGE_SIZE:
31+
break
32+
url = 'http://stackoverflow.com/' + link.get('href')
33+
title = link.get_text()
34+
print("------------------------------")
35+
print(title)
36+
parse_question(url, title)
37+
q_no += 1
38+
current_page += 1
39+
except (KeyboardInterrupt, EOFError, SystemExit):
40+
print("\nStopped by user!")
41+
break
42+
43+
def parse_question(url, title):
44+
page = requests.get(url, headers=headers, timeout=10)
45+
soup = BeautifulSoup(page.content, 'html.parser')
46+
question = soup.find('div', class_='postcell')
47+
if question is not None:
48+
question = list(question)[1].get_text()
49+
answers = soup.find_all('div', class_='answercell')
50+
end = len(answers)
51+
if end > NUM_ANSWERS:
52+
end = NUM_ANSWERS
53+
print(question)
54+
print("List of answers:\n\n")
55+
for i in range(0, end):
56+
answer = answers[i].find('div', class_='post-text').get_text()
57+
entry = [title, answer]
58+
print("===>")
59+
print(title)
60+
print(answer)
61+
62+
def main():
63+
crawl_pages(2)
64+
print('\nDone!')
65+
66+
main()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /