|  | 
|  | 1 | +import os | 
|  | 2 | +import requests | 
|  | 3 | +from ruamel.yaml import YAML | 
|  | 4 | +from bs4 import BeautifulSoup | 
|  | 5 | + | 
|  | 6 | +BASE_URL = 'https://stackoverflow.com/questions/tagged/c' | 
|  | 7 | +SORT = '?sort=votes' | 
|  | 8 | +PAGE = '&page=' | 
|  | 9 | +PAGE_SIZE_URL = '&pageSize=' | 
|  | 10 | + | 
|  | 11 | +PAGE_SIZE = 15 | 
|  | 12 | +NUM_ANSWERS = 3 | 
|  | 13 | + | 
|  | 14 | +headers = { | 
|  | 15 | +	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' | 
|  | 16 | +} | 
|  | 17 | + | 
|  | 18 | +def crawl_pages(num_pages): | 
|  | 19 | +	start = 1 | 
|  | 20 | +	current_page = start | 
|  | 21 | +	end = start + num_pages | 
|  | 22 | +	while current_page != end: | 
|  | 23 | +		try: | 
|  | 24 | +			page_url = BASE_URL + SORT + PAGE + str(current_page) + PAGE_SIZE_URL + str(PAGE_SIZE) | 
|  | 25 | +			source_code = requests.get(page_url, headers=headers, timeout=10).text | 
|  | 26 | +			soup = BeautifulSoup(source_code, 'html.parser') | 
|  | 27 | +			print('crawling page ' + str(current_page) + ': ' + page_url) | 
|  | 28 | +			q_no = 0 | 
|  | 29 | +			for link in soup.find_all('a', {'class': 'question-hyperlink'}): | 
|  | 30 | +				if q_no == PAGE_SIZE: | 
|  | 31 | +					break | 
|  | 32 | +				url = 'http://stackoverflow.com/' + link.get('href') | 
|  | 33 | +				title = link.get_text() | 
|  | 34 | +				print("------------------------------") | 
|  | 35 | +				print(title) | 
|  | 36 | +				parse_question(url, title) | 
|  | 37 | +				q_no += 1 | 
|  | 38 | +			current_page += 1 | 
|  | 39 | +		except (KeyboardInterrupt, EOFError, SystemExit): | 
|  | 40 | +			print("\nStopped by user!") | 
|  | 41 | +			break | 
|  | 42 | + | 
|  | 43 | +def parse_question(url, title): | 
|  | 44 | +	page = requests.get(url, headers=headers, timeout=10) | 
|  | 45 | +	soup = BeautifulSoup(page.content, 'html.parser') | 
|  | 46 | +	question = soup.find('div', class_='postcell') | 
|  | 47 | +	if question is not None: | 
|  | 48 | +		question = list(question)[1].get_text() | 
|  | 49 | +		answers = soup.find_all('div', class_='answercell') | 
|  | 50 | +		end = len(answers) | 
|  | 51 | +		if end > NUM_ANSWERS: | 
|  | 52 | +			end = NUM_ANSWERS | 
|  | 53 | +		print(question) | 
|  | 54 | +		print("List of answers:\n\n") | 
|  | 55 | +		for i in range(0, end): | 
|  | 56 | +			answer = answers[i].find('div', class_='post-text').get_text() | 
|  | 57 | +			entry = [title, answer] | 
|  | 58 | +			print("===>") | 
|  | 59 | +			print(title) | 
|  | 60 | +			print(answer) | 
|  | 61 | + | 
|  | 62 | +def main(): | 
|  | 63 | +	crawl_pages(2) | 
|  | 64 | +	print('\nDone!') | 
|  | 65 | + | 
|  | 66 | +main() | 
0 commit comments