Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Update NewsSpider.py #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
mikeurl wants to merge 1 commit into lining0806:master from mikeurl:master
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 32 additions & 57 deletions NewsSpider/NewsSpider.py
View file Open in desktop
Original file line number Diff line number Diff line change
@@ -1,61 +1,36 @@
# -*- coding: utf-8 -*-
import os
import sys
import urllib2
import requests
import re
from lxml import etree


def StringListSave(save_path, filename, slist):
if not os.path.exists(save_path):
os.makedirs(save_path)
path = save_path+"/"+filename+".txt"
with open(path, "w+") as fp:
for s in slist:
fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8")))

def Page_Info(myPage):
'''Regex'''
mypage_Info = re.findall(r'<div class="titleBar" id=".*?"><h2>(.*?)</h2><div class="more"><a href="(.*?)">.*?</a></div></div>', myPage, re.S)
return mypage_Info

def New_Page_Info(new_page):
'''Regex(slowly) or Xpath(fast)'''
# new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)\.html".*?>(.*?)</a></td>', new_page, re.S)
# # new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)">(.*?)</a></td>', new_page, re.S) # bugs
# results = []
# for url, item in new_page_Info:
# results.append((item, url+".html"))
# return results
dom = etree.HTML(new_page)
new_items = dom.xpath('//tr/td/a/text()')
new_urls = dom.xpath('//tr/td/a/@href')
assert(len(new_items) == len(new_urls))
return zip(new_items, new_urls)

def Spider(url):
i = 0
print "downloading ", url
myPage = requests.get(url).content.decode("gbk")
# myPage = urllib2.urlopen(url).read().decode("gbk")
myPageResults = Page_Info(myPage)
save_path = u"网易新闻抓取"
filename = str(i)+"_"+u"新闻排行榜"
StringListSave(save_path, filename, myPageResults)
i += 1
for item, url in myPageResults:
print "downloading ", url
new_page = requests.get(url).content.decode("gbk")
# new_page = urllib2.urlopen(url).read().decode("gbk")
newPageResults = New_Page_Info(new_page)
filename = str(i)+"_"+item
StringListSave(save_path, filename, newPageResults)
i += 1
from bs4 import BeautifulSoup


def save_to_file(file_path, file_name, data):
if not os.path.exists(file_path):
os.makedirs(file_path)
path = os.path.join(file_path, file_name + '.txt')
with open(path, "w", encoding='utf-8') as file:
for item in data:
file.write(item + '\n')


def get_page_info(page_content):
soup = BeautifulSoup(page_content, 'html.parser')
title = soup.find('h2', class_='titleBar').get_text(strip=True)
more_link = soup.find('div', class_='more').find('a')['href']
return title, more_link


def spider(url):
response = requests.get(url)
response.encoding = 'gbk'
page_content = response.text
title, more_link = get_page_info(page_content)
print(f"Title: {title}")
print(f"More Link: {more_link}")
# Save the title and more link to a file
save_to_file('news', 'news_info', [title, more_link])


if __name__ == '__main__':
print "start"
start_url = "http://news.163.com/rank/"
Spider(start_url)
print "end"
start_url = "http://example.com/start-page"
print("Start")
spider(start_url)
print("End")

AltStyle によって変換されたページ (->オリジナル) /