OC

Knowledge OS
登录 注册
全部话题 移民 创业 iOS Mac Objective-C Swift Android 招聘 求职

学习笔记CB005:关键词、语料提取

清醒疯子
清醒疯子 发布于 2018年03月06日 | 更新于 2018年03月06日
无人欣赏。

关键词提取。pynlpir库实现关键词提取。

# coding:utf-8
import sys
import importlib
importlib.reload(sys)
import pynlpir
pynlpir.open()
s = '怎么才能把电脑里的垃圾文件删除'
key_words = pynlpir.get_key_words(s, weighted=True)
for key_word in key_words:
 print(key_word[0], 't', key_word[1])
pynlpir.close()

百度接口:https://www.baidu.com/s?wd=机器学习 数据挖掘 信息检索

安装scrapy pip install scrapy。创建scrapy工程 scrapy startproject baidusearch。做抓取器,创建baidusearch/baidusearch/spiders/baidusearch.py文件。

# coding:utf-8
import sys
import importlib
importlib.reload(sys)
import scrapy
class BaiduSearchSpider(scrapy.Spider):
 name = "baidu_search"
 allowed_domains = ["baidu.com"]
 start_urls = [
 "https://www.baidu.com/s?wd=电脑 垃圾 文件 删除"
 ]
 def parse(self, response):
 filename = "result.html"
 with open(filename, 'wb') as f:
 f.write(response.body)

修改settings.py文件,ROBOTSTXTOBEY = False,USERAGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10114) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' ,DOWNLOAD_TIMEOUT = 5 ,

进入baidusearch/baidusearch/目录,scrapy crawl baidu_search 。生成result.html,正确抓取网页。

语料提取。搜索结果只是索引。真正内容需进入链接。分析抓取结果,链接嵌在class=c-container Div h3 a标签 href属性。url添加到抓取队列抓取。提取正文,去掉标签,保存摘要。提取url时,提取标题和摘要,scrapy.Request meta传递到处理函数parse_url,抓取完成后能接到这两个值,提取content。完整数据:url、title、abstract、content。

# coding:utf-8
import sys
import importlib
importlib.reload(sys)
import scrapy
from scrapy.utils.markup import remove_tags
class BaiduSearchSpider(scrapy.Spider):
 name = "baidu_search"
 allowed_domains = ["baidu.com"]
 start_urls = [
 "https://www.baidu.com/s?wd=电脑 垃圾 文件 删除"
 ]
 def parse(self, response):
 # filename = "result.html"
 # with open(filename, 'wb') as f:
 # f.write(response.body)
 hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
 # for href in hrefs:
 # print(href)
 # yield scrapy.Request(href, callback=self.parse_url)
 containers = response.selector.xpath('//div[contains(@class, "c-container")]')
 for container in containers:
 href = container.xpath('h3/a/@href').extract()[0]
 title = remove_tags(container.xpath('h3/a').extract()[0])
 c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
 abstract = ""
 if len(c_abstract) > 0:
 abstract = remove_tags(c_abstract[0])
 request = scrapy.Request(href, callback=self.parse_url)
 request.meta['title'] = title
 request.meta['abstract'] = abstract
 yield request
 def parse_url(self, response):
 print(len(response.body))
 print("url:", response.url)
 print("title:", response.meta['title'])
 print("abstract:", response.meta['abstract'])
 content = remove_tags(response.selector.xpath('//body').extract()[0])
 print("content_len:", len(content))

参考资料:

《Python 自然语言处理》

http://www.shareditor.com/blogshow/?blogId=43

http://www.shareditor.com/blogshow?blogId=76

欢迎推荐上海机器学习工作机会,我的微信:qingxingfengzi

暂无回复
登录 或者 注册

AltStyle によって変換されたページ (->オリジナル) /