Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

m2lan/micro-scraper

Folders and files

NameName
Last commit message
Last commit date

Latest commit

History

4 Commits

Repository files navigation

micro-scraper

Node.js 爬虫示例 (for:百度百科)

app.js

var request = require('request'),
 cheerio = require('cheerio'),
 http = require('http'),
 url = require('url');
var host = 'http://baike.baidu.com/view/39744.htm';//可修改为其他的百科地址
var html = [];
setInterval(scraper(host), 1000*60*15);//15 分钟更新一次
function scraper (host) {
 request(host, function (error, response, data) {
 if (!error && response.statusCode == 200) {
 var $ = cheerio.load(data);
 var title = $('.title').first().text(),
 header = [],
 nav = [],
 body = [];
 //删除无用数据
 $('.title').remove();
 $('.pic-info').remove();
 $('.count').remove();
 $('sup').remove();
 //筛选有用数据
 $('#lemmaContent-0 .headline-1').each(function (i) {
 var str = '',
 $next = $(this).next();
 while (!$next.hasClass('headline-1')&&(!$next.next().hasClass('clear'))) {
 if ($next.hasClass('headline-2')) {
 str += "<p><strong>" + $next.text() + "</strong></p>";
 } else {
 str += "<p>" + $next.text() + "</p>";
 }
 $next = $next.next();
 }
 header.push($(this).find('.headline-content').text());
 nav.push("<span><a href='/" + i + "'>" + header[i] + "</a></span>");
 body.push(str);
 });
 var len = $('#catalog-holder-0 .catalog-item').length;//获取 "目录" 条文数
 for (var i = 0; i < len; i++) {
 html[i] = "" +
 "<!DOCTYPE html>" +
 "<html>" +
 "<head>" +
 "<meta charset='UTF-8' />" +
 "<title>" + title + "</title>" +
 "<style type='text/css'>" +
 "body{width:600px;margin:2em auto;font-family:'Microsoft YaHei';}" +
 "p{line-height:24px;margin:1em 0;}" +
 "header{border-bottom:1px solid #cccccc;font-size:2em;font-weight:bold;padding-bottom:.2em;}" +
 "nav{float:left;font-family:'Microsoft YaHei';margin-left:-12em;width:9em;text-align:right;}" +
 "nav a{display:block;text-decoration:none;padding:.7em 1em;color:#000000;}" +
 "nav a:hover{background-color:#003f00;color:#f9f9f9;-webkit-transition:color .2s linear;}" +
 "</style>" +
 "</head>" +
 "<body>" +
 "<header>" + header[i] + "</header>" +
 "<nav>" + nav.join('') + "</nav>" +
 "<article>" + body[i] + "</article>" +
 "</body>" +
 "</html>";
 }
 }
 });
}
http.createServer(function (req, res) {
 var path = url.parse(req.url).pathname;
 path = path == '/' ? 0 : parseInt(path.slice(1));
 res.writeHead(200, {"Content-Type":"text/html"});
 res.end(html[path]);
}).listen(3000);
console.log('Server running at localhost:3000');

About

Node.js 爬虫示例 (for:百度百科)

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

AltStyle によって変換されたページ (->オリジナル) /