I would like to scrap an url:
1 request to get a list of elements
1 request on each result to get details
Here what I have:
var request = require('request')
, cheerio = require('cheerio')
, async = require('async')
, format = require('util').format;
var baseurl = 'http://magiccards.info';
async.waterfall([
function (callback) {
request(baseurl + '/sitemap.html', function (err, response, body) {
var sets = [];
var $ = cheerio.load(body);
$('a[href$="/en.html"]').each(function () {
sets.push({"name": $(this).text(), "code":$(this).attr('href').match(/\/([^)]+)\//)[1], "path": $(this).attr('href'), "translations":[]});
});
callback(null, sets);
});
},
function (sets, callback) {
console.log(sets);
async.eachSeries(sets, function (set, callback) {
console.log('SET ' + set.code.toUpperCase());
request(baseurl + set.path, function (err, response, body) {
var $ = cheerio.load(body);
$('body > a[href^="/' + set.code + '/"]').each(function () {
console.log(' %s (%s)', $(this).text(), $(this).attr('href'));
});
});
});
}
], function (err, result) {
console.log('ERR');
// result now equals 'done'
});
The problem is that the 2nd waterfall function run only once, if I replace the eachSeries with an each, the loop does run X times (but I need to wait for result).
Wath am I missing?
asked Apr 17, 2014 at 12:49
kitensei
2,5402 gold badges44 silver badges70 bronze badges
1 Answer 1
You need to call the eachSeries callback function. Otherwise async won't know that you are done. (1)
You also need to tell the waterfall function that you are done with that step, also by calling the callback function. (2)
function (sets, waterfallCallback) {
async.eachSeries(sets, function (set, seriesCallback) {
console.log('SET ' + set.code.toUpperCase());
request(baseurl + set.path, function (err, response, body) {
var $ = cheerio.load(body);
$('body > a[href^="/' + set.code + '/"]').each(function () {
console.log(' %s (%s)', $(this).text(), $(this).attr('href'));
});
seriesCallback(null); /* 1 */
});
}, waterfallCallback /* 2 */);
}
answered Apr 17, 2014 at 13:00
Linus Unnebäck
24.4k16 gold badges79 silver badges91 bronze badges
Sign up to request clarification or add additional context in comments.
2 Comments
kitensei
works great, but without the 2nd callback (I don't understand why would you use a callback down there). WHen I put a callback on eachSeries, it tells me that callback is undefined (no function)
Linus Unnebäck
I've modified the answer to better explain the two different callbacks. Without the second one, the last function will never run. (The one with
console.log('ERR'); and // result now equals 'done')lang-js