Open source module for crawling web pages for information and data. This module
is extremely extensible and pluggable, feel free to play with it. [Visit
crawlerjs.org for more info][0].
Examples
In this example we will be searching StackOverflow for the most recently
questions about Node.js, other examples can be found here.
var Crawler = require('crawler-js');
var manifest = {
target: {
url: 'https://stackoverflow.com/questions/tagged/node.js',
headers: ['User-Agent', 'CrawlerJS']
},
extractors: [{
name: 'questions',
root: '//div[@class="question-summary"]',
fields: {
question: '//a[1]/text()',
link: '//a[1]/@href'
}
}, {
name: 'count',
fields: {
count: '//*[@id="sidebar"]/div[1]/div[1]/text()',
}
}]
};
var job = new Crawler(manifest);
job.on('data', function(data) {
console.log('Data extracted for %s:', data.name);
console.log(data.data);
});
job.on('error', function(err) {
throw err;
});
job.on('end', function() {
console.log('The job is done');
});
job.start();
Testing
Testing is easy, you need to have grunt-cli
installed globally, clone this
repository, npm install
inside the folder and run grunt test
.
Todos
- Add inline comments inside the code.
- Propper documentation.
- API reference.
- More tests.
License
Copyright (c) 2014, Rodrigo Matheus rodrigorizando@gmail.com
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.