simplecrawler
Advanced tools
Comparing version 0.6.2 to 0.7.0
@@ -0,0 +0,0 @@ /* |
@@ -0,0 +0,0 @@ /* |
// CLI module for crawling. | ||
// Not yet built. |
@@ -0,0 +0,0 @@ /* |
@@ -85,2 +85,9 @@ /* | ||
// Should we update crawler.host if the first response is a redirect to another domain. | ||
crawler.allowInitialDomainChange = false; | ||
// Set Accept-Encoding header and automatically decompress HTTP responses | ||
// based on Content-Encoding header | ||
crawler.decompressResponses = true; | ||
// Decode HTTP responses based on their Content-Type header or any | ||
@@ -175,2 +182,10 @@ // inline charset definition | ||
// Find srcset links | ||
function (string) { | ||
var result = /\ssrcset\s*=\s*(["'])(.*)\1/.exec(string); | ||
return Array.isArray(result) ? String(result[2]).split(",").map(function (string) { | ||
return string.replace(/\s?\w*$/, "").trim(); | ||
}) : ""; | ||
}, | ||
// Find resources in <meta> redirects. We need to wrap these RegExp's in | ||
@@ -199,2 +214,12 @@ // functions because we only want to return the first capture group, not | ||
// Matching MIME-types will be allowed to fetch further than max depth | ||
crawler.whitelistedMimeTypes = [ | ||
/^text\/(css|javascript|ecmascript)/i, | ||
/^application\/javascript/i, | ||
/^application\/x-font/i, | ||
/^application\/font/i, | ||
/^image\//i, | ||
/^font\//i | ||
]; | ||
// Whether to allow 'resources' greater than the max depth to be downloaded | ||
@@ -212,2 +237,3 @@ crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false; | ||
var hiddenProps = { | ||
_isFirstRequest: true, | ||
_openRequests: 0, | ||
@@ -250,2 +276,6 @@ _fetchConditions: [], | ||
if (crawler.running) { | ||
return crawler; | ||
} | ||
// only if we haven't already got stuff in our queue... | ||
@@ -322,5 +352,5 @@ crawler.queue.getLength(function(err, length) { | ||
return crawler.allowedProtocols.reduce(function(prev, protocolCheck) { | ||
return prev || !!protocolCheck.exec(protocol); | ||
}, false); | ||
return crawler.allowedProtocols.some(function(protocolCheck) { | ||
return protocolCheck.test(protocol); | ||
}); | ||
}; | ||
@@ -344,5 +374,5 @@ | ||
return crawler.supportedMimeTypes.reduce(function(prev, mimeCheck) { | ||
return prev || !!mimeCheck.exec(MIMEType); | ||
}, false); | ||
return crawler.supportedMimeTypes.some(function(mimeCheck) { | ||
return mimeCheck.test(MIMEType); | ||
}); | ||
}; | ||
@@ -357,6 +387,2 @@ | ||
If the queue item is a CSS or JS file, it will always be fetched (we need | ||
all images in CSS files, even if max depth is already reached). If it's an | ||
HTML page, we will check if max depth is reached or not. | ||
queueItem - Queue item object to check | ||
@@ -370,19 +396,16 @@ | ||
// Items matching this pattern will always be fetched, even if max depth | ||
// is reached | ||
var mimeTypesWhitelist = [ | ||
/^text\/(css|javascript|ecmascript)/i, | ||
/^application\/javascript/i, | ||
/^application\/x-font/i, | ||
/^application\/font/i, | ||
/^image\//i, | ||
/^font\//i | ||
]; | ||
var belowMaxDepth = crawler.fetchWhitelistedMimeTypesBelowMaxDepth; | ||
if (typeof belowMaxDepth === "boolean") { | ||
belowMaxDepth = belowMaxDepth === false ? 0 : Infinity; | ||
} | ||
var whitelistedDepth = queueItem.depth - belowMaxDepth; | ||
return crawler.maxDepth === 0 || | ||
queueItem.depth <= crawler.maxDepth || | ||
crawler.fetchWhitelistedMimeTypesBelowMaxDepth && | ||
mimeTypesWhitelist.reduce(function(prev, mimeCheck) { | ||
return prev || !!mimeCheck.exec(queueItem.stateData.contentType); | ||
}, false); | ||
whitelistedDepth <= crawler.maxDepth && | ||
crawler.whitelistedMimeTypes.some(function(mimeCheck) { | ||
return mimeCheck.test(queueItem.stateData.contentType); | ||
}); | ||
}; | ||
@@ -534,5 +557,5 @@ | ||
// Does the item already exist in the list? | ||
if (list.reduce(function(prev, current) { | ||
return prev || current === URL; | ||
}, false)) { | ||
if (list.some(function(entry) { | ||
return entry === URL; | ||
})) { | ||
return list; | ||
@@ -633,23 +656,14 @@ } | ||
// Otherwise, scan through it. | ||
return !!crawler.domainWhitelist.reduce(function(prev, cur) { | ||
// If we already located the relevant domain in the whitelist... | ||
if (prev) { | ||
return prev; | ||
} | ||
return !!crawler.domainWhitelist.some(function(entry) { | ||
// If the domain is just equal, return true. | ||
if (host === cur) { | ||
if (host === entry) { | ||
return true; | ||
} | ||
// If we're ignoring WWW subdomains, and both domains, | ||
// less www. are the same, return true. | ||
if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i, "")) { | ||
if (crawler.ignoreWWWDomain && host === entry.replace(/^www\./i, "")) { | ||
return true; | ||
} | ||
// Otherwise, sorry. No dice. | ||
return false; | ||
}, false); | ||
}); | ||
} | ||
@@ -701,9 +715,7 @@ | ||
resourceData - Text document containing linked resource URLs. | ||
queueItem - Queue item from which the resource document was derived. | ||
decompressed - Content is already decompressed (default: false) | ||
resourceData - Text document containing linked resource URLs. | ||
queueItem - Queue item from which the resource document was derived. | ||
Emits | ||
gziperr | ||
discoverycomplete | ||
@@ -718,23 +730,7 @@ | ||
*/ | ||
Crawler.prototype.queueLinkedItems = function(resourceData, queueItem, decompressed) { | ||
var crawler = this, | ||
resources = []; | ||
Crawler.prototype.queueLinkedItems = function(resourceData, queueItem) { | ||
var crawler = this; | ||
if (!decompressed && | ||
queueItem.stateData && | ||
queueItem.stateData.headers["content-encoding"] && ( | ||
queueItem.stateData.headers["content-encoding"].match(/gzip/) || | ||
queueItem.stateData.headers["content-encoding"].match(/deflate/))) { | ||
var resources = crawler.discoverResources(resourceData, queueItem); | ||
return zlib.unzip(resourceData, function(err, newData) { | ||
if (err) { | ||
return crawler.emit("gziperror", queueItem, err, resourceData); | ||
} | ||
crawler.queueLinkedItems(newData, queueItem, true); | ||
}); | ||
} | ||
resources = crawler.discoverResources(resourceData, queueItem); | ||
// Emit discovered resources. ie: might be useful in building a graph of | ||
@@ -783,6 +779,5 @@ // page relationships. | ||
// Pass this URL past fetch conditions to ensure the user thinks it's valid | ||
var fetchDenied = false; | ||
fetchDenied = crawler._fetchConditions.reduce(function(prev, callback) { | ||
return prev || !callback(parsedURL, queueItem); | ||
}, false); | ||
var fetchDenied = crawler._fetchConditions.some(function(callback) { | ||
return !callback(parsedURL, queueItem); | ||
}); | ||
@@ -877,2 +872,5 @@ if (fetchDenied) { | ||
var isStandardHTTPPort = queueItem.protocol === "http" && queueItem.port !== 80, | ||
isStandardHTTPSPort = queueItem.protocol === "https" && queueItem.port !== 443; | ||
// Load in request options | ||
@@ -886,8 +884,6 @@ requestOptions = { | ||
headers: { | ||
"User-Agent": crawler.userAgent, | ||
"Host": queueItem.host + ( | ||
queueItem.port !== 80 ? | ||
":" + queueItem.port : | ||
"" | ||
) | ||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | ||
"Accept-Encoding": "gzip, deflate", | ||
"User-Agent": crawler.userAgent, | ||
"Host": queueItem.host + (isStandardHTTPPort || isStandardHTTPSPort ? ":" + queueItem.port : "") | ||
} | ||
@@ -901,3 +897,3 @@ }; | ||
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts | ||
if (requestOptions.port === 80 || requestOptions.port === 443) { | ||
if (requestPort === 80 || requestPort === 443) { | ||
delete requestOptions.port; | ||
@@ -934,7 +930,5 @@ } | ||
for (var header in crawler.customHeaders) { | ||
if (!crawler.customHeaders.hasOwnProperty(header)) { | ||
continue; | ||
if (crawler.customHeaders.hasOwnProperty(header)) { | ||
requestOptions.headers[header] = crawler.customHeaders[header]; | ||
} | ||
requestOptions.headers[header] = crawler.customHeaders[header]; | ||
} | ||
@@ -1064,8 +1058,8 @@ } | ||
// Save timing and content some header information into queue | ||
stateData.requestLatency = timeHeadersReceived - timeCommenced; | ||
stateData.requestTime = timeHeadersReceived - timeCommenced; | ||
stateData.contentLength = responseLength; | ||
stateData.contentType = contentType = response.headers["content-type"]; | ||
stateData.code = response.statusCode; | ||
stateData.headers = response.headers; | ||
stateData.requestLatency = timeHeadersReceived - timeCommenced; | ||
stateData.requestTime = timeHeadersReceived - timeCommenced; | ||
stateData.contentLength = responseLength; | ||
stateData.contentType = contentType = response.headers["content-type"]; | ||
stateData.code = response.statusCode; | ||
stateData.headers = response.headers; | ||
@@ -1085,7 +1079,16 @@ // Do we need to save cookies? Were we sent any? | ||
// Ensure response length is reasonable... | ||
responseLength = | ||
responseLength > 0 ? responseLength : crawler.maxResourceSize; | ||
responseLength = responseLength > 0 ? responseLength : crawler.maxResourceSize; | ||
queueItem.stateData.contentLength = responseLength; | ||
function emitFetchComplete(responseBody, decompressedBuffer) { | ||
responseBody = crawler.decodeResponses ? crawler.decodeBuffer(responseBody, stateData.contentType) : responseBody; | ||
crawler.emit("fetchcomplete", queueItem, responseBody, response); | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(decompressedBuffer || responseBody, queueItem); | ||
} | ||
} | ||
// Function for dealing with 200 responses | ||
@@ -1116,13 +1119,19 @@ function processReceivedData() { | ||
// Is the item allowed by depth conditions ? | ||
if (crawler.depthAllowed(queueItem)) { | ||
var responseBody = | ||
crawler.decodeResponses ? crawler.decodeBuffer(responseBuffer, stateData.contentType) : responseBuffer; | ||
crawler.emit("fetchcomplete", queueItem, responseBody, response); | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(responseBuffer, queueItem); | ||
// No matter the value of `crawler.decompressResponses`, we still | ||
// decompress the response if it's gzipped or deflated. This is | ||
// because we always provide the discoverResources method with a | ||
// decompressed buffer | ||
if (/(gzip|deflate)/.test(stateData.headers["content-encoding"])) { | ||
zlib.unzip(responseBuffer, function(error, decompressedBuffer) { | ||
if (error) { | ||
crawler.emit("gziperror", queueItem, error, responseBuffer); | ||
emitFetchComplete(responseBuffer); | ||
} else { | ||
var responseBody = crawler.decompressResponses ? decompressedBuffer : responseBuffer; | ||
emitFetchComplete(responseBody, decompressedBuffer); | ||
} | ||
}); | ||
} else { | ||
emitFetchComplete(responseBuffer); | ||
} | ||
@@ -1196,2 +1205,4 @@ } | ||
crawler._isFirstRequest = false; | ||
// We've got a not-modified response back | ||
@@ -1211,2 +1222,4 @@ } else if (response.statusCode === 304) { | ||
crawler._isFirstRequest = false; | ||
// If we should queue a redirect | ||
@@ -1225,5 +1238,13 @@ } else if (response.statusCode >= 300 && response.statusCode < 400 && | ||
if (crawler._isFirstRequest) { | ||
parsedURL.depth = 1; | ||
} | ||
if (crawler.allowInitialDomainChange && crawler._isFirstRequest) { | ||
crawler.host = parsedURL.host; | ||
} | ||
// Clean URL, add to queue... | ||
crawler.queueURL(parsedURL, queueItem); | ||
response.socket.end(); | ||
response.socket.destroy(); | ||
@@ -1238,7 +1259,9 @@ crawler._openRequests--; | ||
// Emit 404 event | ||
crawler.emit("fetch404", queueItem, response); | ||
response.socket.end(); | ||
crawler.emit("fetch" + response.statusCode, queueItem, response); | ||
response.socket.destroy(); | ||
crawler._openRequests--; | ||
crawler._isFirstRequest = false; | ||
// And oh dear. Handle this one as well. (other 400s, 500s, etc) | ||
@@ -1254,2 +1277,4 @@ } else { | ||
crawler._openRequests--; | ||
crawler._isFirstRequest = false; | ||
} | ||
@@ -1256,0 +1281,0 @@ |
@@ -0,0 +0,0 @@ /* |
@@ -0,0 +0,0 @@ /* |
@@ -0,0 +0,0 @@ /* |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.6.2", | ||
"version": "0.7.0", | ||
"homepage": "https://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
824
README.md
@@ -1,49 +0,57 @@ | ||
# Simple web-crawler for Node.js | ||
# Simple web crawler for node.js | ||
[![NPM version](https://img.shields.io/npm/v/simplecrawler.svg)](https://www.npmjs.com/package/simplecrawler) | ||
[![Build Status](https://img.shields.io/travis/cgiffard/node-simplecrawler/master.svg)](https://travis-ci.org/cgiffard/node-simplecrawler) | ||
[![Linux Build Status](https://img.shields.io/travis/cgiffard/node-simplecrawler/master.svg)](https://travis-ci.org/cgiffard/node-simplecrawler) | ||
[![Windows Build Status](https://img.shields.io/appveyor/ci/cgiffard/node-simplecrawler/master.svg?label=Windows%20build)](https://ci.appveyor.com/project/cgiffard/node-simplecrawler/branch/master) | ||
[![Dependency Status](https://img.shields.io/david/cgiffard/node-simplecrawler.svg)](https://david-dm.org/cgiffard/node-simplecrawler) | ||
[![devDependency Status](https://img.shields.io/david/dev/cgiffard/node-simplecrawler.svg)](https://david-dm.org/cgiffard/node-simplecrawler#info=devDependencies) | ||
Simplecrawler is designed to provide the most basic possible API for crawling | ||
websites, while being as flexible and robust as possible. I wrote simplecrawler | ||
to archive, analyse, and search some very large websites. It has happily chewed | ||
through 50,000 pages and written tens of gigabytes to disk without issue. | ||
simplecrawler is designed to provide a basic, flexible and robust API for | ||
crawling websites. I wrote simplecrawler to archive, analyse, and search some | ||
very large websites. It has happily chewed through hundreds of thousands of | ||
pages and written tens of gigabytes to disk without issue. | ||
#### Example (simple mode) | ||
## What does simplecrawler do? | ||
```js | ||
var Crawler = require("simplecrawler"); | ||
Crawler.crawl("http://example.com/") | ||
.on("fetchcomplete", function(queueItem) { | ||
console.log("Completed fetching resource:", queueItem.url); | ||
}); | ||
``` | ||
### What does simplecrawler do? | ||
* Provides a very simple event driven API using `EventEmitter` | ||
* Extremely configurable base for writing your own crawler | ||
* Provides some simple logic for auto-detecting linked resources - which you can | ||
replace or augment | ||
replace or augment | ||
* Has a flexible queue system which can be frozen to disk and defrosted | ||
* Provides basic statistics on network performance | ||
* Uses buffers for fetching and managing data, preserving binary data (except | ||
when discovering links) | ||
when discovering links) | ||
### Installation | ||
## Documentation | ||
``` | ||
npm install simplecrawler | ||
``` | ||
- [Getting started](#getting-started) | ||
- [Simplified mode](#simplified-mode) | ||
- [Regular mode](#regular-mode) | ||
- [Events](#events) | ||
- [A note about HTTP error conditions](#a-note-about-http-error-conditions) | ||
- [Waiting for asynchronous event listeners](#waiting-for-asynchronous-event-listeners) | ||
- [Configuration](#configuration) | ||
- [Fetch conditions](#fetch-conditions) | ||
- [The queue](#the-queue) | ||
- [Manually adding to the queue](#manually-adding-to-the-queue) | ||
- [Queue items](#queue-items) | ||
- [Queue statistics and reporting](#queue-statistics-and-reporting) | ||
- [Saving and reloading the queue (freeze/defrost)](#saving-and-reloading-the-queue-freezedefrost) | ||
- [Cookies](#cookies) | ||
- [Cookie events](#cookie-events) | ||
- [Link Discovery](#link-discovery) | ||
- [FAQ/Troubleshooting](#faqtroubleshooting) | ||
- [Current Maintainers](#current-maintainers) | ||
- [Contributors](#contributors) | ||
- [License](#license) | ||
### Getting Started | ||
## Getting Started | ||
There are two ways of instantiating a new crawler - a simple but less flexible | ||
method inspired by [anemone](http://anemone.rubyforge.org), and the traditional | ||
method which provides a little more room to configure crawl parameters. | ||
There are two ways of instantiating a new crawler - a simplified but less | ||
flexible method inspired by [anemone](http://anemone.rubyforge.org), and the | ||
traditional method which provides a little more room to configure crawl | ||
parameters. | ||
Regardless of whether you use the simple or traditional methods of instantiation, | ||
you'll need to require simplecrawler: | ||
Regardless of whether you use the simplified or regular method of instantiation, | ||
you'll need to require simplecrawler first: | ||
@@ -54,9 +62,11 @@ ```js | ||
#### Simple Mode | ||
### Simplified Mode | ||
Simple mode generates a new crawler for you, preconfigures it based on a URL you | ||
provide, and returns the crawler to you for further configuration and so you can | ||
attach event handlers. | ||
If all you need is a quick crawl of a small website, the simplified mode of | ||
initiating the crawler provides a slightly quicker way of getting started. It | ||
generates a new crawler for you, preconfigures it based on a URL you provide, | ||
starts the crawl and returns the crawler instance for further configuration and | ||
so that you can attach event handlers. | ||
Simply call `Crawler.crawl`, with a URL first parameter, and two optional | ||
Simply call `Crawler.crawl` with a URL as the first parameter, and two optional | ||
functions that will be added as event listeners for `fetchcomplete` and | ||
@@ -85,21 +95,20 @@ `fetcherror` respectively. | ||
#### Advanced Mode | ||
### Regular Mode | ||
The alternative method of creating a crawler is to call the `simplecrawler` | ||
constructor yourself, and to initiate the crawl manually. | ||
The standard way of creating a crawler is to call the `simplecrawler` | ||
constructor yourself and initiate the crawl manually. | ||
```js | ||
var myCrawler = new Crawler("www.example.com"); | ||
var crawler = new Crawler("www.example.com"); | ||
``` | ||
Nonstandard port? HTTPS? Want to start archiving a specific path? No problem: | ||
Non-standard port? HTTPS? Want to start crawling at a specific path? No problem: | ||
```js | ||
myCrawler.initialPath = "/archive"; | ||
myCrawler.initialPort = 8080; | ||
myCrawler.initialProtocol = "https"; | ||
crawler.initialPath = "/archive"; | ||
crawler.initialPort = 8080; | ||
crawler.initialProtocol = "https"; | ||
// Or: | ||
var myCrawler = new Crawler("www.example.com", "/archive", 8080); | ||
var crawler = new Crawler("www.example.com", "/archive", 8080); | ||
``` | ||
@@ -112,36 +121,30 @@ | ||
```js | ||
myCrawler.interval = 10000; // Ten seconds | ||
myCrawler.maxConcurrency = 1; | ||
crawler.interval = 10000; // Ten seconds | ||
crawler.maxConcurrency = 1; | ||
``` | ||
You can also define a max depth for links to fetch: | ||
```js | ||
myCrawler.maxDepth = 1; // Only first page is fetched (with linked CSS & images) | ||
crawler.maxDepth = 1; // Only first page is fetched (with linked CSS & images) | ||
// Or: | ||
myCrawler.maxDepth = 2; // First page and discovered links from it are fetched | ||
crawler.maxDepth = 2; // First page and discovered links from it are fetched | ||
// Or: | ||
myCrawler.maxDepth = 3; // Etc. | ||
crawler.maxDepth = 3; // Etc. | ||
``` | ||
For brevity, you may also specify the initial path and request interval when | ||
creating the crawler: | ||
For a full list of configurable properties, see the | ||
[configuration section](#configuration). | ||
```js | ||
var myCrawler = new Crawler("www.example.com", "/", 8080, 300); | ||
``` | ||
You'll also need to set up event listeners for the [events](#events) you want to | ||
listen to. `fetchcomplete` and `complete` are a good place to start. | ||
### Running the crawler | ||
First, you'll need to set up an event listener to get the fetched data: | ||
```js | ||
myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) { | ||
crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) { | ||
console.log("I just received %s (%d bytes)", queueItem.url, responseBuffer.length); | ||
console.log("It was a resource of type %s", response.headers['content-type']); | ||
// Do something with the data in responseBuffer | ||
}); | ||
``` | ||
Then, when you're satisfied you're ready to go, start the crawler! It'll run | ||
Then, when you're satisfied and ready to go, start the crawler! It'll run | ||
through its queue finding linked resources on the domain to download, until it | ||
@@ -151,3 +154,3 @@ can't find any more. | ||
```js | ||
myCrawler.start(); | ||
crawler.start(); | ||
``` | ||
@@ -188,3 +191,5 @@ | ||
* `fetch404` (queueItem, response) | ||
Fired when a 404 or 410 HTTP status code is returned for a request. | ||
Fired when a 404 HTTP status code is returned for a request. | ||
* `fetch410` (queueItem, response) | ||
Fired when a 410 HTTP status code is returned for a request. | ||
* `fetcherror` (queueItem, response) | ||
@@ -213,8 +218,4 @@ Fired when an alternate 400 or 500 series HTTP status code is returned for a | ||
If this is annoying, and you'd really like to retain error pages by default, let | ||
me know. I didn't include it because I didn't need it - but if it's important to | ||
people I might put it back in. :) | ||
### Waiting for asynchronous event listeners | ||
#### Waiting for Asynchronous Event Listeners | ||
Sometimes, you might want to wait for simplecrawler to wait for you while you | ||
@@ -225,3 +226,3 @@ perform some asynchronous tasks in an event listener, instead of having it | ||
Simplecrawler provides a `wait` method you can call at any time. It is available | ||
simplecrawler provides a `wait` method you can call at any time. It is available | ||
via `this` from inside listeners, and on the crawler object itself. It returns | ||
@@ -234,3 +235,3 @@ a callback function. | ||
##### Example Asynchronous Event Listener | ||
#### Example asynchronous event listener | ||
@@ -247,156 +248,167 @@ ```js | ||
### Configuring the crawler | ||
## Configuration | ||
Here's a complete list of what you can stuff with at this stage: | ||
simplecrawler is highly configurable and there's a long list of settings you can | ||
change to adapt it to your specific needs. | ||
* `crawler.host` - | ||
* `crawler.host` - | ||
The domain to scan. By default, simplecrawler will restrict all requests to | ||
this domain. | ||
* `crawler.initialPath` - | ||
* `crawler.initialPath="/"` - | ||
The initial path with which the crawler will formulate its first request. | ||
Does not restrict subsequent requests. | ||
* `crawler.initialPort` - | ||
* `crawler.initialPort=80` - | ||
The initial port with which the crawler will formulate its first request. | ||
Does not restrict subsequent requests. | ||
* `crawler.initialProtocol` - | ||
The initial protocol with which the crawler will formulate its first request. | ||
Does not restrict subsequent requests. | ||
* `crawler.interval` - | ||
* `crawler.initialProtocol="http"` - | ||
The initial protocol with which the crawler will formulate its first | ||
request. Does not restrict subsequent requests. | ||
* `crawler.interval=250` - | ||
The interval with which the crawler will spool up new requests (one per | ||
tick.) Defaults to 250 ms. | ||
* `crawler.maxConcurrency` - | ||
tick). | ||
* `crawler.maxConcurrency=5` - | ||
The maximum number of requests the crawler will run simultaneously. Defaults | ||
to 5 - the default number of http agents node will run. | ||
* `crawler.timeout` - | ||
* `crawler.timeout=300000` - | ||
The maximum time in milliseconds the crawler will wait for headers before | ||
aborting the request. | ||
* `crawler.listenerTTL` - | ||
* `crawler.listenerTTL=10000` - | ||
The maximum time in milliseconds the crawler will wait for async listeners. | ||
* `crawler.userAgent` - | ||
The user agent the crawler will report. Defaults to | ||
`Node/SimpleCrawler <version> (https://github.com/cgiffard/node-simplecrawler)`. | ||
* `crawler.decodeResponses` - | ||
The response bodies will be intelligently character converted to standard | ||
JavaScript strings using the `iconv-lite` module. The character encoding | ||
is interpreted from the Content-Type header firstly, and secondly from any | ||
<meta charset="xxx" /> tags. | ||
* `crawler.queue` - | ||
* `crawler.userAgent="Node/simplecrawler <version> (https://github.com/cgiffard/node-simplecrawler)"` - | ||
The user agent the crawler will report. | ||
* `crawler.decompressResponses=true` - | ||
Response bodies that are compressed will be automatically decompressed | ||
before they're emitted in the `fetchcomplete` event. Even if this is falsy, | ||
compressed responses will be decompressed before they're passed to the | ||
`discoverResources` method. | ||
* `crawler.decodeResponses=false` - | ||
Response bodies will be intelligently character converted to standard | ||
JavaScript strings using the | ||
[iconv-lite](https://www.npmjs.com/package/iconv-lite) module. The character | ||
encoding is interpreted from the Content-Type header firstly, and secondly | ||
from any `<meta charset="xxx" />` tags. | ||
* `crawler.queue` - | ||
The queue in use by the crawler (Must implement the `FetchQueue` interface) | ||
* `crawler.filterByDomain` - | ||
* `crawler.allowInitialDomainChange=false` - | ||
If the response for the initial URL is a redirect to another domain (e.g. | ||
from github.net to github.com), update `crawler.host` to continue the | ||
crawling on that domain. | ||
* `crawler.filterByDomain=true` - | ||
Specifies whether the crawler will restrict queued requests to a given | ||
domain/domains. | ||
* `crawler.scanSubdomains` - | ||
Enables scanning subdomains (other than www) as well as the specified domain. | ||
Defaults to false. | ||
* `crawler.ignoreWWWDomain` - | ||
* `crawler.scanSubdomains=false` - | ||
Enables scanning subdomains (other than www) as well as the specified | ||
domain. | ||
* `crawler.ignoreWWWDomain=true` - | ||
Treats the `www` domain the same as the originally specified domain. | ||
Defaults to true. | ||
* `crawler.stripWWWDomain` - | ||
* `crawler.stripWWWDomain=false` - | ||
Or go even further and strip WWW subdomain from requests altogether! | ||
* `crawler.stripQuerystring` - | ||
Specify to strip querystring parameters from URLs. Defaults to false. | ||
* `crawler.discoverResources` - | ||
Use simplecrawler's internal resource discovery function. You can replace it | ||
with your own function, which must accept a buffer and a queueItem, and add | ||
the discovered resources to the crawler queue: | ||
* `crawler.stripQuerystring=false` - | ||
Specify to strip querystring parameters from URL's. | ||
* `crawler.discoverResources` - | ||
simplecrawler's default resource discovery function - | ||
which, given a buffer containing a resource, returns an array of URLs. | ||
For more details about link discovery, see [Link Discovery](#link-discovery) | ||
* `crawler.discoverRegex` - | ||
Array of regular expressions and functions that simplecrawler uses to | ||
discover resources. Functions in this array are expected to return an array. | ||
* `crawler.cache` - | ||
Specify a cache architecture to use when crawling. Must implement | ||
`SimpleCache` interface. You can save the site to disk using the built in | ||
file system cache like this: | ||
```js | ||
crawler.discoverResources = function(buf, queueItem) { | ||
// scan buffer for URLs, and then: | ||
... | ||
crawler.queueURL(aDiscoveredURL, queueItem); | ||
... | ||
}; | ||
crawler.cache = new Crawler.cache('pathToCacheDirectory'); | ||
``` | ||
* `crawler.discoverRegex` - | ||
Array of regular expressions and functions that simplecrawler uses to | ||
discover resources. Functions in this array are expected to return an array. | ||
* `crawler.cache` - | ||
Specify a cache architecture to use when crawling. Must implement | ||
`SimpleCache` interface. You can save the site to disk using the built in file | ||
system cache like this: `crawler.cache = new Crawler.cache('pathToCacheDirectory');` | ||
* `crawler.useProxy` - | ||
* `crawler.useProxy=false` - | ||
The crawler should use an HTTP proxy to make its requests. | ||
* `crawler.proxyHostname` - | ||
* `crawler.proxyHostname="127.0.0.1"` - | ||
The hostname of the proxy to use for requests. | ||
* `crawler.proxyPort` - | ||
* `crawler.proxyPort=8123` - | ||
The port of the proxy to use for requests. | ||
* `crawler.proxyUser` - | ||
The username for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.) | ||
* `crawler.proxyPass` - | ||
The password for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.) | ||
* `crawler.domainWhitelist` - | ||
An array of domains the crawler is permitted to crawl from. If other settings | ||
are more permissive, they will override this setting. | ||
* `crawler.supportedMimeTypes` - | ||
* `crawler.proxyUser=null` - | ||
The username for HTTP/Basic proxy authentication (leave unset for | ||
unauthenticated proxies.) | ||
* `crawler.proxyPass=null` - | ||
The password for HTTP/Basic proxy authentication (leave unset for | ||
unauthenticated proxies.) | ||
* `crawler.domainWhitelist` - | ||
An array of domains the crawler is permitted to crawl from. If other | ||
settings are more permissive, they will override this setting. | ||
* `crawler.supportedMimeTypes` - | ||
An array of RegEx objects used to determine supported MIME types (types of | ||
data simplecrawler will scan for links.) If you're not using simplecrawler's | ||
resource discovery function, this won't have any effect. | ||
* `crawler.allowedProtocols` - | ||
An array of RegEx objects used to determine whether a URL protocol is supported. | ||
This is to deal with nonstandard protocol handlers that regular HTTP is | ||
sometimes given, like `feed:`. It does not provide support for non-http | ||
protocols (and why would it!?) | ||
* `crawler.maxResourceSize` - | ||
The maximum resource size, in bytes, which will be downloaded. Defaults to 16MB. | ||
* `crawler.downloadUnsupported` - | ||
Simplecrawler will download files it can't parse. Defaults to true, but if | ||
data simplecrawler will scan for links.) If you're not using | ||
simplecrawler's resource discovery function, this won't have any effect. | ||
* `crawler.allowedProtocols` - | ||
An array of RegExp objects used to determine whether a URL protocol is | ||
supported. This is to deal with nonstandard protocol handlers that regular | ||
HTTP is sometimes given, like `feed:`. It does not provide support for | ||
non-http protocols (and why would it!?) | ||
* `crawler.maxResourceSize=16777216` - | ||
The maximum resource size that will be downloaded, in bytes. Defaults to | ||
16MB. | ||
* `crawler.downloadUnsupported=true` - | ||
simplecrawler will download files it can't parse. Defaults to true, but if | ||
you'd rather save the RAM and GC lag, switch it off. When false, it closes | ||
sockets for unsupported resources. | ||
* `crawler.needsAuth` - | ||
Flag to specify if the domain you are hitting requires basic authentication | ||
* `crawler.authUser` - | ||
Username provided for needsAuth flag | ||
* `crawler.authPass` - | ||
Password provided for needsAuth flag | ||
* `crawler.customHeaders` - | ||
* `crawler.needsAuth=false` - | ||
Flag to specify if the domain you are hitting requires basic authentication. | ||
* `crawler.authUser=""` - | ||
Username provided for `needsAuth` flag. | ||
* `crawler.authPass=""` - | ||
Password provided for `needsAuth` flag. | ||
* `crawler.customHeaders` - | ||
An object specifying a number of custom headers simplecrawler will add to | ||
every request. These override the default headers simplecrawler sets, so | ||
be careful with them. If you want to tamper with headers on a per-request basis, | ||
see the `fetchqueue` event. | ||
* `crawler.acceptCookies` - | ||
Flag to indicate if the crawler should hold on to cookies | ||
* `crawler.urlEncoding` - | ||
Set this to `iso8859` to trigger URIjs' re-encoding of iso8859 URLs to unicode. | ||
Defaults to `unicode`. | ||
* `crawler.parseHTMLComments` - | ||
Whether to scan for URLs inside HTML comments. | ||
Defaults to `true`. | ||
* `crawler.parseScriptTags` - | ||
Whether to scan for URLs inside script tags. | ||
Defaults to `true`. | ||
* `crawler.maxDepth` - | ||
every request. These override the default headers simplecrawler sets, so be | ||
careful with them. If you want to tamper with headers on a per-request | ||
basis, see the `fetchqueue` event. | ||
* `crawler.acceptCookies=true` - | ||
Flag to indicate if the crawler should hold on to cookies. | ||
* `crawler.urlEncoding="unicode"` - | ||
Set this to `iso8859` to trigger | ||
[URI.js](https://medialize.github.io/URI.js/)' re-encoding of iso8859 URL's | ||
to unicode. | ||
* `crawler.parseHTMLComments=true` - | ||
Whether to scan for URL's inside HTML comments. | ||
* `crawler.parseScriptTags=true` - | ||
Whether to scan for URL's inside script tags. | ||
* `crawler.maxDepth=0` - | ||
Defines a maximum distance from the original request at which resources will | ||
be downloaded. Asset files are excluded from this distance condition if | ||
`crawler.fetchWhitelistedMimeTypesBelowMaxDepth` is `true`. Defaults to `0` | ||
— no max depth. | ||
* `crawler.fetchWhitelistedMimeTypesBelowMaxDepth` — Defaults to `false`. If | ||
`true`, then resources (fonts, images, CSS) will be excluded from `maxDepth` | ||
checks. (And therefore downloaded regardless of their depth.) | ||
* `crawler.ignoreInvalidSSL` - | ||
`crawler.fetchWhitelistedMimeTypesBelowMaxDepth` is `true`. Defaults to `0` — | ||
no max depth. | ||
* `crawler.whitelistedMimeTypes` - | ||
An array of RegEx objects used to determine whitelisted MIME types (types of | ||
data simplecrawler will fetch on disregardig the `maxDepth` checks). | ||
Defaults to common resource types like styles, fonts, scripts and images. | ||
* `crawler.fetchWhitelistedMimeTypesBelowMaxDepth=false` - | ||
Defines the depth for fetching resources in addition to maxDepth. If `true`, | ||
then resources (see `whitelistedMimeTypes`) will always be loaded, while | ||
`false` limits them to the same level. Furthermore a numeric value can be | ||
specified for a concrete offset (e.g. 1 for the next depth layer). | ||
* `crawler.ignoreInvalidSSL=false` - | ||
Treat self-signed SSL certificates as valid. SSL certificates will not be | ||
validated against known CAs. Only applies to https requests. You may also have | ||
to set the environment variable NODE_TLS_REJECT_UNAUTHORIZED to '0'. | ||
validated against known CAs. Only applies to https requests. You may also | ||
have to set the environment variable NODE_TLS_REJECT_UNAUTHORIZED to '0'. | ||
For example: `process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';` | ||
Defaults to false. | ||
#### Excluding certain resources from downloading | ||
## Fetch conditions | ||
Simplecrawler has a mechanism you can use to prevent certain resources from being | ||
fetched, based on the URL, called **Fetch Conditions**. A fetch condition is just | ||
a function, which, when given a parsed URL object, will return a boolean that | ||
indicates whether a given resource should be downloaded. | ||
simplecrawler has a mechanism you can use to prevent certain resources from | ||
being fetched, based on the URL, called fetch conditions. A fetch condition is a | ||
function that, when given a parsed URL object, returns a value that indicates | ||
whether a given resource should be downloaded. | ||
You may add as many fetch conditions as you like, and remove them at runtime. | ||
Simplecrawler will evaluate every single condition against every queued URL, and | ||
should just one of them return a falsy value (this includes `null` and `undefined`, | ||
so remember to always return a value!) then the resource in question will not be | ||
fetched. | ||
simplecrawler will evaluate will evaluate every fetch condition until one is | ||
encountered that returns a falsy value. If that happens, the resource in | ||
question will not be fetched. | ||
##### Adding a fetch condition | ||
### Adding a fetch condition | ||
This example fetch condition prevents URLs ending in `.pdf` from being downloaded. | ||
Adding a fetch condition assigns it an ID, which the `addFetchCondition` function | ||
returns. You can use this ID to remove the condition later. | ||
This example fetch condition prevents URL's ending in `.pdf` from being | ||
downloaded. Adding a fetch condition assigns it an ID, which the | ||
`addFetchCondition` function returns. You can use this ID to remove the | ||
condition later. | ||
@@ -410,3 +422,4 @@ ```js | ||
Fetch conditions are called with two arguments: `parsedURL` and `queueItem`. | ||
`parsedURL` is the resource to be fetched (or not) and has the following structure: | ||
`parsedURL` represents the resource to be fetched (or not) and has the following | ||
structure: | ||
@@ -424,26 +437,12 @@ ```js | ||
`queueItem` is a representation of the page where this resource was found, it | ||
looks like this: | ||
`queueItem` is a representation of the page where this resource was found. See | ||
the [queue item documentation](#queue-items) for details on its structure. | ||
```js | ||
{ | ||
url: "http://example.com/index.php", | ||
protocol: "http", | ||
host: "example.com", | ||
port: 80, | ||
path: "/index.php", | ||
depth: 1, | ||
fetched: true, | ||
status: "downloaded", | ||
stateData: {...} | ||
} | ||
``` | ||
This information enables you to write sophisticated logic for which pages to | ||
fetch and which to avoid. You could, for example, implement a link checker that | ||
not only checks your site, but also links to external sites, but doesn't continue | ||
crawling those sites by setting `filterByDomain` to false and checking that | ||
With this information, you can write sophisticated logic for determining which | ||
pages to fetch and which to avoid. For example, you could write a link checker | ||
that checks both internal and external links, yet doesn't continue crawling | ||
other domains by setting `filterByDomain` to false and checking that | ||
`queueItem.host` is the same as `crawler.host`. | ||
##### Removing a fetch condition | ||
### Removing a fetch condition | ||
@@ -457,48 +456,41 @@ If you stored the ID of the fetch condition you added earlier, you can remove it | ||
##### Excluding resources based on robots.txt | ||
## The queue | ||
Simplecrawler [purposely](https://github.com/cgiffard/node-simplecrawler/issues/153) | ||
doesn't come with any built in support for parsing robots.txt rules. Adding | ||
support manually is very straightforward using fetch conditions however, and | ||
in `examples/robots-txt-example.js` you'll find an example that makes use of | ||
the [robots-parser](https://www.npmjs.com/package/robots-parser) module to do | ||
just that. | ||
Like any other web crawler, simplecrawler has a queue. It can be directly | ||
accessed through `crawler.queue` and is by default only backed by an array, | ||
which means items in the queue can be accessed through array notation. However, | ||
since simplecrawler also supports different backing stores for the queue, the | ||
recommended way of accessing items is through the (pseudo) asynchronous | ||
`crawler.queue.get` method. | ||
### The Simplecrawler Queue | ||
Simplecrawler has a queue like any other web crawler. It can be directly accessed | ||
at `crawler.queue` (assuming you called your `Crawler()` object `crawler`.) It | ||
provides array access, so you can get to queue items just with array notation | ||
and an index. | ||
```js | ||
crawler.queue[5]; | ||
crawler.queue.get(5, function (queueItem) { | ||
// Do something with the queueItem | ||
}); | ||
``` | ||
For compatibility with different backing stores, it now provides an alternate | ||
interface which the crawler core makes use of: | ||
Even though this operation is actually synchronous when the default backing | ||
store is used, this method helps maintain compatibility with asynchronous | ||
backing stores that would let you eg. store the queue in a database. | ||
```js | ||
crawler.queue.get(5); | ||
``` | ||
### Manually adding to the queue | ||
It's not just an array though. | ||
#### Adding to the queue | ||
The simplest way to add to the queue is to use the crawler's own method, | ||
The simplest way of manually adding to the queue is to use the crawler's method | ||
`crawler.queueURL`. This method takes a complete URL, validates and deconstructs | ||
it, and adds it to the queue. | ||
If you instead want to add a resource by its components, you may call the | ||
`queue.add` method directly: | ||
```js | ||
var customQueueItem = { | ||
url: "http://example.com", | ||
depth: 2 | ||
}; | ||
```js | ||
crawler.queue.add(protocol, hostname, port, path); | ||
crawler.queueURL("/example.html", customQueueItem); | ||
``` | ||
That's it! It's basically just a URL, but comma separated (that's how you can | ||
remember the order.) | ||
If you instead want to add a resource by its components, you may call the | ||
`queue.add` method directly with the signature `protocol`, `hostname`, `port`, | ||
`path`. | ||
#### Queue items | ||
### Queue items | ||
@@ -509,44 +501,52 @@ Because when working with simplecrawler, you'll constantly be handed queue items, | ||
* `url` - The complete, canonical URL of the resource. | ||
* `url` - The complete, canonical URL of the resource | ||
* `protocol` - The protocol of the resource (http, https) | ||
* `host` - The full domain/hostname of the resource | ||
* `port` - The port of the resource | ||
* `path` - The bit of the URL after the domain - includes the query string. | ||
* `fetched` - Has the request for this item been completed? You can monitor this as requests are processed. | ||
* `status` - The internal status of the item, always a string. This can be one of: | ||
* `queued` - The resource is in the queue to be fetched, but nothing's happened to it yet. | ||
* `spooled` - A request has been made to the remote server, but we're still waiting for a response. | ||
* `headers` - The headers for the resource have been received. | ||
* `downloaded` - The item has been entirely downloaded. | ||
* `redirected` - The resource request returned a 300 series response, with a Location header and a new URL. | ||
* `notfound` - The resource could not be found. (404) | ||
* `failed` - An error occurred when attempting to fetch the resource. | ||
* `stateData` - An object containing state data and other information about the request: | ||
* `requestLatency` - The time taken for headers to be received after the request was made. | ||
* `requestTime` - The total time taken for the request (including download time.) | ||
* `path` - The URL path, including the query string | ||
* `uriPath` - The URL path, excluding the query string | ||
* `depth` - How many steps simplecrawler has taken from the initial page (which | ||
is depth 1) to this resource. | ||
* `fetched` - Has the request for this item been completed? You can monitor this | ||
as requests are processed. | ||
* `status` - The internal status of the item, always a string. This can be one | ||
of: | ||
* `"queued"` - The resource is in the queue to be fetched, but nothing's | ||
happened to it yet. | ||
* `"spooled"` - A request has been made to the remote server, but we're | ||
still waiting for a response. | ||
* `"headers"` - The headers for the resource have been received. | ||
* `"downloaded"` - The item has been entirely downloaded. | ||
* `"redirected"` - The resource request returned a 300 series response, with | ||
a Location header and a new URL. | ||
* `"notfound"` - The resource could not be found. (404) | ||
* `"failed"` - An error occurred when attempting to fetch the resource. | ||
* `stateData` - An object containing state data and other information about the | ||
request: | ||
* `requestLatency` - The time taken for headers to be received after the | ||
request was made. | ||
* `requestTime` - The total time taken for the request (including download | ||
time.) | ||
* `downloadTime` - The total time taken for the resource to be downloaded. | ||
* `contentLength` - The length (in bytes) of the returned content. Calculated based on the `content-length` header. | ||
* `contentLength` - The length (in bytes) of the returned content. | ||
Calculated based on the `content-length` header. | ||
* `contentType` - The MIME type of the content. | ||
* `code` - The HTTP status code returned for the request. | ||
* `headers` - An object containing the header information returned by the server. This is the object node returns as part of the `response` object. | ||
* `actualDataSize` - The length (in bytes) of the returned content. Calculated based on what is actually received, not the `content-length` header. | ||
* `sentIncorrectSize` - True if the data length returned by the server did not match what we were told to expect by the `content-length` header. | ||
* `headers` - An object containing the header information returned by the | ||
server. This is the object node returns as part of the `response` object. | ||
* `actualDataSize` - The length (in bytes) of the returned content. | ||
Calculated based on what is actually received, not the `content-length` | ||
header. | ||
* `sentIncorrectSize` - True if the data length returned by the server did | ||
not match what we were told to expect by the `content-length` header. | ||
You can address these properties like you would any other object: | ||
As you can see, you can get a lot of meta-information out about each request. | ||
This has been put to use by providing some convenient methods for getting simple | ||
aggregate data about the queue. | ||
```js | ||
crawler.queue[52].url; | ||
queueItem.stateData.contentLength; | ||
queueItem.status === "queued"; | ||
``` | ||
### Queue statistics and reporting | ||
As you can see, you can get a lot of meta-information out about each request. The | ||
upside is, the queue actually has some convenient functions for getting simple | ||
aggregate data about the queue... | ||
#### Queue Statistics and Reporting | ||
First of all, the queue can provide some basic statistics about the network | ||
performance of your crawl (so far.) This is done live, so don't check it thirty | ||
times a second. You can test the following properties: | ||
performance of your crawl so far. This is done live, so don't check it 30 times | ||
a second. You can test the following properties: | ||
@@ -559,27 +559,37 @@ * `requestTime` | ||
And you can get the maximum, minimum, and average values for each with the | ||
You can get the maximum, minimum, and average values for each with the | ||
`crawler.queue.max`, `crawler.queue.min`, and `crawler.queue.avg` functions | ||
respectively. Like so: | ||
respectively. Like the `crawler.queue.get` method, these methods are pseudo | ||
asynchronous to support different backing stores for the queue. That means they | ||
will provide both a return value and a callback. | ||
```js | ||
console.log("The maximum request latency was %dms.", crawler.queue.max("requestLatency")); | ||
console.log("The minimum download time was %dms.", crawler.queue.min("downloadTime")); | ||
console.log("The average resource size received is %d bytes.", crawler.queue.avg("actualDataSize")); | ||
crawler.queue.max("requestLatency", function (max) { | ||
console.log("The maximum request latency was %dms.", max); | ||
}); | ||
crawler.queue.min("downloadTime", function (min) { | ||
console.log("The minimum download time was %dms.", min); | ||
}); | ||
crawler.queue.avg("actualDataSize", function (avg) { | ||
console.log("The average resource size received is %d bytes.", avg); | ||
}); | ||
``` | ||
You'll probably often need to determine how many items in the queue have a given | ||
status at any one time, and/or retrieve them. That's easy with | ||
You'll probably often need to determine how many queue items have a given status | ||
and/or retrieve them. That's easily done with the methods | ||
`crawler.queue.countWithStatus` and `crawler.queue.getWithStatus`. | ||
`crawler.queue.countWithStatus` returns the number of queued items with a given | ||
`crawler.queue.countWithStatus` provides the number of queued items with a given | ||
status, while `crawler.queue.getWithStatus` returns an array of the queue items | ||
themselves. | ||
themselves. Again, by default, these methods both return and accept callbacks. | ||
```js | ||
var redirectCount = crawler.queue.countWithStatus("redirected"); | ||
crawler.queue.countWithStatus("redirected", function (redirectCount) { | ||
console.log("The redirect count is %d", redirectCount); | ||
}); | ||
crawler.queue.getWithStatus("failed").forEach(function(queueItem) { | ||
console.log("Whoah, the request for %s failed!", queueItem.url); | ||
// do something... | ||
crawler.queue.getWithStatus("failed", function (failedItems) { | ||
failedItems.forEach(function(queueItem) { | ||
console.log("Whoah, the request for %s failed!", queueItem.url); | ||
}); | ||
}); | ||
@@ -590,31 +600,29 @@ ``` | ||
* `crawler.queue.complete` - returns the number of queue items which have been | ||
completed (marked as fetched) | ||
* `crawler.queue.errors` - returns the number of requests which have failed | ||
(404s and other 400/500 errors, as well as client errors) | ||
* `crawler.queue.complete` - provides the number of queue items which have been | ||
completed (marked as fetched). | ||
* `crawler.queue.errors` - provides the number of requests which have failed | ||
(404s and other 400/500 errors, as well as client errors). | ||
#### Saving and reloading the queue (freeze/defrost) | ||
### Saving and reloading the queue (freeze/defrost) | ||
You'll probably want to be able to save your progress and reload it later, if | ||
your application fails or you need to abort the crawl for some reason. (Perhaps | ||
you just want to finish off for the night and pick it up tomorrow!) The | ||
`crawler.queue.freeze` and `crawler.queue.defrost` functions perform this task. | ||
It can be convenient to be able to save the crawl progress and later be able to | ||
reload it if your application fails or you need to abort the crawl for some | ||
reason. The `crawler.queue.freeze` and `crawler.queue.defrost` methods will let | ||
you do this. | ||
**A word of warning though** - they are not CPU friendly as they rely on | ||
JSON.parse and JSON.stringify. Use them only when you need to save the queue - | ||
don't call them every request or your application's performance will be incredibly | ||
poor - they block like *crazy*. That said, using them when your crawler commences | ||
and stops is perfectly reasonable. | ||
**A word of warning** - they are not CPU friendly as they rely on `JSON.parse` | ||
and `JSON.stringify`. Use them only when you need to save the queue - don't call | ||
them after every request or your application's performance will be incredibly | ||
poor - they block like *crazy*. That said, using them when your crawler | ||
commences and stops is perfectly reasonable. | ||
Note that the methods themselves are asynchronous, so if you are going to exit the | ||
process after you do the freezing, make sure you wait for callback - otherwise | ||
you'll get an empty file. | ||
Note that the methods themselves are asynchronous, so if you are going to exit | ||
the process after you do the freezing, make sure you wait for callback - | ||
otherwise you'll get an empty file. | ||
```js | ||
// Freeze queue | ||
crawler.queue.freeze("mysavedqueue.json", function() { | ||
crawler.queue.freeze("mysavedqueue.json", function () { | ||
process.exit(); | ||
}); | ||
// Defrost queue | ||
crawler.queue.defrost("mysavedqueue.json"); | ||
@@ -625,53 +633,177 @@ ``` | ||
Simplecrawler now has an internal cookie jar, which collects and resends cookies | ||
automatically, and by default. | ||
simplecrawler has an internal cookie jar, which collects and resends cookies | ||
automatically and by default. If you want to turn this off, set the | ||
`crawler.acceptCookies` option to `false`. The cookie jar is accessible via | ||
`crawler.cookies`, and is an event emitter itself. | ||
If you want to turn this off, set the `crawler.acceptCookies` option to `false`. | ||
### Cookie events | ||
The cookie jar is accessible via `crawler.cookies`, and is an event emitter itself: | ||
* `addcookie` (cookie) - Fired when a new cookie is added to the jar. | ||
* `removecookie` (cookie array) - Fired when one or more cookies are removed from the jar. | ||
### Cookie Events | ||
## Link Discovery | ||
* `addcookie` (cookie) | ||
Fired when a new cookie is added to the jar. | ||
* `removecookie` (cookie array) | ||
Fired when one or more cookies are removed from the jar. | ||
simplecrawler's discovery function is made to be replaceable — you can | ||
easily write your own that discovers only the links you're interested in. | ||
## Contributors | ||
The method must accept a buffer and a [`queueItem`](#queue-items), and | ||
return the resources that are to be added to the queue. | ||
I'd like to extend sincere thanks to: | ||
It is quite common to pair simplecrawler with a module like | ||
[cheerio](https://npmjs.com/package/cheerio) that can correctly parse | ||
HTML and provide a DOM like API for querying — or even a whole headless | ||
browser, like phantomJS. | ||
* [Nick Crohn](https://github.com/ncrohn) for the HTTP Basic auth support, and | ||
initial cookie support. | ||
* [Mike Moulton](https://github.com/mmoulton) for | ||
[fixing a bug in the URL discovery mechanism] | ||
(https://github.com/cgiffard/node-simplecrawler/pull/3), as well as | ||
[adding the `discoverycomplete` event] | ||
(https://github.com/cgiffard/node-simplecrawler/pull/10), | ||
* [Mike Iannacone](https://github.com/mikeiannacone) for correcting a keyword | ||
naming collision with node 0.8's EventEmitter. | ||
* [Greg Molnar](https://github.com/gregmolnar) for | ||
[adding a querystring-free path parameter to parsed URL objects.] | ||
(https://github.com/cgiffard/node-simplecrawler/pull/31) | ||
* [Breck Yunits](https://github.com/breck7) for contributing a useful code | ||
sample demonstrating using simplecrawler for caching a website to disk! | ||
* [Luke Plaster](https://github.com/notatestuser) for enabling protocol-agnostic | ||
link discovery | ||
* [Zeus](https://github.com/distracteddev) for fixing a bug where [default port | ||
info was wrongly specified in requests] | ||
(https://github.com/cgiffard/node-simplecrawler/pull/40) | ||
and for fixing the missing request timeout handling! | ||
* [Graham Hutchinson](https://github.com/ghhutch) for adding | ||
querystring-stripping option | ||
* [Jellyfrog](https://github.com/jellyfrog) for assisting in diagnosing some | ||
nasty EventEmitter issues. | ||
* [Brian Moeskau](https://github.com/bmoeskau) for helping to fix the confusing | ||
'async' events API, and providing invaluable feedback. | ||
The example below demonstrates how one might achieve basic HTML-correct | ||
discovery of only link tags using cheerio. | ||
And everybody else who has helped out in some way! :) | ||
```js | ||
crawler.discoverResources = function(buffer, queueItem) { | ||
var $ = cheerio.load(buffer.toString("utf8")); | ||
## Licence | ||
return $("a[href]").map(function () { | ||
return $(this).attr("href"); | ||
}).get(); | ||
}; | ||
``` | ||
Copyright (c) 2013, Christopher Giffard. | ||
## FAQ/Troubleshooting | ||
There are a couple of questions that pop up more often than others in the issue | ||
tracker. If you're having trouble with simplecrawler, please have a look at the | ||
list below before submitting an issue. | ||
- **Q: Why does simplecrawler discover so many invalid URLs?** | ||
A: simplecrawler's built-in discovery method is purposefully naive - it's a | ||
brute force approach intended to find everything: URLs in comments, binary files, | ||
scripts, image EXIF data, inside CSS documents, and more — useful for archiving | ||
and use cases where it's better to have false positives than fail to discover a | ||
resource. | ||
It's definitely not a solution for every case, though — if you're | ||
writing a link checker or validator, you don't want erroneous 404s | ||
throwing errors. Therefore, simplecrawler allows you to tune discovery in a few | ||
key ways: | ||
- You can either add to (or remove from) the `discoverRegex` array, tweaking | ||
the search patterns to meet your requirements; or | ||
- Swap out the `discoverResources` method. Parsing HTML pages is beyond the | ||
scope of simplecrawler, but it is very common to combine simplecrawler with | ||
a module like [cheerio](https://npmjs.com/package/cheerio) for more | ||
sophisticated resource discovery. | ||
Further documentation is available in the [link discovery](#link-discovery) | ||
section. | ||
- **Q: Why did simplecrawler complete without fetching any resources?** | ||
A: When this happens, it is usually because the initial request was redirected | ||
to a different domain that wasn't in the `domainWhitelist`. | ||
- **Q: How do I crawl a site that requires a login?** | ||
A: Logging in to a site is usually fairly simple and only requires an | ||
exhange of credentials over HTTP as well as the storing of a cookie that | ||
allows the client's session can be maintained between requests to the | ||
server. Simplecrawler doesn't have a built-in method for this entire | ||
procedure, but it does have an internal cookie jar that can be used to | ||
store the cookie that's returned from a manual HTTP request. | ||
Here's an example of how to perform a manual login HTTP request with the | ||
[request](https://npmjs.com/package/request) module and then store the | ||
returned cookie in simplecrawler's cookie jar. | ||
```js | ||
var Crawler = require("simplecrawler"), | ||
request = require("request"); | ||
var crawler = new Crawler("example.com", "/"); | ||
crawler.initialProtocol = "https"; | ||
request.post("https://example.com/login", { | ||
form: { | ||
username: "iamauser", | ||
password: "supersecurepw" | ||
} | ||
}, function (error, response, body) { | ||
crawler.cookies.addFromHeaders(response.headers["set-cookie"]); | ||
crawler.start(); | ||
}); | ||
crawler.on("fetchcomplete", function (queueItem, responseBuffer, response) { | ||
console.log("Fetched", queueItem.url); | ||
}); | ||
``` | ||
- **Q: What does it mean that events are asynchronous?** | ||
A: One of the core concepts of node.js is its asynchronous nature. I/O | ||
operations (like network requests) take place outside of the main thread | ||
(which is where your code is executed). This is what makes node fast, the | ||
fact that it can continue executing code while there are multiple HTTP | ||
requests in flight, for example. But to be able to get back the result of | ||
the HTTP request, we need to register a function that will be called when | ||
the result is ready. This is what *asynchronous* means in node - the fact | ||
that code can continue executing while I/O operations are in progress - and | ||
it's the same concept as with AJAX requests in the browser. | ||
- **Q: Promises are nice, can I use them with simplecrawler?** | ||
A: No, not really. Promises are meant as a replacement for callbacks, but | ||
simplecrawler is event driven, not callback driven. Using callbacks to any | ||
greater extent in simplecrawler wouldn't make much sense, since you normally | ||
need to react more than once to what happens in simplecrawler. | ||
- **Q: Something's happening and I don't see the output I'm expecting!** | ||
Before filing an issue, check to see that you're not just missing something by | ||
logging *all* crawler events with the code below: | ||
```js | ||
var originalEmit = crawler.emit; | ||
crawler.emit = function(evtName, queueItem) { | ||
crawler.queue.complete(function(err, completeCount) { | ||
if (err) { | ||
throw err; | ||
} | ||
crawler.queue.getLength(function(err, length) { | ||
if (err) { | ||
throw err; | ||
} | ||
console.log("fetched %d of %d — %d open requests, %d open listeners", | ||
completeCount, | ||
length, | ||
crawler._openRequests, | ||
crawler._openListeners); | ||
}); | ||
}); | ||
console.log(evtName, queueItem ? queueItem.url ? queueItem.url : queueItem : null); | ||
originalEmit.apply(crawler, arguments); | ||
}; | ||
``` | ||
If you don't see what you need after inserting that code block, and you still need help, | ||
please attach the output of all the events fired with your email/issue. | ||
## Current Maintainers | ||
* [Christopher Giffard](https://github.com/cgiffard) | ||
* [Fredrik Ekelund](https://github.com/fredrikekelund) | ||
* [XhmikosR](https://github.com/XhmikosR) | ||
## Contributors | ||
simplecrawler has benefited from the kind efforts of dozens of contributors, to | ||
whom we are incredibly grateful. We originally listed their individual | ||
contributions but it became pretty unwieldy - the | ||
[full list can be found here.](https://github.com/cgiffard/node-simplecrawler/graphs/contributors) | ||
## License | ||
Copyright (c) 2016, Christopher Giffard. | ||
All rights reserved. | ||
@@ -678,0 +810,0 @@ |
119004
2047
814