simplecrawler
Advanced tools
Comparing version 1.1.6 to 1.1.7
/* | ||
* Simplecrawler - FS cache backend | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* https://github.com/simplecrawler/simplecrawler | ||
* | ||
@@ -139,3 +139,7 @@ * Copyright (c) 2011-2015, Christopher Giffard | ||
FSBackend.prototype.saveCache = function(callback) { | ||
fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback); | ||
if (callback) { | ||
fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback); | ||
} else { | ||
fs.writeFileSync(this.location + "cacheindex.json", JSON.stringify(this.index)); | ||
} | ||
}; | ||
@@ -142,0 +146,0 @@ |
/* | ||
* Simplecrawler - cache module | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* https://github.com/simplecrawler/simplecrawler | ||
* | ||
@@ -5,0 +5,0 @@ * Copyright (c) 2011-2015, Christopher Giffard |
@@ -197,4 +197,3 @@ /** | ||
} | ||
}) | ||
.map(function(cookie) { | ||
}).map(function(cookie) { | ||
return cookie.toOutboundString(); | ||
@@ -304,3 +303,3 @@ }); | ||
string = string.replace(/^\s*set\-cookie\s*:\s*/i, ""); | ||
string = string.replace(/^\s*set-cookie\s*:\s*/i, ""); | ||
@@ -307,0 +306,0 @@ var parts = string.split(/\s*;\s*/i), |
/* | ||
* Simplecrawler - Export interfaces | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* https://github.com/simplecrawler/simplecrawler | ||
* | ||
@@ -5,0 +5,0 @@ * Copyright (c) 2011-2015, Christopher Giffard |
166
lib/queue.js
@@ -57,3 +57,29 @@ /** | ||
/** | ||
* Creates a new queue | ||
* QueueItems represent resources in the queue that have been fetched, or will be eventually. | ||
* @typedef {Object} QueueItem | ||
* @property {Number} id A unique ID assigned by the queue when the queue item is added | ||
* @property {String} url The complete, canonical URL of the resource | ||
* @property {String} protocol The protocol of the resource (http, https) | ||
* @property {String} host The full domain/hostname of the resource | ||
* @property {Number} port The port of the resource | ||
* @property {String} path The URL path, including the query string | ||
* @property {String} uriPath The URL path, excluding the query string | ||
* @property {Number} depth How many steps simplecrawler has taken from the initial page (which is depth 1) to this resource. | ||
* @property {String} referrer The URL of the resource where the URL of this queue item was discovered | ||
* @property {Boolean} fetched Has the request for this item been completed? You can monitor this as requests are processed. | ||
* @property {'queued'|'spooled'|'headers'|'downloaded'|'redirected'|'notfound'|'failed'} status The internal status of the item. | ||
* @property {Object} stateData An object containing state data and other information about the request. | ||
* @property {Number} stateData.requestLatency The time (in ms) taken for headers to be received after the request was made. | ||
* @property {Number} stateData.requestTime The total time (in ms) taken for the request (including download time.) | ||
* @property {Number} stateData.downloadTime The total time (in ms) taken for the resource to be downloaded. | ||
* @property {Number} stateData.contentLength The length (in bytes) of the returned content. Calculated based on the `content-length` header. | ||
* @property {String} stateData.contentType The MIME type of the content. | ||
* @property {Number} stateData.code The HTTP status code returned for the request. Note that this code is `600` if an error occurred in the client and a fetch operation could not take place successfully. | ||
* @property {Object} stateData.headers An object containing the header information returned by the server. This is the object node returns as part of the `response` object. | ||
* @property {Number} stateData.actualDataSize The length (in bytes) of the returned content. Calculated based on what is actually received, not the `content-length` header. | ||
* @property {Boolean} stateData.sentIncorrectSize True if the data length returned by the server did not match what we were told to expect by the `content-length` header. | ||
*/ | ||
/** | ||
* FetchQueue handles {@link QueueItem}s and provides a few utility methods for querying them | ||
* @class | ||
@@ -113,13 +139,6 @@ */ | ||
/** | ||
* Called when {@link FetchQueue#add} returns a result | ||
* @callback FetchQueue~addCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {QueueItem} [queueItem] The queue item that was added to the queue. It's status property will have changed to `"queued"`. | ||
*/ | ||
/** | ||
* Adds an item to the queue | ||
* @param {QueueItem} queueItem Queue item that is to be added to the queue | ||
* @param {Boolean} [force=false] If true, the queue item will be added regardless of whether it already exists in the queue | ||
* @param {FetchQueue~addCallback} callback | ||
* @param {QueueItem} queueItem Queue item that is to be added to the queue | ||
* @param {Boolean} [force=false] If true, the queue item will be added regardless of whether it already exists in the queue | ||
* @param {Function} callback Gets two parameters, `error` and `queueItem`. If the operation was successful, `error` will be `null` and `queueItem` will be the item that was added to the queue. It's status property will have changed to `"queued"`. | ||
*/ | ||
@@ -157,13 +176,6 @@ FetchQueue.prototype.add = function(queueItem, force, callback) { | ||
/** | ||
* Called when {@link FetchQueue#exists} returns a result | ||
* @callback FetchQueue~existsCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {Number} [count] The number of occurences in the queue of the provided URL. | ||
*/ | ||
/** | ||
* Checks if a URL already exists in the queue. Returns the number of occurences | ||
* of that URL. | ||
* @param {String} url URL to check the existence of in the queue | ||
* @param {FetchQueue~existsCallback} callback | ||
* @param {String} url URL to check the existence of in the queue | ||
* @param {Function} callback Gets two parameters, `error` and `count`. If the operation was successful, `error` will be `null`. | ||
*/ | ||
@@ -179,12 +191,5 @@ FetchQueue.prototype.exists = function(url, callback) { | ||
/** | ||
* Called when {@link FetchQueue#get} returns a result | ||
* @callback FetchQueue~getCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {QueueItem} [queueItem] The queue item found at that index in the queue. | ||
*/ | ||
/** | ||
* Get a queue item by index | ||
* @param {Number} index The index of the queue item in the queue | ||
* @param {FetchQueue~getCallback} callback | ||
* @param {Number} index The index of the queue item in the queue | ||
* @param {Function} callback Gets two parameters, `error` and `queueItem`. If the operation was successful, `error` will be `null`. | ||
*/ | ||
@@ -206,13 +211,6 @@ FetchQueue.prototype.get = function(index, callback) { | ||
/** | ||
* Called when {@link FetchQueue#update} returns a result | ||
* @callback FetchQueue~updateCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {QueueItem} [queueItem] The updated queue item | ||
*/ | ||
/** | ||
* Updates a queue item in the queue. | ||
* @param {Number} id ID of the queue item that is to be updated | ||
* @param {Object} updates Object that will be deeply assigned (as in `Object.assign`) to the queue item. That means that nested objects will also be resursively assigned. | ||
* @param {FetchQueue~updateCallback} callback | ||
* @param {Number} id ID of the queue item that is to be updated | ||
* @param {Object} updates Object that will be deeply assigned (as in `Object.assign`) to the queue item. That means that nested objects will also be resursively assigned. | ||
* @param {Function} callback Gets two parameters, `error` and `queueItem`. If the operation was successful, `error` will be `null`. | ||
*/ | ||
@@ -239,11 +237,4 @@ FetchQueue.prototype.update = function (id, updates, callback) { | ||
/** | ||
* Called when {@link FetchQueue#oldestUnfetchedItem} returns a result | ||
* @callback FetchQueue~oldestUnfetchedItemCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {QueueItem} [queueItem] If there are unfetched queue items left, this will be the oldest one found. If not, this will be `null`. | ||
*/ | ||
/** | ||
* Gets the first unfetched item in the queue | ||
* @param {FetchQueue~oldestUnfetchedItemCallback} callback | ||
* @param {Function} callback Gets two parameters, `error` and `queueItem`. If the operation was successful, `error` will be `null`. If there are unfetched queue items left, `queueItem` will be the oldest one found. If not, `queueItem` will be `null`. | ||
*/ | ||
@@ -268,14 +259,7 @@ FetchQueue.prototype.oldestUnfetchedItem = function(callback) { | ||
/** | ||
* Called when {@link FetchQueue#max} returns a result | ||
* @callback FetchQueue~maxCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {Number} [max] The maximum value of the property that was initially provided | ||
*/ | ||
/** | ||
* Gets the maximum value of a stateData property from all the items in the | ||
* queue. This means you can eg. get the maximum request time, download size | ||
* etc. | ||
* @param {String} statisticName Can be any of the strings in {@link FetchQueue._allowedStatistics} | ||
* @param {FetchQueue~maxCallback} callback | ||
* @param {String} statisticName Can be any of the strings in {@link FetchQueue._allowedStatistics} | ||
* @param {Function} callback Gets two parameters, `error` and `max`. If the operation was successful, `error` will be `null`. | ||
*/ | ||
@@ -300,14 +284,8 @@ FetchQueue.prototype.max = function(statisticName, callback) { | ||
/** | ||
* Called when {@link FetchQueue#min} returns a result | ||
* @callback FetchQueue~minCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {Number} [min] The minimum value of the property that was initially provided | ||
*/ | ||
/** | ||
* Gets the minimum value of a stateData property from all the items in the | ||
* queue. This means you can eg. get the minimum request time, download size | ||
* etc. | ||
* @param {String} statisticName Can be any of the strings in {@link FetchQueue._allowedStatistics} | ||
* @param {FetchQueue~minCallback} callback | ||
* @param {String} statisticName Can be any of the strings in {@link FetchQueue._allowedStatistics} | ||
* @param {Function} callback Gets two parameters, `error` and `min`. If the operation was successful, `error` will be `null`. | ||
*/ | ||
@@ -332,14 +310,7 @@ FetchQueue.prototype.min = function(statisticName, callback) { | ||
/** | ||
* Called when {@link FetchQueue#avg} returns a result | ||
* @callback FetchQueue~avgCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {Number} [avg] The average value of the property that was initially provided | ||
*/ | ||
/** | ||
* Gets the average value of a stateData property from all the items in the | ||
* queue. This means you can eg. get the average request time, download size | ||
* etc. | ||
* @param {String} statisticName Can be any of the strings in {@link FetchQueue._allowedStatistics} | ||
* @param {FetchQueue~avgCallback} callback | ||
* @param {String} statisticName Can be any of the strings in {@link FetchQueue._allowedStatistics} | ||
* @param {Function} callback Gets two parameters, `error` and `avg`. If the operation was successful, `error` will be `null`. | ||
*/ | ||
@@ -366,12 +337,6 @@ FetchQueue.prototype.avg = function(statisticName, callback) { | ||
/** | ||
* Called when {@link FetchQueue#countItems} returns a result | ||
* @callback FetchQueue~countItemsCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {Number} [count] The number of items that matched the provided selector | ||
*/ | ||
/** | ||
* Counts the items in the queue that match a selector | ||
* @param {Object} comparator Comparator object used to filter items. Queue items that are counted need to match all the properties of this object. | ||
* @param {FetchQueue~countItemsCallback} callback | ||
* @param {Function} callback Gets two parameters, `error` and `items`. If the operation was successful, `error` will be `null` and `items` will be an array of QueueItems. | ||
*/ | ||
@@ -389,12 +354,5 @@ FetchQueue.prototype.countItems = function(comparator, callback) { | ||
/** | ||
* Called when {@link FetchQueue#filterItems} returns a result | ||
* @callback FetchQueue~filterItemsCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {QueueItem[]} [items] The items that matched the provided selector | ||
*/ | ||
/** | ||
* Filters and returns the items in the queue that match a selector | ||
* @param {Object} comparator Comparator object used to filter items. Queue items that are returned need to match all the properties of this object. | ||
* @param {FetchQueue~filterItemsCallback} callback | ||
* @param {Object} comparator Comparator object used to filter items. Queue items that are returned need to match all the properties of this object. | ||
* @param {Function} callback Gets two parameters, `error` and `items`. If the operation was successful, `error` will be `null` and `items` will be an array of QueueItems. | ||
*/ | ||
@@ -410,11 +368,5 @@ FetchQueue.prototype.filterItems = function(comparator, callback) { | ||
/** | ||
* Called when {@link FetchQueue#getLength} returns a result | ||
* @callback FetchQueue~getLengthCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
* @param {Number} [items] The total number of items in the queue | ||
*/ | ||
/** | ||
* Gets the total number of queue items in the queue | ||
* @param {FetchQueue~getLengthCallback} callback | ||
* @param {Function} callback Gets two parameters, `error` and `length`. If the operation was successful, `error` will be `null`. | ||
*/ | ||
@@ -426,12 +378,6 @@ FetchQueue.prototype.getLength = function(callback) { | ||
/** | ||
* Called when {@link FetchQueue#freeze} returns a result | ||
* @callback FetchQueue~freezeCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
*/ | ||
/** | ||
* Writes the queue to disk in a JSON file. This file can later be imported | ||
* using {@link FetchQueue#defrost} | ||
* @param {String} filename Filename passed directly to [fs.writeFile]{@link https://nodejs.org/api/fs.html#fs_fs_writefile_file_data_options_callback} | ||
* @param {FetchQueue~freezeCallback} callback | ||
* @param {String} filename Filename passed directly to [fs.writeFile]{@link https://nodejs.org/api/fs.html#fs_fs_writefile_file_data_options_callback} | ||
* @param {Function} callback Gets a single `error` parameter. If the operation was successful, this parameter will be `null`. | ||
*/ | ||
@@ -454,11 +400,5 @@ FetchQueue.prototype.freeze = function(filename, callback) { | ||
/** | ||
* Called when {@link FetchQueue#defrost} returns a result | ||
* @callback FetchQueue~defrostCallback | ||
* @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. | ||
*/ | ||
/** | ||
* Import the queue from a frozen JSON file on disk. | ||
* @param {String} filename Filename passed directly to [fs.readFile]{@link https://nodejs.org/api/fs.html#fs_fs_readfile_file_options_callback} | ||
* @param {FetchQueue~defrostCallback} callback | ||
* @param {String} filename Filename passed directly to [fs.readFile]{@link https://nodejs.org/api/fs.html#fs_fs_readfile_file_options_callback} | ||
* @param {Function} callback Gets a single `error` parameter. If the operation was successful, this parameter will be `null`. | ||
*/ | ||
@@ -465,0 +405,0 @@ FetchQueue.prototype.defrost = function(filename, callback) { |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straightforward, event driven web crawler. Features a flexible queue interface and a basic cache mechanism with extensible backend.", | ||
"version": "1.1.6", | ||
"homepage": "https://github.com/cgiffard/node-simplecrawler", | ||
"version": "1.1.7", | ||
"homepage": "https://github.com/simplecrawler/simplecrawler", | ||
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
@@ -14,6 +14,6 @@ "maintainers": [ | ||
"type": "git", | ||
"url": "git+https://github.com/cgiffard/node-simplecrawler.git" | ||
"url": "git+https://github.com/simplecrawler/simplecrawler.git" | ||
}, | ||
"bugs": { | ||
"url": "https://github.com/cgiffard/node-simplecrawler/issues" | ||
"url": "https://github.com/simplecrawler/simplecrawler/issues" | ||
}, | ||
@@ -31,5 +31,5 @@ "keywords": [ | ||
"lint": "eslint example/ lib/ test/", | ||
"mocha": "mocha -R spec -t 5000", | ||
"mocha": "mocha -t 5000 --exit", | ||
"test": "npm run lint && npm run mocha", | ||
"docs": "jsdoc -c jsdoc.json" | ||
"docs": "jsdoc2md -t jsdoc2md/README.hbs --partial jsdoc2md/*.hbs --files lib/*.js > README.md" | ||
}, | ||
@@ -41,15 +41,15 @@ "bin": { | ||
"dependencies": { | ||
"async": "^2.1.4", | ||
"iconv-lite": "^0.4.13", | ||
"robots-parser": "^1.0.0", | ||
"urijs": "^1.18.11" | ||
"async": "^2.6.2", | ||
"iconv-lite": "^0.4.24", | ||
"robots-parser": "^2.1.1", | ||
"urijs": "^1.19.1" | ||
}, | ||
"devDependencies": { | ||
"chai": "^3.2.0", | ||
"eslint": "^2.0.0", | ||
"jsdoc": "^3.4.0", | ||
"mocha": "^3.0.0" | ||
"chai": "^4.2.0", | ||
"eslint": "^5.16.0", | ||
"jsdoc-to-markdown": "^4.0.1", | ||
"mocha": "^6.1.1" | ||
}, | ||
"engines": { | ||
"node": ">=0.10.0" | ||
"node": ">=6.13.0" | ||
}, | ||
@@ -56,0 +56,0 @@ "files": [ |
1381
README.md
# Simple web crawler for node.js | ||
[![NPM version](https://img.shields.io/npm/v/simplecrawler.svg)](https://www.npmjs.com/package/simplecrawler) | ||
[![Linux Build Status](https://img.shields.io/travis/cgiffard/node-simplecrawler/master.svg)](https://travis-ci.org/cgiffard/node-simplecrawler) | ||
[![Windows Build Status](https://img.shields.io/appveyor/ci/cgiffard/node-simplecrawler/master.svg?label=Windows%20build)](https://ci.appveyor.com/project/cgiffard/node-simplecrawler/branch/master) | ||
[![Dependency Status](https://img.shields.io/david/cgiffard/node-simplecrawler.svg)](https://david-dm.org/cgiffard/node-simplecrawler) | ||
[![devDependency Status](https://img.shields.io/david/dev/cgiffard/node-simplecrawler.svg)](https://david-dm.org/cgiffard/node-simplecrawler#info=devDependencies) | ||
[![Linux Build Status](https://img.shields.io/travis/simplecrawler/simplecrawler/master.svg)](https://travis-ci.org/simplecrawler/simplecrawler) | ||
[![Windows Build Status](https://img.shields.io/appveyor/ci/fredrikekelund/simplecrawler.svg?label=Windows%20build)](https://ci.appveyor.com/project/fredrikekelund/simplecrawler/branch/master) | ||
[![Dependency Status](https://img.shields.io/david/simplecrawler/simplecrawler.svg)](https://david-dm.org/simplecrawler/simplecrawler) | ||
[![devDependency Status](https://img.shields.io/david/dev/simplecrawler/simplecrawler.svg)](https://david-dm.org/simplecrawler/simplecrawler?type=dev) | ||
[![Greenkeeper badge](https://badges.greenkeeper.io/simplecrawler/simplecrawler.svg)](https://greenkeeper.io/) | ||
simplecrawler is designed to provide a basic, flexible and robust API for | ||
crawling websites. It was written to archive, analyse, and search some | ||
very large websites and has happily chewed through hundreds of thousands of | ||
pages and written tens of gigabytes to disk without issue. | ||
simplecrawler is designed to provide a basic, flexible and robust API for crawling websites. It was written to archive, analyse, and search some very large websites and has happily chewed through hundreds of thousands of pages and written tens of gigabytes to disk without issue. | ||
@@ -18,9 +16,7 @@ ## What does simplecrawler do? | ||
* Extremely configurable base for writing your own crawler | ||
* Provides some simple logic for auto-detecting linked resources - which you can | ||
replace or augment | ||
* Provides some simple logic for auto-detecting linked resources - which you can replace or augment | ||
* Automatically respects any robots.txt rules | ||
* Has a flexible queue system which can be frozen to disk and defrosted | ||
* Provides basic statistics on network performance | ||
* Uses buffers for fetching and managing data, preserving binary data (except | ||
when discovering links) | ||
* Uses buffers for fetching and managing data, preserving binary data (except when discovering links) | ||
@@ -60,10 +56,5 @@ ## Documentation | ||
Initializing simplecrawler is a simple process. First, you require the module | ||
and instantiate it with a single argument. You then configure the properties you | ||
like (eg. the request interval), register a few event listeners, and call the | ||
start method. Let's walk through the process! | ||
Initializing simplecrawler is a simple process. First, you require the module and instantiate it with a single argument. You then configure the properties you like (eg. the request interval), register a few event listeners, and call the start method. Let's walk through the process! | ||
After requiring the crawler, we create a new instance of it. We supply the | ||
constructor with a URL that indicates which domain to crawl and which resource | ||
to fetch first. | ||
After requiring the crawler, we create a new instance of it. We supply the constructor with a URL that indicates which domain to crawl and which resource to fetch first. | ||
@@ -76,4 +67,3 @@ ```js | ||
You can initialize the crawler with or without the `new` operator. Being able to | ||
skip it comes in handy when you want to chain API calls. | ||
You can initialize the crawler with or without the `new` operator. Being able to skip it comes in handy when you want to chain API calls. | ||
@@ -87,10 +77,5 @@ ```js | ||
By default, the crawler will only fetch resources on the same domain as that in | ||
the URL passed to the constructor. But this can be changed through the | ||
`crawler.domainWhitelist` property. | ||
By default, the crawler will only fetch resources on the same domain as that in the URL passed to the constructor. But this can be changed through the <code><a href="#Crawler+domainWhitelist">crawler.domainWhitelist</a></code> property. | ||
Now, let's configure some more things before we start crawling. Of course, | ||
you're probably wanting to ensure you don't take down your web server. Decrease | ||
the concurrency from five simultaneous requests - and increase the request | ||
interval from the default 250 ms like this: | ||
Now, let's configure some more things before we start crawling. Of course, you're probably wanting to ensure you don't take down your web server. Decrease the concurrency from five simultaneous requests - and increase the request interval from the default 250 ms like this: | ||
@@ -112,7 +97,5 @@ ```js | ||
For a full list of configurable properties, see the | ||
[configuration section](#configuration). | ||
For a full list of configurable properties, see the [configuration section](#configuration). | ||
You'll also need to set up event listeners for the [events](#events) you want to | ||
listen to. `fetchcomplete` and `complete` are a good place to start. | ||
You'll also need to set up event listeners for the [events](#events) you want to listen to. <code>crawler.fetchcomplete</code> and <code>crawler.complete</code> are good places to start. | ||
@@ -126,5 +109,3 @@ ```js | ||
Then, when you're satisfied and ready to go, start the crawler! It'll run | ||
through its queue finding linked resources on the domain to download, until it | ||
can't find any more. | ||
Then, when you're satisfied and ready to go, start the crawler! It'll run through its queue finding linked resources on the domain to download, until it can't find any more. | ||
@@ -137,100 +118,297 @@ ```js | ||
simplecrawler's API is event driven, and there are plenty of events emitted | ||
during the different stages of the crawl. Arguments passed to events are written | ||
in parentheses. | ||
simplecrawler's API is event driven, and there are plenty of events emitted during the different stages of the crawl. | ||
* `crawlstart` - | ||
Fired when the crawl begins or is restarted. | ||
* `queueadd` (queueItem, referrerQueueItem) - | ||
Fired when a new item is added to the queue. | ||
* `queueduplicate` (URLData) - | ||
Fired when an item cannot be added to the queue because it is already | ||
present in the queue. Frequent firing of this event is normal and expected. | ||
* `queueerror` (error, URLData) - | ||
Fired when an item cannot be added to the queue due to an error. | ||
* `robotstxterror` (error) - | ||
Fired when robots.txt couldn't be fetched. | ||
* `invaliddomain` (queueItem) - | ||
Fired when a resource wasn't queued because it had an invalid domain. See | ||
`crawler.filterByDomain`, `crawler.ignoreWWWDomain`, | ||
`crawler.scanSubdomains` and `crawler.domainWhitelist` for different ways to | ||
configure which domains are considered valid. | ||
* `fetchdisallowed` (queueItem) - | ||
Fired when a resource wasn't queued because of robots.txt rules. See | ||
`respectRobotsTxt` option. | ||
* `fetchprevented` (queueItem) - | ||
Fired when a resource wasn't queued because of a [fetch | ||
condition](#fetch-conditions). | ||
* `fetchconditionerror` (queueItem, error) - | ||
Fired when one of the fetch conditions returns an error. Provides the queue | ||
item that was processed when the error was encountered as well as the error | ||
itself. | ||
* `downloadconditionerror` (queueItem, error) - | ||
Fired when one of the download conditions returns an error. Provides the | ||
queue item that was processed when the error was encountered as well as the | ||
error itself. | ||
* `fetchstart` (queueItem, requestOptions) - | ||
Fired when an item is spooled for fetching. If your event handler is | ||
synchronous, you can modify the crawler request options (including headers | ||
and request method.) | ||
* `fetchheaders` (queueItem, responseObject) - | ||
Fired when the headers for a resource are received from the server. The node | ||
`http` response object is returned for your perusal. | ||
* `cookieerror` (queueItem, error, setCookieHeader) - | ||
Fired when an error was caught trying to add a cookie to the cookie jar. | ||
`setCookieHeader` is the Set-Cookie header that was provided in the HTTP | ||
response. | ||
* `fetchredirect` (oldQueueItem, redirectQueueItem, responseObject) - | ||
Fired when a redirect header is encountered. The new URL is processed and | ||
passed as `redirectQueueItem`. | ||
* `fetch404` (queueItem, responseObject) - | ||
Fired when a 404 HTTP status code is returned for a request. | ||
* `fetch410` (queueItem, responseObject) - | ||
Fired when a 410 HTTP status code is returned for a request. | ||
* `fetchdataerror` (queueItem, responseObject) - | ||
Fired when a resource can't be downloaded, because it exceeds the maximum | ||
size we're prepared to receive (16MB by default.) | ||
* `fetchtimeout` (queueItem, crawlerTimeoutValue) - | ||
Fired when a request time exceeds the internal crawler threshold. | ||
* `fetchcomplete` (queueItem, responseBody, responseObject) - | ||
Fired after a resource has been completely downloaded and the server | ||
returned an HTTP status code between 200 and 300. The response body is | ||
provided as a Buffer per default, unless `decodeResponses` is truthy, in | ||
which case it's a decoded string representation of the body. | ||
* `fetcherror` (queueItem, responseObject) - | ||
Fired when an alternate 400 or 500 series HTTP status code is returned for a | ||
request. | ||
* `gziperror` (queueItem, error, responseBuffer) - | ||
Fired when a gzipped resource cannot be unzipped. | ||
* `fetchclienterror` (queueItem, error) - | ||
Fired when a request dies locally for some reason. The error data is | ||
returned as the second parameter. | ||
* `discoverycomplete` (queueItem, resources) - | ||
Fired when linked resources have been discovered. Passes an array of | ||
resources (as URL's) as the second parameter. | ||
* `complete` - | ||
Fired when the crawler completes processing all the items in its queue, and | ||
does not find any more to add. This event returns no arguments. | ||
<a name="Crawler+event_crawlstart"></a> | ||
#### "crawlstart" | ||
Fired when the crawl starts. This event gives you the opportunity to | ||
adjust the crawler's configuration, since the crawl won't actually start | ||
until the next processor tick. | ||
<a name="Crawler+event_discoverycomplete"></a> | ||
#### "discoverycomplete" (queueItem, resources) | ||
Fired when the discovery of linked resources has completed | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item that represents the document for the discovered resources | | ||
| resources | <code>Array</code> | An array of discovered and cleaned URL's | | ||
<a name="Crawler+event_invaliddomain"></a> | ||
#### "invaliddomain" (queueItem) | ||
Fired when a resource wasn't queued because of an invalid domain name | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item representing the disallowed URL | | ||
<a name="Crawler+event_fetchdisallowed"></a> | ||
#### "fetchdisallowed" (queueItem) | ||
Fired when a resource wasn't queued because it was disallowed by the | ||
site's robots.txt rules | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item representing the disallowed URL | | ||
<a name="Crawler+event_fetchconditionerror"></a> | ||
#### "fetchconditionerror" (queueItem, error) | ||
Fired when a fetch condition returns an error | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item that was processed when the error was encountered | | ||
| error | <code>\*</code> | | | ||
<a name="Crawler+event_fetchprevented"></a> | ||
#### "fetchprevented" (queueItem, fetchCondition) | ||
Fired when a fetch condition prevented the queueing of a URL | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item that didn't pass the fetch conditions | | ||
| fetchCondition | <code>function</code> | The first fetch condition that returned false | | ||
<a name="Crawler+event_queueduplicate"></a> | ||
#### "queueduplicate" (queueItem) | ||
Fired when a new queue item was rejected because another | ||
queue item with the same URL was already in the queue | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item that was rejected | | ||
<a name="Crawler+event_queueerror"></a> | ||
#### "queueerror" (error, queueItem) | ||
Fired when an error was encountered while updating a queue item | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| error | [<code>QueueItem</code>](#QueueItem) | The error that was returned by the queue | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item that the crawler tried to update when it encountered the error | | ||
<a name="Crawler+event_queueadd"></a> | ||
#### "queueadd" (queueItem, referrer) | ||
Fired when an item was added to the crawler's queue | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item that was added to the queue | | ||
| referrer | [<code>QueueItem</code>](#QueueItem) | The queue item representing the resource where the new queue item was found | | ||
<a name="Crawler+event_fetchtimeout"></a> | ||
#### "fetchtimeout" (queueItem, timeout) | ||
Fired when a request times out | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request timed out | | ||
| timeout | <code>Number</code> | The delay in milliseconds after which the request timed out | | ||
<a name="Crawler+event_fetchclienterror"></a> | ||
#### "fetchclienterror" (queueItem, error) | ||
Fired when a request encounters an unknown error | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request has errored | | ||
| error | <code>Object</code> | The error supplied to the `error` event on the request | | ||
<a name="Crawler+event_fetchstart"></a> | ||
#### "fetchstart" (queueItem, requestOptions) | ||
Fired just after a request has been initiated | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request has been initiated | | ||
| requestOptions | <code>Object</code> | The options generated for the HTTP request | | ||
<a name="Crawler+event_cookieerror"></a> | ||
#### "cookieerror" (queueItem, error, cookie) | ||
Fired when an error was encountered while trying to add a | ||
cookie to the cookie jar | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item representing the resource that returned the cookie | | ||
| error | <code>Error</code> | The error that was encountered | | ||
| cookie | <code>String</code> | The Set-Cookie header value that was returned from the request | | ||
<a name="Crawler+event_fetchheaders"></a> | ||
#### "fetchheaders" (queueItem, response) | ||
Fired when the headers for a request have been received | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the headers have been received | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_downloadconditionerror"></a> | ||
#### "downloadconditionerror" (queueItem, error) | ||
Fired when a download condition returns an error | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item that was processed when the error was encountered | | ||
| error | <code>\*</code> | | | ||
<a name="Crawler+event_downloadprevented"></a> | ||
#### "downloadprevented" (queueItem, response) | ||
Fired when the downloading of a resource was prevented | ||
by a download condition | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item representing the resource that was halfway fetched | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_notmodified"></a> | ||
#### "notmodified" (queueItem, response, cacheObject) | ||
Fired when the crawler's cache was enabled and the server responded with a 304 Not Modified status for the request | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request returned a 304 status | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
| cacheObject | <code>CacheObject</code> | The CacheObject returned from the cache backend | | ||
<a name="Crawler+event_fetchredirect"></a> | ||
#### "fetchredirect" (queueItem, redirectQueueItem, response) | ||
Fired when the server returned a redirect HTTP status for the request | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request was redirected | | ||
| redirectQueueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for the redirect target resource | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_fetch404"></a> | ||
#### "fetch404" (queueItem, response) | ||
Fired when the server returned a 404 Not Found status for the request | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request returned a 404 status | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_fetch410"></a> | ||
#### "fetch410" (queueItem, response) | ||
Fired when the server returned a 410 Gone status for the request | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request returned a 410 status | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_fetcherror"></a> | ||
#### "fetcherror" (queueItem, response) | ||
Fired when the server returned a status code above 400 that isn't 404 or 410 | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request failed | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_fetchcomplete"></a> | ||
#### "fetchcomplete" (queueItem, responseBody, response) | ||
Fired when the request has completed | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request has completed | | ||
| responseBody | <code>String</code> \| <code>Buffer</code> | If [decodeResponses](#Crawler+decodeResponses) is true, this will be the decoded HTTP response. Otherwise it will be the raw response buffer. | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_gziperror"></a> | ||
#### "gziperror" (queueItem, responseBody, response) | ||
Fired when an error was encountered while unzipping the response data | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the unzipping failed | | ||
| responseBody | <code>String</code> \| <code>Buffer</code> | If [decodeResponses](#Crawler+decodeResponses) is true, this will be the decoded HTTP response. Otherwise it will be the raw response buffer. | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_fetchdataerror"></a> | ||
#### "fetchdataerror" (queueItem, response) | ||
Fired when a resource couldn't be downloaded because it exceeded the maximum allowed size | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The queue item for which the request failed | | ||
| response | <code>http.IncomingMessage</code> | The [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) for the request's response | | ||
<a name="Crawler+event_robotstxterror"></a> | ||
#### "robotstxterror" (error) | ||
Fired when an error was encountered while retrieving a robots.txt file | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| error | <code>Error</code> | The error returned from [getRobotsTxt](#Crawler+getRobotsTxt) | | ||
<a name="Crawler+event_complete"></a> | ||
#### "complete" | ||
Fired when the crawl has completed - all resources in the queue have been dealt with | ||
### A note about HTTP error conditions | ||
By default, simplecrawler does not download the response body when it encounters | ||
an HTTP error status in the response. If you need this information, you can listen | ||
to simplecrawler's error events, and through node's native `data` event | ||
(`response.on("data",function(chunk) {...})`) you can save the information yourself. | ||
By default, simplecrawler does not download the response body when it encounters an HTTP error status in the response. If you need this information, you can listen to simplecrawler's error events, and through node's native `data` event (`response.on("data",function(chunk) {...})`) you can save the information yourself. | ||
### Waiting for asynchronous event listeners | ||
Sometimes, you might want to wait for simplecrawler to wait for you while you | ||
perform some asynchronous tasks in an event listener, instead of having it | ||
racing off and firing the `complete` event, halting your crawl. For example, | ||
if you're doing your own link discovery using an asynchronous library method. | ||
Sometimes, you might want to wait for simplecrawler to wait for you while you perform some asynchronous tasks in an event listener, instead of having it racing off and firing the `complete` event, halting your crawl. For example, if you're doing your own link discovery using an asynchronous library method. | ||
simplecrawler provides a `wait` method you can call at any time. It is available | ||
via `this` from inside listeners, and on the crawler object itself. It returns | ||
a callback function. | ||
simplecrawler provides a `wait` method you can call at any time. It is available via `this` from inside listeners, and on the crawler object itself. It returns a callback function. | ||
Once you've called this method, simplecrawler will not fire the `complete` event | ||
until either you execute the callback it returns, or a timeout is reached | ||
(configured in `crawler.listenerTTL`, by default 10000 ms.) | ||
Once you've called this method, simplecrawler will not fire the `complete` event until either you execute the callback it returns, or a timeout is reached (configured in `crawler.listenerTTL`, by default 10000 ms.) | ||
@@ -242,4 +420,8 @@ #### Example asynchronous event listener | ||
var continue = this.wait(); | ||
doSomeDiscovery(data, function(foundURLs) { | ||
foundURLs.forEach(crawler.queueURL.bind(crawler)); | ||
foundURLs.forEach(function(url) { | ||
crawler.queueURL(url, queueItem); | ||
}); | ||
continue(); | ||
@@ -252,366 +434,441 @@ }); | ||
simplecrawler is highly configurable and there's a long list of settings you can | ||
change to adapt it to your specific needs. | ||
simplecrawler is highly configurable and there's a long list of settings you can change to adapt it to your specific needs. | ||
* `crawler.host` - | ||
The domain to scan. By default, simplecrawler will restrict all requests to | ||
this domain. | ||
* `crawler.interval=250` - | ||
The interval with which the crawler will spool up new requests (one per | ||
tick). | ||
* `crawler.maxConcurrency=5` - | ||
The maximum number of requests the crawler will run simultaneously. Defaults | ||
to 5 - the default number of http agents node will run. | ||
* `crawler.timeout=300000` - | ||
The maximum time in milliseconds the crawler will wait for headers before | ||
aborting the request. | ||
* `crawler.listenerTTL=10000` - | ||
The maximum time in milliseconds the crawler will wait for async listeners. | ||
* `crawler.userAgent="Node/simplecrawler <version> (https://github.com/cgiffard/node-simplecrawler)"` - | ||
The user agent the crawler will report. | ||
* `crawler.decompressResponses=true` - | ||
Response bodies that are compressed will be automatically decompressed | ||
before they're emitted in the `fetchcomplete` event. Even if this is falsy, | ||
compressed responses will be decompressed before they're passed to the | ||
`discoverResources` method. | ||
* `crawler.decodeResponses=false` - | ||
Response bodies will be intelligently character converted to standard | ||
JavaScript strings using the | ||
[iconv-lite](https://www.npmjs.com/package/iconv-lite) module. The character | ||
encoding is interpreted from the Content-Type header firstly, and secondly | ||
from any `<meta charset="xxx" />` tags. | ||
* `crawler.respectRobotsTxt=true` - | ||
Controls whether the crawler should respect rules in robots.txt (if such a | ||
file is present). The | ||
[robots-parser](https://www.npmjs.com/package/robots-parser) module is used | ||
to do the actual parsing. This property will also make the default | ||
`crawler.discoverResources` method respect | ||
`<meta name="robots" value="nofollow">` tags - meaning that no resources | ||
will be extracted from pages that include such a tag. | ||
* `crawler.queue` - | ||
The queue in use by the crawler (Must implement the `FetchQueue` interface) | ||
* `crawler.allowInitialDomainChange=false` - | ||
If the response for the initial URL is a redirect to another domain (e.g. | ||
from github.net to github.com), update `crawler.host` to continue the | ||
crawling on that domain. | ||
* `crawler.filterByDomain=true` - | ||
Specifies whether the crawler will restrict queued requests to a given | ||
domain/domains. | ||
* `crawler.scanSubdomains=false` - | ||
Enables scanning subdomains (other than www) as well as the specified | ||
domain. | ||
* `crawler.ignoreWWWDomain=true` - | ||
Treats the `www` domain the same as the originally specified domain. | ||
* `crawler.stripWWWDomain=false` - | ||
Or go even further and strip WWW subdomain from requests altogether! | ||
* `crawler.stripQuerystring=false` - | ||
Specify to strip querystring parameters from URL's. | ||
* `crawler.sortQueryParameters=false` - | ||
Specify to sort the querystring parameters before queueing URL's. This is | ||
to canonicalize URLs so that foo?a=1&b=2 is considered same as foo?b=2&a=1. | ||
* `crawler.discoverResources` - | ||
simplecrawler's default resource discovery function - | ||
which, given a buffer containing a resource, returns an array of URLs. | ||
For more details about link discovery, see [Link Discovery](#link-discovery) | ||
* `crawler.discoverRegex` - | ||
Array of regular expressions and functions that simplecrawler uses to | ||
discover resources. Functions in this array are expected to return an array. | ||
*Only applicable if the default `discoverResources` function is used.* | ||
* `crawler.parseHTMLComments=true` - | ||
Whether to scan for URL's inside HTML comments. *Only applicable if the | ||
default `discoverResources` function is used.* | ||
* `crawler.parseScriptTags=true` - | ||
Whether to scan for URL's inside script tags. *Only applicable if the | ||
default `discoverResources` function is used.* | ||
* `crawler.cache` - | ||
Specify a cache architecture to use when crawling. Must implement | ||
`SimpleCache` interface. You can save the site to disk using the built in | ||
file system cache like this: | ||
<a name="Crawler+initialURL"></a> | ||
```js | ||
crawler.cache = new Crawler.cache('pathToCacheDirectory'); | ||
``` | ||
#### crawler.initialURL : <code>String</code> | ||
Controls which URL to request first | ||
* `crawler.useProxy=false` - | ||
The crawler should use an HTTP proxy to make its requests. | ||
* `crawler.proxyHostname="127.0.0.1"` - | ||
The hostname of the proxy to use for requests. | ||
* `crawler.proxyPort=8123` - | ||
The port of the proxy to use for requests. | ||
* `crawler.proxyUser=null` - | ||
The username for HTTP/Basic proxy authentication (leave unset for | ||
unauthenticated proxies.) | ||
* `crawler.proxyPass=null` - | ||
The password for HTTP/Basic proxy authentication (leave unset for | ||
unauthenticated proxies.) | ||
* `crawler.domainWhitelist` - | ||
An array of domains the crawler is permitted to crawl from. If other | ||
settings are more permissive, they will override this setting. | ||
* `crawler.allowedProtocols` - | ||
An array of RegExp objects used to determine whether a URL protocol is | ||
supported. This is to deal with nonstandard protocol handlers that regular | ||
HTTP is sometimes given, like `feed:`. It does not provide support for | ||
non-http protocols (and why would it!?) | ||
* `crawler.maxResourceSize=16777216` - | ||
The maximum resource size that will be downloaded, in bytes. Defaults to | ||
16MB. | ||
* `crawler.supportedMimeTypes` - | ||
An array of RegExp objects and/or strings used to determine what MIME types | ||
simplecrawler should look for resources in. If `crawler.downloadUnsupported` | ||
is false, this also restricts what resources are downloaded. | ||
* `crawler.downloadUnsupported=true` - | ||
simplecrawler will download files it can't parse (determined by | ||
`crawler.supportedMimeTypes`). Defaults to true, but if you'd rather save | ||
the RAM and GC lag, switch it off. When false, it closes sockets for | ||
unsupported resources. | ||
* `crawler.needsAuth=false` - | ||
Flag to specify if the domain you are hitting requires basic authentication. | ||
* `crawler.authUser=""` - | ||
Username provided for `needsAuth` flag. | ||
* `crawler.authPass=""` - | ||
Password provided for `needsAuth` flag. | ||
* `crawler.customHeaders` - | ||
An object specifying a number of custom headers simplecrawler will add to | ||
every request. These override the default headers simplecrawler sets, so be | ||
careful with them. If you want to tamper with headers on a per-request | ||
basis, see the `fetchqueue` event. | ||
* `crawler.acceptCookies=true` - | ||
Flag to indicate if the crawler should hold on to cookies. | ||
* `crawler.urlEncoding="unicode"` - | ||
Set this to `iso8859` to trigger | ||
[URI.js](https://medialize.github.io/URI.js/)' re-encoding of iso8859 URL's | ||
to unicode. | ||
* `crawler.maxDepth=0` - | ||
Defines a maximum distance from the original request at which resources will | ||
be downloaded. | ||
* `crawler.ignoreInvalidSSL=false` - | ||
Treat self-signed SSL certificates as valid. SSL certificates will not be | ||
validated against known CAs. Only applies to https requests. You may also | ||
have to set the environment variable NODE_TLS_REJECT_UNAUTHORIZED to '0'. | ||
For example: `process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';` | ||
<a name="Crawler+host"></a> | ||
## Fetch conditions | ||
#### crawler.host : <code>String</code> | ||
Determines what hostname the crawler should limit requests to (so long as | ||
[filterByDomain](#Crawler+filterByDomain) is true) | ||
simplecrawler has an concept called fetch conditions that offers a flexible API | ||
for filtering discovered resources before they're put in the queue. A fetch | ||
condition is a function that takes a queue item candidate and evaluates | ||
(synchronously or asynchronously) whether it should be added to the queue or | ||
not. *Please note: with the next major release, all fetch conditions will be | ||
asynchronous.* | ||
<a name="Crawler+interval"></a> | ||
You may add as many fetch conditions as you like, and remove them at runtime. | ||
simplecrawler will evaluate every fetch condition in parallel until one is | ||
encountered that returns a falsy value. If that happens, the resource in | ||
question will not be fetched. | ||
#### crawler.interval : <code>Number</code> | ||
Determines the interval at which new requests are spawned by the crawler, | ||
as long as the number of open requests is under the | ||
[maxConcurrency](#Crawler+maxConcurrency) cap. | ||
This API is complemented by [download conditions](#download-conditions) that | ||
determine whether a resource's body data should be downloaded. | ||
<a name="Crawler+maxConcurrency"></a> | ||
### Adding a fetch condition | ||
#### crawler.maxConcurrency : <code>Number</code> | ||
Maximum request concurrency. If necessary, simplecrawler will increase | ||
node's http agent maxSockets value to match this setting. | ||
This example fetch condition prevents URL's ending in `.pdf` from being | ||
downloaded. Adding a fetch condition assigns it an ID, which the | ||
`addFetchCondition` function returns. You can use this ID to remove the | ||
condition later. | ||
<a name="Crawler+timeout"></a> | ||
#### crawler.timeout : <code>Number</code> | ||
Maximum time we'll wait for headers | ||
<a name="Crawler+listenerTTL"></a> | ||
#### crawler.listenerTTL : <code>Number</code> | ||
Maximum time we'll wait for async listeners | ||
<a name="Crawler+userAgent"></a> | ||
#### crawler.userAgent : <code>String</code> | ||
Crawler's user agent string | ||
**Default**: <code>"Node/simplecrawler <version> (https://github.com/simplecrawler/simplecrawler)"</code> | ||
<a name="Crawler+queue"></a> | ||
#### crawler.queue : [<code>FetchQueue</code>](#FetchQueue) | ||
Queue for requests. The crawler can use any implementation so long as it | ||
uses the same interface. The default queue is simply backed by an array. | ||
<a name="Crawler+respectRobotsTxt"></a> | ||
#### crawler.respectRobotsTxt : <code>Boolean</code> | ||
Controls whether the crawler respects the robots.txt rules of any domain. | ||
This is done both with regards to the robots.txt file, and `<meta>` tags | ||
that specify a `nofollow` value for robots. The latter only applies if | ||
the default [discoverResources](#Crawler+discoverResources) method is used, though. | ||
<a name="Crawler+allowInitialDomainChange"></a> | ||
#### crawler.allowInitialDomainChange : <code>Boolean</code> | ||
Controls whether the crawler is allowed to change the | ||
[host](#Crawler+host) setting if the first response is a redirect to | ||
another domain. | ||
<a name="Crawler+decompressResponses"></a> | ||
#### crawler.decompressResponses : <code>Boolean</code> | ||
Controls whether HTTP responses are automatically decompressed based on | ||
their Content-Encoding header. If true, it will also assign the | ||
appropriate Accept-Encoding header to requests. | ||
<a name="Crawler+decodeResponses"></a> | ||
#### crawler.decodeResponses : <code>Boolean</code> | ||
Controls whether HTTP responses are automatically character converted to | ||
standard JavaScript strings using the [iconv-lite](https://www.npmjs.com/package/iconv-lite) | ||
module before emitted in the [fetchcomplete](#Crawler+event_fetchcomplete) event. | ||
The character encoding is interpreted from the Content-Type header | ||
firstly, and secondly from any `<meta charset="xxx" />` tags. | ||
<a name="Crawler+filterByDomain"></a> | ||
#### crawler.filterByDomain : <code>Boolean</code> | ||
Controls whether the crawler fetches only URL's where the hostname | ||
matches [host](#Crawler+host). Unless you want to be crawling the entire | ||
internet, I would recommend leaving this on! | ||
<a name="Crawler+scanSubdomains"></a> | ||
#### crawler.scanSubdomains : <code>Boolean</code> | ||
Controls whether URL's that points to a subdomain of [host](#Crawler+host) | ||
should also be fetched. | ||
<a name="Crawler+ignoreWWWDomain"></a> | ||
#### crawler.ignoreWWWDomain : <code>Boolean</code> | ||
Controls whether to treat the www subdomain as the same domain as | ||
[host](#Crawler+host). So if [http://example.com/example](http://example.com/example) has | ||
already been fetched, [http://www.example.com/example](http://www.example.com/example) won't be | ||
fetched also. | ||
<a name="Crawler+stripWWWDomain"></a> | ||
#### crawler.stripWWWDomain : <code>Boolean</code> | ||
Controls whether to strip the www subdomain entirely from URL's at queue | ||
item construction time. | ||
<a name="Crawler+cache"></a> | ||
#### crawler.cache : <code>SimpleCache</code> | ||
Internal cache store. Must implement `SimpleCache` interface. You can | ||
save the site to disk using the built in file system cache like this: | ||
```js | ||
var conditionID = myCrawler.addFetchCondition(function(queueItem, referrerQueueItem, callback) { | ||
callback(null, !queueItem.path.match(/\.pdf$/i)); | ||
}); | ||
crawler.cache = new Crawler.cache('pathToCacheDirectory'); | ||
``` | ||
Fetch conditions are called with three arguments: `queueItem`, | ||
`referrerQueueItem` and `callback`. `queueItem` represents the resource to be | ||
fetched (or not), and `referrerQueueItem` represents the resource where the new | ||
`queueItem` was discovered. See the [queue item documentation](#queue-items) for | ||
details on their structure. The `callback` argument is optional, but if your | ||
function takes 3 arguments, simplecrawler will consider it asynchronous and wait | ||
for the `callback` to be called. If your function takes 2 arguments or less, | ||
simplecrawler will consider it synchronous and look at its return value instead. | ||
**Please note** however, that this flexibility in sync and async behavior is due | ||
to change with the next major release when all fetch conditions will need to use | ||
the asynchronous API. | ||
<a name="Crawler+useProxy"></a> | ||
With this information, you can write sophisticated logic for determining which | ||
pages to fetch and which to avoid. For example, you could write a program that | ||
ensures all links on a website - both internal and external - return good HTTP | ||
statuses. Here's an example: | ||
#### crawler.useProxy : <code>Boolean</code> | ||
Controls whether an HTTP proxy should be used for requests | ||
```js | ||
var crawler = new Crawler("http://example.com"); | ||
crawler.filterByDomain = false; | ||
<a name="Crawler+proxyHostname"></a> | ||
crawler.addFetchCondition(function(queueItem, referrerQueueItem, callback) { | ||
// We only ever want to move one step away from example.com, so if the | ||
// referrer queue item reports a different domain, don't proceed | ||
callback(null, referrerQueueItem.host === crawler.host); | ||
}); | ||
#### crawler.proxyHostname : <code>String</code> | ||
If [useProxy](#Crawler+useProxy) is true, this setting controls what hostname | ||
to use for the proxy | ||
crawler.start(); | ||
``` | ||
<a name="Crawler+proxyPort"></a> | ||
### Removing a fetch condition | ||
#### crawler.proxyPort : <code>Number</code> | ||
If [useProxy](#Crawler+useProxy) is true, this setting controls what port to | ||
use for the proxy | ||
With the ID of the fetch condition you added earlier, or with a reference to the | ||
calback function you registered, you can remove the fetch condition using the | ||
`crawler.removeFetchCondition` method: | ||
<a name="Crawler+proxyUser"></a> | ||
```js | ||
function listener(queueItem, stateData) { | ||
// Do something | ||
} | ||
#### crawler.proxyUser : <code>String</code> | ||
If [useProxy](#Crawler+useProxy) is true, this setting controls what username | ||
to use for the proxy | ||
var conditionID = myCrawler.addFetchCondition(listener); | ||
<a name="Crawler+proxyPass"></a> | ||
// By id... | ||
myCrawler.removeFetchCondition(conditionID); | ||
// or by reference | ||
myCrawler.removeFetchCondition(listener); | ||
``` | ||
#### crawler.proxyPass : <code>String</code> | ||
If [useProxy](#Crawler+useProxy) is true, this setting controls what password | ||
to use for the proxy | ||
<a name="Crawler+needsAuth"></a> | ||
#### crawler.needsAuth : <code>Boolean</code> | ||
Controls whether to use HTTP Basic Auth | ||
<a name="Crawler+authUser"></a> | ||
#### crawler.authUser : <code>String</code> | ||
If [needsAuth](#Crawler+needsAuth) is true, this setting controls what username | ||
to send with HTTP Basic Auth | ||
<a name="Crawler+authPass"></a> | ||
#### crawler.authPass : <code>String</code> | ||
If [needsAuth](#Crawler+needsAuth) is true, this setting controls what password | ||
to send with HTTP Basic Auth | ||
<a name="Crawler+acceptCookies"></a> | ||
#### crawler.acceptCookies : <code>Boolean</code> | ||
Controls whether to save and send cookies or not | ||
<a name="Crawler+cookies"></a> | ||
#### crawler.cookies : [<code>CookieJar</code>](#CookieJar) | ||
The module used to store cookies | ||
<a name="Crawler+customHeaders"></a> | ||
#### crawler.customHeaders : <code>Object</code> | ||
Controls what headers (besides the default ones) to include with every | ||
request. | ||
<a name="Crawler+domainWhitelist"></a> | ||
#### crawler.domainWhitelist : <code>Array</code> | ||
Controls what domains the crawler is allowed to fetch from, regardless of | ||
[host](#Crawler+host) or [filterByDomain](#Crawler+filterByDomain) settings. | ||
<a name="Crawler+allowedProtocols"></a> | ||
#### crawler.allowedProtocols : <code>Array.<RegExp></code> | ||
Controls what protocols the crawler is allowed to fetch from | ||
<a name="Crawler+maxResourceSize"></a> | ||
#### crawler.maxResourceSize : <code>Number</code> | ||
Controls the maximum allowed size in bytes of resources to be fetched | ||
**Default**: <code>16777216</code> | ||
<a name="Crawler+supportedMimeTypes"></a> | ||
#### crawler.supportedMimeTypes : <code>Array.<(RegExp\|string)></code> | ||
Controls what mimetypes the crawler will scan for new resources. If | ||
[downloadUnsupported](#Crawler+downloadUnsupported) is false, this setting will also | ||
restrict what resources are downloaded. | ||
<a name="Crawler+downloadUnsupported"></a> | ||
#### crawler.downloadUnsupported : <code>Boolean</code> | ||
Controls whether to download resources with unsupported mimetypes (as | ||
specified by [supportedMimeTypes](#Crawler+supportedMimeTypes)) | ||
<a name="Crawler+urlEncoding"></a> | ||
#### crawler.urlEncoding : <code>String</code> | ||
Controls what URL encoding to use. Can be either "unicode" or "iso8859" | ||
<a name="Crawler+stripQuerystring"></a> | ||
#### crawler.stripQuerystring : <code>Boolean</code> | ||
Controls whether to strip query string parameters from URL's at queue | ||
item construction time. | ||
<a name="Crawler+sortQueryParameters"></a> | ||
#### crawler.sortQueryParameters : <code>Boolean</code> | ||
Controls whether to sort query string parameters from URL's at queue | ||
item construction time. | ||
<a name="Crawler+discoverRegex"></a> | ||
#### crawler.discoverRegex : <code>Array.<(RegExp\|function())></code> | ||
Collection of regular expressions and functions that are applied in the | ||
default [discoverResources](#Crawler+discoverResources) method. | ||
<a name="Crawler+parseHTMLComments"></a> | ||
#### crawler.parseHTMLComments : <code>Boolean</code> | ||
Controls whether the default [discoverResources](#Crawler+discoverResources) should | ||
scan for new resources inside of HTML comments. | ||
<a name="Crawler+parseScriptTags"></a> | ||
#### crawler.parseScriptTags : <code>Boolean</code> | ||
Controls whether the default [discoverResources](#Crawler+discoverResources) should | ||
scan for new resources inside of `<script>` tags. | ||
<a name="Crawler+maxDepth"></a> | ||
#### crawler.maxDepth : <code>Number</code> | ||
Controls the max depth of resources that the crawler fetches. 0 means | ||
that the crawler won't restrict requests based on depth. The initial | ||
resource, as well as manually queued resources, are at depth 1. From | ||
there, every discovered resource adds 1 to its referrer's depth. | ||
<a name="Crawler+ignoreInvalidSSL"></a> | ||
#### crawler.ignoreInvalidSSL : <code>Boolean</code> | ||
Controls whether to proceed anyway when the crawler encounters an invalid | ||
SSL certificate. | ||
<a name="Crawler+httpAgent"></a> | ||
#### crawler.httpAgent : <code>HTTPAgent</code> | ||
Controls what HTTP agent to use. This is useful if you want to configure | ||
eg. a SOCKS client. | ||
<a name="Crawler+httpsAgent"></a> | ||
#### crawler.httpsAgent : <code>HTTPAgent</code> | ||
Controls what HTTPS agent to use. This is useful if you want to configure | ||
eg. a SOCKS client. | ||
## Fetch conditions | ||
simplecrawler has an concept called fetch conditions that offers a flexible API for filtering discovered resources before they're put in the queue. A fetch condition is a function that takes a queue item candidate and evaluates (synchronously or asynchronously) whether it should be added to the queue or not. *Please note: with the next major release, all fetch conditions will be asynchronous.* | ||
You may add as many fetch conditions as you like, and remove them at runtime. simplecrawler will evaluate every fetch condition in parallel until one is encountered that returns a falsy value. If that happens, the resource in question will not be fetched. | ||
This API is complemented by [download conditions](#download-conditions) that determine whether a resource's body data should be downloaded. | ||
<a name="Crawler+addFetchCondition"></a> | ||
#### crawler.addFetchCondition(callback) ⇒ <code>Number</code> | ||
Adds a callback to the fetch conditions array. simplecrawler will evaluate | ||
all fetch conditions for every discovered URL, and if any of the fetch | ||
conditions returns a falsy value, the URL won't be queued. | ||
**Returns**: <code>Number</code> - The index of the fetch condition in the fetch conditions array. This can later be used to remove the fetch condition. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| callback | [<code>addFetchConditionCallback</code>](#Crawler..addFetchConditionCallback) | Function to be called after resource discovery that's able to prevent queueing of resource | | ||
<a name="Crawler..addFetchConditionCallback"></a> | ||
#### Crawler~addFetchConditionCallback : <code>function</code> | ||
Evaluated for every discovered URL to determine whether to put it in the | ||
queue. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The resource to be queued (or not) | | ||
| referrerQueueItem | [<code>QueueItem</code>](#QueueItem) | The resource where `queueItem` was discovered | | ||
| callback | <code>function</code> | | | ||
<a name="Crawler+removeFetchCondition"></a> | ||
#### crawler.removeFetchCondition(id) ⇒ <code>Boolean</code> | ||
Removes a fetch condition from the fetch conditions array. | ||
**Returns**: <code>Boolean</code> - If the removal was successful, the method will return true. Otherwise, it will throw an error. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| id | <code>Number</code> \| <code>function</code> | The numeric ID of the fetch condition, or a reference to the fetch condition itself. This was returned from [addFetchCondition](#Crawler+addFetchCondition) | | ||
## Download conditions | ||
While fetch conditions let you determine which resources to put in the queue, | ||
download conditions offer the same kind of flexible API for determining which | ||
resources' data to download. Download conditions support both a synchronous and | ||
an asynchronous API, but *with the next major release, all download conditions | ||
will be asynchronous.* | ||
While fetch conditions let you determine which resources to put in the queue, download conditions offer the same kind of flexible API for determining which resources' data to download. Download conditions support both a synchronous and an asynchronous API, but *with the next major release, all download conditions will be asynchronous.* | ||
Download conditions are evaluated after the headers of a resource have been | ||
downloaded, if that resource returned an HTTP status between 200 and 299. This | ||
lets you inspect the content-type and content-length headers, along with all | ||
other properties on the queue item, before deciding if you want this resource's | ||
data or not. | ||
Download conditions are evaluated after the headers of a resource have been downloaded, if that resource returned an HTTP status between 200 and 299. This lets you inspect the content-type and content-length headers, along with all other properties on the queue item, before deciding if you want this resource's data or not. | ||
### Adding a download condition | ||
<a name="Crawler+addDownloadCondition"></a> | ||
Download conditions are added in much the same way as fetch conditions, with the | ||
`crawler.addDownloadCondition` method. This method returns an ID that can be | ||
used to remove the condition later. | ||
#### crawler.addDownloadCondition(callback) ⇒ <code>Number</code> | ||
Adds a callback to the download conditions array. simplecrawler will evaluate | ||
all download conditions for every fetched resource after the headers of that | ||
resource have been received. If any of the download conditions returns a | ||
falsy value, the resource data won't be downloaded. | ||
```js | ||
var conditionID = myCrawler.addDownloadCondition(function(queueItem, response, callback) { | ||
callback(null, | ||
queueItem.stateData.contentType === "image/png" && | ||
queueItem.stateData.contentLength < 5 * 1000 * 1000 | ||
); | ||
}); | ||
``` | ||
**Returns**: <code>Number</code> - The index of the download condition in the download conditions array. This can later be used to remove the download condition. | ||
Download conditions are called with three arguments: `queueItem`, `response` and | ||
`callback`. `queueItem` represents the resource that's being fetched ([queue | ||
item structure](#queue-items)) and `response` is an instance of | ||
`http.IncomingMessage`. Please see the [node | ||
documentation](https://nodejs.org/api/http.html#http_class_http_incomingmessage) | ||
for that class for more details on what it looks like. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| callback | [<code>addDownloadConditionCallback</code>](#Crawler..addDownloadConditionCallback) | Function to be called when the headers of the resource represented by the queue item have been downloaded | | ||
### Removing a download condition | ||
<a name="Crawler..addDownloadConditionCallback"></a> | ||
Just like with fetch conditions, download conditions can be removed with the ID | ||
returned from the `addDownloadCondition` method, or with a reference to the same | ||
callback function. `crawler.removeDownloadCondition` is the method you'll use: | ||
#### Crawler~addDownloadConditionCallback : <code>function</code> | ||
Evaluated for every fetched resource after its header have been received to | ||
determine whether to fetch the resource body. | ||
```js | ||
function listener(queueItem, response, callback) { | ||
// Do something | ||
} | ||
var conditionID = myCrawler.addDownloadCondition(listener); | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| queueItem | [<code>QueueItem</code>](#QueueItem) | The resource to be downloaded (or not) | | ||
| response | <code>http.IncomingMessage</code> | The response object as returned by node's `http` API | | ||
| callback | <code>function</code> | | | ||
// By id... | ||
myCrawler.removeDownloadCondition(conditionID); | ||
// or by reference | ||
myCrawler.removeDownloadCondition(listener); | ||
``` | ||
<a name="Crawler+removeDownloadCondition"></a> | ||
#### crawler.removeDownloadCondition(id) ⇒ <code>Boolean</code> | ||
Removes a download condition from the download conditions array. | ||
**Returns**: <code>Boolean</code> - If the removal was successful, the method will return true. Otherwise, it will throw an error. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| id | <code>Number</code> \| <code>function</code> | The numeric ID of the download condition, or a reference to the download condition itself. The ID was returned from [addDownloadCondition](#Crawler+addDownloadCondition) | | ||
## The queue | ||
Like any other web crawler, simplecrawler has a queue. It can be directly | ||
accessed through `crawler.queue` and implements an asynchronous interface for | ||
accessing queue items and statistics. There are several methods for interacting | ||
with the queue, the simplest being `crawler.queue.get`, which lets you get a | ||
queue item at a specific index in the queue. | ||
Like any other web crawler, simplecrawler has a queue. It can be directly accessed through <code><a href="#Crawler+queue">crawler.queue</a></code> and implements an asynchronous interface for accessing queue items and statistics. There are several methods for interacting with the queue, the simplest being <code><a href="#FetchQueue+get">crawler.queue.get</a></code>, which lets you get a queue item at a specific index in the queue. | ||
```js | ||
crawler.queue.get(5, function (queueItem) { | ||
// Do something with the queueItem | ||
}); | ||
``` | ||
<a name="FetchQueue+get"></a> | ||
*All queue method are in reality synchronous by default, but simplecrawler is | ||
built to be able to use different queues that implement the same interface, and | ||
those implementations can be asynchronous - which means they could eg. be backed | ||
by a database.* | ||
#### fetchQueue.get(index, callback) | ||
Get a queue item by index | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| index | <code>Number</code> | The index of the queue item in the queue | | ||
| callback | <code>function</code> | Gets two parameters, `error` and `queueItem`. If the operation was successful, `error` will be `null`. | | ||
*All queue method are in reality synchronous by default, but simplecrawler is built to be able to use different queues that implement the same interface, and those implementations can be asynchronous - which means they could eg. be backed by a database.* | ||
### Manually adding to the queue | ||
To add items to the queue, use `crawler.queueURL`. This method takes 3 | ||
arguments: a URL to queue, a referrer queue item and a boolean that indicates | ||
whether the URL should be queued regardless of whether it already exists in the | ||
queue or not. | ||
To add items to the queue, use <code><a href="#Crawler+queueURL">crawler.queueURL</a></code>. | ||
```js | ||
crawler.queueURL("/example.html", referrerQueueItem, false); | ||
``` | ||
<a name="Crawler+queueURL"></a> | ||
#### crawler.queueURL(url, [referrer], [force]) ⇒ <code>Boolean</code> | ||
Queues a URL for fetching after cleaning, validating and constructing a queue | ||
item from it. If you're queueing a URL manually, use this method rather than | ||
[Crawler#queue#add](Crawler#queue#add) | ||
**Returns**: <code>Boolean</code> - The return value used to indicate whether the URL passed all fetch conditions and robots.txt rules. With the advent of async fetch conditions, the return value will no longer take fetch conditions into account. | ||
**Emits**: [<code>invaliddomain</code>](#Crawler+event_invaliddomain), [<code>fetchdisallowed</code>](#Crawler+event_fetchdisallowed), [<code>fetchconditionerror</code>](#Crawler+event_fetchconditionerror), [<code>fetchprevented</code>](#Crawler+event_fetchprevented), [<code>queueduplicate</code>](#Crawler+event_queueduplicate), [<code>queueerror</code>](#Crawler+event_queueerror), [<code>queueadd</code>](#Crawler+event_queueadd) | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| url | <code>String</code> | An absolute or relative URL. If relative, [processURL](#Crawler+processURL) will make it absolute to the referrer queue item. | | ||
| [referrer] | [<code>QueueItem</code>](#QueueItem) | The queue item representing the resource where this URL was discovered. | | ||
| [force] | <code>Boolean</code> | If true, the URL will be queued regardless of whether it already exists in the queue or not. | | ||
### Queue items | ||
Because when working with simplecrawler, you'll constantly be handed queue items, | ||
it helps to know what's inside them. These are the properties every queue item | ||
is expected to have: | ||
Because when working with simplecrawler, you'll constantly be handed queue items, it helps to know what's inside them. Here's the formal documentation of the properties that they contain. | ||
* `id` - A unique ID assigned by the queue when the queue item is added | ||
* `url` - The complete, canonical URL of the resource | ||
* `protocol` - The protocol of the resource (http, https) | ||
* `host` - The full domain/hostname of the resource | ||
* `port` - The port of the resource | ||
* `path` - The URL path, including the query string | ||
* `uriPath` - The URL path, excluding the query string | ||
* `depth` - How many steps simplecrawler has taken from the initial page (which | ||
is depth 1) to this resource. | ||
* `fetched` - Has the request for this item been completed? You can monitor this | ||
as requests are processed. | ||
* `status` - The internal status of the item, always a string. This can be one | ||
of: | ||
* `"queued"` - The resource is in the queue to be fetched, but nothing's | ||
happened to it yet. | ||
* `"spooled"` - A request has been made to the remote server, but we're | ||
still waiting for a response. | ||
* `"headers"` - The headers for the resource have been received. | ||
* `"downloaded"` - The item has been entirely downloaded. | ||
* `"redirected"` - The resource request returned a 300 series response, with | ||
a Location header and a new URL. | ||
* `"notfound"` - The resource could not be found, ie. returned a 404 or 410 | ||
HTTP status. | ||
* `"failed"` - An error occurred when attempting to fetch the resource. | ||
* `stateData` - An object containing state data and other information about the | ||
request: | ||
* `requestLatency` - The time taken for headers to be received after the | ||
request was made. | ||
* `requestTime` - The total time taken for the request (including download | ||
time.) | ||
* `downloadTime` - The total time taken for the resource to be downloaded. | ||
* `contentLength` - The length (in bytes) of the returned content. | ||
Calculated based on the `content-length` header. | ||
* `contentType` - The MIME type of the content. | ||
* `code` - The HTTP status code returned for the request. Note that this | ||
code is `600` if an error occurred in the client and a fetch operation | ||
could not take place successfully. | ||
* `headers` - An object containing the header information returned by the | ||
server. This is the object node returns as part of the `response` object. | ||
* `actualDataSize` - The length (in bytes) of the returned content. | ||
Calculated based on what is actually received, not the `content-length` | ||
header. | ||
* `sentIncorrectSize` - True if the data length returned by the server did | ||
not match what we were told to expect by the `content-length` header. | ||
<a name="QueueItem"></a> | ||
As you can see, you can get a lot of meta-information out about each request. | ||
This has been put to use by providing some convenient methods for getting simple | ||
aggregate data about the queue. | ||
#### QueueItem : <code>Object</code> | ||
QueueItems represent resources in the queue that have been fetched, or will be eventually. | ||
**Properties** | ||
| Name | Type | Description | | ||
| --- | --- | --- | | ||
| id | <code>Number</code> | A unique ID assigned by the queue when the queue item is added | | ||
| url | <code>String</code> | The complete, canonical URL of the resource | | ||
| protocol | <code>String</code> | The protocol of the resource (http, https) | | ||
| host | <code>String</code> | The full domain/hostname of the resource | | ||
| port | <code>Number</code> | The port of the resource | | ||
| path | <code>String</code> | The URL path, including the query string | | ||
| uriPath | <code>String</code> | The URL path, excluding the query string | | ||
| depth | <code>Number</code> | How many steps simplecrawler has taken from the initial page (which is depth 1) to this resource. | | ||
| referrer | <code>String</code> | The URL of the resource where the URL of this queue item was discovered | | ||
| fetched | <code>Boolean</code> | Has the request for this item been completed? You can monitor this as requests are processed. | | ||
| status | <code>'queued'</code> \| <code>'spooled'</code> \| <code>'headers'</code> \| <code>'downloaded'</code> \| <code>'redirected'</code> \| <code>'notfound'</code> \| <code>'failed'</code> | The internal status of the item. | | ||
| stateData | <code>Object</code> | An object containing state data and other information about the request. | | ||
| stateData.requestLatency | <code>Number</code> | The time (in ms) taken for headers to be received after the request was made. | | ||
| stateData.requestTime | <code>Number</code> | The total time (in ms) taken for the request (including download time.) | | ||
| stateData.downloadTime | <code>Number</code> | The total time (in ms) taken for the resource to be downloaded. | | ||
| stateData.contentLength | <code>Number</code> | The length (in bytes) of the returned content. Calculated based on the `content-length` header. | | ||
| stateData.contentType | <code>String</code> | The MIME type of the content. | | ||
| stateData.code | <code>Number</code> | The HTTP status code returned for the request. Note that this code is `600` if an error occurred in the client and a fetch operation could not take place successfully. | | ||
| stateData.headers | <code>Object</code> | An object containing the header information returned by the server. This is the object node returns as part of the `response` object. | | ||
| stateData.actualDataSize | <code>Number</code> | The length (in bytes) of the returned content. Calculated based on what is actually received, not the `content-length` header. | | ||
| stateData.sentIncorrectSize | <code>Boolean</code> | True if the data length returned by the server did not match what we were told to expect by the `content-length` header. | | ||
### Queue statistics and reporting | ||
First of all, the queue can provide some basic statistics about the network | ||
performance of your crawl so far. This is done live, so don't check it 30 times | ||
a second. You can test the following properties: | ||
First of all, the queue can provide some basic statistics about the network performance of your crawl so far. This is done live, so don't check it 30 times a second. You can test the following properties: | ||
@@ -624,35 +881,72 @@ * `requestTime` | ||
You can get the maximum, minimum, and average values for each with the | ||
`crawler.queue.max`, `crawler.queue.min`, and `crawler.queue.avg` functions | ||
respectively. | ||
You can get the maximum, minimum, and average values for each with the <code><a href="#FetchQueue+max">crawler.queue.max</a></code>, <code><a href="#FetchQueue+min">crawler.queue.min</a></code>, and <code><a href="#FetchQueue+avg">crawler.queue.avg</a></code> functions respectively. | ||
```js | ||
crawler.queue.max("requestLatency", function(error, max) { | ||
console.log("The maximum request latency was %dms.", max); | ||
}); | ||
crawler.queue.min("downloadTime", function(error, min) { | ||
console.log("The minimum download time was %dms.", min); | ||
}); | ||
crawler.queue.avg("actualDataSize", function(error, avg) { | ||
console.log("The average resource size received is %d bytes.", avg); | ||
}); | ||
``` | ||
<a name="FetchQueue+max"></a> | ||
For general filtering or counting of queue items, there are two methods: | ||
`crawler.queue.filterItems` and `crawler.queue.countItems`. Both take an object | ||
comparator and a callback. | ||
#### fetchQueue.max(statisticName, callback) | ||
Gets the maximum value of a stateData property from all the items in the | ||
queue. This means you can eg. get the maximum request time, download size | ||
etc. | ||
```js | ||
crawler.queue.countItems({ fetched: true }, function(error, count) { | ||
console.log("The number of completed items is %d", count); | ||
}); | ||
crawler.queue.filterItems({ status: "notfound" }, function(error, items) { | ||
console.log("These items returned 404 or 410 HTTP statuses", items); | ||
}); | ||
``` | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| statisticName | <code>String</code> | Can be any of the strings in [_allowedStatistics](#FetchQueue._allowedStatistics) | | ||
| callback | <code>function</code> | Gets two parameters, `error` and `max`. If the operation was successful, `error` will be `null`. | | ||
The object comparator can also contain other objects, so you may filter queue | ||
items based on properties in their `stateData` object as well. | ||
<a name="FetchQueue+min"></a> | ||
#### fetchQueue.min(statisticName, callback) | ||
Gets the minimum value of a stateData property from all the items in the | ||
queue. This means you can eg. get the minimum request time, download size | ||
etc. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| statisticName | <code>String</code> | Can be any of the strings in [_allowedStatistics](#FetchQueue._allowedStatistics) | | ||
| callback | <code>function</code> | Gets two parameters, `error` and `min`. If the operation was successful, `error` will be `null`. | | ||
<a name="FetchQueue+avg"></a> | ||
#### fetchQueue.avg(statisticName, callback) | ||
Gets the average value of a stateData property from all the items in the | ||
queue. This means you can eg. get the average request time, download size | ||
etc. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| statisticName | <code>String</code> | Can be any of the strings in [_allowedStatistics](#FetchQueue._allowedStatistics) | | ||
| callback | <code>function</code> | Gets two parameters, `error` and `avg`. If the operation was successful, `error` will be `null`. | | ||
For general filtering or counting of queue items, there are two methods: <code><a href="#FetchQueue+filterItems">crawler.queue.filterItems</a></code> and <code><a href="#FetchQueue+countItems">crawler.queue.countItems</a></code>. Both take an object comparator and a callback. | ||
<a name="FetchQueue+filterItems"></a> | ||
#### fetchQueue.filterItems(comparator, callback) | ||
Filters and returns the items in the queue that match a selector | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| comparator | <code>Object</code> | Comparator object used to filter items. Queue items that are returned need to match all the properties of this object. | | ||
| callback | <code>function</code> | Gets two parameters, `error` and `items`. If the operation was successful, `error` will be `null` and `items` will be an array of QueueItems. | | ||
<a name="FetchQueue+countItems"></a> | ||
#### fetchQueue.countItems(comparator, callback, callback) | ||
Counts the items in the queue that match a selector | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| comparator | <code>Object</code> | Comparator object used to filter items. Queue items that are counted need to match all the properties of this object. | | ||
| callback | <code>FetchQueue~countItemsCallback</code> | | | ||
| callback | <code>function</code> | Gets two parameters, `error` and `items`. If the operation was successful, `error` will be `null` and `items` will be an array of QueueItems. | | ||
The object comparator can also contain other objects, so you may filter queue items based on properties in their `stateData` object as well. | ||
```js | ||
@@ -668,52 +962,68 @@ crawler.queue.filterItems({ | ||
It can be convenient to be able to save the crawl progress and later be able to | ||
reload it if your application fails or you need to abort the crawl for some | ||
reason. The `crawler.queue.freeze` and `crawler.queue.defrost` methods will let | ||
you do this. | ||
It can be convenient to be able to save the crawl progress and later be able to reload it if your application fails or you need to abort the crawl for some reason. The `crawler.queue.freeze` and `crawler.queue.defrost` methods will let you do this. | ||
**A word of warning** - they are not CPU friendly as they rely on `JSON.parse` | ||
and `JSON.stringify`. Use them only when you need to save the queue - don't call | ||
them after every request or your application's performance will be incredibly | ||
poor - they block like *crazy*. That said, using them when your crawler | ||
commences and stops is perfectly reasonable. | ||
**A word of warning** - they are not CPU friendly as they rely on `JSON.parse` and `JSON.stringify`. Use them only when you need to save the queue - don't call them after every request or your application's performance will be incredibly poor - they block like *crazy*. That said, using them when your crawler commences and stops is perfectly reasonable. | ||
Note that the methods themselves are asynchronous, so if you are going to exit | ||
the process after you do the freezing, make sure you wait for callback - | ||
otherwise you'll get an empty file. | ||
Note that the methods themselves are asynchronous, so if you are going to exit the process after you do the freezing, make sure you wait for callback - otherwise you'll get an empty file. | ||
```js | ||
crawler.queue.freeze("mysavedqueue.json", function () { | ||
process.exit(); | ||
}); | ||
<a name="FetchQueue+freeze"></a> | ||
crawler.queue.defrost("mysavedqueue.json"); | ||
``` | ||
#### fetchQueue.freeze(filename, callback) | ||
Writes the queue to disk in a JSON file. This file can later be imported | ||
using [defrost](#FetchQueue+defrost) | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| filename | <code>String</code> | Filename passed directly to [fs.writeFile](https://nodejs.org/api/fs.html#fs_fs_writefile_file_data_options_callback) | | ||
| callback | <code>function</code> | Gets a single `error` parameter. If the operation was successful, this parameter will be `null`. | | ||
<a name="FetchQueue+defrost"></a> | ||
#### fetchQueue.defrost(filename, callback) | ||
Import the queue from a frozen JSON file on disk. | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| filename | <code>String</code> | Filename passed directly to [fs.readFile](https://nodejs.org/api/fs.html#fs_fs_readfile_file_options_callback) | | ||
| callback | <code>function</code> | Gets a single `error` parameter. If the operation was successful, this parameter will be `null`. | | ||
## Cookies | ||
simplecrawler has an internal cookie jar, which collects and resends cookies | ||
automatically and by default. If you want to turn this off, set the | ||
`crawler.acceptCookies` option to `false`. The cookie jar is accessible via | ||
`crawler.cookies`, and is an event emitter itself. | ||
simplecrawler has an internal cookie jar, which collects and resends cookies automatically and by default. If you want to turn this off, set the <code><a href="#Crawler+acceptCookies">crawler.acceptCookies</a></code> option to `false`. The cookie jar is accessible via <code><a href="#Crawler+cookies">crawler.cookies</a></code>, and is an event emitter itself. | ||
### Cookie events | ||
* `addcookie` (cookie) - Fired when a new cookie is added to the jar. | ||
* `removecookie` (cookie array) - Fired when one or more cookies are removed from the jar. | ||
<a name="CookieJar+event_addcookie"></a> | ||
#### "addcookie" (cookie) | ||
Fired when a cookie has been added to the jar | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| cookie | [<code>Cookie</code>](#Cookie) | The cookie that has been added | | ||
<a name="CookieJar+event_removecookie"></a> | ||
#### "removecookie" (cookie) | ||
Fired when one or multiple cookie have been removed from the jar | ||
| Param | Type | Description | | ||
| --- | --- | --- | | ||
| cookie | [<code>Array.<Cookie></code>](#Cookie) | The cookies that have been removed | | ||
## Link Discovery | ||
simplecrawler's discovery function is made to be replaceable — you can | ||
easily write your own that discovers only the links you're interested in. | ||
simplecrawler's discovery function is made to be replaceable — you can easily write your own that discovers only the links you're interested in. | ||
The method must accept a buffer and a [`queueItem`](#queue-items), and | ||
return the resources that are to be added to the queue. | ||
The method must accept a buffer and a [`queueItem`](#queue-items), and return the resources that are to be added to the queue. | ||
It is quite common to pair simplecrawler with a module like | ||
[cheerio](https://npmjs.com/package/cheerio) that can correctly parse | ||
HTML and provide a DOM like API for querying — or even a whole headless | ||
browser, like phantomJS. | ||
It is quite common to pair simplecrawler with a module like [cheerio](https://npmjs.com/package/cheerio) that can correctly parse HTML and provide a DOM like API for querying — or even a whole headless browser, like phantomJS. | ||
The example below demonstrates how one might achieve basic HTML-correct | ||
discovery of only link tags using cheerio. | ||
The example below demonstrates how one might achieve basic HTML-correct discovery of only link tags using cheerio. | ||
@@ -732,40 +1042,22 @@ ```js | ||
There are a couple of questions that pop up more often than others in the issue | ||
tracker. If you're having trouble with simplecrawler, please have a look at the | ||
list below before submitting an issue. | ||
There are a couple of questions that pop up more often than others in the issue tracker. If you're having trouble with simplecrawler, please have a look at the list below before submitting an issue. | ||
- **Q: Why does simplecrawler discover so many invalid URLs?** | ||
A: simplecrawler's built-in discovery method is purposefully naive - it's a | ||
brute force approach intended to find everything: URLs in comments, binary files, | ||
scripts, image EXIF data, inside CSS documents, and more — useful for archiving | ||
and use cases where it's better to have false positives than fail to discover a | ||
resource. | ||
A: simplecrawler's built-in discovery method is purposefully naive - it's a brute force approach intended to find everything: URLs in comments, binary files, scripts, image EXIF data, inside CSS documents, and more — useful for archiving and use cases where it's better to have false positives than fail to discover a resource. | ||
It's definitely not a solution for every case, though — if you're | ||
writing a link checker or validator, you don't want erroneous 404s | ||
throwing errors. Therefore, simplecrawler allows you to tune discovery in a few | ||
key ways: | ||
It's definitely not a solution for every case, though — if you're writing a link checker or validator, you don't want erroneous 404s throwing errors. Therefore, simplecrawler allows you to tune discovery in a few key ways: | ||
- You can either add to (or remove from) the `discoverRegex` array, tweaking | ||
the search patterns to meet your requirements; or | ||
- Swap out the `discoverResources` method. Parsing HTML pages is beyond the | ||
scope of simplecrawler, but it is very common to combine simplecrawler with | ||
a module like [cheerio](https://npmjs.com/package/cheerio) for more | ||
sophisticated resource discovery. | ||
- You can either add to (or remove from) the <code><a href="#Crawler+discoverRegex">crawler.discoverRegex</a></code> array, tweaking the search patterns to meet your requirements; or | ||
- Swap out the `discoverResources` method. Parsing HTML pages is beyond the scope of simplecrawler, but it is very common to combine simplecrawler with a module like [cheerio](https://npmjs.com/package/cheerio) for more sophisticated resource discovery. | ||
Further documentation is available in the [link discovery](#link-discovery) | ||
section. | ||
Further documentation is available in the [link discovery](#link-discovery) section. | ||
- **Q: Why did simplecrawler complete without fetching any resources?** | ||
A: When this happens, it is usually because the initial request was redirected | ||
to a different domain that wasn't in the `domainWhitelist`. | ||
A: When this happens, it is usually because the initial request was redirected to a different domain that wasn't in the <code><a href="#Crawler+domainWhitelist">crawler.domainWhitelist</a></code>. | ||
- **Q: How do I crawl a site that requires a login?** | ||
A: Logging in to a site is usually fairly simple and most login procedures | ||
look alike. We've included an example that covers a lot of situations, but | ||
sadly, there isn't a one true solution for how to deal with logins, so | ||
there's no guarantee that this code works right off the bat. | ||
A: Logging in to a site is usually fairly simple and most login procedures look alike. We've included an example that covers a lot of situations, but sadly, there isn't a one true solution for how to deal with logins, so there's no guarantee that this code works right off the bat. | ||
@@ -847,23 +1139,11 @@ What we do here is: | ||
A: One of the core concepts of node.js is its asynchronous nature. I/O | ||
operations (like network requests) take place outside of the main thread | ||
(which is where your code is executed). This is what makes node fast, the | ||
fact that it can continue executing code while there are multiple HTTP | ||
requests in flight, for example. But to be able to get back the result of | ||
the HTTP request, we need to register a function that will be called when | ||
the result is ready. This is what *asynchronous* means in node - the fact | ||
that code can continue executing while I/O operations are in progress - and | ||
it's the same concept as with AJAX requests in the browser. | ||
A: One of the core concepts of node.js is its asynchronous nature. I/O operations (like network requests) take place outside of the main thread (which is where your code is executed). This is what makes node fast, the fact that it can continue executing code while there are multiple HTTP requests in flight, for example. But to be able to get back the result of the HTTP request, we need to register a function that will be called when the result is ready. This is what *asynchronous* means in node - the fact that code can continue executing while I/O operations are in progress - and it's the same concept as with AJAX requests in the browser. | ||
- **Q: Promises are nice, can I use them with simplecrawler?** | ||
A: No, not really. Promises are meant as a replacement for callbacks, but | ||
simplecrawler is event driven, not callback driven. Using callbacks to any | ||
greater extent in simplecrawler wouldn't make much sense, since you normally | ||
need to react more than once to what happens in simplecrawler. | ||
A: No, not really. Promises are meant as a replacement for callbacks, but simplecrawler is event driven, not callback driven. Using callbacks to any greater extent in simplecrawler wouldn't make much sense, since you normally need to react more than once to what happens in simplecrawler. | ||
- **Q: Something's happening and I don't see the output I'm expecting!** | ||
Before filing an issue, check to see that you're not just missing something by | ||
logging *all* crawler events with the code below: | ||
Before filing an issue, check to see that you're not just missing something by logging *all* crawler events with the code below: | ||
@@ -896,16 +1176,13 @@ ```js | ||
If you don't see what you need after inserting that code block, and you still need help, | ||
please attach the output of all the events fired with your email/issue. | ||
If you don't see what you need after inserting that code block, and you still need help, please attach the output of all the events fired with your email/issue. | ||
## Node Support Policy | ||
Simplecrawler will officially support stable and LTS versions of Node which are | ||
currently supported by the Node Foundation. | ||
Simplecrawler will officially support stable and LTS versions of Node which are currently supported by the Node Foundation. | ||
Currently supported versions: | ||
- 4.x | ||
- 5.x | ||
- 6.x | ||
- 7.x | ||
- 8.x | ||
- 10.x | ||
@@ -916,2 +1193,3 @@ ## Current Maintainers | ||
* [Fredrik Ekelund](https://github.com/fredrikekelund) | ||
* [Konstantin Bläsi](https://github.com/konstantinblaesi) | ||
* [XhmikosR](https://github.com/XhmikosR) | ||
@@ -921,16 +1199,11 @@ | ||
Please see the [contributor guidelines](https://github.com/cgiffard/node-simplecrawler/blob/master/CONTRIBUTING.md) | ||
before submitting a pull request to ensure that your contribution is able to be | ||
accepted quickly and easily! | ||
Please see the [contributor guidelines](https://github.com/simplecrawler/simplecrawler/blob/master/CONTRIBUTING.md) before submitting a pull request to ensure that your contribution is able to be accepted quickly and easily! | ||
## Contributors | ||
simplecrawler has benefited from the kind efforts of dozens of contributors, to | ||
whom we are incredibly grateful. We originally listed their individual | ||
contributions but it became pretty unwieldy - the | ||
[full list can be found here.](https://github.com/cgiffard/node-simplecrawler/graphs/contributors) | ||
simplecrawler has benefited from the kind efforts of dozens of contributors, to whom we are incredibly grateful. We originally listed their individual contributions but it became pretty unwieldy - the [full list can be found here.](https://github.com/simplecrawler/simplecrawler/graphs/contributors) | ||
## License | ||
Copyright (c) 2016, Christopher Giffard. | ||
Copyright (c) 2017, Christopher Giffard. | ||
@@ -937,0 +1210,0 @@ All rights reserved. |
Sorry, the diff of this file is too big to display
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
174535
1215
2676
3
+ Addedrobots-parser@2.4.0(transitive)
- Removedrobots-parser@1.0.2(transitive)
Updatedasync@^2.6.2
Updatediconv-lite@^0.4.24
Updatedrobots-parser@^2.1.1
Updatedurijs@^1.19.1