simplecrawler
Advanced tools
Comparing version 0.1.4 to 0.1.5
// Simplecrawler | ||
// Christopher Giffard, 2011 | ||
// Christopher Giffard, 2011 - 2013+ | ||
// | ||
@@ -16,3 +16,20 @@ // http://www.github.com/cgiffard/node-simplecrawler | ||
// Crawler Constructor | ||
/* | ||
Public: Constructor for the crawler. | ||
host - Initial hostname/domain to begin crawling from. By | ||
default, the crawl will be locked to this hostname. | ||
initialPath - Initial path to begin crawling from. | ||
initialPort - Port to begin crawling from. | ||
interval - Request interval for the crawler. Defaults to 250ms. | ||
Examples | ||
var crawler = new Crawler("example.com","/",80,500); | ||
var crawler = new Crawler("example.com"); | ||
Returns the crawler object which has now been constructed. | ||
*/ | ||
var Crawler = function(host,initialPath,initialPort,interval) { | ||
@@ -115,2 +132,16 @@ // Data integrity checks | ||
/* | ||
Public: Starts or resumes the crawl. If the queue is empty, it adds a new | ||
queue item from which to begin crawling based on the initial configuration | ||
of the crawler itself. The crawler waits for process.nextTick to begin, so | ||
handlers and other properties can be altered or addressed before the crawl | ||
commences. | ||
Examples | ||
crawler.start(); | ||
Returns the crawler object, to enable chaining. | ||
*/ | ||
Crawler.prototype.start = function() { | ||
@@ -144,5 +175,19 @@ var crawler = this; | ||
}); | ||
return this; | ||
}; | ||
// Determines whether the protocol is supported, given a URL | ||
/* | ||
Public: Determines whether the protocol is supported, given a URL. | ||
URL - URL with a protocol, for testing. | ||
Examples | ||
crawler.protocolSupported("http://google.com/") // true, by default | ||
crawler.protocolSupported("wss://google.com/") // false, by default | ||
Returns a boolean, true if the protocol is supported - false if not. | ||
*/ | ||
Crawler.prototype.protocolSupported = function(URL) { | ||
@@ -164,3 +209,15 @@ var protocol; | ||
// Determines whether the mimetype is supported, given a... mimetype | ||
/* | ||
Public: Determines whether the mimetype is supported, given a mimetype | ||
MIMEType - String containing MIME type to test | ||
Examples | ||
crawler.mimeTypeSupported("text/html") // true, by default | ||
crawler.mimeTypeSupported("application/octet-stream") // false, by default | ||
Returns a boolean, true if the MIME type is supported - false if not. | ||
*/ | ||
Crawler.prototype.mimeTypeSupported = function(MIMEType) { | ||
@@ -175,3 +232,15 @@ | ||
// Takes a URL, and extracts the protocol, host, port, and resource | ||
/* | ||
Public: Extracts protocol, host, port and resource (path) given a URL string. | ||
URL - String containing URL to process | ||
Examples | ||
var URLInfo = crawler.processURL("http://www.google.com/fish"); | ||
Returns an object containing keys and values for "protocol", "host", "port", | ||
and "path". | ||
*/ | ||
Crawler.prototype.processURL = function(URL,context) { | ||
@@ -202,5 +271,17 @@ var newURL; | ||
// Input some text/html and this function will return a bunch of URLs for queueing | ||
// (if there are actually any in the resource, otherwise it'll return an empty array) | ||
Crawler.prototype.discoverResources = function(resourceData,queueItem) { | ||
/* | ||
Public: Discovers linked resources in an HTML, XML or text document. | ||
resourceData - String containing document with linked resources. | ||
Examples | ||
crawler.discoverResources("http://www.google.com") | ||
crawler.discoverResources("<a href='...'>test</a>") | ||
Returns an array of the (string) resource URLs found in the document. If none | ||
were found, the array will be empty. | ||
*/ | ||
Crawler.prototype.discoverResources = function(resourceData) { | ||
var resources = [], | ||
@@ -219,2 +300,3 @@ resourceText = resourceData.toString("utf8"), | ||
// or preview windows, which would otherwise be unavailable to us. | ||
// Worst case scenario is we make some junky requests. | ||
/^javascript\:[a-z0-9]+\(['"][^'"\s]+/ig | ||
@@ -269,3 +351,17 @@ ]; | ||
// Checks to see whether domain is valid for crawling. | ||
/* | ||
Public: Determines based on crawler state whether a domain is valid for | ||
crawling. | ||
host - String containing the hostname of the resource to be fetched. | ||
Examples | ||
crawler.domainValid("127.0.0.1"); | ||
crawler.domainValid("google.com"); | ||
crawler.domainValid("test.example.com"); | ||
Returns an true if the domain is valid for crawling, false if not. | ||
*/ | ||
Crawler.prototype.domainValid = function(host) { | ||
@@ -281,3 +377,4 @@ var crawler = this, | ||
// If there's no whitelist, or the whitelist is of zero length, just return false. | ||
// If there's no whitelist, or the whitelist is of zero length, | ||
// just return false. | ||
if (!crawler.domainWhitelist || | ||
@@ -288,8 +385,14 @@ !crawler.domainWhitelist.length) return false; | ||
return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) { | ||
// If we already located the relevant domain in the whitelist... | ||
if (prev) return prev; | ||
// If the domain is just equal, return true. | ||
if (host === cur) return true; | ||
// If we're ignoring WWW subdomains, and both domains, less www. are the same, return true. | ||
if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,"")) return true; | ||
// If we're ignoring WWW subdomains, and both domains, | ||
// less www. are the same, return true. | ||
if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,"")) | ||
return true; | ||
// Otherwise, sorry. No dice. | ||
@@ -307,3 +410,4 @@ return false; | ||
// If we're ignoring www, remove it from both (if www is the first domain component...) | ||
// If we're ignoring www, remove it from both | ||
// (if www is the first domain component...) | ||
if (crawler.ignoreWWWDomain) { | ||
@@ -324,12 +428,35 @@ subdomain.replace(/^www./ig,""); | ||
(host === crawler.host) || | ||
// Or if we're ignoring WWW subdomains, and both domains, less www. are the same, return true. | ||
(crawler.ignoreWWWDomain && crawler.host.replace(/^www\./i,"") === host.replace(/^www\./i,"")) || | ||
// Or if the domain in question exists in the domain whitelist, return true. | ||
// Or if we're ignoring WWW subdomains, and both domains, | ||
// less www. are the same, return true. | ||
( | ||
crawler.ignoreWWWDomain && | ||
crawler.host.replace(/^www\./i,"") === | ||
host.replace(/^www\./i,"") | ||
) || | ||
// Or if the domain in question exists in the domain whitelist, | ||
// return true. | ||
domainInWhitelist(host) || | ||
// Or if we're scanning subdomains, and this domain is a subdomain of the crawler's set domain, return true. | ||
// Or if we're scanning subdomains, and this domain is a subdomain | ||
// of the crawler's set domain, return true. | ||
(crawler.scanSubdomains && isSubdomainOf(host,crawler.host))); | ||
}; | ||
// Input some text/html and this function will delegate resource discovery, check link validity | ||
// and queue up resources for downloading! | ||
/* | ||
Public: Given a text or HTML document, initiates discovery of linked | ||
resources in the text, and queues the resources if applicable. Emits | ||
"discoverycomplete". Not to be confused with `crawler.discoverResources`, | ||
despite the `discoverResources` function being the main component of this | ||
one, since this function queues the resources in addition to | ||
discovering them. | ||
resourceData - Text document containing linked resource URLs. | ||
queueItem - Queue item from which the resource document was derived. | ||
Examples | ||
crawler.queueLinkedItems("<a href='...'>test</a>",queueItem); | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.queueLinkedItems = function(resourceData,queueItem) { | ||
@@ -344,8 +471,26 @@ var resources = this.discoverResources(resourceData,queueItem), | ||
resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); | ||
return this; | ||
}; | ||
// Clean and queue a single URL... | ||
/* | ||
Public: Given a single URL, this function cleans, validates, parses it and | ||
adds it to the queue. This is the best and simplest way to add an item to | ||
the queue. | ||
url - URL to be queued. | ||
queueItem - Queue item from which the resource was linked. | ||
Examples | ||
crawler.queueURL("http://www.google.com/",queueItem); | ||
Returns a boolean value indicating whether the URL was successfully queued | ||
or not. | ||
*/ | ||
Crawler.prototype.queueURL = function(url,queueItem) { | ||
var crawler = this; | ||
var parsedURL = typeof(url) === "object" ? url : crawler.processURL(url,queueItem); | ||
var parsedURL = | ||
typeof(url) === "object" ? url : crawler.processURL(url,queueItem); | ||
@@ -389,7 +534,34 @@ // URL Parser decided this URL was junky. Next please! | ||
crawler.emit("queueerror",error,parsedURL); | ||
return false; | ||
} | ||
} | ||
return true; | ||
}; | ||
// Fetch a queue item | ||
/* | ||
Public: The guts of the crawler: takes a queue item and spools a request for | ||
it, downloads, caches, and fires events based on the result of the request. | ||
It kicks off resource discovery and queues any new resources found. | ||
queueItem - Queue item to be fetched. | ||
Emits | ||
fetchstart | ||
fetchheaders | ||
fetchcomplete | ||
fetchdataerror | ||
notmodified | ||
fetchredirect | ||
fetch404 | ||
fetcherror | ||
fetchclienterror | ||
Examples | ||
crawler.fetchQueueItem(queueItem); | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.fetchQueueItem = function(queueItem) { | ||
@@ -625,5 +797,19 @@ var crawler = this; | ||
}); | ||
return crawler; | ||
}; | ||
// Crawl init | ||
/* | ||
Public: The main crawler runloop. Fires at the interval specified in the | ||
crawler configuration, when the crawl is running. May be manually fired. | ||
This function initiates fetching of a queue item if there are enough workers | ||
to do so and there are unfetched items in the queue. | ||
Examples | ||
crawler.crawl(); | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.crawl = function() { | ||
@@ -646,9 +832,43 @@ var crawler = this; | ||
}); | ||
return crawler; | ||
}; | ||
/* | ||
Public: Stops the crawler, terminating the crawl runloop. | ||
Examples | ||
crawler.stop(); | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.stop = function() { | ||
clearInterval(this.crawlIntervalID); | ||
this.running = false; | ||
return this; | ||
}; | ||
/* | ||
Public: Given a function, this method adds it to an internal list maintained | ||
by the crawler to be executed against each URL to determine whether it should | ||
be fetched or not. | ||
callback - Function to be called when evaluating a URL. This function is | ||
passed an object containing the protocol, hostname, port, and path | ||
of a resource to be fetched. It can determine whether it should | ||
be requested or not by returning a boolean - false for no, true | ||
for yes. | ||
Examples | ||
crawler.addFetchCondition(function(parsedURL) { | ||
return (parsedURL.host !== "evildomain.com"); | ||
}); | ||
Returns the ID of the fetch condition - used for removing it from the crawler | ||
later. | ||
*/ | ||
Crawler.prototype.addFetchCondition = function(callback) { | ||
@@ -663,2 +883,16 @@ if (callback instanceof Function) { | ||
/* | ||
Public: Given the ID of an existing fetch condition, this function removes | ||
it from the crawler's internal list of conditions. | ||
url - ID of fetch condition to be removed. | ||
Examples | ||
crawler.removeFetchCondition(3); | ||
Returns true if the fetch condition was removed, and throws an error if it | ||
could not be found. | ||
*/ | ||
Crawler.prototype.removeFetchCondition = function(index) { | ||
@@ -665,0 +899,0 @@ if (this.fetchConditions[index] && |
@@ -35,3 +35,3 @@ var Crawler = require("./crawler.js"), | ||
Returns the new Vixen object which has now been constructed. | ||
Returns the crawler object which has now been constructed. | ||
@@ -38,0 +38,0 @@ */ |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.1.4", | ||
"version": "0.1.5", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
76661
20
1558