supercrawler
Advanced tools
Comparing version 0.2.2 to 0.2.3
@@ -341,5 +341,12 @@ var Crawler, | ||
// robots.txt doesn't exist in the cache, so we have to hit the | ||
// server to get it. | ||
return this._downloadUrl(robotsUrl).catch(error.HttpError, function (err) { | ||
// We want to add /robots.txt to the crawl queue. This is because we may | ||
// parse the robots.txt file with a content handler, in order to extract | ||
// it's Sitemap: directives. (And then we'll crawl those sitemaps too!) | ||
return this.getUrlList().insertIfNotExists(new Url({ | ||
url: robotsUrl | ||
})).then(function () { | ||
// robots.txt doesn't exist in the cache, so we have to hit the | ||
// server to get it. | ||
return self._downloadUrl(robotsUrl); | ||
}).catch(error.HttpError, function (err) { | ||
var robotsStatusCode = err.statusCode; | ||
@@ -346,0 +353,0 @@ |
var Crawler = require("./Crawler"), | ||
Url = require("./Url"), | ||
DbUrlList = require("./DbUrlList"), | ||
htmlLinkParser = require("./handlers/htmlLinkParser"); | ||
htmlLinkParser = require("./handlers/htmlLinkParser"), | ||
robotsParser = require("./handlers/robotsParser"); | ||
@@ -11,4 +12,5 @@ module.exports = { | ||
handlers: { | ||
htmlLinkParser: htmlLinkParser | ||
htmlLinkParser: htmlLinkParser, | ||
robotsParser: robotsParser | ||
} | ||
}; |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "0.2.2", | ||
"version": "0.2.3", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -9,2 +9,15 @@ # Supercrawler - Node.js Web Crawler | ||
## Features | ||
* **Link Detection**. Supercrawler will parse crawled HTML documents, identify | ||
links and add them to the queue. | ||
* **Robots Parsing**. Supercrawler will request robots.txt and check the rules | ||
before crawling. It will also identify any sitemaps. | ||
* **Sitemaps Parsing**. Supercrawler will read links from XML sitemap files, | ||
and add links to the queue. | ||
* **Concurrency Limiting**. Supercrawler limits the number of requests sent out | ||
at any one time. | ||
* **Rate limiting**. Supercrawler will add a delay between requests to avoid | ||
bombarding servers. | ||
## Step 1. Create a New Crawler | ||
@@ -181,12 +194,18 @@ | ||
## Features | ||
* Pluggable priority queues. Supercrawler ships with a simple first-in, | ||
first-out style queue. But you can easily plug your own queue in, allowing | ||
you to retry failed crawls, prioritize specific pages or save crawl data | ||
in a database, for example. | ||
* Concurrency limiting. You can set a maximum number of requests that can | ||
execute at the same time. | ||
* Rate limiting. You can set a rate limit to prevent crawling too quickly. | ||
* Robots adherence. Supercrawler automatically downloads, checks and caches | ||
the results of robots.txt exclusions. | ||
## handlers.robotsParser | ||
A function that returns a handler which parses a robots.txt file. Robots.txt | ||
file are automatically crawled, and sent through the same content handler | ||
routines as any other file. This handler will look for any `Sitemap: ` directives, | ||
and add those XML sitemaps to the crawl. | ||
It will ignore any files that are not `/robots.txt`. | ||
If you want to extract the URLs from those XML sitemaps, you will also need | ||
to add a sitemap parser. | ||
Example usage: | ||
var rp = supercrawler.handlers.robotsParser(); | ||
crawler.addHandler("text/plain", supercrawler.handlers.robotsParser()); |
@@ -276,2 +276,18 @@ var proxyquire = require('proxyquire'), | ||
it("adds the robots.txt file itself to the crawl queue", function (done) { | ||
var crawler = new Crawler({ | ||
interval: 10 | ||
}); | ||
crawler.start(); | ||
setTimeout(function () { | ||
crawler.stop(); | ||
sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ | ||
_url: "https://example.com/robots.txt" | ||
})); | ||
done(); | ||
}, 200); | ||
}); | ||
it("requests a page that is not excluded by robots.txt", function (done) { | ||
@@ -278,0 +294,0 @@ var crawler = new Crawler({ |
@@ -7,2 +7,3 @@ var proxyquire = require('proxyquire'), | ||
htmlLinkParserMock, | ||
robotsParserMock, | ||
index; | ||
@@ -14,2 +15,3 @@ | ||
htmlLinkParserMock = function () {}; | ||
robotsParserMock = function () {}; | ||
@@ -20,3 +22,4 @@ index = proxyquire("../lib/index", { | ||
"./DbUrlList": DbUrlListMock, | ||
"./handlers/htmlLinkParser": htmlLinkParserMock | ||
"./handlers/htmlLinkParser": htmlLinkParserMock, | ||
"./handlers/robotsParser": robotsParserMock | ||
}); | ||
@@ -40,2 +43,6 @@ | ||
}); | ||
it("exposes robotsParser", function () { | ||
expect(index.handlers.robotsParser).to.equal(robotsParserMock); | ||
}); | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
74510
22
1828
210