supercrawler
Advanced tools
Comparing version 0.2.3 to 0.3.0
@@ -255,3 +255,3 @@ var Crawler, | ||
return Promise.try(function () { | ||
return handlerFun(body, url); | ||
return handlerFun(body, url, contentType); | ||
}).then(function (subArr) { | ||
@@ -279,3 +279,4 @@ if (!(subArr instanceof Array)) { | ||
"User-Agent": this.getUserAgent() | ||
} | ||
}, | ||
encoding: null | ||
}).catch(function (err) { | ||
@@ -368,6 +369,8 @@ err = new error.RequestError("A request error occured. " + err.message); | ||
}).then(function (response) { | ||
var body; | ||
var body, | ||
robotsTxt; | ||
body = response.body; | ||
self._robotsCache.set(robotsUrl, body); | ||
robotsTxt = body.toString(); | ||
self._robotsCache.set(robotsUrl, robotsTxt); | ||
@@ -374,0 +377,0 @@ return robotsTxt; |
@@ -9,6 +9,6 @@ var cheerio = require("cheerio"), | ||
return function (body, url) { | ||
return function (buf, url) { | ||
var $; | ||
$ = cheerio.load(body); | ||
$ = cheerio.load(buf); | ||
@@ -15,0 +15,0 @@ return $("a[href]").map(function () { |
@@ -10,3 +10,3 @@ var robotsParser = require("robots-parser"), | ||
module.exports = function () { | ||
return function (body, url) { | ||
return function (buf, url) { | ||
var robots, | ||
@@ -22,3 +22,3 @@ urlObj; | ||
robots = robotsParser(url, body); | ||
robots = robotsParser(url, buf.toString()); | ||
@@ -25,0 +25,0 @@ return robots.getSitemaps().map(function (sitemapHref) { |
@@ -5,3 +5,4 @@ var Crawler = require("./Crawler"), | ||
htmlLinkParser = require("./handlers/htmlLinkParser"), | ||
robotsParser = require("./handlers/robotsParser"); | ||
robotsParser = require("./handlers/robotsParser"), | ||
sitemapsParser = require("./handlers/sitemapsParser"); | ||
@@ -14,4 +15,5 @@ module.exports = { | ||
htmlLinkParser: htmlLinkParser, | ||
robotsParser: robotsParser | ||
robotsParser: robotsParser, | ||
sitemapsParser: sitemapsParser | ||
} | ||
}; |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "0.2.3", | ||
"version": "0.3.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -45,3 +45,3 @@ # Supercrawler - Node.js Web Crawler | ||
})); | ||
crawler.addHandler("text/html", function (body, url) { | ||
crawler.addHandler("text/html", function (buf, url) { | ||
console.log("Got page", url); | ||
@@ -211,1 +211,14 @@ }); | ||
crawler.addHandler("text/plain", supercrawler.handlers.robotsParser()); | ||
# handlers.sitemapsParser | ||
A function that returns a handler which parses an XML sitemaps file. It will | ||
pick up any URLs matching `sitemapindex > sitemap > loc, urlset > url > loc`. | ||
It will also handle a gzipped file, since that it part of the sitemaps | ||
specification. | ||
Example usage: | ||
var sp = supercrawler.handlers.sitemapsParser(); | ||
crawler.addHandler(supercrawler.handlers.sitemapsParser()); |
@@ -39,3 +39,4 @@ var proxyquire = require('proxyquire'), | ||
pageBody, | ||
robotsStatusCode; | ||
robotsStatusCode, | ||
robotsTxt; | ||
@@ -47,2 +48,6 @@ beforeEach(function () { | ||
robotsStatusCode = 200; | ||
robotsTxt = ["User-agent: *", | ||
"Allow: /", | ||
"Disallow: /index17.html" | ||
].join("\n"); | ||
@@ -65,3 +70,3 @@ requestSpy = sinon.spy(function (opts, cb) { | ||
statusCode: pageStatusCode, | ||
body: pageBody | ||
body: new Buffer(pageBody) | ||
}); | ||
@@ -80,6 +85,3 @@ }, 1); | ||
statusCode: robotsStatusCode, | ||
body: ["User-agent: *", | ||
"Allow: /", | ||
"Disallow: /index17.html" | ||
].join("\n") | ||
body: new Buffer(robotsTxt) | ||
}); | ||
@@ -310,2 +312,20 @@ }, 1); | ||
it("skips page excluded by robots.txt, even if robots.txt not in cache", function (done) { | ||
var crawler = new Crawler({ | ||
interval: 10 | ||
}); | ||
robotsTxt = ["User-agent: *", | ||
"Allow: /", | ||
"Disallow: /index1.html" | ||
].join("\n"); | ||
crawler.start(); | ||
setTimeout(function () { | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index1.html")).to.equal(0); | ||
done(); | ||
}, 200); | ||
}); | ||
it("skips a page that is excluded by robots.txt", function (done) { | ||
@@ -433,3 +453,3 @@ var crawler = new Crawler({ | ||
sinon.assert.calledWith(handler, | ||
sinon.match("<html><body>test</body></html>"), | ||
sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html"); | ||
@@ -452,3 +472,3 @@ done(); | ||
sinon.assert.calledWith(handler, | ||
sinon.match("<html><body>test</body></html>"), | ||
sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html"); | ||
@@ -487,3 +507,3 @@ done(); | ||
crawler.stop(); | ||
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"), | ||
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html")).to.equal(true); | ||
@@ -505,3 +525,3 @@ done(); | ||
crawler.stop(); | ||
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"), | ||
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html")).to.equal(true); | ||
@@ -529,2 +549,21 @@ done(); | ||
it("passes the content type as the third argument", function (done) { | ||
var crawler = new Crawler({ | ||
interval: 10 | ||
}); | ||
crawler.addHandler(handler); | ||
crawler.start(); | ||
pageContentType = "text/plain"; | ||
setTimeout(function () { | ||
crawler.stop(); | ||
sinon.assert.calledWith(handler, | ||
sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html", | ||
"text/plain"); | ||
done(); | ||
}, 15); | ||
}); | ||
it("adds URL to the queue", function (done) { | ||
@@ -531,0 +570,0 @@ var crawler = new Crawler({ |
@@ -14,3 +14,3 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"), | ||
return html; | ||
return new Buffer(html); | ||
}; | ||
@@ -17,0 +17,0 @@ |
@@ -20,3 +20,3 @@ var robotsParser = require("../../lib/handlers/robotsParser"), | ||
it("can extract extract a absolute path sitemap", function () { | ||
expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([ | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([ | ||
"http://subdomain.example.com/sitemap_index_1.xml" | ||
@@ -29,3 +29,3 @@ ]); | ||
expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([ | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([ | ||
"http://subdomain.example.com/sitemap_index_1.xml", | ||
@@ -38,8 +38,8 @@ "http://example.com/sitemap_index.xml" | ||
robotsTxt = ""; | ||
expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([]); | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([]); | ||
}); | ||
it("returns empty when the URL path is not /robots.txt", function () { | ||
expect(rb(robotsTxt, "http://example.com/Iamnotarobots.txt")).to.deep.equal([]); | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/Iamnotarobots.txt")).to.deep.equal([]); | ||
}); | ||
}); |
@@ -8,2 +8,3 @@ var proxyquire = require('proxyquire'), | ||
robotsParserMock, | ||
sitemapsParserMock, | ||
index; | ||
@@ -16,2 +17,3 @@ | ||
robotsParserMock = function () {}; | ||
sitemapsParserMock = function () {}; | ||
@@ -23,3 +25,4 @@ index = proxyquire("../lib/index", { | ||
"./handlers/htmlLinkParser": htmlLinkParserMock, | ||
"./handlers/robotsParser": robotsParserMock | ||
"./handlers/robotsParser": robotsParserMock, | ||
"./handlers/sitemapsParser": sitemapsParserMock | ||
}); | ||
@@ -47,2 +50,6 @@ | ||
}); | ||
it("exposes sitemapsParser", function () { | ||
expect(index.handlers.sitemapsParser).to.equal(sitemapsParserMock); | ||
}); | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
79767
24
1969
223