supercrawler
Advanced tools
Comparing version 0.11.0 to 0.12.0
@@ -276,4 +276,12 @@ var Crawler, | ||
Crawler.prototype._fireHandlers = function (contentType, body, url) { | ||
var ctx; | ||
contentType = contentType.replace(/;.*$/g, ""); | ||
ctx = { | ||
body: body, | ||
url: url, | ||
contentType: contentType | ||
}; | ||
return Promise.reduce(this._handlers, function (arr, handlerObj) { | ||
@@ -295,3 +303,3 @@ var handlerContentType = handlerObj.contentType, | ||
return Promise.try(function () { | ||
return handlerFun(body, url, contentType); | ||
return handlerFun(ctx); | ||
}).then(function (subArr) { | ||
@@ -298,0 +306,0 @@ if (!(subArr instanceof Array)) { |
@@ -9,6 +9,7 @@ var cheerio = require("cheerio"), | ||
return function (buf, url) { | ||
return function (context) { | ||
var $; | ||
$ = cheerio.load(buf); | ||
$ = context.$ || cheerio.load(context.body); | ||
context.$ = $; | ||
@@ -25,3 +26,3 @@ return $("a[href], link[href][rel=alternate]").map(function () { | ||
targetHref = $this.attr("href"); | ||
absoluteTargetUrl = urlMod.resolve(url, targetHref); | ||
absoluteTargetUrl = urlMod.resolve(context.url, targetHref); | ||
urlObj = urlMod.parse(absoluteTargetUrl); | ||
@@ -28,0 +29,0 @@ protocol = urlObj.protocol; |
@@ -20,7 +20,7 @@ var robotsParser = require("robots-parser"), | ||
return function (buf, url) { | ||
return function (context) { | ||
var robots, | ||
urlObj; | ||
urlObj = urlMod.parse(url); | ||
urlObj = urlMod.parse(context.url); | ||
@@ -32,10 +32,10 @@ // skip if this is not actually a robots.txt file. | ||
robots = robotsParser(url, buf.toString()); | ||
robots = robotsParser(context.url, context.body.toString()); | ||
return robots.getSitemaps().map(function (sitemapHref) { | ||
return urlMod.resolve(url, sitemapHref); | ||
return urlMod.resolve(context.url, sitemapHref); | ||
}).filter(function (sitemapUrl) { | ||
return opts.urlFilter(sitemapUrl, url); | ||
return opts.urlFilter(sitemapUrl, context.url); | ||
}); | ||
}; | ||
}; |
@@ -34,11 +34,11 @@ var cheerio = require("cheerio"), | ||
return function (buf, url, contentType) { | ||
return function (context) { | ||
var xmlBufProm; | ||
// If sitemap has come in compressed state, we must uncompress it! | ||
if (contentType === "application/x-gzip" || | ||
contentType === "application/gzip") { | ||
xmlBufProm = Promise.promisify(zlib.gunzip)(buf); | ||
if (context.contentType === "application/x-gzip" || | ||
context.contentType === "application/gzip") { | ||
xmlBufProm = Promise.promisify(zlib.gunzip)(context.body); | ||
} else { | ||
xmlBufProm = Promise.resolve(buf); | ||
xmlBufProm = Promise.resolve(context.body); | ||
} | ||
@@ -51,5 +51,4 @@ | ||
var $ = cheerio.load(xmlBuf, { | ||
xmlMode: true | ||
}); | ||
var $ = context.$ || cheerio.load(xmlBuf); | ||
context.$ = $; | ||
@@ -56,0 +55,0 @@ // We map over the array rather than using Cheerio's map, because it is |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "0.11.0", | ||
"version": "0.12.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -100,5 +100,5 @@ # Node.js Web Crawler | ||
// Custom content handler for HTML pages. | ||
crawler.addHandler("text/html", function (buf, url) { | ||
var sizeKb = Buffer.byteLength(buf) / 1024; | ||
logger.info("Processed", url, "Size=", sizeKb, "KB"); | ||
crawler.addHandler("text/html", function (context) { | ||
var sizeKb = Buffer.byteLength(context.body) / 1024; | ||
logger.info("Processed", context.url, "Size=", sizeKb, "KB"); | ||
}); | ||
@@ -313,2 +313,9 @@ ``` | ||
### 0.12.0 | ||
* [Change] Rather than calling content handlers with (body, url), they are | ||
now called with a single `context` argument. This allows you to pass information | ||
forwards via handlers. For example, you might cache the `cheerio` parsing | ||
so you don't parse with every content handler. | ||
### 0.11.0 | ||
@@ -315,0 +322,0 @@ |
@@ -539,5 +539,7 @@ var proxyquire = require('proxyquire'), | ||
crawler.stop(); | ||
sinon.assert.calledWith(handler, | ||
sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html"); | ||
sinon.assert.calledWith(handler, sinon.match({ | ||
body: sinon.match(new Buffer("<html><body>test</body></html>")), | ||
url: "https://example.com/index1.html", | ||
contentType: "text/plain" | ||
})); | ||
done(); | ||
@@ -558,5 +560,7 @@ }, 15); | ||
crawler.stop(); | ||
sinon.assert.calledWith(handler, | ||
sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html"); | ||
sinon.assert.calledWith(handler, sinon.match({ | ||
body: sinon.match(new Buffer("<html><body>test</body></html>")), | ||
url: "https://example.com/index1.html", | ||
contentType: "text/html" | ||
})); | ||
done(); | ||
@@ -577,4 +581,6 @@ }, 15); | ||
crawler.stop(); | ||
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"), | ||
"https://example.com/index1.html")).to.equal(false); | ||
expect(handler.calledWith(sinon.match({ | ||
body: sinon.match("<html><body>test</body></html>"), | ||
url: "https://example.com/index1.html" | ||
}))).to.equal(false); | ||
done(); | ||
@@ -595,4 +601,6 @@ }, 15); | ||
crawler.stop(); | ||
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html")).to.equal(true); | ||
expect(handler.calledWith(sinon.match({ | ||
body: sinon.match(new Buffer("<html><body>test</body></html>")), | ||
url: "https://example.com/index1.html" | ||
}))).to.equal(true); | ||
done(); | ||
@@ -613,4 +621,6 @@ }, 15); | ||
crawler.stop(); | ||
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html")).to.equal(true); | ||
expect(handler.calledWith(sinon.match({ | ||
body: sinon.match(new Buffer("<html><body>test</body></html>")), | ||
url: "https://example.com/index1.html" | ||
}))).to.equal(true); | ||
done(); | ||
@@ -631,4 +641,6 @@ }, 15); | ||
crawler.stop(); | ||
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"), | ||
"https://example.com/index1.html")).to.equal(false); | ||
expect(handler.calledWith(sinon.match({ | ||
body: sinon.match("<html><body>test</body></html>"), | ||
url: "https://example.com/index1.html" | ||
}))).to.equal(false); | ||
done(); | ||
@@ -649,6 +661,7 @@ }, 15); | ||
crawler.stop(); | ||
sinon.assert.calledWith(handler, | ||
sinon.match(new Buffer("<html><body>test</body></html>")), | ||
"https://example.com/index1.html", | ||
"text/plain"); | ||
sinon.assert.calledWith(handler, sinon.match({ | ||
body: sinon.match(new Buffer("<html><body>test</body></html>")), | ||
url: "https://example.com/index1.html", | ||
contentType: "text/plain" | ||
})); | ||
done(); | ||
@@ -655,0 +668,0 @@ }, 100); |
@@ -24,3 +24,6 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"), | ||
expect(hlp(html, "https://example2.com/index")).to.deep.equal([ | ||
expect(hlp({ | ||
body: html, | ||
url: "https://example2.com/index" | ||
})).to.deep.equal([ | ||
"https://example.com/test" | ||
@@ -36,3 +39,6 @@ ]); | ||
expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([ | ||
expect(hlp({ | ||
body: html, | ||
url: "https://example.com/my/page.html" | ||
})).to.deep.equal([ | ||
"https://example.com/my/page2.html" | ||
@@ -48,3 +54,6 @@ ]); | ||
expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([ | ||
expect(hlp({ | ||
body: html, | ||
url: "https://example.com/my/page.html" | ||
})).to.deep.equal([ | ||
"https://example.com/page2.html" | ||
@@ -64,3 +73,6 @@ ]); | ||
expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([ | ||
expect(hlp({ | ||
body: html, | ||
url: "https://example.com/my/page.html" | ||
})).to.deep.equal([ | ||
"https://example.com/page2.html", | ||
@@ -80,3 +92,6 @@ "https://example.com/my/page3.html", | ||
expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([]); | ||
expect(hlp({ | ||
body: html, | ||
url: "https://example.com/my/page.html" | ||
})).to.deep.equal([]); | ||
}); | ||
@@ -97,3 +112,6 @@ | ||
expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([ | ||
expect(hlp({ | ||
body: html, | ||
url: "https://example.com/my/page.html" | ||
})).to.deep.equal([ | ||
"https://example.com/page2.html", | ||
@@ -111,3 +129,6 @@ "https://example.com/my/page3.html", | ||
expect(hlp(html, "http://example.com")).to.deep.equal([ | ||
expect(hlp({ | ||
body: html, | ||
url: "http://example.com" | ||
})).to.deep.equal([ | ||
"http://example.com/index-es/" | ||
@@ -123,4 +144,7 @@ ]); | ||
expect(hlp(html, "http://example.com")).to.deep.equal([]); | ||
expect(hlp({ | ||
body: html, | ||
url: "http://example.com" | ||
})).to.deep.equal([]); | ||
}); | ||
}); |
@@ -20,3 +20,6 @@ var robotsParser = require("../../lib/handlers/robotsParser"), | ||
it("can extract extract a absolute path sitemap", function () { | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([ | ||
expect(rb({ | ||
body: new Buffer(robotsTxt), | ||
url: "http://example.com/robots.txt" | ||
})).to.deep.equal([ | ||
"http://subdomain.example.com/sitemap_index_1.xml" | ||
@@ -29,3 +32,6 @@ ]); | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([ | ||
expect(rb({ | ||
body: new Buffer(robotsTxt), | ||
url: "http://example.com/robots.txt" | ||
})).to.deep.equal([ | ||
"http://subdomain.example.com/sitemap_index_1.xml", | ||
@@ -43,3 +49,6 @@ "http://example.com/sitemap_index.xml" | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([ | ||
expect(rb({ | ||
body: new Buffer(robotsTxt), | ||
url: "http://example.com/robots.txt" | ||
})).to.deep.equal([ | ||
"http://subdomain.example.com/sitemap_index_1.xml" | ||
@@ -51,8 +60,14 @@ ]); | ||
robotsTxt = ""; | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([]); | ||
expect(rb({ | ||
body: new Buffer(robotsTxt), | ||
url: "http://example.com/robots.txt" | ||
})).to.deep.equal([]); | ||
}); | ||
it("returns empty when the URL path is not /robots.txt", function () { | ||
expect(rb(new Buffer(robotsTxt), "http://example.com/Iamnotarobots.txt")).to.deep.equal([]); | ||
expect(rb({ | ||
body: new Buffer(robotsTxt), | ||
url: "http://example.com/Iamnotarobots.txt" | ||
})).to.deep.equal([]); | ||
}); | ||
}); |
@@ -47,3 +47,6 @@ var sitemapsParser = require("../../lib/handlers/sitemapsParser"), | ||
it("discovers another sitemap", function (done) { | ||
sp(new Buffer(sitemapindex), "http://example.com/sitemap_index.xml").then(function (urls) { | ||
sp({ | ||
body: new Buffer(sitemapindex), | ||
url: "http://example.com/sitemap_index.xml" | ||
}).then(function (urls) { | ||
expect(urls).to.deep.equal([ | ||
@@ -58,3 +61,6 @@ "http://example.com/sitemap.xml.gz" | ||
sitemapindex = "<html><body><h1>I'm not a sitemap</h1></body></html>"; | ||
sp(new Buffer(sitemapindex), "http://example.com/sitemap_index.xml").then(function (urls) { | ||
sp({ | ||
body: new Buffer(sitemapindex), | ||
url: "http://example.com/sitemap_index.xml" | ||
}).then(function (urls) { | ||
expect(urls).to.deep.equal([]); | ||
@@ -66,3 +72,6 @@ done(); | ||
it("discovers a urlset", function (done) { | ||
sp(new Buffer(urlset), "http://example.com/sitemap_index.xml").then(function (urls) { | ||
sp({ | ||
body: new Buffer(urlset), | ||
url: "http://example.com/sitemap_index.xml" | ||
}).then(function (urls) { | ||
expect(urls).to.deep.equal([ | ||
@@ -76,3 +85,6 @@ "https://example.com/home.html" | ||
it("discovers an alternate link", function (done) { | ||
sp(new Buffer(urlsetWithAlternate), "http://example.com/sitemap_index.xml").then(function (urls) { | ||
sp({ | ||
body: new Buffer(urlsetWithAlternate), | ||
url: "http://example.com/sitemap_index.xml" | ||
}).then(function (urls) { | ||
expect(urls).to.deep.equal([ | ||
@@ -93,3 +105,6 @@ "https://example.com/home.html", | ||
sp(new Buffer(urlsetWithAlternate), "http://example.com/sitemap_index.xml").then(function (urls) { | ||
sp({ | ||
body: new Buffer(urlsetWithAlternate), | ||
url: "http://example.com/sitemap_index.xml" | ||
}).then(function (urls) { | ||
expect(urls).to.deep.equal([ | ||
@@ -104,3 +119,7 @@ "https://example.com/home.html" | ||
Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) { | ||
return sp(buf, "http://example.com/sitemap_index.xml", "application/x-gzip"); | ||
return sp({ | ||
body: buf, | ||
url: "http://example.com/sitemap_index.xml", | ||
contentType: "application/x-gzip" | ||
}); | ||
}).then(function (urls) { | ||
@@ -116,3 +135,7 @@ expect(urls).to.deep.equal([ | ||
Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) { | ||
return sp(buf, "http://example.com/sitemap_index.xml", "application/gzip"); | ||
return sp({ | ||
body: buf, | ||
url: "http://example.com/sitemap_index.xml", | ||
contentType: "application/gzip" | ||
}); | ||
}).then(function (urls) { | ||
@@ -119,0 +142,0 @@ expect(urls).to.deep.equal([ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
103553
2488
430