Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

supercrawler

Package Overview
Dependencies
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

supercrawler - npm Package Compare versions

Comparing version 0.11.0 to 0.12.0

10

lib/Crawler.js

@@ -276,4 +276,12 @@ var Crawler,

Crawler.prototype._fireHandlers = function (contentType, body, url) {
var ctx;
contentType = contentType.replace(/;.*$/g, "");
ctx = {
body: body,
url: url,
contentType: contentType
};
return Promise.reduce(this._handlers, function (arr, handlerObj) {

@@ -295,3 +303,3 @@ var handlerContentType = handlerObj.contentType,

return Promise.try(function () {
return handlerFun(body, url, contentType);
return handlerFun(ctx);
}).then(function (subArr) {

@@ -298,0 +306,0 @@ if (!(subArr instanceof Array)) {

7

lib/handlers/htmlLinkParser.js

@@ -9,6 +9,7 @@ var cheerio = require("cheerio"),

return function (buf, url) {
return function (context) {
var $;
$ = cheerio.load(buf);
$ = context.$ || cheerio.load(context.body);
context.$ = $;

@@ -25,3 +26,3 @@ return $("a[href], link[href][rel=alternate]").map(function () {

targetHref = $this.attr("href");
absoluteTargetUrl = urlMod.resolve(url, targetHref);
absoluteTargetUrl = urlMod.resolve(context.url, targetHref);
urlObj = urlMod.parse(absoluteTargetUrl);

@@ -28,0 +29,0 @@ protocol = urlObj.protocol;

@@ -20,7 +20,7 @@ var robotsParser = require("robots-parser"),

return function (buf, url) {
return function (context) {
var robots,
urlObj;
urlObj = urlMod.parse(url);
urlObj = urlMod.parse(context.url);

@@ -32,10 +32,10 @@ // skip if this is not actually a robots.txt file.

robots = robotsParser(url, buf.toString());
robots = robotsParser(context.url, context.body.toString());
return robots.getSitemaps().map(function (sitemapHref) {
return urlMod.resolve(url, sitemapHref);
return urlMod.resolve(context.url, sitemapHref);
}).filter(function (sitemapUrl) {
return opts.urlFilter(sitemapUrl, url);
return opts.urlFilter(sitemapUrl, context.url);
});
};
};

@@ -34,11 +34,11 @@ var cheerio = require("cheerio"),

return function (buf, url, contentType) {
return function (context) {
var xmlBufProm;
// If sitemap has come in compressed state, we must uncompress it!
if (contentType === "application/x-gzip" ||
contentType === "application/gzip") {
xmlBufProm = Promise.promisify(zlib.gunzip)(buf);
if (context.contentType === "application/x-gzip" ||
context.contentType === "application/gzip") {
xmlBufProm = Promise.promisify(zlib.gunzip)(context.body);
} else {
xmlBufProm = Promise.resolve(buf);
xmlBufProm = Promise.resolve(context.body);
}

@@ -51,5 +51,4 @@

var $ = cheerio.load(xmlBuf, {
xmlMode: true
});
var $ = context.$ || cheerio.load(xmlBuf);
context.$ = $;

@@ -56,0 +55,0 @@ // We map over the array rather than using Cheerio's map, because it is

{
"name": "supercrawler",
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
"version": "0.11.0",
"version": "0.12.0",
"homepage": "https://github.com/brendonboshell/supercrawler",

@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

@@ -100,5 +100,5 @@ # Node.js Web Crawler

// Custom content handler for HTML pages.
crawler.addHandler("text/html", function (buf, url) {
var sizeKb = Buffer.byteLength(buf) / 1024;
logger.info("Processed", url, "Size=", sizeKb, "KB");
crawler.addHandler("text/html", function (context) {
var sizeKb = Buffer.byteLength(context.body) / 1024;
logger.info("Processed", context.url, "Size=", sizeKb, "KB");
});

@@ -313,2 +313,9 @@ ```

### 0.12.0
* [Change] Rather than calling content handlers with (body, url), they are
now called with a single `context` argument. This allows you to pass information
forwards via handlers. For example, you might cache the `cheerio` parsing
so you don't parse with every content handler.
### 0.11.0

@@ -315,0 +322,0 @@

@@ -539,5 +539,7 @@ var proxyquire = require('proxyquire'),

crawler.stop();
sinon.assert.calledWith(handler,
sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html");
sinon.assert.calledWith(handler, sinon.match({
body: sinon.match(new Buffer("<html><body>test</body></html>")),
url: "https://example.com/index1.html",
contentType: "text/plain"
}));
done();

@@ -558,5 +560,7 @@ }, 15);

crawler.stop();
sinon.assert.calledWith(handler,
sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html");
sinon.assert.calledWith(handler, sinon.match({
body: sinon.match(new Buffer("<html><body>test</body></html>")),
url: "https://example.com/index1.html",
contentType: "text/html"
}));
done();

@@ -577,4 +581,6 @@ }, 15);

crawler.stop();
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
"https://example.com/index1.html")).to.equal(false);
expect(handler.calledWith(sinon.match({
body: sinon.match("<html><body>test</body></html>"),
url: "https://example.com/index1.html"
}))).to.equal(false);
done();

@@ -595,4 +601,6 @@ }, 15);

crawler.stop();
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html")).to.equal(true);
expect(handler.calledWith(sinon.match({
body: sinon.match(new Buffer("<html><body>test</body></html>")),
url: "https://example.com/index1.html"
}))).to.equal(true);
done();

@@ -613,4 +621,6 @@ }, 15);

crawler.stop();
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html")).to.equal(true);
expect(handler.calledWith(sinon.match({
body: sinon.match(new Buffer("<html><body>test</body></html>")),
url: "https://example.com/index1.html"
}))).to.equal(true);
done();

@@ -631,4 +641,6 @@ }, 15);

crawler.stop();
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
"https://example.com/index1.html")).to.equal(false);
expect(handler.calledWith(sinon.match({
body: sinon.match("<html><body>test</body></html>"),
url: "https://example.com/index1.html"
}))).to.equal(false);
done();

@@ -649,6 +661,7 @@ }, 15);

crawler.stop();
sinon.assert.calledWith(handler,
sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html",
"text/plain");
sinon.assert.calledWith(handler, sinon.match({
body: sinon.match(new Buffer("<html><body>test</body></html>")),
url: "https://example.com/index1.html",
contentType: "text/plain"
}));
done();

@@ -655,0 +668,0 @@ }, 100);

@@ -24,3 +24,6 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"),

expect(hlp(html, "https://example2.com/index")).to.deep.equal([
expect(hlp({
body: html,
url: "https://example2.com/index"
})).to.deep.equal([
"https://example.com/test"

@@ -36,3 +39,6 @@ ]);

expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
expect(hlp({
body: html,
url: "https://example.com/my/page.html"
})).to.deep.equal([
"https://example.com/my/page2.html"

@@ -48,3 +54,6 @@ ]);

expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
expect(hlp({
body: html,
url: "https://example.com/my/page.html"
})).to.deep.equal([
"https://example.com/page2.html"

@@ -64,3 +73,6 @@ ]);

expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
expect(hlp({
body: html,
url: "https://example.com/my/page.html"
})).to.deep.equal([
"https://example.com/page2.html",

@@ -80,3 +92,6 @@ "https://example.com/my/page3.html",

expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([]);
expect(hlp({
body: html,
url: "https://example.com/my/page.html"
})).to.deep.equal([]);
});

@@ -97,3 +112,6 @@

expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
expect(hlp({
body: html,
url: "https://example.com/my/page.html"
})).to.deep.equal([
"https://example.com/page2.html",

@@ -111,3 +129,6 @@ "https://example.com/my/page3.html",

expect(hlp(html, "http://example.com")).to.deep.equal([
expect(hlp({
body: html,
url: "http://example.com"
})).to.deep.equal([
"http://example.com/index-es/"

@@ -123,4 +144,7 @@ ]);

expect(hlp(html, "http://example.com")).to.deep.equal([]);
expect(hlp({
body: html,
url: "http://example.com"
})).to.deep.equal([]);
});
});

@@ -20,3 +20,6 @@ var robotsParser = require("../../lib/handlers/robotsParser"),

it("can extract extract a absolute path sitemap", function () {
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
expect(rb({
body: new Buffer(robotsTxt),
url: "http://example.com/robots.txt"
})).to.deep.equal([
"http://subdomain.example.com/sitemap_index_1.xml"

@@ -29,3 +32,6 @@ ]);

expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
expect(rb({
body: new Buffer(robotsTxt),
url: "http://example.com/robots.txt"
})).to.deep.equal([
"http://subdomain.example.com/sitemap_index_1.xml",

@@ -43,3 +49,6 @@ "http://example.com/sitemap_index.xml"

expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
expect(rb({
body: new Buffer(robotsTxt),
url: "http://example.com/robots.txt"
})).to.deep.equal([
"http://subdomain.example.com/sitemap_index_1.xml"

@@ -51,8 +60,14 @@ ]);

robotsTxt = "";
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([]);
expect(rb({
body: new Buffer(robotsTxt),
url: "http://example.com/robots.txt"
})).to.deep.equal([]);
});
it("returns empty when the URL path is not /robots.txt", function () {
expect(rb(new Buffer(robotsTxt), "http://example.com/Iamnotarobots.txt")).to.deep.equal([]);
expect(rb({
body: new Buffer(robotsTxt),
url: "http://example.com/Iamnotarobots.txt"
})).to.deep.equal([]);
});
});

@@ -47,3 +47,6 @@ var sitemapsParser = require("../../lib/handlers/sitemapsParser"),

it("discovers another sitemap", function (done) {
sp(new Buffer(sitemapindex), "http://example.com/sitemap_index.xml").then(function (urls) {
sp({
body: new Buffer(sitemapindex),
url: "http://example.com/sitemap_index.xml"
}).then(function (urls) {
expect(urls).to.deep.equal([

@@ -58,3 +61,6 @@ "http://example.com/sitemap.xml.gz"

sitemapindex = "<html><body><h1>I'm not a sitemap</h1></body></html>";
sp(new Buffer(sitemapindex), "http://example.com/sitemap_index.xml").then(function (urls) {
sp({
body: new Buffer(sitemapindex),
url: "http://example.com/sitemap_index.xml"
}).then(function (urls) {
expect(urls).to.deep.equal([]);

@@ -66,3 +72,6 @@ done();

it("discovers a urlset", function (done) {
sp(new Buffer(urlset), "http://example.com/sitemap_index.xml").then(function (urls) {
sp({
body: new Buffer(urlset),
url: "http://example.com/sitemap_index.xml"
}).then(function (urls) {
expect(urls).to.deep.equal([

@@ -76,3 +85,6 @@ "https://example.com/home.html"

it("discovers an alternate link", function (done) {
sp(new Buffer(urlsetWithAlternate), "http://example.com/sitemap_index.xml").then(function (urls) {
sp({
body: new Buffer(urlsetWithAlternate),
url: "http://example.com/sitemap_index.xml"
}).then(function (urls) {
expect(urls).to.deep.equal([

@@ -93,3 +105,6 @@ "https://example.com/home.html",

sp(new Buffer(urlsetWithAlternate), "http://example.com/sitemap_index.xml").then(function (urls) {
sp({
body: new Buffer(urlsetWithAlternate),
url: "http://example.com/sitemap_index.xml"
}).then(function (urls) {
expect(urls).to.deep.equal([

@@ -104,3 +119,7 @@ "https://example.com/home.html"

Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) {
return sp(buf, "http://example.com/sitemap_index.xml", "application/x-gzip");
return sp({
body: buf,
url: "http://example.com/sitemap_index.xml",
contentType: "application/x-gzip"
});
}).then(function (urls) {

@@ -116,3 +135,7 @@ expect(urls).to.deep.equal([

Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) {
return sp(buf, "http://example.com/sitemap_index.xml", "application/gzip");
return sp({
body: buf,
url: "http://example.com/sitemap_index.xml",
contentType: "application/gzip"
});
}).then(function (urls) {

@@ -119,0 +142,0 @@ expect(urls).to.deep.equal([

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc