Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

supercrawler

Package Overview
Dependencies
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

supercrawler - npm Package Compare versions

Comparing version 0.2.3 to 0.3.0

lib/handlers/sitemapsParser.js

11

lib/Crawler.js

@@ -255,3 +255,3 @@ var Crawler,

return Promise.try(function () {
return handlerFun(body, url);
return handlerFun(body, url, contentType);
}).then(function (subArr) {

@@ -279,3 +279,4 @@ if (!(subArr instanceof Array)) {

"User-Agent": this.getUserAgent()
}
},
encoding: null
}).catch(function (err) {

@@ -368,6 +369,8 @@ err = new error.RequestError("A request error occured. " + err.message);

}).then(function (response) {
var body;
var body,
robotsTxt;
body = response.body;
self._robotsCache.set(robotsUrl, body);
robotsTxt = body.toString();
self._robotsCache.set(robotsUrl, robotsTxt);

@@ -374,0 +377,0 @@ return robotsTxt;

@@ -9,6 +9,6 @@ var cheerio = require("cheerio"),

return function (body, url) {
return function (buf, url) {
var $;
$ = cheerio.load(body);
$ = cheerio.load(buf);

@@ -15,0 +15,0 @@ return $("a[href]").map(function () {

@@ -10,3 +10,3 @@ var robotsParser = require("robots-parser"),

module.exports = function () {
return function (body, url) {
return function (buf, url) {
var robots,

@@ -22,3 +22,3 @@ urlObj;

robots = robotsParser(url, body);
robots = robotsParser(url, buf.toString());

@@ -25,0 +25,0 @@ return robots.getSitemaps().map(function (sitemapHref) {

@@ -5,3 +5,4 @@ var Crawler = require("./Crawler"),

htmlLinkParser = require("./handlers/htmlLinkParser"),
robotsParser = require("./handlers/robotsParser");
robotsParser = require("./handlers/robotsParser"),
sitemapsParser = require("./handlers/sitemapsParser");

@@ -14,4 +15,5 @@ module.exports = {

htmlLinkParser: htmlLinkParser,
robotsParser: robotsParser
robotsParser: robotsParser,
sitemapsParser: sitemapsParser
}
};
{
"name": "supercrawler",
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
"version": "0.2.3",
"version": "0.3.0",
"homepage": "https://github.com/brendonboshell/supercrawler",

@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

@@ -45,3 +45,3 @@ # Supercrawler - Node.js Web Crawler

}));
crawler.addHandler("text/html", function (body, url) {
crawler.addHandler("text/html", function (buf, url) {
console.log("Got page", url);

@@ -211,1 +211,14 @@ });

crawler.addHandler("text/plain", supercrawler.handlers.robotsParser());
# handlers.sitemapsParser
A function that returns a handler which parses an XML sitemaps file. It will
pick up any URLs matching `sitemapindex > sitemap > loc, urlset > url > loc`.
It will also handle a gzipped file, since that it part of the sitemaps
specification.
Example usage:
var sp = supercrawler.handlers.sitemapsParser();
crawler.addHandler(supercrawler.handlers.sitemapsParser());

@@ -39,3 +39,4 @@ var proxyquire = require('proxyquire'),

pageBody,
robotsStatusCode;
robotsStatusCode,
robotsTxt;

@@ -47,2 +48,6 @@ beforeEach(function () {

robotsStatusCode = 200;
robotsTxt = ["User-agent: *",
"Allow: /",
"Disallow: /index17.html"
].join("\n");

@@ -65,3 +70,3 @@ requestSpy = sinon.spy(function (opts, cb) {

statusCode: pageStatusCode,
body: pageBody
body: new Buffer(pageBody)
});

@@ -80,6 +85,3 @@ }, 1);

statusCode: robotsStatusCode,
body: ["User-agent: *",
"Allow: /",
"Disallow: /index17.html"
].join("\n")
body: new Buffer(robotsTxt)
});

@@ -310,2 +312,20 @@ }, 1);

it("skips page excluded by robots.txt, even if robots.txt not in cache", function (done) {
var crawler = new Crawler({
interval: 10
});
robotsTxt = ["User-agent: *",
"Allow: /",
"Disallow: /index1.html"
].join("\n");
crawler.start();
setTimeout(function () {
crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index1.html")).to.equal(0);
done();
}, 200);
});
it("skips a page that is excluded by robots.txt", function (done) {

@@ -433,3 +453,3 @@ var crawler = new Crawler({

sinon.assert.calledWith(handler,
sinon.match("<html><body>test</body></html>"),
sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html");

@@ -452,3 +472,3 @@ done();

sinon.assert.calledWith(handler,
sinon.match("<html><body>test</body></html>"),
sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html");

@@ -487,3 +507,3 @@ done();

crawler.stop();
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html")).to.equal(true);

@@ -505,3 +525,3 @@ done();

crawler.stop();
expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html")).to.equal(true);

@@ -529,2 +549,21 @@ done();

it("passes the content type as the third argument", function (done) {
var crawler = new Crawler({
interval: 10
});
crawler.addHandler(handler);
crawler.start();
pageContentType = "text/plain";
setTimeout(function () {
crawler.stop();
sinon.assert.calledWith(handler,
sinon.match(new Buffer("<html><body>test</body></html>")),
"https://example.com/index1.html",
"text/plain");
done();
}, 15);
});
it("adds URL to the queue", function (done) {

@@ -531,0 +570,0 @@ var crawler = new Crawler({

@@ -14,3 +14,3 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"),

return html;
return new Buffer(html);
};

@@ -17,0 +17,0 @@

@@ -20,3 +20,3 @@ var robotsParser = require("../../lib/handlers/robotsParser"),

it("can extract extract a absolute path sitemap", function () {
expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
"http://subdomain.example.com/sitemap_index_1.xml"

@@ -29,3 +29,3 @@ ]);

expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
"http://subdomain.example.com/sitemap_index_1.xml",

@@ -38,8 +38,8 @@ "http://example.com/sitemap_index.xml"

robotsTxt = "";
expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([]);
expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([]);
});
it("returns empty when the URL path is not /robots.txt", function () {
expect(rb(robotsTxt, "http://example.com/Iamnotarobots.txt")).to.deep.equal([]);
expect(rb(new Buffer(robotsTxt), "http://example.com/Iamnotarobots.txt")).to.deep.equal([]);
});
});

@@ -8,2 +8,3 @@ var proxyquire = require('proxyquire'),

robotsParserMock,
sitemapsParserMock,
index;

@@ -16,2 +17,3 @@

robotsParserMock = function () {};
sitemapsParserMock = function () {};

@@ -23,3 +25,4 @@ index = proxyquire("../lib/index", {

"./handlers/htmlLinkParser": htmlLinkParserMock,
"./handlers/robotsParser": robotsParserMock
"./handlers/robotsParser": robotsParserMock,
"./handlers/sitemapsParser": sitemapsParserMock
});

@@ -47,2 +50,6 @@

});
it("exposes sitemapsParser", function () {
expect(index.handlers.sitemapsParser).to.equal(sitemapsParserMock);
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc