Socket
Socket
Sign inDemoInstall

supercrawler

Package Overview
Dependencies
161
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.7.2 to 2.0.0

12

lib/Crawler.js

@@ -85,3 +85,7 @@ var Crawler,

*/
Crawler.prototype.getUserAgent = function () {
Crawler.prototype.getUserAgent = function (url) {
if (typeof this._userAgent === 'function') {
return this._userAgent(url);
}
return this._userAgent;

@@ -313,3 +317,3 @@ };

}).then(function (url) {
self.emit("crawledurl", url.getUrl(), url.getErrorCode(), url.getStatusCode());
self.emit("crawledurl", url.getUrl(), url.getErrorCode(), url.getStatusCode(), url.getErrorMessage());

@@ -386,3 +390,3 @@ return url;

headers: {
"User-Agent": this.getUserAgent()
"User-Agent": this.getUserAgent(url)
},

@@ -428,3 +432,3 @@ encoding: null,

robots = robotsParser(self._getRobotsUrl(url), robotsTxt);
isAllowed = robots.isAllowed(url, self.getUserAgent());
isAllowed = robots.isAllowed(url, self.getUserAgent(url));

@@ -431,0 +435,0 @@ if (!isAllowed) {

@@ -67,3 +67,3 @@ var cheerio = require("cheerio"),

return match ? match.data : null;
}).filter(nullFilter);
}).filter(nullFilter).filter(opts.urlFilter);

@@ -70,0 +70,0 @@ urlUrls = $("urlset > url > loc").get().map(function (el) {

{
"name": "supercrawler",
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
"version": "1.7.2",
"version": "2.0.0",
"homepage": "https://github.com/brendonboshell/supercrawler",

@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

@@ -137,3 +137,3 @@ # Node.js Web Crawler

| robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. |
| userAgent | User agent to use for requests. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)` |
| userAgent | User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. |
| request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. |

@@ -168,3 +168,3 @@

| crawlurl(url) | Fires when crawling starts with a new URL. |
| crawledurl(url, errorCode, statusCode) | Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. |
| crawledurl(url, errorCode, statusCode, errorMessage) | Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. `errorMessage` is `null` if no error occurred. |
| urllistempty | Fires when the URL list is (intermittently) empty. |

@@ -361,3 +361,3 @@ | urllistcomplete | Fires when the URL list is permanently empty, barring URLs added by external sources. This only makes sense when running Supercrawler in non-distributed fashion. |

| --- | --- |
| urlFilter | Function that takes a URL and returns `true` if it should be included. |
| urlFilter | Function that takes a URL (including sitemap entries) and returns `true` if it should be included. |

@@ -373,2 +373,8 @@ Example usage:

### 2.0.0
* [Added] `crawledurl` event to contain the error message, thanks [hjr3](https://github.com/hjr3).
* [Changed] `sitemapsParser` to apply `urlFilter` on the sitemaps entries, thanks [hjr3](https://github.com/hjr3).
* [Added] `Crawler` to take `userAgent` option as a function, thanks [hjr3](https://github.com/hjr3).
### 1.7.2

@@ -375,0 +381,0 @@

@@ -201,2 +201,18 @@ var proxyquire = require('proxyquire'),

});
it("will accept a function as a user agent", function () {
expect(new Crawler({
userAgent: () => "mybot/1.1"
}).getUserAgent()).to.equal("mybot/1.1");
expect(new Crawler({
userAgent: (url) => {
if (url === 'http://www.example.com/some/random/page') {
return 'url specific user agent';
}
return "mybot/1.1";
}
}).getUserAgent()).to.equal("mybot/1.1");
});
});

@@ -1021,3 +1037,3 @@

crawler.stop();
sinon.assert.calledWith(spy, "https://example.com/index1.html", "OTHER_ERROR", null);
sinon.assert.calledWith(spy, "https://example.com/index1.html", "OTHER_ERROR", null, "abitrary error");
done();

@@ -1024,0 +1040,0 @@ }, 200);

@@ -22,2 +22,6 @@ var sitemapsParser = require("../../lib/handlers/sitemapsParser"),

"</sitemap>",
"<sitemap>",
"<loc>http://example.com/sitemap-de.xml.gz</loc>",
"<lastmod>2015-07-17T18:16:02.754-07:00</lastmod>",
"</sitemap>",
"</sitemapindex>"

@@ -53,3 +57,4 @@ ].join("\n");

expect(urls).to.deep.equal([
"http://example.com/sitemap.xml.gz"
"http://example.com/sitemap.xml.gz",
"http://example.com/sitemap-de.xml.gz"
]);

@@ -114,2 +119,20 @@ done();

it("can apply a filter to sitemap URLs discovered within sitemap", function (done) {
var sp = new sitemapsParser({
urlFilter: function (url) {
return url.indexOf("de") === -1;
}
});
sp({
body: new Buffer(sitemapindex ),
url: "http://example.com/sitemap_index.xml"
}).then(function (urls) {
expect(urls).to.deep.equal([
"http://example.com/sitemap.xml.gz"
]);
done();
});
});
it("supports a .gz sitemap file", function (done) {

@@ -116,0 +139,0 @@ Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc