supercrawler
Advanced tools
Comparing version 1.7.2 to 2.0.0
@@ -85,3 +85,7 @@ var Crawler, | ||
*/ | ||
Crawler.prototype.getUserAgent = function () { | ||
Crawler.prototype.getUserAgent = function (url) { | ||
if (typeof this._userAgent === 'function') { | ||
return this._userAgent(url); | ||
} | ||
return this._userAgent; | ||
@@ -313,3 +317,3 @@ }; | ||
}).then(function (url) { | ||
self.emit("crawledurl", url.getUrl(), url.getErrorCode(), url.getStatusCode()); | ||
self.emit("crawledurl", url.getUrl(), url.getErrorCode(), url.getStatusCode(), url.getErrorMessage()); | ||
@@ -386,3 +390,3 @@ return url; | ||
headers: { | ||
"User-Agent": this.getUserAgent() | ||
"User-Agent": this.getUserAgent(url) | ||
}, | ||
@@ -428,3 +432,3 @@ encoding: null, | ||
robots = robotsParser(self._getRobotsUrl(url), robotsTxt); | ||
isAllowed = robots.isAllowed(url, self.getUserAgent()); | ||
isAllowed = robots.isAllowed(url, self.getUserAgent(url)); | ||
@@ -431,0 +435,0 @@ if (!isAllowed) { |
@@ -67,3 +67,3 @@ var cheerio = require("cheerio"), | ||
return match ? match.data : null; | ||
}).filter(nullFilter); | ||
}).filter(nullFilter).filter(opts.urlFilter); | ||
@@ -70,0 +70,0 @@ urlUrls = $("urlset > url > loc").get().map(function (el) { |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "1.7.2", | ||
"version": "2.0.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -137,3 +137,3 @@ # Node.js Web Crawler | ||
| robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. | | ||
| userAgent | User agent to use for requests. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)` | | ||
| userAgent | User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. | | ||
| request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. | | ||
@@ -168,3 +168,3 @@ | ||
| crawlurl(url) | Fires when crawling starts with a new URL. | | ||
| crawledurl(url, errorCode, statusCode) | Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. | | ||
| crawledurl(url, errorCode, statusCode, errorMessage) | Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. `errorMessage` is `null` if no error occurred. | | ||
| urllistempty | Fires when the URL list is (intermittently) empty. | | ||
@@ -361,3 +361,3 @@ | urllistcomplete | Fires when the URL list is permanently empty, barring URLs added by external sources. This only makes sense when running Supercrawler in non-distributed fashion. | | ||
| --- | --- | | ||
| urlFilter | Function that takes a URL and returns `true` if it should be included. | | ||
| urlFilter | Function that takes a URL (including sitemap entries) and returns `true` if it should be included. | | ||
@@ -373,2 +373,8 @@ Example usage: | ||
### 2.0.0 | ||
* [Added] `crawledurl` event to contain the error message, thanks [hjr3](https://github.com/hjr3). | ||
* [Changed] `sitemapsParser` to apply `urlFilter` on the sitemaps entries, thanks [hjr3](https://github.com/hjr3). | ||
* [Added] `Crawler` to take `userAgent` option as a function, thanks [hjr3](https://github.com/hjr3). | ||
### 1.7.2 | ||
@@ -375,0 +381,0 @@ |
@@ -201,2 +201,18 @@ var proxyquire = require('proxyquire'), | ||
}); | ||
it("will accept a function as a user agent", function () { | ||
expect(new Crawler({ | ||
userAgent: () => "mybot/1.1" | ||
}).getUserAgent()).to.equal("mybot/1.1"); | ||
expect(new Crawler({ | ||
userAgent: (url) => { | ||
if (url === 'http://www.example.com/some/random/page') { | ||
return 'url specific user agent'; | ||
} | ||
return "mybot/1.1"; | ||
} | ||
}).getUserAgent()).to.equal("mybot/1.1"); | ||
}); | ||
}); | ||
@@ -1021,3 +1037,3 @@ | ||
crawler.stop(); | ||
sinon.assert.calledWith(spy, "https://example.com/index1.html", "OTHER_ERROR", null); | ||
sinon.assert.calledWith(spy, "https://example.com/index1.html", "OTHER_ERROR", null, "abitrary error"); | ||
done(); | ||
@@ -1024,0 +1040,0 @@ }, 200); |
@@ -22,2 +22,6 @@ var sitemapsParser = require("../../lib/handlers/sitemapsParser"), | ||
"</sitemap>", | ||
"<sitemap>", | ||
"<loc>http://example.com/sitemap-de.xml.gz</loc>", | ||
"<lastmod>2015-07-17T18:16:02.754-07:00</lastmod>", | ||
"</sitemap>", | ||
"</sitemapindex>" | ||
@@ -53,3 +57,4 @@ ].join("\n"); | ||
expect(urls).to.deep.equal([ | ||
"http://example.com/sitemap.xml.gz" | ||
"http://example.com/sitemap.xml.gz", | ||
"http://example.com/sitemap-de.xml.gz" | ||
]); | ||
@@ -114,2 +119,20 @@ done(); | ||
it("can apply a filter to sitemap URLs discovered within sitemap", function (done) { | ||
var sp = new sitemapsParser({ | ||
urlFilter: function (url) { | ||
return url.indexOf("de") === -1; | ||
} | ||
}); | ||
sp({ | ||
body: new Buffer(sitemapindex ), | ||
url: "http://example.com/sitemap_index.xml" | ||
}).then(function (urls) { | ||
expect(urls).to.deep.equal([ | ||
"http://example.com/sitemap.xml.gz" | ||
]); | ||
done(); | ||
}); | ||
}); | ||
it("supports a .gz sitemap file", function (done) { | ||
@@ -116,0 +139,0 @@ Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) { |
3214
583
132808
29