supercrawler
Advanced tools
Comparing version 0.3.3 to 0.4.0
@@ -189,10 +189,20 @@ var Crawler, | ||
return this._downloadAndCheckRobots(url).then(function () { | ||
return self._downloadUrl(url); | ||
return self._downloadUrl(url, false); | ||
}).then(function (_response) { | ||
var contentType; | ||
var contentType, | ||
statusCode, | ||
location; | ||
response = _response; | ||
contentType = response.headers["content-type"] || mime.lookup(url); | ||
statusCode = response.statusCode; | ||
location = response.headers.location; | ||
return self._fireHandlers(contentType, response.body, url); | ||
// If this is a redirect, we follow the location header. | ||
// Otherwise, we get the discovered URLs from the content handlers. | ||
if (statusCode >= 300 && statusCode < 400) { | ||
return [urlMod.resolve(url, location)]; | ||
} else { | ||
return self._fireHandlers(contentType, response.body, url); | ||
} | ||
}).then(function (links) { | ||
@@ -268,8 +278,12 @@ return Promise.map(links, function (link) { | ||
/** | ||
* Download a particular URL. | ||
* Download a particular URL. Generally speaking, we do not want to follow | ||
* redirects, because we just add the destination URLs to the queue and crawl | ||
* them later. But, when requesting /robots.txt, we do follow the redirects. | ||
* This is an edge case. | ||
* | ||
* @param {string} url URL to fetch. | ||
* @return {Promise} Promise of result. | ||
* @param {string} url URL to fetch. | ||
* @param {Boolean} followRedirect True if redirect should be followed. | ||
* @return {Promise} Promise of result. | ||
*/ | ||
Crawler.prototype._downloadUrl = function (url) { | ||
Crawler.prototype._downloadUrl = function (url, followRedirect) { | ||
return request({ | ||
@@ -281,3 +295,4 @@ url: url, | ||
}, | ||
encoding: null | ||
encoding: null, | ||
followRedirect: Boolean(followRedirect) | ||
}).catch(function (err) { | ||
@@ -353,3 +368,3 @@ err = new error.RequestError("A request error occured. " + err.message); | ||
// server to get it. | ||
return self._downloadUrl(robotsUrl); | ||
return self._downloadUrl(robotsUrl, true); | ||
}).catch(error.HttpError, function (err) { | ||
@@ -356,0 +371,0 @@ var robotsStatusCode = err.statusCode; |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "0.3.3", | ||
"version": "0.4.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -282,1 +282,52 @@ # Node.js Web Crawler | ||
crawler.addHandler(supercrawler.handlers.sitemapsParser()); | ||
## Changelog | ||
### 0.4.0 | ||
* [Changed] Supercrawler no longer follows redirects on crawled URLs. Supercrawler will now add a redirected URL to the queue as a separate entry. We still follow redirects for the `/robots.txt` that is used for checking rules; but not for `/robots.txt` added to the queue. | ||
### 0.3.3 | ||
* [Fix] `DbUrlList` to mark a URL as taken, and ensure it never returns a URL that is being crawled in another concurrent request. This has required a new field called `holdDate` on the `url` table | ||
### 0.3.2 | ||
* [Fix] Time-based unit tests made more reliable. | ||
### 0.3.1 | ||
* [Added] Support for Travis CI. | ||
### 0.3.0 | ||
* [Added] Content type passed as third argument to all content type handlers. | ||
* [Added] Sitemaps parser to extract sitemap URLs and urlset URLs. | ||
* [Changed] Content handlers receive Buffers rather than strings for the first argument. | ||
* [Fix] Robots.txt checking to work for the first crawled URL. There was a bug that caused robots.txt to be ignored if it wasn't in the cache. | ||
### 0.2.3 | ||
* [Added] A robots.txt parser that identifies `Sitemap:` directives. | ||
### 0.2.2 | ||
* [Fixed] Support for URLs up to 10,000 characters long. This required a new `urlHash` SHA1 field on the `url` table, to support the unique index. | ||
### 0.2.1 | ||
* [Added] Extensive documentation. | ||
### 0.2.0 | ||
* [Added] Status code is updated in the queue for successfully crawled pages (HTTP code < 400). | ||
* [Added] A new error type `error.RequestError` for all errors that occur when requesting a page. | ||
* [Added] `DbUrlList` queue object that stores URLs in a SQL database. Includes exponetial backoff retry logic. | ||
* [Changed] Interface to `DbUrlList` and `FifoUrlList` is now via methods `insertIfNotExists`, `upsert` and `getNextUrl`. Previously, it was just `insert` (which also updated) and `upsert`, but we need a way to differentiate between discovered URLs which should not update the crawl state. | ||
### 0.1.0 | ||
* [Added] `Crawler` object, supporting rate limiting, concurrent requests limiting, robots.txt caching. | ||
* [Added] `FifoUrlList` object, a first-in, first-out in-memory list of URLs to be crawled. | ||
* [Added] `Url` object, representing a URL in the crawl queue. | ||
* [Added] `htmlLinkParser`, a function to extract links from crawled HTML documents. |
@@ -37,2 +37,3 @@ var proxyquire = require('proxyquire'), | ||
pageContentType, | ||
pageLocationHeader, | ||
pageStatusCode, | ||
@@ -62,2 +63,6 @@ pageBody, | ||
if (pageLocationHeader) { | ||
headers.location = pageLocationHeader; | ||
} | ||
if (pageStatusCode === 0) { | ||
@@ -108,3 +113,3 @@ return cb(new Error("Some request error")); | ||
var numCrawlsOfUrl = function (url) { | ||
var numCrawlsOfUrl = function (url, followRedirect) { | ||
var numCalls = 0; | ||
@@ -119,3 +124,4 @@ var n = 0; | ||
url: url, | ||
forever: true | ||
forever: true, | ||
followRedirect: followRedirect | ||
}))) { | ||
@@ -132,3 +138,3 @@ numCalls++; | ||
var numRobotsCalls = function () { | ||
return numCrawlsOfUrl("https://example.com/robots.txt"); | ||
return numCrawlsOfUrl("https://example.com/robots.txt", true); | ||
}; | ||
@@ -299,2 +305,36 @@ | ||
it("will add destination URL to queue when redirected", function (done) { | ||
var crawler = new Crawler({ interval: 10 }); | ||
crawler.start(); | ||
pageStatusCode = 301; | ||
pageLocationHeader = "https://example.com/destination.html"; | ||
setTimeout(function () { | ||
crawler.stop(); | ||
sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ | ||
_url: "https://example.com/destination.html" | ||
})); | ||
done(); | ||
}, 200); | ||
}); | ||
it("will add relative destination URL to queue when redirected", function (done) { | ||
var crawler = new Crawler({ interval: 10 }); | ||
crawler.start(); | ||
pageStatusCode = 301; | ||
pageLocationHeader = "/destination2.html"; | ||
setTimeout(function () { | ||
crawler.stop(); | ||
sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ | ||
_url: "https://example.com/destination2.html" | ||
})); | ||
done(); | ||
}, 200); | ||
}); | ||
it("requests a page that is not excluded by robots.txt", function (done) { | ||
@@ -309,3 +349,3 @@ var crawler = new Crawler({ | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index18.html")).to.equal(1); | ||
expect(numCrawlsOfUrl("https://example.com/index18.html", false)).to.equal(1); | ||
done(); | ||
@@ -328,3 +368,3 @@ }, 200); | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index1.html")).to.equal(0); | ||
expect(numCrawlsOfUrl("https://example.com/index1.html", false)).to.equal(0); | ||
done(); | ||
@@ -343,3 +383,3 @@ }, 200); | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index17.html")).to.equal(0); | ||
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0); | ||
done(); | ||
@@ -359,3 +399,3 @@ }, 200); | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index17.html")).to.equal(1); | ||
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1); | ||
done(); | ||
@@ -375,3 +415,3 @@ }, 200); | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index5.html")).to.equal(0); | ||
expect(numCrawlsOfUrl("https://example.com/index5.html", false)).to.equal(0); | ||
done(); | ||
@@ -378,0 +418,0 @@ }, 200); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
89203
2097
333