supercrawler
Advanced tools
Comparing version 0.6.1 to 0.7.0
@@ -215,7 +215,19 @@ var Crawler, | ||
}).then(function (links) { | ||
return Promise.map(links, function (link) { | ||
return urlList.insertIfNotExists(new Url({ | ||
url: link | ||
var insertProm; | ||
if (typeof urlList.insertIfNotExistsBulk === "undefined") { | ||
insertProm = Promise.map(links, function (link) { | ||
return urlList.insertIfNotExists(new Url({ | ||
url: link | ||
})); | ||
}); | ||
} else { | ||
insertProm = urlList.insertIfNotExistsBulk(links.map(function (link) { | ||
return new Url({ | ||
url: link | ||
}); | ||
})); | ||
}); | ||
} | ||
return insertProm; | ||
}).then(function () { | ||
@@ -222,0 +234,0 @@ return new Url({ |
@@ -96,17 +96,6 @@ var DbUrlList, | ||
DbUrlList.prototype.insertIfNotExists = function (url) { | ||
var self = this, | ||
urlHash; | ||
var self = this; | ||
urlHash = crypto.createHash('sha1').update(url.getUrl()).digest("hex"); | ||
return this._getUrlTable().then(function (urlTable) { | ||
return urlTable.create({ | ||
urlHash: urlHash, | ||
url: url.getUrl(), | ||
statusCode: url.getStatusCode(), | ||
errorCode: url.getErrorCode(), | ||
numErrors: url.getErrorCode() === null ? 0 : 1, | ||
nextRetryDate: url.getErrorCode() === null ? self._calcNextRetryDate(0) : self._calcNextRetryDate(1), | ||
holdDate: new Date(0) | ||
}).catch(Sequelize.UniqueConstraintError, function () { | ||
return urlTable.create(self._makeUrlRow(url)).catch(Sequelize.UniqueConstraintError, function () { | ||
// we ignore unqiue constraint errors | ||
@@ -119,2 +108,44 @@ return true; | ||
/** | ||
* A method to insert an array of URLs in bulk. This is useful when we are | ||
* trying to insert 50,000 URLs discovered in a sitemaps file, for example. | ||
* | ||
* @param {Array} urls Array of URL objects to insert. | ||
* @return {Promise} Promise resolves when everything is inserted. | ||
*/ | ||
DbUrlList.prototype.insertIfNotExistsBulk = function (urls) { | ||
var self = this; | ||
return this._getUrlTable().then(function (urlTable) { | ||
return urlTable.bulkCreate(urls.map(function (url) { | ||
return self._makeUrlRow(url); | ||
}), { | ||
ignoreDuplicates: true | ||
}); | ||
}); | ||
}; | ||
/** | ||
* Given a URL object, create the corresponding row to be inserted into the | ||
* urls table. | ||
* | ||
* @param {Url} url Url object. | ||
* @return {Object} Row to be inserted into the url table. | ||
*/ | ||
DbUrlList.prototype._makeUrlRow = function (url) { | ||
var urlHash; | ||
urlHash = crypto.createHash('sha1').update(url.getUrl()).digest("hex"); | ||
return { | ||
urlHash: urlHash, | ||
url: url.getUrl(), | ||
statusCode: url.getStatusCode(), | ||
errorCode: url.getErrorCode(), | ||
numErrors: url.getErrorCode() === null ? 0 : 1, | ||
nextRetryDate: url.getErrorCode() === null ? this._calcNextRetryDate(0) : this._calcNextRetryDate(1), | ||
holdDate: new Date(0) | ||
}; | ||
}; | ||
/** | ||
* Calculate the next retry date, given the number of errors that have now | ||
@@ -121,0 +152,0 @@ * occurred. The retry interval is based on an exponentially (power of 2) |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "0.6.1", | ||
"version": "0.7.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -304,2 +304,8 @@ # Node.js Web Crawler | ||
### 0.7.0 | ||
* [Added] Support for optional `insertIfNotExistsBulk` method which can insert | ||
a large list of URLs into the crawl queue. | ||
* [Updated] `DbUrlList` supports the bulk insert method. | ||
### 0.6.1 | ||
@@ -306,0 +312,0 @@ |
@@ -17,2 +17,3 @@ var proxyquire = require('proxyquire'), | ||
insertIfNotExistsSpy, | ||
insertIfNotExistsBulkSpy, | ||
upsertSpy, | ||
@@ -102,2 +103,6 @@ pageContentType, | ||
insertIfNotExistsBulkSpy = sinon.spy(function () { | ||
return Promise.resolve(); | ||
}); | ||
upsertSpy = sinon.spy(function () { | ||
@@ -656,2 +661,26 @@ return Promise.resolve(); | ||
it("uses the bulk insert method if it exists", function (done) { | ||
var crawler = new Crawler({ | ||
interval: 10 | ||
}); | ||
handlerRet = [ | ||
"https://example.com/page98.html", | ||
"https://example.com/page99.html" | ||
]; | ||
crawler.addHandler(handler); | ||
crawler.getUrlList().insertIfNotExistsBulk = insertIfNotExistsBulkSpy; | ||
crawler.start(); | ||
setTimeout(function () { | ||
crawler.stop(); | ||
sinon.assert.calledWith(insertIfNotExistsBulkSpy, sinon.match([sinon.match({ | ||
_url: "https://example.com/page98.html" | ||
}), sinon.match({ | ||
_url: "https://example.com/page99.html" | ||
})])); | ||
done(); | ||
}, 15); | ||
}); | ||
it("works if handler returns invalid value", function (done) { | ||
@@ -658,0 +687,0 @@ var crawler = new Crawler({ |
@@ -14,2 +14,3 @@ var expect = require("chai").expect, | ||
createSpy, | ||
bulkCreateSpy, | ||
upsertSpy, | ||
@@ -53,2 +54,6 @@ numErrors, | ||
bulkCreateSpy = sinon.spy(function () { | ||
return Promise.resolve(); | ||
}); | ||
upsertSpy = sinon.spy(function () { | ||
@@ -123,2 +128,3 @@ return Promise.resolve(); | ||
create: createSpy, | ||
bulkCreate: bulkCreateSpy, | ||
upsert: upsertSpy, | ||
@@ -253,2 +259,28 @@ findOne: findOneSpy, | ||
describe("#insertIfNotExistsBulk", function () { | ||
it("inserts multiple records in one go", function (done) { | ||
new DbUrlList(opts).insertIfNotExistsBulk([ | ||
makeUrl("https://example.com"), | ||
makeUrl("https://example.com/page2.html") | ||
]).then(function () { | ||
sinon.assert.calledWith(bulkCreateSpy, sinon.match([sinon.match({ | ||
urlHash: "327c3fda87ce286848a574982ddd0b7c7487f816", | ||
url: "https://example.com", | ||
statusCode: null, | ||
errorCode: null, | ||
numErrors: 0 | ||
}), sinon.match({ | ||
urlHash: "cf1b134e852ef25837ff7ed5888684a8f5213213", | ||
url: "https://example.com/page2.html", | ||
statusCode: null, | ||
errorCode: null, | ||
numErrors: 0 | ||
})]), sinon.match({ | ||
ignoreDuplicates: true | ||
})); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
describe("#upsert", function () { | ||
@@ -255,0 +287,0 @@ it("upserts record in database", function (done) { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
94951
2245
372