Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

supercrawler

Package Overview
Dependencies
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

supercrawler - npm Package Compare versions

Comparing version 0.6.1 to 0.7.0

20

lib/Crawler.js

@@ -215,7 +215,19 @@ var Crawler,

}).then(function (links) {
return Promise.map(links, function (link) {
return urlList.insertIfNotExists(new Url({
url: link
var insertProm;
if (typeof urlList.insertIfNotExistsBulk === "undefined") {
insertProm = Promise.map(links, function (link) {
return urlList.insertIfNotExists(new Url({
url: link
}));
});
} else {
insertProm = urlList.insertIfNotExistsBulk(links.map(function (link) {
return new Url({
url: link
});
}));
});
}
return insertProm;
}).then(function () {

@@ -222,0 +234,0 @@ return new Url({

57

lib/DbUrlList.js

@@ -96,17 +96,6 @@ var DbUrlList,

DbUrlList.prototype.insertIfNotExists = function (url) {
var self = this,
urlHash;
var self = this;
urlHash = crypto.createHash('sha1').update(url.getUrl()).digest("hex");
return this._getUrlTable().then(function (urlTable) {
return urlTable.create({
urlHash: urlHash,
url: url.getUrl(),
statusCode: url.getStatusCode(),
errorCode: url.getErrorCode(),
numErrors: url.getErrorCode() === null ? 0 : 1,
nextRetryDate: url.getErrorCode() === null ? self._calcNextRetryDate(0) : self._calcNextRetryDate(1),
holdDate: new Date(0)
}).catch(Sequelize.UniqueConstraintError, function () {
return urlTable.create(self._makeUrlRow(url)).catch(Sequelize.UniqueConstraintError, function () {
// we ignore unqiue constraint errors

@@ -119,2 +108,44 @@ return true;

/**
* A method to insert an array of URLs in bulk. This is useful when we are
* trying to insert 50,000 URLs discovered in a sitemaps file, for example.
*
* @param {Array} urls Array of URL objects to insert.
* @return {Promise} Promise resolves when everything is inserted.
*/
DbUrlList.prototype.insertIfNotExistsBulk = function (urls) {
var self = this;
return this._getUrlTable().then(function (urlTable) {
return urlTable.bulkCreate(urls.map(function (url) {
return self._makeUrlRow(url);
}), {
ignoreDuplicates: true
});
});
};
/**
* Given a URL object, create the corresponding row to be inserted into the
* urls table.
*
* @param {Url} url Url object.
* @return {Object} Row to be inserted into the url table.
*/
DbUrlList.prototype._makeUrlRow = function (url) {
var urlHash;
urlHash = crypto.createHash('sha1').update(url.getUrl()).digest("hex");
return {
urlHash: urlHash,
url: url.getUrl(),
statusCode: url.getStatusCode(),
errorCode: url.getErrorCode(),
numErrors: url.getErrorCode() === null ? 0 : 1,
nextRetryDate: url.getErrorCode() === null ? this._calcNextRetryDate(0) : this._calcNextRetryDate(1),
holdDate: new Date(0)
};
};
/**
* Calculate the next retry date, given the number of errors that have now

@@ -121,0 +152,0 @@ * occurred. The retry interval is based on an exponentially (power of 2)

{
"name": "supercrawler",
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
"version": "0.6.1",
"version": "0.7.0",
"homepage": "https://github.com/brendonboshell/supercrawler",

@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

@@ -304,2 +304,8 @@ # Node.js Web Crawler

### 0.7.0
* [Added] Support for optional `insertIfNotExistsBulk` method which can insert
a large list of URLs into the crawl queue.
* [Updated] `DbUrlList` supports the bulk insert method.
### 0.6.1

@@ -306,0 +312,0 @@

@@ -17,2 +17,3 @@ var proxyquire = require('proxyquire'),

insertIfNotExistsSpy,
insertIfNotExistsBulkSpy,
upsertSpy,

@@ -102,2 +103,6 @@ pageContentType,

insertIfNotExistsBulkSpy = sinon.spy(function () {
return Promise.resolve();
});
upsertSpy = sinon.spy(function () {

@@ -656,2 +661,26 @@ return Promise.resolve();

it("uses the bulk insert method if it exists", function (done) {
var crawler = new Crawler({
interval: 10
});
handlerRet = [
"https://example.com/page98.html",
"https://example.com/page99.html"
];
crawler.addHandler(handler);
crawler.getUrlList().insertIfNotExistsBulk = insertIfNotExistsBulkSpy;
crawler.start();
setTimeout(function () {
crawler.stop();
sinon.assert.calledWith(insertIfNotExistsBulkSpy, sinon.match([sinon.match({
_url: "https://example.com/page98.html"
}), sinon.match({
_url: "https://example.com/page99.html"
})]));
done();
}, 15);
});
it("works if handler returns invalid value", function (done) {

@@ -658,0 +687,0 @@ var crawler = new Crawler({

@@ -14,2 +14,3 @@ var expect = require("chai").expect,

createSpy,
bulkCreateSpy,
upsertSpy,

@@ -53,2 +54,6 @@ numErrors,

bulkCreateSpy = sinon.spy(function () {
return Promise.resolve();
});
upsertSpy = sinon.spy(function () {

@@ -123,2 +128,3 @@ return Promise.resolve();

create: createSpy,
bulkCreate: bulkCreateSpy,
upsert: upsertSpy,

@@ -253,2 +259,28 @@ findOne: findOneSpy,

describe("#insertIfNotExistsBulk", function () {
it("inserts multiple records in one go", function (done) {
new DbUrlList(opts).insertIfNotExistsBulk([
makeUrl("https://example.com"),
makeUrl("https://example.com/page2.html")
]).then(function () {
sinon.assert.calledWith(bulkCreateSpy, sinon.match([sinon.match({
urlHash: "327c3fda87ce286848a574982ddd0b7c7487f816",
url: "https://example.com",
statusCode: null,
errorCode: null,
numErrors: 0
}), sinon.match({
urlHash: "cf1b134e852ef25837ff7ed5888684a8f5213213",
url: "https://example.com/page2.html",
statusCode: null,
errorCode: null,
numErrors: 0
})]), sinon.match({
ignoreDuplicates: true
}));
done();
});
});
});
describe("#upsert", function () {

@@ -255,0 +287,0 @@ it("upserts record in database", function (done) {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc