Socket
Socket
Sign inDemoInstall

supercrawler

Package Overview
Dependencies
179
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.5.0 to 1.6.0

11

lib/DbUrlList.js

@@ -30,2 +30,9 @@ var DbUrlList,

// Some options defaults
if (opts.db.table === undefined) {
opts.db.table = "url";
}
this._recrawlInMs = opts.recrawlInMs || YEAR_MS;
opts.db.sequelizeOpts.logging = false;

@@ -35,3 +42,3 @@

opts.db.sequelizeOpts);
this._urlTable = this._db.define('url', {
this._urlTable = this._db.define(opts.db.table, {
urlHash: {

@@ -226,3 +233,3 @@ type: Sequelize.STRING(40),

// again.
nextRetryDate = new Date(new Date().getTime() + YEAR_MS);
nextRetryDate = new Date(new Date().getTime() + self._recrawlInMs);
}

@@ -229,0 +236,0 @@ } else {

@@ -9,2 +9,8 @@ var cheerio = require("cheerio"),

if (!opts.urlFilter) {
opts.urlFilter = function () {
return true;
};
}
return function (context) {

@@ -49,4 +55,6 @@ var $;

});
}).get();
}).get().filter(function (url) {
return opts.urlFilter(url, context.url);
});
};
};

2

package.json
{
"name": "supercrawler",
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
"version": "1.5.0",
"version": "1.6.0",
"homepage": "https://github.com/brendonboshell/supercrawler",

@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

@@ -182,2 +182,4 @@ # Node.js Web Crawler

| opts.db.sequelizeOpts | Options to pass to sequelize. |
| opts.db.table | Table name to store URL queue. Default = 'url' |
| opts.recrawlInMs | Number of milliseconds to recrawl a URL. Default = 31536000000 (1 year) |

@@ -304,2 +306,3 @@ Example usage:

| hostnames | Array of hostnames that are allowed to be crawled. |
| urlFilter(url, pageUrl) | Function that takes a URL and returns `true` if it should be included. |

@@ -314,2 +317,10 @@ Example usage:

```js
var hlp = supercrawler.handlers.htmlLinkParser({
urlFilter: function (url) {
return url.indexOf("page1") === -1;
}
});
```
## handlers.robotsParser

@@ -359,2 +370,8 @@

### 1.6.0
* [Added] Added `opts.db.table` option to `DbUrlList` ([adversinc](https://github.com/adversinc)).
* [Added] Added `recrawlInMs` option to `DbUrlList` ([adversinc](https://github.com/adversinc)).
* [Added] Added the `urlFilter` option to `htmlLinkParser` ([adversinc](https://github.com/adversinc)).
### 1.5.0

@@ -361,0 +378,0 @@

@@ -142,2 +142,23 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"),

});
it ("can apply a filter to the URLs discovered", function () {
var hlp = htmlLinkParser({
urlFilter: function (url) {
return url.indexOf("page1") === -1;
}
}),
html;
html = makeHtmlWithLinks([
"page1.html",
"page2.html"
]);
expect(hlp({
body: html,
url: "http://example.com"
})).to.deep.equal([
"http://example.com/page2.html"
]);
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc