supercrawler
Advanced tools
Comparing version 1.5.0 to 1.6.0
@@ -30,2 +30,9 @@ var DbUrlList, | ||
// Some options defaults | ||
if (opts.db.table === undefined) { | ||
opts.db.table = "url"; | ||
} | ||
this._recrawlInMs = opts.recrawlInMs || YEAR_MS; | ||
opts.db.sequelizeOpts.logging = false; | ||
@@ -35,3 +42,3 @@ | ||
opts.db.sequelizeOpts); | ||
this._urlTable = this._db.define('url', { | ||
this._urlTable = this._db.define(opts.db.table, { | ||
urlHash: { | ||
@@ -226,3 +233,3 @@ type: Sequelize.STRING(40), | ||
// again. | ||
nextRetryDate = new Date(new Date().getTime() + YEAR_MS); | ||
nextRetryDate = new Date(new Date().getTime() + self._recrawlInMs); | ||
} | ||
@@ -229,0 +236,0 @@ } else { |
@@ -9,2 +9,8 @@ var cheerio = require("cheerio"), | ||
if (!opts.urlFilter) { | ||
opts.urlFilter = function () { | ||
return true; | ||
}; | ||
} | ||
return function (context) { | ||
@@ -49,4 +55,6 @@ var $; | ||
}); | ||
}).get(); | ||
}).get().filter(function (url) { | ||
return opts.urlFilter(url, context.url); | ||
}); | ||
}; | ||
}; |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "1.5.0", | ||
"version": "1.6.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -182,2 +182,4 @@ # Node.js Web Crawler | ||
| opts.db.sequelizeOpts | Options to pass to sequelize. | | ||
| opts.db.table | Table name to store URL queue. Default = 'url' | | ||
| opts.recrawlInMs | Number of milliseconds to recrawl a URL. Default = 31536000000 (1 year) | | ||
@@ -304,2 +306,3 @@ Example usage: | ||
| hostnames | Array of hostnames that are allowed to be crawled. | | ||
| urlFilter(url, pageUrl) | Function that takes a URL and returns `true` if it should be included. | | ||
@@ -314,2 +317,10 @@ Example usage: | ||
```js | ||
var hlp = supercrawler.handlers.htmlLinkParser({ | ||
urlFilter: function (url) { | ||
return url.indexOf("page1") === -1; | ||
} | ||
}); | ||
``` | ||
## handlers.robotsParser | ||
@@ -359,2 +370,8 @@ | ||
### 1.6.0 | ||
* [Added] Added `opts.db.table` option to `DbUrlList` ([adversinc](https://github.com/adversinc)). | ||
* [Added] Added `recrawlInMs` option to `DbUrlList` ([adversinc](https://github.com/adversinc)). | ||
* [Added] Added the `urlFilter` option to `htmlLinkParser` ([adversinc](https://github.com/adversinc)). | ||
### 1.5.0 | ||
@@ -361,0 +378,0 @@ |
@@ -142,2 +142,23 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"), | ||
}); | ||
it ("can apply a filter to the URLs discovered", function () { | ||
var hlp = htmlLinkParser({ | ||
urlFilter: function (url) { | ||
return url.indexOf("page1") === -1; | ||
} | ||
}), | ||
html; | ||
html = makeHtmlWithLinks([ | ||
"page1.html", | ||
"page2.html" | ||
]); | ||
expect(hlp({ | ||
body: html, | ||
url: "http://example.com" | ||
})).to.deep.equal([ | ||
"http://example.com/page2.html" | ||
]); | ||
}); | ||
}); |
129141
3142
562