Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

supercrawler

Package Overview
Dependencies
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

supercrawler - npm Package Compare versions

Comparing version 0.3.3 to 0.4.0

33

lib/Crawler.js

@@ -189,10 +189,20 @@ var Crawler,

return this._downloadAndCheckRobots(url).then(function () {
return self._downloadUrl(url);
return self._downloadUrl(url, false);
}).then(function (_response) {
var contentType;
var contentType,
statusCode,
location;
response = _response;
contentType = response.headers["content-type"] || mime.lookup(url);
statusCode = response.statusCode;
location = response.headers.location;
return self._fireHandlers(contentType, response.body, url);
// If this is a redirect, we follow the location header.
// Otherwise, we get the discovered URLs from the content handlers.
if (statusCode >= 300 && statusCode < 400) {
return [urlMod.resolve(url, location)];
} else {
return self._fireHandlers(contentType, response.body, url);
}
}).then(function (links) {

@@ -268,8 +278,12 @@ return Promise.map(links, function (link) {

/**
* Download a particular URL.
* Download a particular URL. Generally speaking, we do not want to follow
* redirects, because we just add the destination URLs to the queue and crawl
* them later. But, when requesting /robots.txt, we do follow the redirects.
* This is an edge case.
*
* @param {string} url URL to fetch.
* @return {Promise} Promise of result.
* @param {string} url URL to fetch.
* @param {Boolean} followRedirect True if redirect should be followed.
* @return {Promise} Promise of result.
*/
Crawler.prototype._downloadUrl = function (url) {
Crawler.prototype._downloadUrl = function (url, followRedirect) {
return request({

@@ -281,3 +295,4 @@ url: url,

},
encoding: null
encoding: null,
followRedirect: Boolean(followRedirect)
}).catch(function (err) {

@@ -353,3 +368,3 @@ err = new error.RequestError("A request error occured. " + err.message);

// server to get it.
return self._downloadUrl(robotsUrl);
return self._downloadUrl(robotsUrl, true);
}).catch(error.HttpError, function (err) {

@@ -356,0 +371,0 @@ var robotsStatusCode = err.statusCode;

{
"name": "supercrawler",
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
"version": "0.3.3",
"version": "0.4.0",
"homepage": "https://github.com/brendonboshell/supercrawler",

@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

@@ -282,1 +282,52 @@ # Node.js Web Crawler

crawler.addHandler(supercrawler.handlers.sitemapsParser());
## Changelog
### 0.4.0
* [Changed] Supercrawler no longer follows redirects on crawled URLs. Supercrawler will now add a redirected URL to the queue as a separate entry. We still follow redirects for the `/robots.txt` that is used for checking rules; but not for `/robots.txt` added to the queue.
### 0.3.3
* [Fix] `DbUrlList` to mark a URL as taken, and ensure it never returns a URL that is being crawled in another concurrent request. This has required a new field called `holdDate` on the `url` table
### 0.3.2
* [Fix] Time-based unit tests made more reliable.
### 0.3.1
* [Added] Support for Travis CI.
### 0.3.0
* [Added] Content type passed as third argument to all content type handlers.
* [Added] Sitemaps parser to extract sitemap URLs and urlset URLs.
* [Changed] Content handlers receive Buffers rather than strings for the first argument.
* [Fix] Robots.txt checking to work for the first crawled URL. There was a bug that caused robots.txt to be ignored if it wasn't in the cache.
### 0.2.3
* [Added] A robots.txt parser that identifies `Sitemap:` directives.
### 0.2.2
* [Fixed] Support for URLs up to 10,000 characters long. This required a new `urlHash` SHA1 field on the `url` table, to support the unique index.
### 0.2.1
* [Added] Extensive documentation.
### 0.2.0
* [Added] Status code is updated in the queue for successfully crawled pages (HTTP code < 400).
* [Added] A new error type `error.RequestError` for all errors that occur when requesting a page.
* [Added] `DbUrlList` queue object that stores URLs in a SQL database. Includes exponetial backoff retry logic.
* [Changed] Interface to `DbUrlList` and `FifoUrlList` is now via methods `insertIfNotExists`, `upsert` and `getNextUrl`. Previously, it was just `insert` (which also updated) and `upsert`, but we need a way to differentiate between discovered URLs which should not update the crawl state.
### 0.1.0
* [Added] `Crawler` object, supporting rate limiting, concurrent requests limiting, robots.txt caching.
* [Added] `FifoUrlList` object, a first-in, first-out in-memory list of URLs to be crawled.
* [Added] `Url` object, representing a URL in the crawl queue.
* [Added] `htmlLinkParser`, a function to extract links from crawled HTML documents.

@@ -37,2 +37,3 @@ var proxyquire = require('proxyquire'),

pageContentType,
pageLocationHeader,
pageStatusCode,

@@ -62,2 +63,6 @@ pageBody,

if (pageLocationHeader) {
headers.location = pageLocationHeader;
}
if (pageStatusCode === 0) {

@@ -108,3 +113,3 @@ return cb(new Error("Some request error"));

var numCrawlsOfUrl = function (url) {
var numCrawlsOfUrl = function (url, followRedirect) {
var numCalls = 0;

@@ -119,3 +124,4 @@ var n = 0;

url: url,
forever: true
forever: true,
followRedirect: followRedirect
}))) {

@@ -132,3 +138,3 @@ numCalls++;

var numRobotsCalls = function () {
return numCrawlsOfUrl("https://example.com/robots.txt");
return numCrawlsOfUrl("https://example.com/robots.txt", true);
};

@@ -299,2 +305,36 @@

it("will add destination URL to queue when redirected", function (done) {
var crawler = new Crawler({ interval: 10 });
crawler.start();
pageStatusCode = 301;
pageLocationHeader = "https://example.com/destination.html";
setTimeout(function () {
crawler.stop();
sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({
_url: "https://example.com/destination.html"
}));
done();
}, 200);
});
it("will add relative destination URL to queue when redirected", function (done) {
var crawler = new Crawler({ interval: 10 });
crawler.start();
pageStatusCode = 301;
pageLocationHeader = "/destination2.html";
setTimeout(function () {
crawler.stop();
sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({
_url: "https://example.com/destination2.html"
}));
done();
}, 200);
});
it("requests a page that is not excluded by robots.txt", function (done) {

@@ -309,3 +349,3 @@ var crawler = new Crawler({

crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index18.html")).to.equal(1);
expect(numCrawlsOfUrl("https://example.com/index18.html", false)).to.equal(1);
done();

@@ -328,3 +368,3 @@ }, 200);

crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index1.html")).to.equal(0);
expect(numCrawlsOfUrl("https://example.com/index1.html", false)).to.equal(0);
done();

@@ -343,3 +383,3 @@ }, 200);

crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index17.html")).to.equal(0);
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0);
done();

@@ -359,3 +399,3 @@ }, 200);

crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index17.html")).to.equal(1);
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1);
done();

@@ -375,3 +415,3 @@ }, 200);

crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index5.html")).to.equal(0);
expect(numCrawlsOfUrl("https://example.com/index5.html", false)).to.equal(0);
done();

@@ -378,0 +418,0 @@ }, 200);

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc