Socket
Socket
Sign inDemoInstall

downcache

Package Overview
Dependencies
47
Maintainers
1
Versions
16
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.0.5 to 0.0.6-a

72

index.js

@@ -6,4 +6,7 @@ var request = require('request'),

mkdirp = require('mkdirp'),
log = require('npmlog');
log = require('npmlog'),
extend = require("extend");
var RateLimiter = require('limiter').RateLimiter;
/* OPTIONS */

@@ -17,20 +20,31 @@ /*

module.exports = function(url, opts, callback) {
if (arguments.length === 1) {
opts = {};
} else if (arguments.length === 2 && typeof opts === "function") {
callback = opts;
opts = {};
//log.level = "verbose";
// initial options, which we can overwrite any time
var global_options = {
dir: "./cache",
limit: 1000,
log: "warn"
}
var limiter = new RateLimiter(1, global_options.limit);
// downcache({ url: "http://whatever.com" })
// downcache("http://whatever.com", { opts: values }, function(err, resp, body) {} )
// downcache("http://whatever.com", function(err, resp, body) {} )
module.exports = function(url, my_opts, callback) {
if (arguments.length === 2 && typeof my_opts === "function") {
callback = my_opts;
}
var opts = extend(false, {}, global_options, my_opts || {});
log.level = opts.log;
if (!callback) {
log.verbose("FYI, no callback provided.");
log.info("FYI, no callback provided to downcache.");
callback = function() {};
}
// directory where the cache will be stored
if (!opts.dir) {
opts.dir = "./cache/";
}
log.verbose("directory for cache is", opts.dir);

@@ -82,8 +96,26 @@

var download = module.exports.download = function(opts, callback) {
limiter.removeTokens(1, function(err, remainingRequests) {
if (err) {
log.warn("rate limited " + opts.url);
return callback("rate limited");
}
downloadDirect(opts,callback);
});
}
var downloadDirect = module.exports.downloadDirect = function(opts, callback) {
request(opts.url, function(err, resp, body) {
if (err) {
log.error("Error retrieving", opts.url, ":", err);
return callback(err, resp, null);
log.error(err, resp, body);
return callback(err, null, null);
};
// make sure it's a valid response
if (resp.statusCode != 200) {
log.info("Did not cache", opts.url, "because response code was", resp.statusCode);
return callback("Bad response code", resp, body);
}
var response = {

@@ -126,2 +158,14 @@ response: resp,

callback(null, resp, body);
}
// update the global settings that get used in absense of a specification in the individual call
module.exports.set = function(property, value) {
if (typeof property == "string" && typeof value == "string") {
global_options[property] = value;
} else if (typeof property == "object") {
extend(false, global_options, property);
}
if (property == "limit" || property.limit) {
limiter = new RateLimiter(1, global_options.limit);
}
}

9

package.json
{
"name": "downcache",
"version": "0.0.5",
"version": "0.0.6a",
"author": "Chris Wilson <christopher.e.wilson@gmail.com>",

@@ -21,4 +21,9 @@ "description": "Download and cache webpages and write them to disk for fast future retrieval",

"graceful-fs": "2.0.3",
"npmlog": "*"
"npmlog": "1.0.0",
"limiter": "1.0.x",
"extend": "2.0.x"
},
"devDependencies": {
"rimraf": "2.2.x"
}
}
downcache
=========
Version 0.0.5
Version 0.0.6a
[![Build Status](https://travis-ci.org/wilson428/downcache.png)](https://travis-ci.org/wilson428/downcache)

@@ -8,10 +8,8 @@

Any sort of application or project that involves live calls to webpages often ends up hitting them far more often than is reasonably necessary. This module functions like @mikeal's [request](https://github.com/mikeal/request) -- in fact, it uses it as a dependency -- but stores a copy of the HTML on your local machine. The next time you make a request to that page using downcache, it checks for that local copy before making another call to the live page.
Any sort of scraping project often ends up hitting pages far more often than is reasonably necessary. This module functions like @mikeal's [request](https://github.com/mikeal/request) -- in fact, it uses it as a dependency -- but stores a copy of the HTML on your local machine. The next time you make a request to that page using downcache, it checks for that local copy before making another call to the live page.
#Installation
```npm install downcache``` (local)
`npm install downcache`
```sudo npm install -g downcache``` (global)
#Usage

@@ -29,3 +27,3 @@

The only required input to ```downcache``` is a url. Most of the time, you'll want to pass a callback function as well. This receives three variables: An error (hopefully null), a response object that is either the response provided by ```request``` or an object indicating that the page was loaded from cache, and the body of the page requested. Do with them what you will (or not).
The only required input to `downcache` is a url. Most of the time, you'll want to pass a callback function as well. This receives three variables: An error (hopefully null), a response object that is either the response provided by `request` or an object indicating that the page was loaded from cache, and the body of the page requested. Do with them what you will (or not).

@@ -45,15 +43,49 @@ downcache("http://time.com/7612/americas-mood-map-an-interactive-guide-to-the-united-states-of-attitude/", function(err, resp, body) {

By default, this module creates a ```cache``` directory in the current directory. The path to the cached file is created from the url so that the local file structure resembles the website being crawled.
By default, this module creates a `cache` directory in the current directory. The path to the cached file is created from the url so that the local file structure resembles the website being crawled.
#Options
To specify options, pass a third argument to ```downcache``` between the url and the callback. Here are your choices:
There are a variety of options for you to specify and two ways to specify them.
-```dir```: The cache directory. Default is "cache"
-```path```: The filepath to write the url response to. Default is the url itself, minus the schema (http://)
-```force```: Don't bother looking for the file in cache and call it live
-```nocache```: Don't write the response to cache. Then question why you are using this module.
-```json```: Run ```JSON.parse``` on the response
| option | default | description |
| -------| ------- | ----------- |
| dir | './cache' | The directory where pages will be cached. Will be created if not present. |
| path | url | The file path to write the url response to inside the cache directory. Default is the url itself as a file structure. |
| force | false | If true, don't bother looking for the file in cache and call it live. |
| nocache | false | Don't write the response to cache. Then question why you are using this module. |
| json | false | Run `JSON.parse` on the response. |
| log | 'warn' | Log level for module, using [npmlog](https://www.npmjs.com/package/npmlog) values: `verbose`, `info`, `warn`, `error`. |
| limit | 100 | How long to wait in milliseconds between each http call. |
To specify options for a _single URL call_, you can pass a third argument to `downcache` between the url and the callback, like so:
downcache("http://example.com/article.html", {
dir: '/Users/jsmith/Desktop/cache/',
log: 'verbose'
}, function(err, resp, body) {
//callback
});
But if you want to store the cache somewhere else, like in the above example, it would be a hassle to specify this every time. So you can also set pseudo-global options that apply to all future `downcache` calls using the `set` method:
downcache.set("dir", "/Users/jsmith/Desktop/cache/");
// or
downcache.set({
log: "info",
json: true
});
If you specify these universal options, you can still override them with options passed to the main `downcache()` function call, but doing so with not overwrite the options specified by `.set()` for future calls.
#Note on downcache and wget
The default behavior for "path" is similar to the structure created by `wget`, in which the directory structure of the website is replicated on disk. At some future point, I may make this identical so that one can "precache" a site by mirroring it and then use this module to make requests to it, falling back on the live site.
The most common case for specifying your own path is if the site that you're requesting attaches session keys to the links in the source code, as many government database search results have the awful habit of doing. If you don't specify a path without these keys, they will fool the module into requesting a new URL each time.
#Rate limiting
If you invoke this module many times in a row, there is built-in rate limiting to prevent bad behavior. The default is not more than one call per 100 milliseconds, which you can adjust with the `limit` option. (Feel free to call `downcache()` as you would normally, and the rate limiter will store the calls until their turn comes up.) I recommend you set the rate limiting interval using `.set()` at the beginning of the program and leave it at that value for the duration of the execution. Changing it midstream should not cause an interruption, but it can create some confusing race conditions.
#To Do
-Allow for optional "should I cache?" logic so as to ignore certain types of responses you don't want (say, those that are under 1KB, indicating an error)
-Allow for cache expiration

@@ -63,2 +95,14 @@ -Return a better response when called from cache

#Changes
**v0.0.6a**
Cleaned up tests and fixed the damn markdown list.
**v0.0.6**
Rate limiting implemented. Can now set "global" options for all downcache calls in a session.
**v0.0.5b**
Try out rate limiting
**v0.0.5a**
Does not cache responses that have a status code other than 200.
**v0.0.4**

@@ -70,3 +114,7 @@ Checks to see if cached version is empty, and calls live if so.

#Compatibility
Downcache is [tested](https://travis-ci.org/wilson428/downcache) on Node v0.10 and v0.11. It breaks on v0.8 due to a [problem with an npmlog dependency](https://travis-ci.org/wilson428/downcache).
#License
[MIT](/LICENSE.md)
#!/usr/bin/env node
var downcache = require("../index"),
log = require("npmlog");
rimraf = require("rimraf");
log.level = "verbose";
downcache.set({
log: "info",
limit: 2000
});
downcache("http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=Jimmy%20Rollins&rvprop=content&format=json");
// remove old cache from previous tests
rimraf("./cache", function(err) {
// see what happens when we get a status code other than 200
downcache("http://api.meetup.com/2/members?group_id=741891&key=dsafsadfsadfasf", function(err, resp, html) {
console.log(err);
});
downcache("http://time.com/selfies-cities-world-rankings/", function(err, resp, html) {
console.log("Downloaded", html.length, "characters");
});
downcache("http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=Jimmy%20Rollins&rvprop=content&format=json");
downcache("http://time.com/27821/us-college-rankings/", {
path: "great-articles/rankings.html"
}, function(err, resp, html) {
if (resp.socket) {
console.log(resp.socket.bytesRead);
}
// try again, see it load from cache
downcache("http://time.com/selfies-cities-world-rankings/", function(err, resp, html) {
console.log("Downloaded", html.length, "characters");
});
downcache("http://time.com/27821/us-college-rankings/", {
path: "great-articles/rankings.html"
path: "great-articles/rankings.html",
log: "verbose" // will override just for this call
}, function(err, resp, html) {
console.log(resp);
});
});
if (resp.socket) {
console.log(resp.socket.bytesRead);
}
// try again, see it load from cache
downcache("http://time.com/27821/us-college-rankings/", {
path: "great-articles/rankings.html",
log: "verbose"
}, function(err, resp, html) {
//console.log(resp);
});
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc