canvas-data-cli
Advanced tools
Comparing version 0.1.0 to 0.2.0
module.exports = { | ||
Api: require('./lib/Api'), | ||
Sync: require('./lib/Sync'), | ||
cli: require('./lib/cli') | ||
cli: require('./lib/cli'), | ||
Fetch: require('./lib/Fetch'), | ||
Unpack: require('./lib/Unpack') | ||
} |
@@ -10,2 +10,3 @@ 'use strict'; | ||
var Unpack = require('./Unpack'); | ||
var Fetch = require('./Fetch'); | ||
@@ -37,2 +38,14 @@ var cli = yargs.usage('npm <command>').demand(1, 'must provide a valid command').option('level', { | ||
}).help('help'); | ||
}).command('fetch', 'fetch a single table', function (yargs) { | ||
yargs.options('config', { | ||
alias: 'c', | ||
demand: true, | ||
describe: 'the configuration file to use', | ||
type: 'string' | ||
}).option('table', { | ||
alias: 't', | ||
describe: 'the table to fetch', | ||
demand: true, | ||
type: 'string' | ||
}); | ||
}).help('help').alias('v', 'version').version(function () { | ||
@@ -45,3 +58,4 @@ return require('../package').version; | ||
sampleConfig: { 'class': Config }, | ||
unpack: { requireConfig: true, 'class': Unpack } | ||
unpack: { requireConfig: true, 'class': Unpack }, | ||
fetch: { requireConfig: true, 'class': Fetch } | ||
}; | ||
@@ -48,0 +62,0 @@ module.exports = { |
@@ -7,10 +7,16 @@ 'use strict'; | ||
var MAX_ATTEMPTS = 5; | ||
var fs = require('fs'); | ||
var request = require('request'); | ||
var pump = require('pump'); | ||
var Re = require('re'); | ||
var reOpts = { | ||
retries: 5, | ||
strategy: { | ||
"type": Re.STRATEGIES.EXPONENTIAL, | ||
"initial": 100, | ||
"base": 2 | ||
} | ||
}; | ||
var re = new Re(reOpts); | ||
// ghetto backoff solution | ||
var backoff = [100, 200, 500, 1500, 3000]; | ||
var FileDownloader = (function () { | ||
@@ -26,29 +32,25 @@ function FileDownloader(logger) { | ||
value: function downloadToFile(downloadLink, artifact, dest, cb) { | ||
this._downloadRetry(downloadLink, artifact, dest, 0, cb); | ||
} | ||
}, { | ||
key: '_downloadRetry', | ||
value: function _downloadRetry(downloadLink, artifact, dest, attempt, cb) { | ||
var _this = this; | ||
if (attempt > MAX_ATTEMPTS) return cb(new Error('max number of retries reached for ' + fileUrl + ', aborting')); | ||
this.logger.debug('downlading ' + downloadLink.filename + ' for artifact ' + artifact.tableName + ' from dump ' + artifact.sequence + ', attempt ' + attempt); | ||
var r = request({ method: 'GET', url: downloadLink.url }); | ||
var badStatusCode = false; | ||
r.on('response', function (resp) { | ||
if (resp.statusCode !== 200) { | ||
_this.logger.debug('got non 200 status code (actual ' + resp.statusCode + ') from ' + downloadLink.url); | ||
badStatusCode = true; | ||
} | ||
re['try'](function (retryCount, done) { | ||
_this.logger.debug('downloading ' + downloadLink.filename + ' for artifact ' + artifact.tableName + ' from dump ' + artifact.sequence + ', attempt ' + (retryCount + 1)); | ||
var r = request({ method: 'GET', url: downloadLink.url }); | ||
var badStatusCode = false; | ||
r.on('response', function (resp) { | ||
if (resp.statusCode !== 200) { | ||
_this.logger.debug('got non 200 status code (actual ' + resp.statusCode + ') from ' + downloadLink.url); | ||
badStatusCode = true; | ||
} | ||
}); | ||
pump(r, fs.createWriteStream(dest), function (err) { | ||
if (err || badStatusCode) { | ||
_this.logger.debug('failed attempt ' + (retryCount + 1) + ' for ' + downloadLink.filename + ', err: ' + (err || badStatusCode)); | ||
return done(new Error("Failed Attempt."), retryCount); | ||
} | ||
_this.logger.debug('finished downlading ' + downloadLink.filename + ' for artifact ' + artifact.tableName + ' from dump ' + artifact.sequence); | ||
done(null, retryCount); | ||
}); | ||
}, function (err, retryCount) { | ||
cb(err ? new Error('max number of retries reached for ' + downloadLink.filename + ', aborting') : null); | ||
}); | ||
pump(r, fs.createWriteStream(dest), function (err) { | ||
if (err || badStatusCode) { | ||
_this.logger.debug('failed attempt ' + attempt + ' for ' + downloadLink.filename + ', err: ' + err); | ||
return setTimeout(function () { | ||
return _this._downloadRetry(downloadLink, artifact, dest, attempt + 1, cb); | ||
}, backoff[attempt]); | ||
} | ||
_this.logger.debug('finished downlading ' + downloadLink.filename + ' for artifact ' + artifact.tableName + ' from dump ' + artifact.sequence); | ||
cb(); | ||
}); | ||
} | ||
@@ -55,0 +57,0 @@ }]); |
@@ -34,2 +34,7 @@ 'use strict'; | ||
_createClass(Sync, [{ | ||
key: 'getNewCollector', | ||
value: function getNewCollector() { | ||
return { partialTables: {}, groups: {}, artifactCount: 0 }; | ||
} | ||
}, { | ||
key: 'run', | ||
@@ -44,27 +49,21 @@ value: function run(cb) { | ||
_this.logger.info('starting from sequence ' + lastSequence); | ||
_this.getLatestDumps(lastSequence, function (err, dumps) { | ||
_this.getToDownload(lastSequence, function (err, res) { | ||
if (err) return cb(err); | ||
if (!dumps.length) { | ||
var toDownload = res.toDownload; | ||
var artifactCount = res.artifactCount; | ||
var schemaVersion = res.schemaVersion; | ||
var newestSequence = res.newestSequence; | ||
if (toDownload.length === 0) { | ||
_this.logger.info('no new dumps to process'); | ||
return cb(); | ||
} | ||
_this.logger.info('will process ' + dumps.length + ' dumps'); | ||
dumps = dumps.map(function (dump, index) { | ||
return { dump: dump, index: index }; | ||
}); | ||
dumps[0].latestDump = true; | ||
var latestDump = dumps[0].dump; | ||
var newestSequence = dumps[0].dump.sequence; | ||
async.reduce(dumps, { partialTables: {}, groups: {}, artifactCount: 0 }, _this.processDump.bind(_this), function (err, results) { | ||
_this.logger.info('downloading ' + artifactCount + ' artifacts'); | ||
async.eachLimit(toDownload, CONCURRENCY_LIMIT, _this.downloadArtifactGroup.bind(_this), function (err) { | ||
if (err) return cb(err); | ||
var toDownload = _.values(results.groups); | ||
_this.logger.info('downloading ' + results.artifactCount + ' artifacts'); | ||
async.eachLimit(toDownload, CONCURRENCY_LIMIT, _this.downloadArtifactGroup.bind(_this), function (err) { | ||
state.sequence = newestSequence; | ||
_this.downloadSchema(schemaVersion, function (err) { | ||
if (err) return cb(err); | ||
state.sequence = newestSequence; | ||
_this.downloadSchema(latestDump, function (err) { | ||
if (err) return cb(err); | ||
_this.logger.info('finished, saving out state, newest sequence: ' + newestSequence); | ||
_this.stateStore.save(state, cb); | ||
}); | ||
_this.logger.info('finished, saving out state, newest sequence: ' + newestSequence); | ||
_this.stateStore.save(state, cb); | ||
}); | ||
@@ -76,18 +75,41 @@ }); | ||
}, { | ||
key: 'downloadSchema', | ||
value: function downloadSchema(dump, cb) { | ||
key: 'getToDownload', | ||
value: function getToDownload(lastSequence, cb) { | ||
var _this2 = this; | ||
this.api.getSchemaVersion(dump.schemaVersion, function (err, schema) { | ||
this.getLatestDumps(lastSequence, function (err, dumps) { | ||
if (err) return cb(err); | ||
fs.writeFile(path.join(_this2.saveLocation, 'schema.json'), JSON.stringify(schema, 0, 2), cb); | ||
if (dumps.length === 0) return cb(); | ||
_this2.logger.info('will process ' + dumps.length + ' dumps'); | ||
dumps = dumps.map(function (dump, index) { | ||
return { dump: dump, index: index }; | ||
}); | ||
dumps[0].latestDump = true; | ||
var latestDump = dumps[0].dump; | ||
var newestSequence = dumps[0].dump.sequence; | ||
async.reduce(dumps, _this2.getNewCollector(), _this2.processDump.bind(_this2), function (err, results) { | ||
if (err) return cb(err); | ||
var toDownload = _.values(results.groups); | ||
cb(null, { toDownload: toDownload, newestSequence: newestSequence, artifactCount: results.artifactCount, schemaVersion: latestDump.schemaVersion }); | ||
}); | ||
}); | ||
} | ||
}, { | ||
key: 'downloadSchema', | ||
value: function downloadSchema(schemaVersion, cb) { | ||
var _this3 = this; | ||
this.api.getSchemaVersion(schemaVersion, function (err, schema) { | ||
if (err) return cb(err); | ||
fs.writeFile(path.join(_this3.saveLocation, 'schema.json'), JSON.stringify(schema, 0, 2), cb); | ||
}); | ||
} | ||
}, { | ||
key: 'processDump', | ||
value: function processDump(collector, dumpInfo, cb) { | ||
var _this3 = this; | ||
var _this4 = this; | ||
this.api.getFilesForDump(dumpInfo.dump.dumpId, function (err, dumpFiles) { | ||
if (err) return cb(err); | ||
for (var tableName in dumpFiles.artifactsByTable) { | ||
@@ -97,14 +119,16 @@ var artifact = dumpFiles.artifactsByTable[tableName]; | ||
var willDownload = false; | ||
if (dumpInfo.latestDump) { | ||
_this3.logger.debug('will download artifact ' + tableName + ' from dump ' + artifactInfo.sequence + ' as latestDump'); | ||
_this4.logger.debug('will download artifact ' + tableName + ' from dump ' + artifactInfo.sequence + ' as latestDump'); | ||
willDownload = true; | ||
} else if (artifact.partial && collector.partialTables[tableName] !== 'foundFull') { | ||
_this3.logger.debug('will download artifact ' + tableName + ' from dump ' + artifactInfo.sequence + ' as partial'); | ||
_this4.logger.debug('will download artifact ' + tableName + ' from dump ' + artifactInfo.sequence + ' as partial'); | ||
willDownload = true; | ||
collector.partialTables[tableName] = 'partial'; | ||
} else if (collector.partialTables[tableName] === 'partial' && artifact.partial === false) { | ||
_this3.logger.debug('will download artifact ' + tableName + ' from dump ' + artifactInfo.sequence + ' as first in partial'); | ||
_this4.logger.debug('will download artifact ' + tableName + ' from dump ' + artifactInfo.sequence + ' as first in partial'); | ||
willDownload = true; | ||
collector.partialTables[tableName] = 'foundFull'; | ||
} | ||
if (willDownload) { | ||
@@ -116,2 +140,3 @@ collector.artifactCount++; | ||
} | ||
cb(null, collector); | ||
@@ -128,16 +153,16 @@ }); | ||
value: function downloadArtifact(artifact, cb) { | ||
var _this4 = this; | ||
var _this5 = this; | ||
this.removeOldArtifact(artifact, function (err) { | ||
if (err) return cb(err); | ||
mkdirp(path.join(_this4.saveLocation, artifact.tableName), function (err) { | ||
mkdirp(path.join(_this5.saveLocation, artifact.tableName), function (err) { | ||
if (err) return cb(err); | ||
_this4.logger.info('artifact ' + artifact.tableName + ' from dump ' + artifact.sequence + ' has ' + artifact.artifact.files.length + ' to download'); | ||
_this5.logger.info('artifact ' + artifact.tableName + ' from dump ' + artifact.sequence + ' has ' + artifact.artifact.files.length + ' to download'); | ||
async.eachLimit(artifact.artifact.files, CONCURRENCY_LIMIT, function (downloadLink, cb) { | ||
var fileName = path.join(_this4.saveLocation, artifact.tableName, artifact.sequence + '_' + downloadLink.filename); | ||
_this4.fileDownloader.downloadToFile(downloadLink, artifact, fileName, cb); | ||
var fileName = path.join(_this5.saveLocation, artifact.tableName, artifact.sequence + '_' + downloadLink.filename); | ||
_this5.fileDownloader.downloadToFile(downloadLink, artifact, fileName, cb); | ||
}, function (err) { | ||
if (err) return cb(err); | ||
_this4.logger.info('artifact ' + artifact.tableName + ' from dump ' + artifact.sequence + ' finished'); | ||
_this5.logger.info('artifact ' + artifact.tableName + ' from dump ' + artifact.sequence + ' finished'); | ||
cb(); | ||
@@ -158,3 +183,3 @@ }); | ||
value: function getLatestDumps(lastSequence, collector, cb) { | ||
var _this5 = this; | ||
var _this6 = this; | ||
@@ -169,6 +194,6 @@ if (typeof collector === 'function') { | ||
if (err) return cb(err); | ||
(_collector = collector).push.apply(_collector, _toConsumableArray(dumps)); | ||
(_collector = collector).unshift.apply(_collector, _toConsumableArray(dumps)); | ||
if (dumps.length < DEFAULT_LIMIT) return cb(null, collector); | ||
var newestSeq = dumps[0].sequence; | ||
_this5.getLatestDumps(newestSeq, collector, cb); | ||
_this6.getLatestDumps(newestSeq, collector, cb); | ||
}); | ||
@@ -175,0 +200,0 @@ } |
{ | ||
"name": "canvas-data-cli", | ||
"version": "0.1.0", | ||
"version": "0.2.0", | ||
"description": "A CLI tool for interacting with the Canvas Data API", | ||
"main": "index.js", | ||
"scripts": { | ||
"test": "mocha test/*Test.js", | ||
"test": "mocha --compilers js:babel/register test/*Test.js", | ||
"prepublish": "babel src --out-dir lib/" | ||
@@ -31,2 +31,3 @@ }, | ||
"pump": "^1.0.1", | ||
"re": "^0.1.4", | ||
"request": "^2.65.0", | ||
@@ -39,4 +40,9 @@ "rimraf": "^2.4.3", | ||
"babel": "^5.8.29", | ||
"mocha": "^2.3.3" | ||
"chai": "^3.5.0", | ||
"chai-fs": "^0.1.0", | ||
"mocha": "^2.3.3", | ||
"mocha-sinon": "^1.1.5", | ||
"sinon": "^1.17.3", | ||
"touch": "^1.0.0" | ||
} | ||
} |
@@ -22,4 +22,13 @@ # Canvas Data CLI | ||
## Usage | ||
### Syncing | ||
`canvasDataCli sync -c path/to/config.js` will start the sync process. On the first sync, it will look through all the data exports and download only the latest version of any tables that are not | ||
If you want to simply download all the data from Canva Data, the `sync` command can be used to keep an up-to-date copy locally. | ||
```Shell | ||
canvasDataCli sync -c path/to/config.js | ||
``` | ||
This will start the sync process. On the first sync, it will look through all the data exports and download only the latest version of any tables that are not | ||
marked as `partial` and will download any files from older exports to complete a partial table. | ||
@@ -29,3 +38,51 @@ | ||
Currently, for plus and pro customers, Canvas Data is updated daily, so running this in a daily cron job should keep your files up to date. | ||
If you run this daily, you should keep all of your data from Canvas Data up to date. | ||
### Fetch | ||
Fetches most up to date data for a single table from the API. This ignores any previously downloaded files and will redownload all the files associated with that table. | ||
```Shell | ||
canvasDataCli fetch -c path/to/config.js -t user_dim | ||
``` | ||
This will start the fetch process and download what is needed to get the most recent data for that table (in this case, the `user_dim`). | ||
On subsequent executions, this will redownload all the data for that table, ignoring any previous days data. | ||
### Unpack | ||
*NOTE*: This only works after properly running a `sync` command | ||
This command will unpack the gzipped files, concat any partitioned files, and add a header to the output file | ||
```Shell | ||
canvasDataCli unpack -c path/to/config.js -f user_dim,account_dim | ||
``` | ||
This command will unpack the user_dim and account_dim tables to a directory. Currently, you explictly have to give the files you want to unpack | ||
as this has the potential for creating very large files. | ||
## Developing | ||
Process: | ||
1. Write some code | ||
2. Write tests | ||
3. Open a pull request | ||
### Running tests | ||
#### In Docker | ||
If you use docker, you can run tests inside a docker container | ||
```Shell | ||
./build.sh | ||
``` | ||
#### Native | ||
```Shell | ||
npm install . | ||
npm test | ||
``` |
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
59463
24
1309
87
9
7
9
10