Comparing version 0.4.13 to 0.4.14
#!/usr/bin/env node | ||
/* global log */ | ||
var program = require('commander') | ||
var fs = require('fs') | ||
var winston = require('winston') | ||
var log = require('winston') | ||
var api = require('../lib/api.js') | ||
var loglevels = require('../lib/loglevels.js') | ||
var mkdirp = require('mkdirp') | ||
var pjson = require('../package.json') | ||
@@ -60,3 +58,3 @@ | ||
if (allowedlevels.indexOf(program.loglevel) === -1) { | ||
winston.error('Loglevel must be one of: ', | ||
log.error('Loglevel must be one of: ', | ||
'quiet, verbose, data, info, warn, error, debug') | ||
@@ -66,8 +64,6 @@ process.exit(1) | ||
log = new (winston.Logger)({ | ||
transports: [new winston.transports.Console({ | ||
level: program.loglevel, | ||
levels: loglevels.levels, | ||
colorize: true | ||
})], | ||
log.addColors(loglevels.colors) | ||
log.remove(log.transports.Console) // reset logger to nothing | ||
log.add(log.transports.Console, { | ||
level: program.loglevel, | ||
@@ -77,7 +73,6 @@ levels: loglevels.levels, | ||
}) | ||
winston.addColors(loglevels.colors) | ||
if (program.hasOwnProperty('logfile')) { | ||
logstream = fs.createWriteStream(program.logfile.toString()) | ||
log.add(winston.transports.File, { | ||
var logstream = fs.createWriteStream(program.logfile.toString()) | ||
log.add(log.transports.File, { | ||
stream: logstream, | ||
@@ -90,3 +85,3 @@ level: 'debug' | ||
// check arguments | ||
if (typeof program.query === "undefined" && program.api!=='crossref') { | ||
if (typeof program.query === 'undefined' && program.api !== 'crossref') { | ||
log.error('No query given. ' + | ||
@@ -97,3 +92,3 @@ 'You must provide the --query argument.') | ||
if (program.filter && program.api!=='crossref') { | ||
if (program.filter && program.api !== 'crossref') { | ||
log.warn('Filter given but not using CrossRef api ' + | ||
@@ -130,4 +125,4 @@ 'so no filter applied.') | ||
var chosenapi = api(program.api) | ||
var searchapi = new chosenapi(options) | ||
var Chosenapi = api(program.api) | ||
var searchapi = new Chosenapi(options) | ||
searchapi.search(program.query) |
@@ -5,2 +5,3 @@ var eupmc = require('./eupmc.js') | ||
var ieee = require('./ieee.js') | ||
var log = require('winston') | ||
@@ -7,0 +8,0 @@ var chooseAPI = function (api) { |
299
lib/arxiv.js
@@ -1,100 +0,88 @@ | ||
var util = require('util') | ||
, fs = require('fs') | ||
, chalk = require('chalk') | ||
, got = require('got') | ||
, mkdirp = require('mkdirp') | ||
, _ = require('lodash') | ||
, ProgressBar = require('progress') | ||
, request = require('requestretry') | ||
, urlDl = require('./download.js') | ||
, config = require('./config.js') | ||
var fs = require('fs') | ||
var chalk = require('chalk') | ||
var ProgressBar = require('progress') | ||
var request = require('requestretry') | ||
var urlDl = require('./download.js') | ||
var config = require('./config.js') | ||
var log = require('winston') | ||
var parseString = require('xml2js').parseString | ||
var ArXiv = function(opts) { | ||
var ArXiv = function (opts) { | ||
this.baseurl = 'http://export.arxiv.org/api/query?search_query=' | ||
this.opts = opts; | ||
this.opts = opts | ||
} | ||
ArXiv.prototype.search = function(query) { | ||
ArXiv.prototype.search = function (query) { | ||
var arxiv = this | ||
var arxiv = this; | ||
if (arxiv.opts.xml) { | ||
log.warn("The ArXiv API does not provide fulltext XML, so the --xml flag will be ignored"); | ||
log.warn('The ArXiv API does not provide fulltext XML, so the --xml flag will be ignored') | ||
} | ||
if (arxiv.opts.minedterms) { | ||
log.warn("The ArXiv API does not provide mined terms so the --minedterms flag will be ignored"); | ||
log.warn('The ArXiv API does not provide mined terms so the --minedterms flag will be ignored') | ||
} | ||
var options = {} | ||
var options = {}; | ||
arxiv.queryurl = arxiv.buildQuery(query, options); | ||
arxiv.first = true; | ||
arxiv.hitlimit = arxiv.opts.hitlimit ? arxiv.opts.hitlimit : 0; | ||
arxiv.hitcount = 0; | ||
arxiv.residualhits = 0; | ||
arxiv.allresults = []; | ||
arxiv.iter = 0; | ||
arxiv.pagesize = 500; | ||
arxiv.page_delay = 3000; // miliseconds to wait between requests | ||
arxiv.pageQuery(); | ||
arxiv.queryurl = arxiv.buildQuery(query, options) | ||
arxiv.first = true | ||
arxiv.hitlimit = arxiv.opts.hitlimit ? arxiv.opts.hitlimit : 0 | ||
arxiv.hitcount = 0 | ||
arxiv.residualhits = 0 | ||
arxiv.allresults = [] | ||
arxiv.iter = 0 | ||
arxiv.pagesize = 500 | ||
arxiv.page_delay = 3000 // miliseconds to wait between requests | ||
arxiv.pageQuery() | ||
} | ||
ArXiv.prototype.pageQuery = function() { | ||
ArXiv.prototype.pageQuery = function () { | ||
var arxiv = this | ||
var arxiv = this; | ||
var thisQueryUrl = arxiv.queryurl | ||
var thisQueryUrl = arxiv.queryurl; | ||
var pageterm = | ||
'&start=' + arxiv.iter + | ||
'&max_results=' + arxiv.pagesize; | ||
thisQueryUrl += pageterm; | ||
'&max_results=' + arxiv.pagesize | ||
thisQueryUrl += pageterm | ||
log.debug(thisQueryUrl); | ||
log.debug(thisQueryUrl) | ||
var rq = request.get({url: thisQueryUrl, | ||
headers: {'User-Agent': config.userAgent} | ||
}); | ||
headers: {'User-Agent': config.userAgent}}) | ||
var convertXML2JSON = function (data) { | ||
//console.log(data.body) | ||
// console.log(data.body) | ||
parseString(data.body, function (err, datum) { | ||
cb = arxiv.completeCallback.bind(arxiv, datum) | ||
cb() } ) | ||
if (err) throw err | ||
var cb = arxiv.completeCallback.bind(arxiv, datum) | ||
cb() | ||
}) | ||
} | ||
rq.on('complete', convertXML2JSON); | ||
rq.on('timeout', arxiv.timeoutCallback); | ||
rq.on('complete', convertXML2JSON) | ||
rq.on('timeout', arxiv.timeoutCallback) | ||
} | ||
ArXiv.prototype.completeCallback = function(data) { | ||
ArXiv.prototype.completeCallback = function (data) { | ||
var arxiv = this | ||
var arxiv = this; | ||
var totalfound = parseInt(data.feed['opensearch:totalResults'][0]._) | ||
var totalfound = parseInt(data.feed['opensearch:totalResults'][0]._); | ||
if (arxiv.first) { | ||
arxiv.first = false; | ||
arxiv.hitcount = totalfound; | ||
log.info('Found ' + arxiv.hitcount + ' results'); | ||
if (arxiv.hitcount == 0 || arxiv.opts.noexecute) { | ||
process.exit(0); | ||
arxiv.first = false | ||
arxiv.hitcount = totalfound | ||
log.info('Found ' + arxiv.hitcount + ' results') | ||
if (arxiv.hitcount === 0 || arxiv.opts.noexecute) { | ||
process.exit(0) | ||
} | ||
// set hitlimit | ||
// set hitlimit | ||
if (arxiv.hitlimit && arxiv.hitlimit < arxiv.hitcount) { | ||
log.info('Limiting to ' + arxiv.hitlimit + ' hits'); | ||
} | ||
else { arxiv.hitlimit = arxiv.hitcount; } | ||
log.info('Limiting to ' + arxiv.hitlimit + ' hits') | ||
} else { arxiv.hitlimit = arxiv.hitcount } | ||
// create progress bar | ||
var progmsg = 'Retrieving results [:bar] :percent' + | ||
' (eta :etas)'; | ||
' (eta :etas)' | ||
var progopts = { | ||
@@ -104,50 +92,46 @@ total: arxiv.hitlimit, | ||
complete: chalk.green('=') | ||
}; | ||
arxiv.pageprogress = new ProgressBar(progmsg, progopts); | ||
} | ||
arxiv.pageprogress = new ProgressBar(progmsg, progopts) | ||
} | ||
if (data && data.feed && data.feed.entry) { | ||
if (!arxiv.residualhits) { var result = data.feed.entry; } | ||
else { var result = data.feed.entry.slice(0,arxiv.hitlimit); } | ||
var result | ||
if (!arxiv.residualhits) { result = data.feed.entry } else { result = data.feed.entry.slice(0, arxiv.hitlimit) } | ||
} else { | ||
log.error('Malformed response from arXiv API - no data in feed'); | ||
log.debug(data); | ||
log.info('Retrying failed request'); | ||
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay); | ||
return; | ||
log.error('Malformed response from arXiv API - no data in feed') | ||
log.debug(data) | ||
log.info('Retrying failed request') | ||
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay) | ||
return | ||
} | ||
log.debug('Got', result.length, 'results in this page'); | ||
arxiv.allresults = arxiv.allresults.concat(result); | ||
arxiv.pageprogress.tick(result.length); | ||
log.debug('Got', result.length, 'results in this page') | ||
arxiv.allresults = arxiv.allresults.concat(result) | ||
arxiv.pageprogress.tick(result.length) | ||
if (arxiv.allresults.length < arxiv.hitlimit) { | ||
arxiv.iter += arxiv.pagesize; | ||
hitsremaining = arxiv.hitlimit - arxiv.allresults.length; | ||
if(hitsremaining<arxiv.pagesize) { | ||
arxiv.residualhits = hitsremaining | ||
} | ||
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay); | ||
arxiv.iter += arxiv.pagesize | ||
var hitsremaining = arxiv.hitlimit - arxiv.allresults.length | ||
if (hitsremaining < arxiv.pagesize) { | ||
arxiv.residualhits = hitsremaining | ||
} | ||
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay) | ||
} else { | ||
log.info('Done collecting results'); | ||
arxiv.handleSearchResults(arxiv); | ||
log.info('Done collecting results') | ||
arxiv.handleSearchResults(arxiv) | ||
} | ||
} | ||
ArXiv.prototype.handleSearchResults = function(arxiv) { | ||
ArXiv.prototype.handleSearchResults = function (arxiv) { | ||
// write the full result set to a file | ||
log.info('Saving result metadata'); | ||
var pretty = JSON.stringify(arxiv.allresults, null, 2); | ||
log.info('Saving result metadata') | ||
var pretty = JSON.stringify(arxiv.allresults, null, 2) | ||
fs.writeFileSync('arxiv_results.json', pretty) | ||
var filename = chalk.blue('arxiv_results.json') | ||
log.info('Full ArXiv result metadata written to ' + filename); | ||
log.info('Full ArXiv result metadata written to ' + filename) | ||
var dlTasks = [] | ||
var dlTasks = []; | ||
// download the fullText PDF | ||
if (arxiv.opts.pdf) { | ||
dlTasks.push(arxiv.downloadFulltextPDFs); | ||
dlTasks.push(arxiv.downloadFulltextPDFs) | ||
} | ||
@@ -157,120 +141,101 @@ | ||
if (arxiv.opts.supp) { | ||
dlTasks.push(arxiv.downloadSuppFiles); | ||
dlTasks.push(arxiv.downloadSuppFiles) | ||
} | ||
arxiv.runDlTasks(dlTasks); | ||
arxiv.runDlTasks(dlTasks) | ||
} | ||
ArXiv.prototype.runDlTasks = function(dlTasks) { | ||
ArXiv.prototype.runDlTasks = function (dlTasks) { | ||
var arxiv = this | ||
var arxiv = this; | ||
arxiv.dlTasks = dlTasks; | ||
arxiv.currDlTask = -1; | ||
arxiv.nextDlTask(); | ||
arxiv.dlTasks = dlTasks | ||
arxiv.currDlTask = -1 | ||
arxiv.nextDlTask() | ||
} | ||
ArXiv.prototype.nextDlTask = function() { | ||
ArXiv.prototype.nextDlTask = function () { | ||
var arxiv = this | ||
var arxiv = this; | ||
arxiv.currDlTask ++; | ||
arxiv.currDlTask ++ | ||
if (arxiv.dlTasks.length > arxiv.currDlTask) { | ||
var fun = arxiv.dlTasks[arxiv.currDlTask]; | ||
fun(arxiv); | ||
var fun = arxiv.dlTasks[arxiv.currDlTask] | ||
fun(arxiv) | ||
} else { | ||
process.exit(0); | ||
process.exit(0) | ||
} | ||
} | ||
ArXiv.prototype.timeoutCallback = function (ms) { | ||
log.error('Did not get a response from the ArXiv API within ' + ms + 'ms') | ||
} | ||
ArXiv.prototype.timeoutCallback = function(ms) { | ||
ArXiv.prototype.buildQuery = function (query, options) { | ||
var arxiv = this | ||
log.error('Did not get a response from the ArXiv API within ' + ms + 'ms'); | ||
var queryurl = arxiv.baseurl + encodeURIComponent(query) | ||
}; | ||
ArXiv.prototype.buildQuery = function(query, options) { | ||
var arxiv = this; | ||
var queryurl = arxiv.baseurl + encodeURIComponent(query); | ||
Object.keys(options).forEach(function(key) { | ||
var val = options[key]; | ||
Object.keys(options).forEach(function (key) { | ||
var val = options[key] | ||
if (key.length > 0) { | ||
queryurl += '&' + key + '=' + val; | ||
queryurl += '&' + key + '=' + val | ||
} | ||
}); | ||
}) | ||
return queryurl; | ||
return queryurl | ||
} | ||
ArXiv.prototype.getFulltextPDFUrl = function(result) { | ||
ArXiv.prototype.getFulltextPDFUrl = function (result) { | ||
var urls = result.link | ||
var pdfurls = urls.filter(function (u) { | ||
return u['$'].type === 'application/pdf' | ||
}) | ||
var arxiv = this; | ||
var urls = result.link; | ||
var pdfurls = urls.filter(function(u) { | ||
return u['$'].type === "application/pdf"; | ||
}); | ||
if (pdfurls.length == 0) { | ||
//log.info('pdf missing') | ||
return null; | ||
if (pdfurls.length === 0) { | ||
// log.info('pdf missing') | ||
return null | ||
} else { | ||
return [ pdfurls[0]['$'].href, result.id[0].split('abs/')[1] + '/' ]; | ||
return [ pdfurls[0]['$'].href, result.id[0].split('abs/')[1] + '/' ] | ||
} | ||
} | ||
ArXiv.prototype.getIdentifier = function(result) { | ||
return result.id[0]; | ||
ArXiv.prototype.getIdentifier = function (result) { | ||
return result.id[0] | ||
} | ||
ArXiv.prototype.getSuppFilesUrl = function(result) { | ||
ArXiv.prototype.getSuppFilesUrl = function (result) { | ||
var arxiv = this | ||
var arxiv = this; | ||
var id = arxiv.getIdentifier(result) | ||
var id = arxiv.getIdentifier(result); | ||
return [id.split('abs').join('e-print'), id.split('abs/')[1]]; | ||
return [id.split('abs').join('e-print'), id.split('abs/')[1]] | ||
} | ||
ArXiv.prototype.urlQueueBuilder = function(urls, type, rename) { | ||
return urls.map(function urlQueueBuilder(url_id) { | ||
return {url: url_id[0], id: url_id[1], type: type, rename: rename } | ||
ArXiv.prototype.urlQueueBuilder = function (urls, type, rename) { | ||
return urls.map(function urlQueueBuilder (urlId) { | ||
return { url: urlId[0], id: urlId[1], type: type, rename: rename } | ||
}) | ||
}; | ||
} | ||
ArXiv.prototype.downloadFulltextPDFs = function(arxiv) { | ||
urls = arxiv.allresults | ||
ArXiv.prototype.downloadFulltextPDFs = function (arxiv) { | ||
var urls = arxiv.allresults | ||
.map(arxiv.getFulltextPDFUrl, arxiv) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
log.info('Downloading fulltext PDF files'); | ||
log.info('Downloading fulltext PDF files') | ||
var urlQueue = arxiv.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf'); | ||
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv)); | ||
var urlQueue = arxiv.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf') | ||
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv)) | ||
} | ||
ArXiv.prototype.downloadSuppFiles = function(arxiv) { | ||
urls = arxiv.allresults | ||
ArXiv.prototype.downloadSuppFiles = function (arxiv) { | ||
var urls = arxiv.allresults | ||
.map(arxiv.getSuppFilesUrl, arxiv) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
log.info('Downloading supplementary files'); | ||
log.info('Downloading supplementary files') | ||
var urlQueue = arxiv.urlQueueBuilder(urls, 'supplementary files', 'supplementaryFiles.tar.gz'); | ||
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv)); | ||
var urlQueue = arxiv.urlQueueBuilder(urls, 'supplementary files', 'supplementaryFiles.tar.gz') | ||
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv)) | ||
} | ||
module.exports = ArXiv; | ||
module.exports = ArXiv |
@@ -1,3 +0,1 @@ | ||
/* global log */ | ||
var fs = require('fs') | ||
@@ -9,3 +7,4 @@ var chalk = require('chalk') | ||
var CrossRefAPI = require('crossref') | ||
var sanitize = require("sanitize-filename") | ||
var sanitize = require('sanitize-filename') | ||
var log = require('winston') | ||
@@ -33,3 +32,3 @@ var CrossRef = function (opts) { | ||
log.info('Found', crossref.hitcount, 'results') | ||
if (crossref.opts.noexecute){ process.exit(0) } | ||
if (crossref.opts.noexecute) { process.exit(0) } | ||
} | ||
@@ -50,15 +49,15 @@ | ||
var message = {} | ||
if (query != null) { message.query = query} | ||
if (query != null) { message.query = query } | ||
message.rows = crossref.pagesize | ||
if (crossref.opts.filter) { | ||
var filters = crossref.opts.filter.split(',') | ||
message.filter = message.filter ? message.filter : {} | ||
for (var singleFilter of filters){ | ||
if (!message.filter[singleFilter.split(':')[0]]) { | ||
message.filter[singleFilter.split(':')[0]] = [] | ||
if (crossref.opts.filter) { | ||
var filters = crossref.opts.filter.split(',') | ||
message.filter = message.filter ? message.filter : {} | ||
for (var singleFilter of filters) { | ||
if (!message.filter[singleFilter.split(':')[0]]) { | ||
message.filter[singleFilter.split(':')[0]] = [] | ||
} | ||
message.filter[singleFilter.split(':')[0]].push(singleFilter.split(':')[1]) | ||
} | ||
message.filter[singleFilter.split(':')[0]].push(singleFilter.split(':')[1]) | ||
} | ||
} | ||
@@ -116,3 +115,3 @@ CrossRefAPI.works(message, pageQuery) | ||
if (crossref.allresults.length > crossref.hitlimit) { | ||
crossref.allresults = crossref.allresults.slice(0,crossref.hitlimit) | ||
crossref.allresults = crossref.allresults.slice(0, crossref.hitlimit) | ||
log.info('limiting hits') | ||
@@ -256,4 +255,4 @@ } | ||
CrossRef.prototype.urlQueueBuilder = function (urls, type, rename) { | ||
return urls.map(function (url_id) { | ||
return { url: url_id[0], id: url_id[1], type: type, rename: rename } | ||
return urls.map(function (urlId) { | ||
return { url: urlId[0], id: urlId[1], type: type, rename: rename } | ||
}) | ||
@@ -265,3 +264,3 @@ } | ||
// First convert slashes to underscores to aid readability | ||
var id = crossref.getIdentifier(record).id.replace(/\//g,"_") | ||
var id = crossref.getIdentifier(record).id.replace(/\//g, '_') | ||
var sanid = sanitize(id) | ||
@@ -268,0 +267,0 @@ mkdirp.sync(sanid) |
@@ -1,2 +0,1 @@ | ||
var util = require('util') | ||
var fs = require('fs') | ||
@@ -7,14 +6,14 @@ var chalk = require('chalk') | ||
var _ = require('lodash') | ||
var ProgressBar = require('progress'); | ||
var sanitize = require("sanitize-filename") | ||
var ProgressBar = require('progress') | ||
var sanitize = require('sanitize-filename') | ||
var config = require('./config.js') | ||
var log = require('winston') | ||
exports.downloadurlQueue = function(urlQueue, nextDlTaskcb) { | ||
var failed = []; | ||
var retries = 0; | ||
var missing = 0; | ||
exports.downloadurlQueue = function (urlQueue, nextDlTaskcb) { | ||
var failed = [] | ||
var missing = 0 | ||
//Setup ProgressBar | ||
// Setup ProgressBar | ||
var progmsg = 'Downloading files [:bar] :percent' + | ||
' (:current/:total) [:elapseds elapsed, eta :eta]'; | ||
' (:current/:total) [:elapseds elapsed, eta :eta]' | ||
var progopts = { | ||
@@ -24,101 +23,89 @@ total: urlQueue.length, | ||
complete: chalk.green('=') | ||
}; | ||
var dlprogress = new ProgressBar(progmsg, progopts); | ||
} | ||
var dlprogress = new ProgressBar(progmsg, progopts) | ||
var donefunc = function() { | ||
var donefunc = function () { | ||
if (failed.length > 0) { | ||
log.warn(failed.length + ' downloads timed out on retry.'); | ||
log.warn(failed.length + ' downloads timed out on retry.') | ||
} else if (missing > 0) { | ||
var succeeded = urlQueue.length - missing; | ||
var succeeded = urlQueue.length - missing | ||
var suffix = missing > 1 ? 's' : '' | ||
log.info(succeeded + ' downloads succeeded. ' + missing + | ||
' paper' + suffix + ' had urlQueue that could not be reached (404 error).'); | ||
' paper' + suffix + ' had urlQueue that could not be reached (404 error).') | ||
} else { | ||
log.info('All downloads succeeded!'); | ||
log.info('All downloads succeeded!') | ||
} | ||
nextDlTaskcb(); | ||
nextDlTaskcb() | ||
} | ||
var done = _.after(urlQueue.length, donefunc); | ||
var done = _.after(urlQueue.length, donefunc) | ||
for(var i=0; i<10; i++) { | ||
nextUrlTask(urlQueue); //spawn 10 workers | ||
for (var i = 0; i < 10; i++) { | ||
nextUrlTask(urlQueue) // spawn 10 workers | ||
} | ||
function nextUrlTask() { | ||
if (urlQueue instanceof Array && urlQueue.length > 0) { | ||
var urlObj = urlQueue.splice(0,1)[0]; | ||
testIfFileExists(urlObj, downloadURL); | ||
function nextUrlTask () { | ||
if (urlQueue instanceof Array && urlQueue.length > 0) { | ||
var urlObj = urlQueue.splice(0, 1)[0] | ||
testIfFileExists(urlObj, downloadURL) | ||
} else { | ||
log.debug('ending thread because urlQueue is now empty') | ||
} | ||
} | ||
else { | ||
log.debug('ending thread because urlQueue is now empty' ) | ||
} | ||
} | ||
// Run callback if file doesn't exist | ||
function testIfFileExists(urlObj, cb) { | ||
dlprogress.tick(); | ||
var url = urlObj.url; | ||
var id = urlObj.id; | ||
var type = urlObj.type; | ||
var rename = urlObj.rename; | ||
var base = id + '/'; | ||
fs.readFile(base + rename, (err, data) => { | ||
if ((err)&&(err.code=='ENOENT')) { | ||
cb(urlObj) | ||
return | ||
//File doesn't exist so start download procedure | ||
} | ||
else if (err) { | ||
throw err | ||
} | ||
else { | ||
log.info('File of type: '+type+' and id: '+id+' already exists. Skipping.') | ||
nextUrlTask(urlQueue) | ||
return | ||
} | ||
}) | ||
} | ||
function testIfFileExists (urlObj, cb) { | ||
dlprogress.tick() | ||
var id = urlObj.id | ||
var type = urlObj.type | ||
var rename = urlObj.rename | ||
var base = id + '/' | ||
fs.readFile(base + rename, (err, data) => { | ||
if ((err) && (err.code === 'ENOENT')) { | ||
cb(urlObj) | ||
function downloadURL(urlObj) { | ||
var url = urlObj.url; | ||
var id = urlObj.id; | ||
var type = urlObj.type; | ||
var rename = sanitize(urlObj.rename); | ||
var base = sanitize(id) + '/'; | ||
log.debug('Creating directory: ' + base); | ||
mkdirp.sync(base); | ||
log.debug('Downloading ' + type + ': ' + url); | ||
var options = { | ||
timeout: 15000, | ||
encoding: null, | ||
retries: 3 | ||
// File doesn't exist so start download procedure | ||
} else if (err) { | ||
throw err | ||
} else { | ||
log.info('File of type: ' + type + ' and id: ' + id + ' already exists. Skipping.') | ||
nextUrlTask(urlQueue) | ||
} | ||
}) | ||
} | ||
function fileWriteCB(err) { | ||
if (err) throw error | ||
done() | ||
} | ||
function downloadURL (urlObj) { | ||
var url = urlObj.url | ||
var id = urlObj.id | ||
var type = urlObj.type | ||
var rename = sanitize(urlObj.rename) | ||
var base = sanitize(id) + '/' | ||
log.debug('Creating directory: ' + base) | ||
mkdirp.sync(base) | ||
function handleDownload(data) { | ||
fs.writeFile(base + rename, data, fileWriteCB); | ||
nextUrlTask(urlQueue); | ||
log.debug('Downloading ' + type + ': ' + url) | ||
function fileWriteCB (err) { | ||
if (err) throw err | ||
done() | ||
} | ||
function throwErr(err){ | ||
if (err) throw err | ||
} | ||
function handleDownload (data) { | ||
fs.writeFile(base + rename, data, fileWriteCB) | ||
nextUrlTask(urlQueue) | ||
} | ||
rq = requestretry.get({url: url, | ||
fullResponse: false, | ||
headers: {'User-Agent': config.userAgent} | ||
}); | ||
rq.then(handleDownload) | ||
rq.catch(throwErr) | ||
} | ||
function throwErr (err) { | ||
if (err) throw err | ||
} | ||
var fourohfour = function() { | ||
missing ++; | ||
var rq = requestretry.get({ | ||
url: url, | ||
fullResponse: false, | ||
headers: {'User-Agent': config.userAgent}, | ||
encoding: null | ||
}) | ||
rq.then(handleDownload) | ||
rq.catch(throwErr) | ||
} | ||
} |
642
lib/eupmc.js
@@ -1,99 +0,87 @@ | ||
var util = require('util') | ||
, fs = require('fs') | ||
, chalk = require('chalk') | ||
, got = require('got') | ||
, mkdirp = require('mkdirp') | ||
, _ = require('lodash') | ||
, ProgressBar = require('progress') | ||
, urlDl = require('./download.js') | ||
, requestretry = require('requestretry') | ||
, glob = require('matched') | ||
, vc = require('version_compare') | ||
, config = require('./config.js') | ||
var fs = require('fs') | ||
var chalk = require('chalk') | ||
var got = require('got') | ||
var mkdirp = require('mkdirp') | ||
var _ = require('lodash') | ||
var ProgressBar = require('progress') | ||
var urlDl = require('./download.js') | ||
var requestretry = require('requestretry') | ||
var glob = require('matched') | ||
var vc = require('version_compare') | ||
var config = require('./config.js') | ||
var log = require('winston') | ||
var parseString = require('xml2js').parseString | ||
var minimumEuPMCResponseLength = 100 // Shortest length we could expect that countains | ||
// at least one result | ||
var EuPMCVersion = '5.1.1' | ||
var EuPmc = function(opts) { | ||
var EuPmc = function (opts) { | ||
var eupmc = this | ||
this.baseurl = 'http://www.ebi.ac.uk/' + | ||
'europepmc/webservices/rest/search/'; | ||
this.opts = opts; | ||
'europepmc/webservices/rest/search/' | ||
this.opts = opts || {} | ||
eupmc.first = true | ||
eupmc.hitlimit = eupmc.opts.hitlimit ? eupmc.opts.hitlimit : 0 | ||
eupmc.hitcount = 0 | ||
eupmc.residualhits = 0 | ||
eupmc.allresults = [] | ||
eupmc.nextCursorMark = '*' // we always get back the first page | ||
eupmc.pagesize = '1000' | ||
eupmc.unfillledPage = false | ||
} | ||
EuPmc.prototype.search = function(query) { | ||
EuPmc.prototype.search = function (query) { | ||
var eupmc = this | ||
var eupmc = this; | ||
if (!eupmc.opts.all) { | ||
query += " OPEN_ACCESS:y"; | ||
query += ' OPEN_ACCESS:y' | ||
} | ||
eupmc.pagesize = '1000' | ||
var options = { resulttype: 'core', pageSize: eupmc.pagesize }; | ||
eupmc.queryurl = eupmc.buildQuery(query, options); | ||
eupmc.first = true; | ||
eupmc.hitlimit = eupmc.opts.hitlimit ? eupmc.opts.hitlimit : 0; | ||
eupmc.hitcount = 0; | ||
eupmc.residualhits = 0; | ||
eupmc.allresults = []; | ||
eupmc.nextCursorMark = '*'; //we always get back the first page | ||
var options = { resulttype: 'core', pageSize: eupmc.pagesize } | ||
eupmc.queryurl = eupmc.buildQuery(query, options) | ||
if (eupmc.opts.restart) { | ||
fs.readFile('eupmc_results.json', (err,data) => { | ||
if ((err) && (err.code == 'ENOENT')) { | ||
log.error('No existing download to restart') | ||
process.exit(1) | ||
} | ||
else if (err) { | ||
throw err | ||
} | ||
else { | ||
log.info('Restarting previous download') | ||
eupmc.allresults=JSON.parse(data) | ||
eupmc.addDlTasks() | ||
} | ||
} ) | ||
fs.readFile('eupmc_results.json', (err, data) => { | ||
if ((err) && (err.code === 'ENOENT')) { | ||
log.error('No existing download to restart') | ||
process.exit(1) | ||
} else if (err) { | ||
throw err | ||
} else { | ||
log.info('Restarting previous download') | ||
eupmc.allresults = JSON.parse(data) | ||
eupmc.addDlTasks() | ||
} | ||
}) | ||
} else { | ||
eupmc.pageQuery() | ||
} | ||
else { | ||
eupmc.pageQuery(); | ||
} | ||
} | ||
EuPmc.prototype.testApi = function(version) { | ||
if(!vc.matches(version, EuPMCVersion)) { | ||
EuPmc.prototype.testApi = function (version) { | ||
if (!vc.matches(version, EuPMCVersion)) { | ||
log.warn('This version of getpapers wasn\'t built with this version of the EuPMC api in mind') | ||
log.warn(`getpapers EuPMCVersion: ${EuPMCVersion} vs. ${version} reported by api` ) | ||
log.warn(`getpapers EuPMCVersion: ${EuPMCVersion} vs. ${version} reported by api`) | ||
} | ||
} | ||
EuPmc.prototype.pageQuery = function() { | ||
EuPmc.prototype.pageQuery = function () { | ||
var eupmc = this | ||
var eupmc = this; | ||
var thisQueryUrl = eupmc.queryurl + '' | ||
var thisQueryUrl = eupmc.queryurl + ''; | ||
var pageterm = '&cursorMark=' + eupmc.nextCursorMark | ||
thisQueryUrl += pageterm | ||
var pageterm = '&cursorMark=' + eupmc.nextCursorMark; | ||
thisQueryUrl += pageterm; | ||
log.debug(thisQueryUrl) | ||
log.debug(thisQueryUrl); | ||
var retryOnHTTPNetOrEuPMCFailure = function (err, response, body){ | ||
var retryOnHTTPNetOrEuPMCFailure = function (err, response, body) { | ||
return requestretry.RetryStrategies.HTTPOrNetworkError(err, response, body) || | ||
~body.indexOf('<resultList/>') //hacky way to see if resultsList is empty | ||
~body.indexOf('<resultList/>') // hacky way to see if resultsList is empty | ||
} | ||
var rq = requestretry.get({url: thisQueryUrl, | ||
maxAttempts: 50, | ||
retryStrategy: retryOnHTTPNetOrEuPMCFailure, | ||
headers: {'User-Agent': config.userAgent} | ||
}); | ||
maxAttempts: 50, | ||
retryStrategy: retryOnHTTPNetOrEuPMCFailure, | ||
headers: {'User-Agent': config.userAgent} | ||
}) | ||
var handleResquestResponse = function (data) { | ||
@@ -103,3 +91,3 @@ if (data.attempts > 1) { | ||
} | ||
convertXML2JSON(data) | ||
convertXML2JSON(data) | ||
} | ||
@@ -109,31 +97,30 @@ var convertXML2JSON = function (data) { | ||
if (err) throw err | ||
cb = eupmc.completeCallback.bind(eupmc, datum) | ||
cb() } ) | ||
var cb = eupmc.completeCallback.bind(eupmc, datum) | ||
cb() | ||
}) | ||
} | ||
rq.then(handleResquestResponse); | ||
rq.on('timeout', eupmc.timeoutCallback); | ||
rq.then(handleResquestResponse) | ||
rq.on('timeout', eupmc.timeoutCallback) | ||
} | ||
EuPmc.prototype.completeCallback = function(data) { | ||
EuPmc.prototype.completeCallback = function (data) { | ||
var eupmc = this | ||
var eupmc = this; | ||
var resp = data.responseWrapper | ||
var resp = data.responseWrapper; | ||
if(!resp.hitCount || !resp.hitCount[0] || !resp.resultList[0].result) { | ||
log.error("Malformed or empty response from EuropePMC. Try running again. Perhaps your query is wrong."); | ||
process.exit(1); | ||
if (!resp.hitCount || !resp.hitCount[0] || !resp.resultList[0].result) { | ||
log.error('Malformed or empty response from EuropePMC. Try running again. Perhaps your query is wrong.') | ||
process.exit(1) | ||
} | ||
if (eupmc.first){ | ||
eupmc.first = false; | ||
eupmc.hitcount = parseInt(resp.hitCount[0]); | ||
var oaclause = eupmc.opts.all ? '' : ' open access'; | ||
log.info('Found ' + eupmc.hitcount + oaclause + ' results'); | ||
if (eupmc.first) { | ||
eupmc.first = false | ||
eupmc.hitcount = parseInt(resp.hitCount[0]) | ||
var oaclause = eupmc.opts.all ? '' : ' open access' | ||
log.info('Found ' + eupmc.hitcount + oaclause + ' results') | ||
eupmc.testApi(resp.version[0]) | ||
if (eupmc.hitcount == 0 || eupmc.opts.noexecute) { | ||
process.exit(0); | ||
if (eupmc.hitcount === 0 || eupmc.opts.noexecute) { | ||
process.exit(0) | ||
} | ||
@@ -143,9 +130,8 @@ | ||
if (eupmc.hitlimit && eupmc.hitlimit < eupmc.hitcount) { | ||
log.info('Limiting to ' + eupmc.hitlimit + ' hits'); | ||
} | ||
else { eupmc.hitlimit = eupmc.hitcount; } | ||
log.info('Limiting to ' + eupmc.hitlimit + ' hits') | ||
} else { eupmc.hitlimit = eupmc.hitcount } | ||
// create progress bar | ||
var progmsg = 'Retrieving results [:bar] :percent' + | ||
' (eta :etas)'; | ||
' (eta :etas)' | ||
var progopts = { | ||
@@ -155,32 +141,38 @@ total: eupmc.hitlimit, | ||
complete: chalk.green('=') | ||
}; | ||
eupmc.pageprogress = new ProgressBar(progmsg, progopts); | ||
} | ||
eupmc.pageprogress = new ProgressBar(progmsg, progopts) | ||
} | ||
var result | ||
if (eupmc.residualhits) { | ||
var result = resp.resultList[0].result.slice(0,eupmc.residualhits); | ||
result = resp.resultList[0].result.slice(0, eupmc.residualhits) | ||
} else { | ||
result = resp.resultList[0].result | ||
// if less results in this page than page count (and we were expecting an entire page) | ||
// EuPMC has been lying and we shouldn't keep searching for more results | ||
if (result.length < eupmc.pagesize) eupmc.unfilledPage = true | ||
} | ||
else { var result = resp.resultList[0].result; } | ||
log.debug('In this batch got: ' + result.length + ' results') | ||
eupmc.allresults = eupmc.allresults.concat(result); | ||
eupmc.pageprogress.tick(result.length); | ||
eupmc.allresults = eupmc.allresults.concat(result) | ||
eupmc.pageprogress.tick(result.length) | ||
if (eupmc.allresults.length < eupmc.hitlimit) { //we still have more results to get | ||
if (eupmc.allresults.length < eupmc.hitlimit) { // we still have more results to get | ||
if (eupmc.unfilledPage) { // but the last page wasn't full then something is wrong | ||
log.info('EuPMC gave us the wrong hitcount. We\'ve already found all the results') | ||
eupmc.handleSearchResults(eupmc) | ||
return | ||
} | ||
if (eupmc.hitlimit - eupmc.allresults.length < eupmc.pagesize) { | ||
eupmc.residualhits = eupmc.hitlimit - eupmc.allresults.length; | ||
eupmc.residualhits = eupmc.hitlimit - eupmc.allresults.length | ||
} | ||
eupmc.nextCursorMark = resp.nextCursorMark[0]; | ||
eupmc.pageQuery(); | ||
eupmc.nextCursorMark = resp.nextCursorMark[0] | ||
eupmc.pageQuery() | ||
} else { | ||
log.info('Done collecting results'); | ||
eupmc.handleSearchResults(eupmc); | ||
log.info('Done collecting results') | ||
eupmc.handleSearchResults(eupmc) | ||
} | ||
} | ||
EuPmc.prototype.timeoutCallback = function(ms) { | ||
eupmc = this | ||
log.error('Did not get a response from Europe PMC within ' + ms + 'ms'); | ||
EuPmc.prototype.timeoutCallback = function (ms) { | ||
var eupmc = this | ||
log.error('Did not get a response from Europe PMC within ' + ms + 'ms') | ||
if (eupmc.allresults) { | ||
@@ -191,43 +183,37 @@ log.info('Handling the limited number of search results we got.') | ||
} | ||
} | ||
EuPmc.prototype.buildQuery = function(query, options) { | ||
EuPmc.prototype.buildQuery = function (query, options) { | ||
var eupmc = this | ||
var eupmc = this; | ||
var queryurl = eupmc.baseurl + 'query=' + encodeURIComponent(query); | ||
Object.keys(options).forEach(function(key) { | ||
var val = options[key]; | ||
var queryurl = eupmc.baseurl + 'query=' + encodeURIComponent(query) | ||
Object.keys(options).forEach(function (key) { | ||
var val = options[key] | ||
if (key.length > 0) { | ||
queryurl += '&' + key + '=' + val; | ||
queryurl += '&' + key + '=' + val | ||
} | ||
}); | ||
return queryurl; | ||
}) | ||
return queryurl | ||
} | ||
EuPmc.prototype.formatResult = function(result) { | ||
EuPmc.prototype.formatResult = function (result) { | ||
return result.authorString + | ||
' (' + result.pubYear + '). ' + | ||
result.title + ' http://dx.doi.org/' + result.DOI; | ||
result.title + ' http://dx.doi.org/' + result.DOI | ||
} | ||
EuPmc.prototype.handleSearchResults = function(eupmc) { | ||
EuPmc.prototype.handleSearchResults = function (eupmc) { | ||
// see how many results were unique | ||
var originalLength = eupmc.allresults.length; | ||
eupmc.allresults = _.uniq(eupmc.allresults, function(x) { | ||
return eupmc.getIdentifier(x).id; | ||
}); | ||
var originalLength = eupmc.allresults.length | ||
eupmc.allresults = _.uniq(eupmc.allresults, function (x) { | ||
return eupmc.getIdentifier(x).id | ||
}) | ||
if (eupmc.allresults.length < originalLength) { | ||
log.info('Duplicate records found: ' + | ||
eupmc.allresults.length + | ||
' unique results identified'); | ||
' unique results identified') | ||
} | ||
if (eupmc.allresults.length > eupmc.hitlimit) { | ||
eupmc.allresults = eupmc.allresults.slice(0,eupmc.hitlimit) | ||
eupmc.allresults = eupmc.allresults.slice(0, eupmc.hitlimit) | ||
log.info('limiting hits') | ||
@@ -237,19 +223,19 @@ } | ||
// write the full result set to a file | ||
log.info('Saving result metadata'); | ||
var pretty = JSON.stringify(eupmc.allresults, null, 2); | ||
log.info('Saving result metadata') | ||
var pretty = JSON.stringify(eupmc.allresults, null, 2) | ||
fs.writeFileSync('eupmc_results.json', pretty) | ||
var filename = chalk.blue('eupmc_results.json') | ||
log.info('Full EUPMC result metadata written to ' + filename); | ||
var resultsFilename = chalk.blue('eupmc_results.json') | ||
log.info('Full EUPMC result metadata written to ' + resultsFilename) | ||
// write individual results to their respective directories | ||
eupmc.allresults.forEach(function(result) { | ||
eupmc.allresults.forEach(function (result) { | ||
eupmc.writeRecord(result, eupmc) | ||
}) | ||
log.info('Individual EUPMC result metadata records written'); | ||
log.info('Individual EUPMC result metadata records written') | ||
// write only the url list to file | ||
log.info('Extracting fulltext HTML URL list (may not be available for all articles)'); | ||
log.info('Extracting fulltext HTML URL list (may not be available for all articles)') | ||
var urls = eupmc.allresults | ||
.map(eupmc.getFulltextHTMLUrl, eupmc) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
@@ -259,19 +245,18 @@ if (urls.length > 0) { | ||
'eupmc_fulltext_html_urls.txt', | ||
urls.concat(["\n"]).join("\n") | ||
); | ||
var filename = chalk.blue('eupmc_fulltext_html_urls.txt') | ||
log.info('Fulltext HTML URL list written to ' + filename); | ||
urls.concat(['\n']).join('\n') | ||
) | ||
var urlFilename = chalk.blue('eupmc_fulltext_html_urls.txt') | ||
log.info('Fulltext HTML URL list written to ' + urlFilename) | ||
} | ||
eupmc.addDlTasks() | ||
} | ||
EuPmc.prototype.addDlTasks = function() { | ||
eupmc = this | ||
var dlTasks = []; | ||
EuPmc.prototype.addDlTasks = function () { | ||
var eupmc = this | ||
var dlTasks = [] | ||
// download the fullText XML | ||
if (eupmc.opts.xml) { | ||
dlTasks.push(eupmc.downloadFulltextXMLs); | ||
dlTasks.push(eupmc.downloadFulltextXMLs) | ||
} | ||
@@ -281,3 +266,3 @@ | ||
if (eupmc.opts.pdf) { | ||
dlTasks.push(eupmc.downloadFulltextPDFs); | ||
dlTasks.push(eupmc.downloadFulltextPDFs) | ||
} | ||
@@ -287,3 +272,3 @@ | ||
if (eupmc.opts.supp) { | ||
dlTasks.push(eupmc.downloadSuppFiles); | ||
dlTasks.push(eupmc.downloadSuppFiles) | ||
} | ||
@@ -293,109 +278,101 @@ | ||
if (eupmc.opts.minedterms) { | ||
dlTasks.push(eupmc.downloadMinedTerms); | ||
dlTasks.push(eupmc.summariseMinedTerms); | ||
dlTasks.push(eupmc.downloadMinedTerms) | ||
dlTasks.push(eupmc.summariseMinedTerms) | ||
} | ||
eupmc.runDlTasks(dlTasks); | ||
eupmc.runDlTasks(dlTasks) | ||
} | ||
EuPmc.prototype.runDlTasks = function(dlTasks) { | ||
EuPmc.prototype.runDlTasks = function (dlTasks) { | ||
var eupmc = this | ||
var eupmc = this; | ||
eupmc.dlTasks = dlTasks; | ||
eupmc.currDlTask = -1; | ||
eupmc.nextDlTask(); | ||
eupmc.dlTasks = dlTasks | ||
eupmc.currDlTask = -1 | ||
eupmc.nextDlTask() | ||
} | ||
EuPmc.prototype.nextDlTask = function() { | ||
EuPmc.prototype.nextDlTask = function () { | ||
var eupmc = this | ||
var eupmc = this; | ||
eupmc.currDlTask ++; | ||
eupmc.currDlTask ++ | ||
if (eupmc.dlTasks.length > eupmc.currDlTask) { | ||
var fun = eupmc.dlTasks[eupmc.currDlTask]; | ||
fun(eupmc); | ||
var fun = eupmc.dlTasks[eupmc.currDlTask] | ||
fun(eupmc) | ||
} else { | ||
process.exit(0); | ||
process.exit(0) | ||
} | ||
} | ||
EuPmc.prototype.downloadFulltextXMLs = function(eupmc) { | ||
urls = eupmc.allresults | ||
EuPmc.prototype.downloadFulltextXMLs = function (eupmc) { | ||
var urls = eupmc.allresults | ||
.map(eupmc.getFulltextXMLUrl, eupmc) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
log.info('Got XML URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results'); | ||
log.info('Got XML URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results') | ||
log.info('Downloading fulltext XML files'); | ||
log.info('Downloading fulltext XML files') | ||
var urlQueue = eupmc.urlQueueBuilder(urls, 'XML', 'fulltext.xml'); | ||
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc)); | ||
var urlQueue = eupmc.urlQueueBuilder(urls, 'XML', 'fulltext.xml') | ||
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc)) | ||
} | ||
EuPmc.prototype.downloadMinedTerms = function(eupmc) { | ||
urls = eupmc.allresults | ||
EuPmc.prototype.downloadMinedTerms = function (eupmc) { | ||
var urls = eupmc.allresults | ||
.map(eupmc.getMinedTermsURL, eupmc) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
log.info('Got mined terms JSON URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results'); | ||
log.info('Got mined terms JSON URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results') | ||
log.info('Downloading mined terms JSON files'); | ||
log.info('Downloading mined terms JSON files') | ||
var urlQueue = eupmc.urlQueueBuilder(urls, 'JSON', 'textMinedTerms.json'); | ||
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc)); | ||
var urlQueue = eupmc.urlQueueBuilder(urls, 'JSON', 'textMinedTerms.json') | ||
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc)) | ||
} | ||
EuPmc.prototype.downloadFulltextPDFs = function(eupmc) { | ||
urls = eupmc.allresults | ||
EuPmc.prototype.downloadFulltextPDFs = function (eupmc) { | ||
var urls = eupmc.allresults | ||
.map(eupmc.getFulltextPDFUrl, eupmc) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
log.info('Downloading fulltext PDF files'); | ||
log.info('Downloading fulltext PDF files') | ||
var urlQueue = eupmc.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf'); | ||
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc)); | ||
var urlQueue = eupmc.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf') | ||
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc)) | ||
} | ||
EuPmc.prototype.downloadSuppFiles = function(eupmc) { | ||
urls = eupmc.allresults | ||
EuPmc.prototype.downloadSuppFiles = function (eupmc) { | ||
var urls = eupmc.allresults | ||
.map(eupmc.getSuppFilesUrl, eupmc) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
log.info('Downloading supplementary files'); | ||
log.info('Downloading supplementary files') | ||
var failed = []; | ||
var retries = 0; | ||
var missing = 0; | ||
var failed = [] | ||
var retries = 0 | ||
var missing = 0 | ||
var fourohfour = function() { | ||
missing ++; | ||
var fourohfour = function () { | ||
missing++ | ||
} | ||
var done = _.after(urls.length, function() { | ||
if (failed.length > 0 && retries == 0) { | ||
log.warn(failed.length + ' downloads timed out. Retrying.'); | ||
failed = []; | ||
var done = _.after(urls.length, function () { | ||
if (failed.length > 0 && retries === 0) { | ||
log.warn(failed.length + ' downloads timed out. Retrying.') | ||
failed = [] | ||
eupmc.downloadUrls(urls, | ||
'supplementary files', | ||
'supplementaryFiles.zip', | ||
failed, done, eupmc, fourohfour); | ||
failed, done, eupmc, fourohfour) | ||
} else if (failed.length > 0) { | ||
log.warn(failed.length + ' downloads timed out on retry. Skipping.'); | ||
log.warn(failed.length + ' downloads timed out on retry. Skipping.') | ||
} else if (missing > 0) { | ||
var succeeded = urls.length - missing; | ||
var succeeded = urls.length - missing | ||
var suffix = missing > 1 ? 's' : '' | ||
log.info(succeeded + ' downloads succeeded. ' + missing + | ||
' paper' + suffix + ' had no supplementary files.'); | ||
' paper' + suffix + ' had no supplementary files.') | ||
} else { | ||
log.info('All supplementary file downloads succeeded!'); | ||
log.info('All supplementary file downloads succeeded!') | ||
} | ||
eupmc.nextDlTask(); | ||
}); | ||
eupmc.nextDlTask() | ||
}) | ||
@@ -405,13 +382,10 @@ eupmc.downloadUrls(urls, | ||
'supplementaryFiles.zip', | ||
failed, done, eupmc, fourohfour); | ||
failed, done, eupmc, fourohfour) | ||
} | ||
EuPmc.prototype.downloadUrls = function(urls, type, rename, failed, | ||
EuPmc.prototype.downloadUrls = function (urls, type, rename, failed, | ||
cb, thisArg, fourohfour) { | ||
var eupmc = thisArg; | ||
// setup progress bar | ||
var progmsg = 'Downloading files [:bar] :percent' + | ||
' (:current/:total) [:elapseds elapsed, eta :eta]'; | ||
' (:current/:total) [:elapseds elapsed, eta :eta]' | ||
var progopts = { | ||
@@ -421,12 +395,12 @@ total: urls.length, | ||
complete: chalk.green('=') | ||
}; | ||
var dlprogress = new ProgressBar(progmsg, progopts); | ||
} | ||
var dlprogress = new ProgressBar(progmsg, progopts) | ||
urls.forEach(function(url_id) { | ||
var url = url_id[0]; | ||
var id = url_id[1]; | ||
var base = id + '/'; | ||
log.debug('Creating directory: ' + base); | ||
mkdirp.sync(base); | ||
log.debug('Downloading ' + type + ': ' + url); | ||
urls.forEach(function (urlId) { | ||
var url = urlId[0] | ||
var id = urlId[1] | ||
var base = id + '/' | ||
log.debug('Creating directory: ' + base) | ||
mkdirp.sync(base) | ||
log.debug('Downloading ' + type + ': ' + url) | ||
var options = { | ||
@@ -436,54 +410,48 @@ timeout: 15000, | ||
} | ||
var get = got(url, options, function(err, data, res) { | ||
dlprogress.tick(); | ||
got(url, options, function (err, data, res) { | ||
dlprogress.tick() | ||
if (err) { | ||
if (err.code === 'ETIMEDOUT' || err.code === 'ESOCKETTIMEDOUT') { | ||
log.warn('Download timed out for URL ' + url); | ||
log.warn('Download timed out for URL ' + url) | ||
} | ||
if (!res) { | ||
failed.push(url); | ||
} else if ((res.statusCode == 404) && !(fourohfour === null)) { | ||
fourohfour(); | ||
failed.push(url) | ||
} else if ((res.statusCode === 404) && !(fourohfour === null)) { | ||
fourohfour() | ||
} else { | ||
failed.push(url); | ||
failed.push(url) | ||
} | ||
cb(); | ||
cb() | ||
} else { | ||
fs.writeFile(base + rename, data, cb); | ||
fs.writeFile(base + rename, data, cb) | ||
} | ||
}); | ||
}); | ||
}) | ||
}) | ||
} | ||
EuPmc.prototype.getFulltextHTMLUrl = function (result, oa) { | ||
var eupmc = this | ||
var id = eupmc.getIdentifier(result) | ||
EuPmc.prototype.getFulltextHTMLUrl = function(result, oa) { | ||
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id) } | ||
var eupmc = this; | ||
var id = eupmc.getIdentifier(result); | ||
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id); } | ||
var urls = result.fullTextUrlList[0].fullTextUrl; | ||
var htmlUrls = urls.filter(function(u) { | ||
return (u.documentStyle[0] == 'html' || u.documentStyle[0] == 'doi') | ||
}).sort(function(a, b) { | ||
return (a.availabilityCode[0] == 'OA' || eupmc.opts.all) ? -1 : 1 | ||
}); | ||
if (htmlUrls.length == 0) { | ||
var id = eupmc.getIdentifier(result); | ||
var urls = result.fullTextUrlList[0].fullTextUrl | ||
var htmlUrls = urls.filter(function (u) { | ||
return (u.documentStyle[0] === 'html' || u.documentStyle[0] === 'doi') | ||
}).sort(function (a, b) { | ||
return (a.availabilityCode[0] === 'OA' || eupmc.opts.all) ? -1 : 1 | ||
}) | ||
if (htmlUrls.length === 0) { | ||
log.warn('Article with ' + id.type + ' "' + | ||
id.id + '" had no fulltext HTML url'); | ||
return null; | ||
id.id + '" had no fulltext HTML url') | ||
return null | ||
} else { | ||
return htmlUrls[0].url[0]; | ||
return htmlUrls[0].url[0] | ||
} | ||
} | ||
EuPmc.prototype.getIdentifier = function(result) { | ||
var types = ['pmcid', 'doi', 'pmid', 'title']; | ||
EuPmc.prototype.getIdentifier = function (result) { | ||
var types = ['pmcid', 'doi', 'pmid', 'title'] | ||
for (var i = 0; i < types.length; i++) { | ||
var type = types[i]; | ||
var type = types[i] | ||
if (result.hasOwnProperty(type) && result[type].length > 0) { | ||
@@ -501,123 +469,111 @@ return { | ||
} | ||
} | ||
EuPmc.prototype.getFulltextXMLUrl = function (result) { | ||
var eupmc = this | ||
EuPmc.prototype.getFulltextXMLUrl = function(result) { | ||
var id = eupmc.getIdentifier(result) | ||
var eupmc = this; | ||
var xmlurl = null | ||
var id = eupmc.getIdentifier(result); | ||
var xmlurl = null; | ||
if (id.type === 'pmcid') { | ||
xmlurl = 'http://www.ebi.ac.uk/europepmc/webservices/rest/' + | ||
id.id + '/fullTextXML'; | ||
id.id + '/fullTextXML' | ||
} else { | ||
log.warn('Article with ' + id.type + ' "' + | ||
id.id + ' did not have a PMCID (therefore no XML)'); | ||
return null; | ||
id.id + ' did not have a PMCID (therefore no XML)') | ||
return null | ||
} | ||
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id); } | ||
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id) } | ||
var urls = result.fullTextUrlList[0].fullTextUrl; | ||
var htmlUrls = urls.filter(function(u) { | ||
var urls = result.fullTextUrlList[0].fullTextUrl | ||
var htmlUrls = urls.filter(function (u) { | ||
return (u.documentStyle[0] === 'html' || u.documentStyle[0] === 'doi') | ||
}).filter(function(a, b) { | ||
return (a.availabilityCode[0] === 'OA'); | ||
}); | ||
if (htmlUrls.length == 0) { | ||
var id = eupmc.getIdentifier(result); | ||
}).filter(function (a, b) { | ||
return (a.availabilityCode[0] === 'OA') | ||
}) | ||
if (htmlUrls.length === 0) { | ||
log.warn('Article with ' + id.type + ' "' + | ||
id.id + '" was not Open Access (therefore no XML)'); | ||
return null; | ||
id.id + '" was not Open Access (therefore no XML)') | ||
return null | ||
} | ||
return [xmlurl, id.id]; | ||
return [xmlurl, id.id] | ||
} | ||
EuPmc.prototype.getFulltextPDFUrl = function(result) { | ||
EuPmc.prototype.getFulltextPDFUrl = function (result) { | ||
var eupmc = this | ||
var id = eupmc.getIdentifier(result) | ||
var eupmc = this; | ||
var id = eupmc.getIdentifier(result); | ||
var noPDF = function(id) { | ||
log.warn('Article with ' + id.type + ' "' + | ||
id.id + '" had no fulltext PDF url'); | ||
return null; | ||
var noPDF = function (id) { | ||
log.warn('Article with ' + id.type + ' "' + | ||
id.id + '" had no fulltext PDF url') | ||
return null | ||
} | ||
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id); } | ||
if (result.hasPDF == 'N') { return noPDF(id); } | ||
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id) } | ||
if (result.hasPDF === 'N') { return noPDF(id) } | ||
var urls = result.fullTextUrlList[0].fullTextUrl; | ||
var pdfOAurls = urls.filter(function(u) { | ||
var urls = result.fullTextUrlList[0].fullTextUrl | ||
var pdfOAurls = urls.filter(function (u) { | ||
return u.documentStyle[0] === 'pdf' && | ||
u.availabilityCode[0] === 'OA' | ||
}); | ||
}) | ||
if (pdfOAurls.length == 0) { | ||
return noPDF(id); | ||
if (pdfOAurls.length === 0) { | ||
return noPDF(id) | ||
} else { | ||
return [pdfOAurls[0].url[0], id.id]; | ||
return [pdfOAurls[0].url[0], id.id] | ||
} | ||
} | ||
EuPmc.prototype.urlQueueBuilder = function(urls, type, rename) { | ||
return urls.map(function(url_id) { | ||
return {url: url_id[0], id: url_id[1], type: type, rename: rename } | ||
EuPmc.prototype.urlQueueBuilder = function (urls, type, rename) { | ||
return urls.map(function (urlId) { | ||
return { url: urlId[0], id: urlId[1], type: type, rename: rename } | ||
}) | ||
}; | ||
} | ||
EuPmc.prototype.getSuppFilesUrl = function (result) { | ||
var eupmc = this | ||
EuPmc.prototype.getSuppFilesUrl = function(result) { | ||
var id = eupmc.getIdentifier(result) | ||
var eupmc = this; | ||
var id = eupmc.getIdentifier(result); | ||
if (id.type == 'pmcid') { | ||
if (id.type === 'pmcid') { | ||
return ['http://www.ebi.ac.uk/europepmc/webservices/rest/' + | ||
id.id + '/supplementaryFiles', id.id]; | ||
id.id + '/supplementaryFiles', id.id] | ||
} else { | ||
log.warn('Article with ' + id.type + ' "' + | ||
id.id + ' did not have a PMCID (therefore no supplementary files)'); | ||
return null; | ||
id.id + ' did not have a PMCID (therefore no supplementary files)') | ||
return null | ||
} | ||
} | ||
EuPmc.prototype.getMinedTermsURL = function(result) { | ||
EuPmc.prototype.getMinedTermsURL = function (result) { | ||
var eupmc = this | ||
var eupmc = this; | ||
var id = eupmc.getIdentifier(result) | ||
var id = eupmc.getIdentifier(result); | ||
if (id.type == 'pmcid') { | ||
if (id.type === 'pmcid') { | ||
return ['http://www.ebi.ac.uk/europepmc/webservices/rest/PMC/' + | ||
id.id + '/textMinedTerms//1/1000/json', id.id]; | ||
id.id + '/textMinedTerms//1/1000/json', id.id] | ||
} else { | ||
log.warn('Article with ' + id.type + ' "' + | ||
id.id + ' did not have a PMCID (therefore no mined terms)'); | ||
return null; | ||
id.id + ' did not have a PMCID (therefore no mined terms)') | ||
return null | ||
} | ||
} | ||
EuPmc.prototype.summariseMinedTerms = function() { | ||
EuPmc.prototype.summariseMinedTerms = function () { | ||
log.info('Writing mined term summary CSV files to minedterms_summary/') | ||
mkdirp.sync('minedterms_summary') | ||
var termstore = {} | ||
glob.sync(['*/textMinedTerms.json']).forEach(function(termsFile) { | ||
glob.sync(['*/textMinedTerms.json']).forEach(function (termsFile) { | ||
var json = fs.readFileSync(termsFile, 'utf8') | ||
var terms = JSON.parse(json) | ||
terms.semanticTypeList.semanticType.forEach(function(termset) { | ||
terms.semanticTypeList.semanticType.forEach(function (termset) { | ||
if (!termstore[termset.name]) { | ||
termstore[termset.name] = [] | ||
} | ||
var rows = termset.tmSummary.map(function(term) { | ||
var rows = termset.tmSummary.map(function (term) { | ||
return [ | ||
@@ -634,5 +590,5 @@ terms.request.id, | ||
}) | ||
Object.keys(termstore).forEach(function(key) { | ||
Object.keys(termstore).forEach(function (key) { | ||
var head = 'article,' + key + ',count,dbName,dbId\n' | ||
var csv = head + termstore[key].map(function(row) { | ||
var csv = head + termstore[key].map(function (row) { | ||
return row.join(',') | ||
@@ -644,5 +600,5 @@ }).join('\n') + '\n' | ||
EuPmc.prototype.writeRecord = function(record, eupmc) { | ||
var json = JSON.stringify(record, null, 2); | ||
var id = eupmc.getIdentifier(record).id; | ||
EuPmc.prototype.writeRecord = function (record, eupmc) { | ||
var json = JSON.stringify(record, null, 2) | ||
var id = eupmc.getIdentifier(record).id | ||
mkdirp.sync(id) | ||
@@ -652,12 +608,8 @@ fs.writeFileSync(id + '/eupmc_result.json', json) | ||
EuPmc.prototype.noFulltextUrls = function(id) { | ||
EuPmc.prototype.noFulltextUrls = function (id) { | ||
log.debug('Article with ' + id.type + ' "' + | ||
id.id + '" had no fulltext Urls'); | ||
return null; | ||
id.id + '" had no fulltext Urls') | ||
return null | ||
} | ||
module.exports = EuPmc; | ||
module.exports = EuPmc |
260
lib/ieee.js
@@ -1,123 +0,114 @@ | ||
var util = require('util') | ||
, fs = require('fs') | ||
, chalk = require('chalk') | ||
, got = require('got') | ||
, mkdirp = require('mkdirp') | ||
, _ = require('lodash') | ||
, request = require('requestretry') | ||
, ProgressBar = require('progress') | ||
, config = require ('./config.js') | ||
var fs = require('fs') | ||
var chalk = require('chalk') | ||
var request = require('requestretry') | ||
var ProgressBar = require('progress') | ||
var config = require('./config.js') | ||
var log = require('winston') | ||
var parseString = require('xml2js').parseString | ||
var IEEE = function(opts) { | ||
var IEEE = function (opts) { | ||
this.baseurl = 'http://ieeexplore.ieee.org/gateway/ipsSearch.jsp?' | ||
this.opts = opts; | ||
this.opts = opts | ||
} | ||
IEEE.prototype.search = function(query) { | ||
IEEE.prototype.search = function (query) { | ||
var ieee = this | ||
var ieee = this; | ||
if (ieee.opts.xml) { | ||
log.warn("The IEEE API does not provide fulltext XML, so the --xml flag will be ignored"); | ||
log.warn('The IEEE API does not provide fulltext XML, so the --xml flag will be ignored') | ||
} | ||
if (ieee.opts.pdf) { | ||
log.warn("The IEEE API does not provide fulltext PDF links, so the --pdf flag will be ignored"); | ||
log.warn('The IEEE API does not provide fulltext PDF links, so the --pdf flag will be ignored') | ||
} | ||
if (ieee.opts.minedterms) { | ||
log.warn("The IEEE API does not provide mined terms, so the --minedterms flag will be ignored"); | ||
log.warn('The IEEE API does not provide mined terms, so the --minedterms flag will be ignored') | ||
} | ||
if (ieee.opts.supp) { | ||
log.warn("The IEEE API does not provide supplementary materials, so the --supp flag will be ignored"); | ||
log.warn('The IEEE API does not provide supplementary materials, so the --supp flag will be ignored') | ||
} | ||
ieee.pagesize = 200; | ||
ieee.pagesize = 200 | ||
options = { | ||
var options = { | ||
hc: ieee.pagesize | ||
}; | ||
} | ||
if (!ieee.opts.all) { | ||
options['oa'] = 1; | ||
options['oa'] = 1 | ||
} | ||
ieee.queryurl = ieee.buildQuery(query, options); | ||
ieee.first = true; | ||
ieee.residualhits = 0; | ||
ieee.hitlimit = ieee.opts.hitlimit ? ieee.opts.hitlimit : 0; | ||
ieee.hitcount = 0; | ||
ieee.allresults = []; | ||
ieee.iter = 1; | ||
ieee.queryurl = ieee.buildQuery(query, options) | ||
ieee.first = true | ||
ieee.residualhits = 0 | ||
ieee.hitlimit = ieee.opts.hitlimit ? ieee.opts.hitlimit : 0 | ||
ieee.hitcount = 0 | ||
ieee.allresults = [] | ||
ieee.iter = 1 | ||
ieee.timeouts = 0; | ||
ieee.timeouts = 0 | ||
ieee.resultstream = fs.createWriteStream('ieee_results.json'); | ||
ieee.fulltextURLstream = fs.createWriteStream('ieee_fulltext_html_urls.txt'); | ||
ieee.resultstream = fs.createWriteStream('ieee_results.json') | ||
ieee.fulltextURLstream = fs.createWriteStream('ieee_fulltext_html_urls.txt') | ||
ieee.pageQuery(); | ||
ieee.pageQuery() | ||
} | ||
IEEE.prototype.pageQuery = function() { | ||
IEEE.prototype.pageQuery = function () { | ||
var ieee = this | ||
var ieee = this; | ||
var thisQueryUrl = ieee.queryurl | ||
var thisQueryUrl = ieee.queryurl; | ||
if (ieee.iter > 0) { | ||
var pageterm = '&rs=' + ieee.iter; | ||
thisQueryUrl += pageterm; | ||
var pageterm = '&rs=' + ieee.iter | ||
thisQueryUrl += pageterm | ||
} | ||
log.debug(thisQueryUrl); | ||
log.debug(thisQueryUrl) | ||
var rq = request.get({url: thisQueryUrl, | ||
headers: { 'Accept': 'application/json', | ||
'User-Agent': config.userAgent}}); | ||
headers: { | ||
'Accept': 'application/json', | ||
'User-Agent': config.userAgent | ||
}}) | ||
var convertXML2JSON = function (data) { | ||
//console.log(data.body) | ||
// console.log(data.body) | ||
parseString(data.body, function (err, datum) { | ||
cb = ieee.completeCallback.bind(ieee, datum) | ||
cb() } ) | ||
if (err) throw err | ||
var cb = ieee.completeCallback.bind(ieee, datum) | ||
cb() | ||
}) | ||
} | ||
rq.on('complete', convertXML2JSON); | ||
rq.on('timeout', ieee.timeoutCallback.bind(ieee)); | ||
rq.on('complete', convertXML2JSON) | ||
rq.on('timeout', ieee.timeoutCallback.bind(ieee)) | ||
} | ||
IEEE.prototype.completeCallback = function(data) { | ||
IEEE.prototype.completeCallback = function (data) { | ||
var ieee = this | ||
var ieee = this; | ||
var totalfound = 0 | ||
var totalfound = 0; | ||
if (data.hasOwnProperty('root')) { | ||
var totalfound = parseInt(data.root.totalfound[0]); | ||
totalfound = parseInt(data.root.totalfound[0]) | ||
} | ||
if (ieee.first) { | ||
ieee.first = false; | ||
ieee.hitcount = totalfound; | ||
var oaclause = ieee.opts.all ? '' : ' open access'; | ||
log.info('Found ' + ieee.hitcount + oaclause + ' results'); | ||
if (ieee.hitcount == 0 || ieee.opts.noexecute) { | ||
process.exit(0); | ||
ieee.first = false | ||
ieee.hitcount = totalfound | ||
var oaclause = ieee.opts.all ? '' : ' open access' | ||
log.info('Found ' + ieee.hitcount + oaclause + ' results') | ||
if (ieee.hitcount === 0 || ieee.opts.noexecute) { | ||
process.exit(0) | ||
} | ||
log.info('The IEEE API does not provide fulltext HTML links, but we will try to guess them from other metadata'); | ||
log.info('The IEEE API does not provide fulltext HTML links, but we will try to guess them from other metadata') | ||
// set hitlimit | ||
// set hitlimit | ||
if (ieee.hitlimit && ieee.hitlimit < ieee.hitcount) { | ||
log.info('Limiting to ' + ieee.hitlimit + ' hits'); | ||
} | ||
else { ieee.hitlimit = ieee.hitcount; } | ||
log.info('Limiting to ' + ieee.hitlimit + ' hits') | ||
} else { ieee.hitlimit = ieee.hitcount } | ||
// create progress bar | ||
var progmsg = 'Fetching result metadata [:bar] :percent' + | ||
' (:current/:total) [:elapseds elapsed, eta :etas]'; | ||
' (:current/:total) [:elapseds elapsed, eta :etas]' | ||
var progopts = { | ||
@@ -127,117 +118,100 @@ total: ieee.hitlimit, | ||
complete: chalk.green('=') | ||
}; | ||
ieee.pageprogress = new ProgressBar(progmsg, progopts); | ||
} | ||
ieee.pageprogress = new ProgressBar(progmsg, progopts) | ||
} | ||
var result | ||
if (!ieee.residualhits) { | ||
result = data.root.document | ||
} else { | ||
result = data.root.document.slice(0, ieee.residualhits) | ||
} | ||
var pretty = JSON.stringify(result, null, 2) | ||
ieee.resultstream.write(pretty) | ||
if(!ieee.residualhits) { var result = data.root.document; } | ||
else { var result = data.root.document.slice(0,ieee.residualhits); } | ||
var pretty = JSON.stringify(result, null, 2); | ||
ieee.resultstream.write(pretty); | ||
var urls = ieee.getFulltextHTMLUrls(result) | ||
urls.forEach(function (url) { ieee.fulltextURLstream.write(url + '\n') }) | ||
var urls = ieee.getFulltextHTMLUrls(result); | ||
urls.forEach(function(url) { ieee.fulltextURLstream.write(url + '\n') }); | ||
ieee.allresults = ieee.allresults.concat(result) | ||
ieee.pageprogress.tick(result.length) | ||
ieee.allresults = ieee.allresults.concat(result); | ||
ieee.pageprogress.tick(result.length); | ||
if (ieee.allresults.length < ieee.hitcount) { | ||
ieee.iter += 1; | ||
remaininghits = ieee.hitcount - ieee.allresults.length; | ||
if(remaininghits<ieee.pagesize) { ieee.residualhits = remaininghits; } | ||
log.debug(ieee.allresults.length); | ||
ieee.pageQuery(); | ||
ieee.iter += 1 | ||
var remaininghits = ieee.hitcount - ieee.allresults.length | ||
if (remaininghits < ieee.pagesize) { ieee.residualhits = remaininghits } | ||
log.debug(ieee.allresults.length) | ||
ieee.pageQuery() | ||
} else { | ||
log.info('Done collecting results. Got ' + ieee.allresults.length); | ||
ieee.handleSearchResults(ieee); | ||
log.info('Done collecting results. Got ' + ieee.allresults.length) | ||
ieee.handleSearchResults(ieee) | ||
} | ||
} | ||
IEEE.prototype.handleSearchResults = function(ieee) { | ||
IEEE.prototype.handleSearchResults = function (ieee) { | ||
// write the full result set to a file | ||
log.info('Saving result metadata'); | ||
var pretty = JSON.stringify(ieee.allresults, null, 2); | ||
log.info('Saving result metadata') | ||
var pretty = JSON.stringify(ieee.allresults, null, 2) | ||
fs.writeFileSync('ieee_results.json', pretty) | ||
var filename = chalk.blue('ieee_results.json') | ||
log.info('Full IEEE result metadata written to ' + filename); | ||
log.info('Full IEEE result metadata written to ' + filename) | ||
ieee.fulltextURLstream.end(); | ||
ieee.fulltextURLstream.end() | ||
filename = chalk.blue('ieee_fulltext_html_urls.txt') | ||
log.info('Fulltext HTML URL list written to ' + filename); | ||
log.info('Fulltext HTML URL list written to ' + filename) | ||
} | ||
IEEE.prototype.timeoutCallback = function(ms) { | ||
IEEE.prototype.timeoutCallback = function (ms) { | ||
var ieee = this | ||
var ieee = this; | ||
log.error('Did not get a response from the IEEE API within ' + ms + 'ms') | ||
log.error('There have been ' + ieee.timeouts + ' total timeouts') | ||
ieee.timeouts += 1 | ||
log.error('Did not get a response from the IEEE API within ' + ms + 'ms'); | ||
log.error('There have been ' + ieee.timeouts + ' total timeouts'); | ||
ieee.timeouts += 1; | ||
if (ieee.timeouts > 99) { | ||
log.info('Timed out 100 times - the connection is probably broken'); | ||
log.info('Timed out 100 times - the connection is probably broken') | ||
log.info('You have either been disconnected from the internet, or ' + | ||
'the API provider has blocked your IP'); | ||
process.exit(1); | ||
'the API provider has blocked your IP') | ||
process.exit(1) | ||
} else { | ||
log.info('Retrying timed-out query'); | ||
ieee.pageQuery(); | ||
log.info('Retrying timed-out query') | ||
ieee.pageQuery() | ||
} | ||
} | ||
}; | ||
IEEE.prototype.buildQuery = function (query, options) { | ||
var ieee = this | ||
IEEE.prototype.buildQuery = function(query, options) { | ||
var queryurl = ieee.baseurl + 'querytext=' + encodeURIComponent(query) | ||
var ieee = this; | ||
var queryurl = ieee.baseurl + 'querytext=' + encodeURIComponent(query); | ||
Object.keys(options).forEach(function(key) { | ||
var val = options[key]; | ||
Object.keys(options).forEach(function (key) { | ||
var val = options[key] | ||
if (key.length > 0) { | ||
queryurl += '&' + key + '=' + val; | ||
queryurl += '&' + key + '=' + val | ||
} | ||
}); | ||
}) | ||
return queryurl; | ||
return queryurl | ||
} | ||
IEEE.prototype.getFulltextHTMLUrl = function(result) { | ||
var ieee = this; | ||
IEEE.prototype.getFulltextHTMLUrl = function (result) { | ||
if (result.htmlFlag && result.htmlFlag[0] === '1') { | ||
var arnumber = result.arnumber[0] | ||
var arnumber = result.arnumber[0]; | ||
var url = "http://ieeexplore.ieee.org/xpls/icp.jsp?arnumber=" + arnumber; | ||
result.html = url; | ||
return url; | ||
var url = 'http://ieeexplore.ieee.org/xpls/icp.jsp?arnumber=' + arnumber | ||
result.html = url | ||
return url | ||
} else { | ||
return null; | ||
return null | ||
} | ||
} | ||
IEEE.prototype.getFulltextHTMLUrls = function(results) { | ||
IEEE.prototype.getFulltextHTMLUrls = function (results) { | ||
var ieee = this | ||
var ieee = this; | ||
return results | ||
.map(ieee.getFulltextHTMLUrl, ieee) | ||
.filter(function(x) { return !(x === null) }); | ||
.filter(function (x) { return !(x === null) }) | ||
} | ||
module.exports = IEEE; | ||
module.exports = IEEE |
@@ -1,2 +0,2 @@ | ||
var log = module.exports; | ||
var log = module.exports | ||
@@ -14,3 +14,3 @@ log.levels = { | ||
error: 9 | ||
}; | ||
} | ||
@@ -28,2 +28,2 @@ log.colors = { | ||
error: 'red' | ||
}; | ||
} |
{ | ||
"name": "getpapers", | ||
"description": "Get fulltexts or fulltext URLs of papers matching a search query", | ||
"version": "0.4.13", | ||
"version": "0.4.14", | ||
"homepage": "https://github.com/ContentMine/getpapers", | ||
@@ -29,3 +29,3 @@ "author": { | ||
"scripts": { | ||
"test": "mocha", | ||
"test": "standard && mocha ", | ||
"coverage": "istanbul cover ./node_modules/mocha/bin/_mocha --report lcovonly -- -R spec", | ||
@@ -47,3 +47,3 @@ "coveralls": "istanbul cover ./node_modules/mocha/bin/_mocha --report lcovonly -- -R spec && cat ./coverage/lcov.info | ./node_modules/coveralls/bin/coveralls.js && rm -rf ./coverage" | ||
"version_compare": "0.0.3", | ||
"winston": "~1.0.0", | ||
"winston": "~2.3.1", | ||
"xml2js": "^0.4.17" | ||
@@ -55,8 +55,13 @@ }, | ||
"devDependencies": { | ||
"chai": "^4.0.2", | ||
"coveralls": "~2.11.2", | ||
"grunt": "~0.4.5", | ||
"coveralls": "~2.11.2", | ||
"istanbul": "~0.3.13", | ||
"mocha": "~2.2.4", | ||
"mocha-lcov-reporter": "0.0.2", | ||
"nock": "^9.0.13", | ||
"should": "~4.0.0", | ||
"istanbul": "~0.3.13", | ||
"mocha": "~2.2.4" | ||
"standard": "^10.0.2", | ||
"sinon": "^2.3.5", | ||
"sinon-chai": "^2.11.0" | ||
}, | ||
@@ -68,3 +73,15 @@ "keywords": [ | ||
"science" | ||
] | ||
], | ||
"standard": { | ||
"globals": [ | ||
"describe", | ||
"context", | ||
"before", | ||
"beforeEach", | ||
"after", | ||
"afterEach", | ||
"it", | ||
"expect" | ||
] | ||
} | ||
} |
170845
20
1375
11
+ Addedwinston@2.3.1(transitive)
- Removedpkginfo@0.3.1(transitive)
- Removedwinston@1.0.2(transitive)
Updatedwinston@~2.3.1