New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

getpapers

Package Overview
Dependencies
Maintainers
3
Versions
31
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

getpapers - npm Package Compare versions

Comparing version 0.4.13 to 0.4.14

.travis.yml

29

bin/getpapers.js
#!/usr/bin/env node
/* global log */
var program = require('commander')
var fs = require('fs')
var winston = require('winston')
var log = require('winston')
var api = require('../lib/api.js')
var loglevels = require('../lib/loglevels.js')
var mkdirp = require('mkdirp')
var pjson = require('../package.json')

@@ -60,3 +58,3 @@

if (allowedlevels.indexOf(program.loglevel) === -1) {
winston.error('Loglevel must be one of: ',
log.error('Loglevel must be one of: ',
'quiet, verbose, data, info, warn, error, debug')

@@ -66,8 +64,6 @@ process.exit(1)

log = new (winston.Logger)({
transports: [new winston.transports.Console({
level: program.loglevel,
levels: loglevels.levels,
colorize: true
})],
log.addColors(loglevels.colors)
log.remove(log.transports.Console) // reset logger to nothing
log.add(log.transports.Console, {
level: program.loglevel,

@@ -77,7 +73,6 @@ levels: loglevels.levels,

})
winston.addColors(loglevels.colors)
if (program.hasOwnProperty('logfile')) {
logstream = fs.createWriteStream(program.logfile.toString())
log.add(winston.transports.File, {
var logstream = fs.createWriteStream(program.logfile.toString())
log.add(log.transports.File, {
stream: logstream,

@@ -90,3 +85,3 @@ level: 'debug'

// check arguments
if (typeof program.query === "undefined" && program.api!=='crossref') {
if (typeof program.query === 'undefined' && program.api !== 'crossref') {
log.error('No query given. ' +

@@ -97,3 +92,3 @@ 'You must provide the --query argument.')

if (program.filter && program.api!=='crossref') {
if (program.filter && program.api !== 'crossref') {
log.warn('Filter given but not using CrossRef api ' +

@@ -130,4 +125,4 @@ 'so no filter applied.')

var chosenapi = api(program.api)
var searchapi = new chosenapi(options)
var Chosenapi = api(program.api)
var searchapi = new Chosenapi(options)
searchapi.search(program.query)

@@ -5,2 +5,3 @@ var eupmc = require('./eupmc.js')

var ieee = require('./ieee.js')
var log = require('winston')

@@ -7,0 +8,0 @@ var chooseAPI = function (api) {

@@ -1,100 +0,88 @@

var util = require('util')
, fs = require('fs')
, chalk = require('chalk')
, got = require('got')
, mkdirp = require('mkdirp')
, _ = require('lodash')
, ProgressBar = require('progress')
, request = require('requestretry')
, urlDl = require('./download.js')
, config = require('./config.js')
var fs = require('fs')
var chalk = require('chalk')
var ProgressBar = require('progress')
var request = require('requestretry')
var urlDl = require('./download.js')
var config = require('./config.js')
var log = require('winston')
var parseString = require('xml2js').parseString
var ArXiv = function(opts) {
var ArXiv = function (opts) {
this.baseurl = 'http://export.arxiv.org/api/query?search_query='
this.opts = opts;
this.opts = opts
}
ArXiv.prototype.search = function(query) {
ArXiv.prototype.search = function (query) {
var arxiv = this
var arxiv = this;
if (arxiv.opts.xml) {
log.warn("The ArXiv API does not provide fulltext XML, so the --xml flag will be ignored");
log.warn('The ArXiv API does not provide fulltext XML, so the --xml flag will be ignored')
}
if (arxiv.opts.minedterms) {
log.warn("The ArXiv API does not provide mined terms so the --minedterms flag will be ignored");
log.warn('The ArXiv API does not provide mined terms so the --minedterms flag will be ignored')
}
var options = {}
var options = {};
arxiv.queryurl = arxiv.buildQuery(query, options);
arxiv.first = true;
arxiv.hitlimit = arxiv.opts.hitlimit ? arxiv.opts.hitlimit : 0;
arxiv.hitcount = 0;
arxiv.residualhits = 0;
arxiv.allresults = [];
arxiv.iter = 0;
arxiv.pagesize = 500;
arxiv.page_delay = 3000; // miliseconds to wait between requests
arxiv.pageQuery();
arxiv.queryurl = arxiv.buildQuery(query, options)
arxiv.first = true
arxiv.hitlimit = arxiv.opts.hitlimit ? arxiv.opts.hitlimit : 0
arxiv.hitcount = 0
arxiv.residualhits = 0
arxiv.allresults = []
arxiv.iter = 0
arxiv.pagesize = 500
arxiv.page_delay = 3000 // miliseconds to wait between requests
arxiv.pageQuery()
}
ArXiv.prototype.pageQuery = function() {
ArXiv.prototype.pageQuery = function () {
var arxiv = this
var arxiv = this;
var thisQueryUrl = arxiv.queryurl
var thisQueryUrl = arxiv.queryurl;
var pageterm =
'&start=' + arxiv.iter +
'&max_results=' + arxiv.pagesize;
thisQueryUrl += pageterm;
'&max_results=' + arxiv.pagesize
thisQueryUrl += pageterm
log.debug(thisQueryUrl);
log.debug(thisQueryUrl)
var rq = request.get({url: thisQueryUrl,
headers: {'User-Agent': config.userAgent}
});
headers: {'User-Agent': config.userAgent}})
var convertXML2JSON = function (data) {
//console.log(data.body)
// console.log(data.body)
parseString(data.body, function (err, datum) {
cb = arxiv.completeCallback.bind(arxiv, datum)
cb() } )
if (err) throw err
var cb = arxiv.completeCallback.bind(arxiv, datum)
cb()
})
}
rq.on('complete', convertXML2JSON);
rq.on('timeout', arxiv.timeoutCallback);
rq.on('complete', convertXML2JSON)
rq.on('timeout', arxiv.timeoutCallback)
}
ArXiv.prototype.completeCallback = function(data) {
ArXiv.prototype.completeCallback = function (data) {
var arxiv = this
var arxiv = this;
var totalfound = parseInt(data.feed['opensearch:totalResults'][0]._)
var totalfound = parseInt(data.feed['opensearch:totalResults'][0]._);
if (arxiv.first) {
arxiv.first = false;
arxiv.hitcount = totalfound;
log.info('Found ' + arxiv.hitcount + ' results');
if (arxiv.hitcount == 0 || arxiv.opts.noexecute) {
process.exit(0);
arxiv.first = false
arxiv.hitcount = totalfound
log.info('Found ' + arxiv.hitcount + ' results')
if (arxiv.hitcount === 0 || arxiv.opts.noexecute) {
process.exit(0)
}
// set hitlimit
// set hitlimit
if (arxiv.hitlimit && arxiv.hitlimit < arxiv.hitcount) {
log.info('Limiting to ' + arxiv.hitlimit + ' hits');
}
else { arxiv.hitlimit = arxiv.hitcount; }
log.info('Limiting to ' + arxiv.hitlimit + ' hits')
} else { arxiv.hitlimit = arxiv.hitcount }
// create progress bar
var progmsg = 'Retrieving results [:bar] :percent' +
' (eta :etas)';
' (eta :etas)'
var progopts = {

@@ -104,50 +92,46 @@ total: arxiv.hitlimit,

complete: chalk.green('=')
};
arxiv.pageprogress = new ProgressBar(progmsg, progopts);
}
arxiv.pageprogress = new ProgressBar(progmsg, progopts)
}
if (data && data.feed && data.feed.entry) {
if (!arxiv.residualhits) { var result = data.feed.entry; }
else { var result = data.feed.entry.slice(0,arxiv.hitlimit); }
var result
if (!arxiv.residualhits) { result = data.feed.entry } else { result = data.feed.entry.slice(0, arxiv.hitlimit) }
} else {
log.error('Malformed response from arXiv API - no data in feed');
log.debug(data);
log.info('Retrying failed request');
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay);
return;
log.error('Malformed response from arXiv API - no data in feed')
log.debug(data)
log.info('Retrying failed request')
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay)
return
}
log.debug('Got', result.length, 'results in this page');
arxiv.allresults = arxiv.allresults.concat(result);
arxiv.pageprogress.tick(result.length);
log.debug('Got', result.length, 'results in this page')
arxiv.allresults = arxiv.allresults.concat(result)
arxiv.pageprogress.tick(result.length)
if (arxiv.allresults.length < arxiv.hitlimit) {
arxiv.iter += arxiv.pagesize;
hitsremaining = arxiv.hitlimit - arxiv.allresults.length;
if(hitsremaining<arxiv.pagesize) {
arxiv.residualhits = hitsremaining
}
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay);
arxiv.iter += arxiv.pagesize
var hitsremaining = arxiv.hitlimit - arxiv.allresults.length
if (hitsremaining < arxiv.pagesize) {
arxiv.residualhits = hitsremaining
}
setTimeout(arxiv.pageQuery.bind(arxiv), arxiv.page_delay)
} else {
log.info('Done collecting results');
arxiv.handleSearchResults(arxiv);
log.info('Done collecting results')
arxiv.handleSearchResults(arxiv)
}
}
ArXiv.prototype.handleSearchResults = function(arxiv) {
ArXiv.prototype.handleSearchResults = function (arxiv) {
// write the full result set to a file
log.info('Saving result metadata');
var pretty = JSON.stringify(arxiv.allresults, null, 2);
log.info('Saving result metadata')
var pretty = JSON.stringify(arxiv.allresults, null, 2)
fs.writeFileSync('arxiv_results.json', pretty)
var filename = chalk.blue('arxiv_results.json')
log.info('Full ArXiv result metadata written to ' + filename);
log.info('Full ArXiv result metadata written to ' + filename)
var dlTasks = []
var dlTasks = [];
// download the fullText PDF
if (arxiv.opts.pdf) {
dlTasks.push(arxiv.downloadFulltextPDFs);
dlTasks.push(arxiv.downloadFulltextPDFs)
}

@@ -157,120 +141,101 @@

if (arxiv.opts.supp) {
dlTasks.push(arxiv.downloadSuppFiles);
dlTasks.push(arxiv.downloadSuppFiles)
}
arxiv.runDlTasks(dlTasks);
arxiv.runDlTasks(dlTasks)
}
ArXiv.prototype.runDlTasks = function(dlTasks) {
ArXiv.prototype.runDlTasks = function (dlTasks) {
var arxiv = this
var arxiv = this;
arxiv.dlTasks = dlTasks;
arxiv.currDlTask = -1;
arxiv.nextDlTask();
arxiv.dlTasks = dlTasks
arxiv.currDlTask = -1
arxiv.nextDlTask()
}
ArXiv.prototype.nextDlTask = function() {
ArXiv.prototype.nextDlTask = function () {
var arxiv = this
var arxiv = this;
arxiv.currDlTask ++;
arxiv.currDlTask ++
if (arxiv.dlTasks.length > arxiv.currDlTask) {
var fun = arxiv.dlTasks[arxiv.currDlTask];
fun(arxiv);
var fun = arxiv.dlTasks[arxiv.currDlTask]
fun(arxiv)
} else {
process.exit(0);
process.exit(0)
}
}
ArXiv.prototype.timeoutCallback = function (ms) {
log.error('Did not get a response from the ArXiv API within ' + ms + 'ms')
}
ArXiv.prototype.timeoutCallback = function(ms) {
ArXiv.prototype.buildQuery = function (query, options) {
var arxiv = this
log.error('Did not get a response from the ArXiv API within ' + ms + 'ms');
var queryurl = arxiv.baseurl + encodeURIComponent(query)
};
ArXiv.prototype.buildQuery = function(query, options) {
var arxiv = this;
var queryurl = arxiv.baseurl + encodeURIComponent(query);
Object.keys(options).forEach(function(key) {
var val = options[key];
Object.keys(options).forEach(function (key) {
var val = options[key]
if (key.length > 0) {
queryurl += '&' + key + '=' + val;
queryurl += '&' + key + '=' + val
}
});
})
return queryurl;
return queryurl
}
ArXiv.prototype.getFulltextPDFUrl = function(result) {
ArXiv.prototype.getFulltextPDFUrl = function (result) {
var urls = result.link
var pdfurls = urls.filter(function (u) {
return u['$'].type === 'application/pdf'
})
var arxiv = this;
var urls = result.link;
var pdfurls = urls.filter(function(u) {
return u['$'].type === "application/pdf";
});
if (pdfurls.length == 0) {
//log.info('pdf missing')
return null;
if (pdfurls.length === 0) {
// log.info('pdf missing')
return null
} else {
return [ pdfurls[0]['$'].href, result.id[0].split('abs/')[1] + '/' ];
return [ pdfurls[0]['$'].href, result.id[0].split('abs/')[1] + '/' ]
}
}
ArXiv.prototype.getIdentifier = function(result) {
return result.id[0];
ArXiv.prototype.getIdentifier = function (result) {
return result.id[0]
}
ArXiv.prototype.getSuppFilesUrl = function(result) {
ArXiv.prototype.getSuppFilesUrl = function (result) {
var arxiv = this
var arxiv = this;
var id = arxiv.getIdentifier(result)
var id = arxiv.getIdentifier(result);
return [id.split('abs').join('e-print'), id.split('abs/')[1]];
return [id.split('abs').join('e-print'), id.split('abs/')[1]]
}
ArXiv.prototype.urlQueueBuilder = function(urls, type, rename) {
return urls.map(function urlQueueBuilder(url_id) {
return {url: url_id[0], id: url_id[1], type: type, rename: rename }
ArXiv.prototype.urlQueueBuilder = function (urls, type, rename) {
return urls.map(function urlQueueBuilder (urlId) {
return { url: urlId[0], id: urlId[1], type: type, rename: rename }
})
};
}
ArXiv.prototype.downloadFulltextPDFs = function(arxiv) {
urls = arxiv.allresults
ArXiv.prototype.downloadFulltextPDFs = function (arxiv) {
var urls = arxiv.allresults
.map(arxiv.getFulltextPDFUrl, arxiv)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })
log.info('Downloading fulltext PDF files');
log.info('Downloading fulltext PDF files')
var urlQueue = arxiv.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf');
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv));
var urlQueue = arxiv.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf')
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv))
}
ArXiv.prototype.downloadSuppFiles = function(arxiv) {
urls = arxiv.allresults
ArXiv.prototype.downloadSuppFiles = function (arxiv) {
var urls = arxiv.allresults
.map(arxiv.getSuppFilesUrl, arxiv)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })
log.info('Downloading supplementary files');
log.info('Downloading supplementary files')
var urlQueue = arxiv.urlQueueBuilder(urls, 'supplementary files', 'supplementaryFiles.tar.gz');
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv));
var urlQueue = arxiv.urlQueueBuilder(urls, 'supplementary files', 'supplementaryFiles.tar.gz')
urlDl.downloadurlQueue(urlQueue, arxiv.nextDlTask.bind(arxiv))
}
module.exports = ArXiv;
module.exports = ArXiv

@@ -1,3 +0,1 @@

/* global log */
var fs = require('fs')

@@ -9,3 +7,4 @@ var chalk = require('chalk')

var CrossRefAPI = require('crossref')
var sanitize = require("sanitize-filename")
var sanitize = require('sanitize-filename')
var log = require('winston')

@@ -33,3 +32,3 @@ var CrossRef = function (opts) {

log.info('Found', crossref.hitcount, 'results')
if (crossref.opts.noexecute){ process.exit(0) }
if (crossref.opts.noexecute) { process.exit(0) }
}

@@ -50,15 +49,15 @@

var message = {}
if (query != null) { message.query = query}
if (query != null) { message.query = query }
message.rows = crossref.pagesize
if (crossref.opts.filter) {
var filters = crossref.opts.filter.split(',')
message.filter = message.filter ? message.filter : {}
for (var singleFilter of filters){
if (!message.filter[singleFilter.split(':')[0]]) {
message.filter[singleFilter.split(':')[0]] = []
if (crossref.opts.filter) {
var filters = crossref.opts.filter.split(',')
message.filter = message.filter ? message.filter : {}
for (var singleFilter of filters) {
if (!message.filter[singleFilter.split(':')[0]]) {
message.filter[singleFilter.split(':')[0]] = []
}
message.filter[singleFilter.split(':')[0]].push(singleFilter.split(':')[1])
}
message.filter[singleFilter.split(':')[0]].push(singleFilter.split(':')[1])
}
}

@@ -116,3 +115,3 @@ CrossRefAPI.works(message, pageQuery)

if (crossref.allresults.length > crossref.hitlimit) {
crossref.allresults = crossref.allresults.slice(0,crossref.hitlimit)
crossref.allresults = crossref.allresults.slice(0, crossref.hitlimit)
log.info('limiting hits')

@@ -256,4 +255,4 @@ }

CrossRef.prototype.urlQueueBuilder = function (urls, type, rename) {
return urls.map(function (url_id) {
return { url: url_id[0], id: url_id[1], type: type, rename: rename }
return urls.map(function (urlId) {
return { url: urlId[0], id: urlId[1], type: type, rename: rename }
})

@@ -265,3 +264,3 @@ }

// First convert slashes to underscores to aid readability
var id = crossref.getIdentifier(record).id.replace(/\//g,"_")
var id = crossref.getIdentifier(record).id.replace(/\//g, '_')
var sanid = sanitize(id)

@@ -268,0 +267,0 @@ mkdirp.sync(sanid)

@@ -1,2 +0,1 @@

var util = require('util')
var fs = require('fs')

@@ -7,14 +6,14 @@ var chalk = require('chalk')

var _ = require('lodash')
var ProgressBar = require('progress');
var sanitize = require("sanitize-filename")
var ProgressBar = require('progress')
var sanitize = require('sanitize-filename')
var config = require('./config.js')
var log = require('winston')
exports.downloadurlQueue = function(urlQueue, nextDlTaskcb) {
var failed = [];
var retries = 0;
var missing = 0;
exports.downloadurlQueue = function (urlQueue, nextDlTaskcb) {
var failed = []
var missing = 0
//Setup ProgressBar
// Setup ProgressBar
var progmsg = 'Downloading files [:bar] :percent' +
' (:current/:total) [:elapseds elapsed, eta :eta]';
' (:current/:total) [:elapseds elapsed, eta :eta]'
var progopts = {

@@ -24,101 +23,89 @@ total: urlQueue.length,

complete: chalk.green('=')
};
var dlprogress = new ProgressBar(progmsg, progopts);
}
var dlprogress = new ProgressBar(progmsg, progopts)
var donefunc = function() {
var donefunc = function () {
if (failed.length > 0) {
log.warn(failed.length + ' downloads timed out on retry.');
log.warn(failed.length + ' downloads timed out on retry.')
} else if (missing > 0) {
var succeeded = urlQueue.length - missing;
var succeeded = urlQueue.length - missing
var suffix = missing > 1 ? 's' : ''
log.info(succeeded + ' downloads succeeded. ' + missing +
' paper' + suffix + ' had urlQueue that could not be reached (404 error).');
' paper' + suffix + ' had urlQueue that could not be reached (404 error).')
} else {
log.info('All downloads succeeded!');
log.info('All downloads succeeded!')
}
nextDlTaskcb();
nextDlTaskcb()
}
var done = _.after(urlQueue.length, donefunc);
var done = _.after(urlQueue.length, donefunc)
for(var i=0; i<10; i++) {
nextUrlTask(urlQueue); //spawn 10 workers
for (var i = 0; i < 10; i++) {
nextUrlTask(urlQueue) // spawn 10 workers
}
function nextUrlTask() {
if (urlQueue instanceof Array && urlQueue.length > 0) {
var urlObj = urlQueue.splice(0,1)[0];
testIfFileExists(urlObj, downloadURL);
function nextUrlTask () {
if (urlQueue instanceof Array && urlQueue.length > 0) {
var urlObj = urlQueue.splice(0, 1)[0]
testIfFileExists(urlObj, downloadURL)
} else {
log.debug('ending thread because urlQueue is now empty')
}
}
else {
log.debug('ending thread because urlQueue is now empty' )
}
}
// Run callback if file doesn't exist
function testIfFileExists(urlObj, cb) {
dlprogress.tick();
var url = urlObj.url;
var id = urlObj.id;
var type = urlObj.type;
var rename = urlObj.rename;
var base = id + '/';
fs.readFile(base + rename, (err, data) => {
if ((err)&&(err.code=='ENOENT')) {
cb(urlObj)
return
//File doesn't exist so start download procedure
}
else if (err) {
throw err
}
else {
log.info('File of type: '+type+' and id: '+id+' already exists. Skipping.')
nextUrlTask(urlQueue)
return
}
})
}
function testIfFileExists (urlObj, cb) {
dlprogress.tick()
var id = urlObj.id
var type = urlObj.type
var rename = urlObj.rename
var base = id + '/'
fs.readFile(base + rename, (err, data) => {
if ((err) && (err.code === 'ENOENT')) {
cb(urlObj)
function downloadURL(urlObj) {
var url = urlObj.url;
var id = urlObj.id;
var type = urlObj.type;
var rename = sanitize(urlObj.rename);
var base = sanitize(id) + '/';
log.debug('Creating directory: ' + base);
mkdirp.sync(base);
log.debug('Downloading ' + type + ': ' + url);
var options = {
timeout: 15000,
encoding: null,
retries: 3
// File doesn't exist so start download procedure
} else if (err) {
throw err
} else {
log.info('File of type: ' + type + ' and id: ' + id + ' already exists. Skipping.')
nextUrlTask(urlQueue)
}
})
}
function fileWriteCB(err) {
if (err) throw error
done()
}
function downloadURL (urlObj) {
var url = urlObj.url
var id = urlObj.id
var type = urlObj.type
var rename = sanitize(urlObj.rename)
var base = sanitize(id) + '/'
log.debug('Creating directory: ' + base)
mkdirp.sync(base)
function handleDownload(data) {
fs.writeFile(base + rename, data, fileWriteCB);
nextUrlTask(urlQueue);
log.debug('Downloading ' + type + ': ' + url)
function fileWriteCB (err) {
if (err) throw err
done()
}
function throwErr(err){
if (err) throw err
}
function handleDownload (data) {
fs.writeFile(base + rename, data, fileWriteCB)
nextUrlTask(urlQueue)
}
rq = requestretry.get({url: url,
fullResponse: false,
headers: {'User-Agent': config.userAgent}
});
rq.then(handleDownload)
rq.catch(throwErr)
}
function throwErr (err) {
if (err) throw err
}
var fourohfour = function() {
missing ++;
var rq = requestretry.get({
url: url,
fullResponse: false,
headers: {'User-Agent': config.userAgent},
encoding: null
})
rq.then(handleDownload)
rq.catch(throwErr)
}
}

@@ -1,99 +0,87 @@

var util = require('util')
, fs = require('fs')
, chalk = require('chalk')
, got = require('got')
, mkdirp = require('mkdirp')
, _ = require('lodash')
, ProgressBar = require('progress')
, urlDl = require('./download.js')
, requestretry = require('requestretry')
, glob = require('matched')
, vc = require('version_compare')
, config = require('./config.js')
var fs = require('fs')
var chalk = require('chalk')
var got = require('got')
var mkdirp = require('mkdirp')
var _ = require('lodash')
var ProgressBar = require('progress')
var urlDl = require('./download.js')
var requestretry = require('requestretry')
var glob = require('matched')
var vc = require('version_compare')
var config = require('./config.js')
var log = require('winston')
var parseString = require('xml2js').parseString
var minimumEuPMCResponseLength = 100 // Shortest length we could expect that countains
// at least one result
var EuPMCVersion = '5.1.1'
var EuPmc = function(opts) {
var EuPmc = function (opts) {
var eupmc = this
this.baseurl = 'http://www.ebi.ac.uk/' +
'europepmc/webservices/rest/search/';
this.opts = opts;
'europepmc/webservices/rest/search/'
this.opts = opts || {}
eupmc.first = true
eupmc.hitlimit = eupmc.opts.hitlimit ? eupmc.opts.hitlimit : 0
eupmc.hitcount = 0
eupmc.residualhits = 0
eupmc.allresults = []
eupmc.nextCursorMark = '*' // we always get back the first page
eupmc.pagesize = '1000'
eupmc.unfillledPage = false
}
EuPmc.prototype.search = function(query) {
EuPmc.prototype.search = function (query) {
var eupmc = this
var eupmc = this;
if (!eupmc.opts.all) {
query += " OPEN_ACCESS:y";
query += ' OPEN_ACCESS:y'
}
eupmc.pagesize = '1000'
var options = { resulttype: 'core', pageSize: eupmc.pagesize };
eupmc.queryurl = eupmc.buildQuery(query, options);
eupmc.first = true;
eupmc.hitlimit = eupmc.opts.hitlimit ? eupmc.opts.hitlimit : 0;
eupmc.hitcount = 0;
eupmc.residualhits = 0;
eupmc.allresults = [];
eupmc.nextCursorMark = '*'; //we always get back the first page
var options = { resulttype: 'core', pageSize: eupmc.pagesize }
eupmc.queryurl = eupmc.buildQuery(query, options)
if (eupmc.opts.restart) {
fs.readFile('eupmc_results.json', (err,data) => {
if ((err) && (err.code == 'ENOENT')) {
log.error('No existing download to restart')
process.exit(1)
}
else if (err) {
throw err
}
else {
log.info('Restarting previous download')
eupmc.allresults=JSON.parse(data)
eupmc.addDlTasks()
}
} )
fs.readFile('eupmc_results.json', (err, data) => {
if ((err) && (err.code === 'ENOENT')) {
log.error('No existing download to restart')
process.exit(1)
} else if (err) {
throw err
} else {
log.info('Restarting previous download')
eupmc.allresults = JSON.parse(data)
eupmc.addDlTasks()
}
})
} else {
eupmc.pageQuery()
}
else {
eupmc.pageQuery();
}
}
EuPmc.prototype.testApi = function(version) {
if(!vc.matches(version, EuPMCVersion)) {
EuPmc.prototype.testApi = function (version) {
if (!vc.matches(version, EuPMCVersion)) {
log.warn('This version of getpapers wasn\'t built with this version of the EuPMC api in mind')
log.warn(`getpapers EuPMCVersion: ${EuPMCVersion} vs. ${version} reported by api` )
log.warn(`getpapers EuPMCVersion: ${EuPMCVersion} vs. ${version} reported by api`)
}
}
EuPmc.prototype.pageQuery = function() {
EuPmc.prototype.pageQuery = function () {
var eupmc = this
var eupmc = this;
var thisQueryUrl = eupmc.queryurl + ''
var thisQueryUrl = eupmc.queryurl + '';
var pageterm = '&cursorMark=' + eupmc.nextCursorMark
thisQueryUrl += pageterm
var pageterm = '&cursorMark=' + eupmc.nextCursorMark;
thisQueryUrl += pageterm;
log.debug(thisQueryUrl)
log.debug(thisQueryUrl);
var retryOnHTTPNetOrEuPMCFailure = function (err, response, body){
var retryOnHTTPNetOrEuPMCFailure = function (err, response, body) {
return requestretry.RetryStrategies.HTTPOrNetworkError(err, response, body) ||
~body.indexOf('<resultList/>') //hacky way to see if resultsList is empty
~body.indexOf('<resultList/>') // hacky way to see if resultsList is empty
}
var rq = requestretry.get({url: thisQueryUrl,
maxAttempts: 50,
retryStrategy: retryOnHTTPNetOrEuPMCFailure,
headers: {'User-Agent': config.userAgent}
});
maxAttempts: 50,
retryStrategy: retryOnHTTPNetOrEuPMCFailure,
headers: {'User-Agent': config.userAgent}
})
var handleResquestResponse = function (data) {

@@ -103,3 +91,3 @@ if (data.attempts > 1) {

}
convertXML2JSON(data)
convertXML2JSON(data)
}

@@ -109,31 +97,30 @@ var convertXML2JSON = function (data) {

if (err) throw err
cb = eupmc.completeCallback.bind(eupmc, datum)
cb() } )
var cb = eupmc.completeCallback.bind(eupmc, datum)
cb()
})
}
rq.then(handleResquestResponse);
rq.on('timeout', eupmc.timeoutCallback);
rq.then(handleResquestResponse)
rq.on('timeout', eupmc.timeoutCallback)
}
EuPmc.prototype.completeCallback = function(data) {
EuPmc.prototype.completeCallback = function (data) {
var eupmc = this
var eupmc = this;
var resp = data.responseWrapper
var resp = data.responseWrapper;
if(!resp.hitCount || !resp.hitCount[0] || !resp.resultList[0].result) {
log.error("Malformed or empty response from EuropePMC. Try running again. Perhaps your query is wrong.");
process.exit(1);
if (!resp.hitCount || !resp.hitCount[0] || !resp.resultList[0].result) {
log.error('Malformed or empty response from EuropePMC. Try running again. Perhaps your query is wrong.')
process.exit(1)
}
if (eupmc.first){
eupmc.first = false;
eupmc.hitcount = parseInt(resp.hitCount[0]);
var oaclause = eupmc.opts.all ? '' : ' open access';
log.info('Found ' + eupmc.hitcount + oaclause + ' results');
if (eupmc.first) {
eupmc.first = false
eupmc.hitcount = parseInt(resp.hitCount[0])
var oaclause = eupmc.opts.all ? '' : ' open access'
log.info('Found ' + eupmc.hitcount + oaclause + ' results')
eupmc.testApi(resp.version[0])
if (eupmc.hitcount == 0 || eupmc.opts.noexecute) {
process.exit(0);
if (eupmc.hitcount === 0 || eupmc.opts.noexecute) {
process.exit(0)
}

@@ -143,9 +130,8 @@

if (eupmc.hitlimit && eupmc.hitlimit < eupmc.hitcount) {
log.info('Limiting to ' + eupmc.hitlimit + ' hits');
}
else { eupmc.hitlimit = eupmc.hitcount; }
log.info('Limiting to ' + eupmc.hitlimit + ' hits')
} else { eupmc.hitlimit = eupmc.hitcount }
// create progress bar
var progmsg = 'Retrieving results [:bar] :percent' +
' (eta :etas)';
' (eta :etas)'
var progopts = {

@@ -155,32 +141,38 @@ total: eupmc.hitlimit,

complete: chalk.green('=')
};
eupmc.pageprogress = new ProgressBar(progmsg, progopts);
}
eupmc.pageprogress = new ProgressBar(progmsg, progopts)
}
var result
if (eupmc.residualhits) {
var result = resp.resultList[0].result.slice(0,eupmc.residualhits);
result = resp.resultList[0].result.slice(0, eupmc.residualhits)
} else {
result = resp.resultList[0].result
// if less results in this page than page count (and we were expecting an entire page)
// EuPMC has been lying and we shouldn't keep searching for more results
if (result.length < eupmc.pagesize) eupmc.unfilledPage = true
}
else { var result = resp.resultList[0].result; }
log.debug('In this batch got: ' + result.length + ' results')
eupmc.allresults = eupmc.allresults.concat(result);
eupmc.pageprogress.tick(result.length);
eupmc.allresults = eupmc.allresults.concat(result)
eupmc.pageprogress.tick(result.length)
if (eupmc.allresults.length < eupmc.hitlimit) { //we still have more results to get
if (eupmc.allresults.length < eupmc.hitlimit) { // we still have more results to get
if (eupmc.unfilledPage) { // but the last page wasn't full then something is wrong
log.info('EuPMC gave us the wrong hitcount. We\'ve already found all the results')
eupmc.handleSearchResults(eupmc)
return
}
if (eupmc.hitlimit - eupmc.allresults.length < eupmc.pagesize) {
eupmc.residualhits = eupmc.hitlimit - eupmc.allresults.length;
eupmc.residualhits = eupmc.hitlimit - eupmc.allresults.length
}
eupmc.nextCursorMark = resp.nextCursorMark[0];
eupmc.pageQuery();
eupmc.nextCursorMark = resp.nextCursorMark[0]
eupmc.pageQuery()
} else {
log.info('Done collecting results');
eupmc.handleSearchResults(eupmc);
log.info('Done collecting results')
eupmc.handleSearchResults(eupmc)
}
}
EuPmc.prototype.timeoutCallback = function(ms) {
eupmc = this
log.error('Did not get a response from Europe PMC within ' + ms + 'ms');
EuPmc.prototype.timeoutCallback = function (ms) {
var eupmc = this
log.error('Did not get a response from Europe PMC within ' + ms + 'ms')
if (eupmc.allresults) {

@@ -191,43 +183,37 @@ log.info('Handling the limited number of search results we got.')

}
}
EuPmc.prototype.buildQuery = function(query, options) {
EuPmc.prototype.buildQuery = function (query, options) {
var eupmc = this
var eupmc = this;
var queryurl = eupmc.baseurl + 'query=' + encodeURIComponent(query);
Object.keys(options).forEach(function(key) {
var val = options[key];
var queryurl = eupmc.baseurl + 'query=' + encodeURIComponent(query)
Object.keys(options).forEach(function (key) {
var val = options[key]
if (key.length > 0) {
queryurl += '&' + key + '=' + val;
queryurl += '&' + key + '=' + val
}
});
return queryurl;
})
return queryurl
}
EuPmc.prototype.formatResult = function(result) {
EuPmc.prototype.formatResult = function (result) {
return result.authorString +
' (' + result.pubYear + '). ' +
result.title + ' http://dx.doi.org/' + result.DOI;
result.title + ' http://dx.doi.org/' + result.DOI
}
EuPmc.prototype.handleSearchResults = function(eupmc) {
EuPmc.prototype.handleSearchResults = function (eupmc) {
// see how many results were unique
var originalLength = eupmc.allresults.length;
eupmc.allresults = _.uniq(eupmc.allresults, function(x) {
return eupmc.getIdentifier(x).id;
});
var originalLength = eupmc.allresults.length
eupmc.allresults = _.uniq(eupmc.allresults, function (x) {
return eupmc.getIdentifier(x).id
})
if (eupmc.allresults.length < originalLength) {
log.info('Duplicate records found: ' +
eupmc.allresults.length +
' unique results identified');
' unique results identified')
}
if (eupmc.allresults.length > eupmc.hitlimit) {
eupmc.allresults = eupmc.allresults.slice(0,eupmc.hitlimit)
eupmc.allresults = eupmc.allresults.slice(0, eupmc.hitlimit)
log.info('limiting hits')

@@ -237,19 +223,19 @@ }

// write the full result set to a file
log.info('Saving result metadata');
var pretty = JSON.stringify(eupmc.allresults, null, 2);
log.info('Saving result metadata')
var pretty = JSON.stringify(eupmc.allresults, null, 2)
fs.writeFileSync('eupmc_results.json', pretty)
var filename = chalk.blue('eupmc_results.json')
log.info('Full EUPMC result metadata written to ' + filename);
var resultsFilename = chalk.blue('eupmc_results.json')
log.info('Full EUPMC result metadata written to ' + resultsFilename)
// write individual results to their respective directories
eupmc.allresults.forEach(function(result) {
eupmc.allresults.forEach(function (result) {
eupmc.writeRecord(result, eupmc)
})
log.info('Individual EUPMC result metadata records written');
log.info('Individual EUPMC result metadata records written')
// write only the url list to file
log.info('Extracting fulltext HTML URL list (may not be available for all articles)');
log.info('Extracting fulltext HTML URL list (may not be available for all articles)')
var urls = eupmc.allresults
.map(eupmc.getFulltextHTMLUrl, eupmc)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })

@@ -259,19 +245,18 @@ if (urls.length > 0) {

'eupmc_fulltext_html_urls.txt',
urls.concat(["\n"]).join("\n")
);
var filename = chalk.blue('eupmc_fulltext_html_urls.txt')
log.info('Fulltext HTML URL list written to ' + filename);
urls.concat(['\n']).join('\n')
)
var urlFilename = chalk.blue('eupmc_fulltext_html_urls.txt')
log.info('Fulltext HTML URL list written to ' + urlFilename)
}
eupmc.addDlTasks()
}
EuPmc.prototype.addDlTasks = function() {
eupmc = this
var dlTasks = [];
EuPmc.prototype.addDlTasks = function () {
var eupmc = this
var dlTasks = []
// download the fullText XML
if (eupmc.opts.xml) {
dlTasks.push(eupmc.downloadFulltextXMLs);
dlTasks.push(eupmc.downloadFulltextXMLs)
}

@@ -281,3 +266,3 @@

if (eupmc.opts.pdf) {
dlTasks.push(eupmc.downloadFulltextPDFs);
dlTasks.push(eupmc.downloadFulltextPDFs)
}

@@ -287,3 +272,3 @@

if (eupmc.opts.supp) {
dlTasks.push(eupmc.downloadSuppFiles);
dlTasks.push(eupmc.downloadSuppFiles)
}

@@ -293,109 +278,101 @@

if (eupmc.opts.minedterms) {
dlTasks.push(eupmc.downloadMinedTerms);
dlTasks.push(eupmc.summariseMinedTerms);
dlTasks.push(eupmc.downloadMinedTerms)
dlTasks.push(eupmc.summariseMinedTerms)
}
eupmc.runDlTasks(dlTasks);
eupmc.runDlTasks(dlTasks)
}
EuPmc.prototype.runDlTasks = function(dlTasks) {
EuPmc.prototype.runDlTasks = function (dlTasks) {
var eupmc = this
var eupmc = this;
eupmc.dlTasks = dlTasks;
eupmc.currDlTask = -1;
eupmc.nextDlTask();
eupmc.dlTasks = dlTasks
eupmc.currDlTask = -1
eupmc.nextDlTask()
}
EuPmc.prototype.nextDlTask = function() {
EuPmc.prototype.nextDlTask = function () {
var eupmc = this
var eupmc = this;
eupmc.currDlTask ++;
eupmc.currDlTask ++
if (eupmc.dlTasks.length > eupmc.currDlTask) {
var fun = eupmc.dlTasks[eupmc.currDlTask];
fun(eupmc);
var fun = eupmc.dlTasks[eupmc.currDlTask]
fun(eupmc)
} else {
process.exit(0);
process.exit(0)
}
}
EuPmc.prototype.downloadFulltextXMLs = function(eupmc) {
urls = eupmc.allresults
EuPmc.prototype.downloadFulltextXMLs = function (eupmc) {
var urls = eupmc.allresults
.map(eupmc.getFulltextXMLUrl, eupmc)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })
log.info('Got XML URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results');
log.info('Got XML URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results')
log.info('Downloading fulltext XML files');
log.info('Downloading fulltext XML files')
var urlQueue = eupmc.urlQueueBuilder(urls, 'XML', 'fulltext.xml');
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc));
var urlQueue = eupmc.urlQueueBuilder(urls, 'XML', 'fulltext.xml')
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc))
}
EuPmc.prototype.downloadMinedTerms = function(eupmc) {
urls = eupmc.allresults
EuPmc.prototype.downloadMinedTerms = function (eupmc) {
var urls = eupmc.allresults
.map(eupmc.getMinedTermsURL, eupmc)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })
log.info('Got mined terms JSON URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results');
log.info('Got mined terms JSON URLs for ' + urls.length + ' out of ' + eupmc.allresults.length + ' results')
log.info('Downloading mined terms JSON files');
log.info('Downloading mined terms JSON files')
var urlQueue = eupmc.urlQueueBuilder(urls, 'JSON', 'textMinedTerms.json');
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc));
var urlQueue = eupmc.urlQueueBuilder(urls, 'JSON', 'textMinedTerms.json')
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc))
}
EuPmc.prototype.downloadFulltextPDFs = function(eupmc) {
urls = eupmc.allresults
EuPmc.prototype.downloadFulltextPDFs = function (eupmc) {
var urls = eupmc.allresults
.map(eupmc.getFulltextPDFUrl, eupmc)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })
log.info('Downloading fulltext PDF files');
log.info('Downloading fulltext PDF files')
var urlQueue = eupmc.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf');
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc));
var urlQueue = eupmc.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf')
urlDl.downloadurlQueue(urlQueue, eupmc.nextDlTask.bind(eupmc))
}
EuPmc.prototype.downloadSuppFiles = function(eupmc) {
urls = eupmc.allresults
EuPmc.prototype.downloadSuppFiles = function (eupmc) {
var urls = eupmc.allresults
.map(eupmc.getSuppFilesUrl, eupmc)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })
log.info('Downloading supplementary files');
log.info('Downloading supplementary files')
var failed = [];
var retries = 0;
var missing = 0;
var failed = []
var retries = 0
var missing = 0
var fourohfour = function() {
missing ++;
var fourohfour = function () {
missing++
}
var done = _.after(urls.length, function() {
if (failed.length > 0 && retries == 0) {
log.warn(failed.length + ' downloads timed out. Retrying.');
failed = [];
var done = _.after(urls.length, function () {
if (failed.length > 0 && retries === 0) {
log.warn(failed.length + ' downloads timed out. Retrying.')
failed = []
eupmc.downloadUrls(urls,
'supplementary files',
'supplementaryFiles.zip',
failed, done, eupmc, fourohfour);
failed, done, eupmc, fourohfour)
} else if (failed.length > 0) {
log.warn(failed.length + ' downloads timed out on retry. Skipping.');
log.warn(failed.length + ' downloads timed out on retry. Skipping.')
} else if (missing > 0) {
var succeeded = urls.length - missing;
var succeeded = urls.length - missing
var suffix = missing > 1 ? 's' : ''
log.info(succeeded + ' downloads succeeded. ' + missing +
' paper' + suffix + ' had no supplementary files.');
' paper' + suffix + ' had no supplementary files.')
} else {
log.info('All supplementary file downloads succeeded!');
log.info('All supplementary file downloads succeeded!')
}
eupmc.nextDlTask();
});
eupmc.nextDlTask()
})

@@ -405,13 +382,10 @@ eupmc.downloadUrls(urls,

'supplementaryFiles.zip',
failed, done, eupmc, fourohfour);
failed, done, eupmc, fourohfour)
}
EuPmc.prototype.downloadUrls = function(urls, type, rename, failed,
EuPmc.prototype.downloadUrls = function (urls, type, rename, failed,
cb, thisArg, fourohfour) {
var eupmc = thisArg;
// setup progress bar
var progmsg = 'Downloading files [:bar] :percent' +
' (:current/:total) [:elapseds elapsed, eta :eta]';
' (:current/:total) [:elapseds elapsed, eta :eta]'
var progopts = {

@@ -421,12 +395,12 @@ total: urls.length,

complete: chalk.green('=')
};
var dlprogress = new ProgressBar(progmsg, progopts);
}
var dlprogress = new ProgressBar(progmsg, progopts)
urls.forEach(function(url_id) {
var url = url_id[0];
var id = url_id[1];
var base = id + '/';
log.debug('Creating directory: ' + base);
mkdirp.sync(base);
log.debug('Downloading ' + type + ': ' + url);
urls.forEach(function (urlId) {
var url = urlId[0]
var id = urlId[1]
var base = id + '/'
log.debug('Creating directory: ' + base)
mkdirp.sync(base)
log.debug('Downloading ' + type + ': ' + url)
var options = {

@@ -436,54 +410,48 @@ timeout: 15000,

}
var get = got(url, options, function(err, data, res) {
dlprogress.tick();
got(url, options, function (err, data, res) {
dlprogress.tick()
if (err) {
if (err.code === 'ETIMEDOUT' || err.code === 'ESOCKETTIMEDOUT') {
log.warn('Download timed out for URL ' + url);
log.warn('Download timed out for URL ' + url)
}
if (!res) {
failed.push(url);
} else if ((res.statusCode == 404) && !(fourohfour === null)) {
fourohfour();
failed.push(url)
} else if ((res.statusCode === 404) && !(fourohfour === null)) {
fourohfour()
} else {
failed.push(url);
failed.push(url)
}
cb();
cb()
} else {
fs.writeFile(base + rename, data, cb);
fs.writeFile(base + rename, data, cb)
}
});
});
})
})
}
EuPmc.prototype.getFulltextHTMLUrl = function (result, oa) {
var eupmc = this
var id = eupmc.getIdentifier(result)
EuPmc.prototype.getFulltextHTMLUrl = function(result, oa) {
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id) }
var eupmc = this;
var id = eupmc.getIdentifier(result);
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id); }
var urls = result.fullTextUrlList[0].fullTextUrl;
var htmlUrls = urls.filter(function(u) {
return (u.documentStyle[0] == 'html' || u.documentStyle[0] == 'doi')
}).sort(function(a, b) {
return (a.availabilityCode[0] == 'OA' || eupmc.opts.all) ? -1 : 1
});
if (htmlUrls.length == 0) {
var id = eupmc.getIdentifier(result);
var urls = result.fullTextUrlList[0].fullTextUrl
var htmlUrls = urls.filter(function (u) {
return (u.documentStyle[0] === 'html' || u.documentStyle[0] === 'doi')
}).sort(function (a, b) {
return (a.availabilityCode[0] === 'OA' || eupmc.opts.all) ? -1 : 1
})
if (htmlUrls.length === 0) {
log.warn('Article with ' + id.type + ' "' +
id.id + '" had no fulltext HTML url');
return null;
id.id + '" had no fulltext HTML url')
return null
} else {
return htmlUrls[0].url[0];
return htmlUrls[0].url[0]
}
}
EuPmc.prototype.getIdentifier = function(result) {
var types = ['pmcid', 'doi', 'pmid', 'title'];
EuPmc.prototype.getIdentifier = function (result) {
var types = ['pmcid', 'doi', 'pmid', 'title']
for (var i = 0; i < types.length; i++) {
var type = types[i];
var type = types[i]
if (result.hasOwnProperty(type) && result[type].length > 0) {

@@ -501,123 +469,111 @@ return {

}
}
EuPmc.prototype.getFulltextXMLUrl = function (result) {
var eupmc = this
EuPmc.prototype.getFulltextXMLUrl = function(result) {
var id = eupmc.getIdentifier(result)
var eupmc = this;
var xmlurl = null
var id = eupmc.getIdentifier(result);
var xmlurl = null;
if (id.type === 'pmcid') {
xmlurl = 'http://www.ebi.ac.uk/europepmc/webservices/rest/' +
id.id + '/fullTextXML';
id.id + '/fullTextXML'
} else {
log.warn('Article with ' + id.type + ' "' +
id.id + ' did not have a PMCID (therefore no XML)');
return null;
id.id + ' did not have a PMCID (therefore no XML)')
return null
}
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id); }
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id) }
var urls = result.fullTextUrlList[0].fullTextUrl;
var htmlUrls = urls.filter(function(u) {
var urls = result.fullTextUrlList[0].fullTextUrl
var htmlUrls = urls.filter(function (u) {
return (u.documentStyle[0] === 'html' || u.documentStyle[0] === 'doi')
}).filter(function(a, b) {
return (a.availabilityCode[0] === 'OA');
});
if (htmlUrls.length == 0) {
var id = eupmc.getIdentifier(result);
}).filter(function (a, b) {
return (a.availabilityCode[0] === 'OA')
})
if (htmlUrls.length === 0) {
log.warn('Article with ' + id.type + ' "' +
id.id + '" was not Open Access (therefore no XML)');
return null;
id.id + '" was not Open Access (therefore no XML)')
return null
}
return [xmlurl, id.id];
return [xmlurl, id.id]
}
EuPmc.prototype.getFulltextPDFUrl = function(result) {
EuPmc.prototype.getFulltextPDFUrl = function (result) {
var eupmc = this
var id = eupmc.getIdentifier(result)
var eupmc = this;
var id = eupmc.getIdentifier(result);
var noPDF = function(id) {
log.warn('Article with ' + id.type + ' "' +
id.id + '" had no fulltext PDF url');
return null;
var noPDF = function (id) {
log.warn('Article with ' + id.type + ' "' +
id.id + '" had no fulltext PDF url')
return null
}
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id); }
if (result.hasPDF == 'N') { return noPDF(id); }
if (!result.fullTextUrlList) { return eupmc.noFulltextUrls(id) }
if (result.hasPDF === 'N') { return noPDF(id) }
var urls = result.fullTextUrlList[0].fullTextUrl;
var pdfOAurls = urls.filter(function(u) {
var urls = result.fullTextUrlList[0].fullTextUrl
var pdfOAurls = urls.filter(function (u) {
return u.documentStyle[0] === 'pdf' &&
u.availabilityCode[0] === 'OA'
});
})
if (pdfOAurls.length == 0) {
return noPDF(id);
if (pdfOAurls.length === 0) {
return noPDF(id)
} else {
return [pdfOAurls[0].url[0], id.id];
return [pdfOAurls[0].url[0], id.id]
}
}
EuPmc.prototype.urlQueueBuilder = function(urls, type, rename) {
return urls.map(function(url_id) {
return {url: url_id[0], id: url_id[1], type: type, rename: rename }
EuPmc.prototype.urlQueueBuilder = function (urls, type, rename) {
return urls.map(function (urlId) {
return { url: urlId[0], id: urlId[1], type: type, rename: rename }
})
};
}
EuPmc.prototype.getSuppFilesUrl = function (result) {
var eupmc = this
EuPmc.prototype.getSuppFilesUrl = function(result) {
var id = eupmc.getIdentifier(result)
var eupmc = this;
var id = eupmc.getIdentifier(result);
if (id.type == 'pmcid') {
if (id.type === 'pmcid') {
return ['http://www.ebi.ac.uk/europepmc/webservices/rest/' +
id.id + '/supplementaryFiles', id.id];
id.id + '/supplementaryFiles', id.id]
} else {
log.warn('Article with ' + id.type + ' "' +
id.id + ' did not have a PMCID (therefore no supplementary files)');
return null;
id.id + ' did not have a PMCID (therefore no supplementary files)')
return null
}
}
EuPmc.prototype.getMinedTermsURL = function(result) {
EuPmc.prototype.getMinedTermsURL = function (result) {
var eupmc = this
var eupmc = this;
var id = eupmc.getIdentifier(result)
var id = eupmc.getIdentifier(result);
if (id.type == 'pmcid') {
if (id.type === 'pmcid') {
return ['http://www.ebi.ac.uk/europepmc/webservices/rest/PMC/' +
id.id + '/textMinedTerms//1/1000/json', id.id];
id.id + '/textMinedTerms//1/1000/json', id.id]
} else {
log.warn('Article with ' + id.type + ' "' +
id.id + ' did not have a PMCID (therefore no mined terms)');
return null;
id.id + ' did not have a PMCID (therefore no mined terms)')
return null
}
}
EuPmc.prototype.summariseMinedTerms = function() {
EuPmc.prototype.summariseMinedTerms = function () {
log.info('Writing mined term summary CSV files to minedterms_summary/')
mkdirp.sync('minedterms_summary')
var termstore = {}
glob.sync(['*/textMinedTerms.json']).forEach(function(termsFile) {
glob.sync(['*/textMinedTerms.json']).forEach(function (termsFile) {
var json = fs.readFileSync(termsFile, 'utf8')
var terms = JSON.parse(json)
terms.semanticTypeList.semanticType.forEach(function(termset) {
terms.semanticTypeList.semanticType.forEach(function (termset) {
if (!termstore[termset.name]) {
termstore[termset.name] = []
}
var rows = termset.tmSummary.map(function(term) {
var rows = termset.tmSummary.map(function (term) {
return [

@@ -634,5 +590,5 @@ terms.request.id,

})
Object.keys(termstore).forEach(function(key) {
Object.keys(termstore).forEach(function (key) {
var head = 'article,' + key + ',count,dbName,dbId\n'
var csv = head + termstore[key].map(function(row) {
var csv = head + termstore[key].map(function (row) {
return row.join(',')

@@ -644,5 +600,5 @@ }).join('\n') + '\n'

EuPmc.prototype.writeRecord = function(record, eupmc) {
var json = JSON.stringify(record, null, 2);
var id = eupmc.getIdentifier(record).id;
EuPmc.prototype.writeRecord = function (record, eupmc) {
var json = JSON.stringify(record, null, 2)
var id = eupmc.getIdentifier(record).id
mkdirp.sync(id)

@@ -652,12 +608,8 @@ fs.writeFileSync(id + '/eupmc_result.json', json)

EuPmc.prototype.noFulltextUrls = function(id) {
EuPmc.prototype.noFulltextUrls = function (id) {
log.debug('Article with ' + id.type + ' "' +
id.id + '" had no fulltext Urls');
return null;
id.id + '" had no fulltext Urls')
return null
}
module.exports = EuPmc;
module.exports = EuPmc

@@ -1,123 +0,114 @@

var util = require('util')
, fs = require('fs')
, chalk = require('chalk')
, got = require('got')
, mkdirp = require('mkdirp')
, _ = require('lodash')
, request = require('requestretry')
, ProgressBar = require('progress')
, config = require ('./config.js')
var fs = require('fs')
var chalk = require('chalk')
var request = require('requestretry')
var ProgressBar = require('progress')
var config = require('./config.js')
var log = require('winston')
var parseString = require('xml2js').parseString
var IEEE = function(opts) {
var IEEE = function (opts) {
this.baseurl = 'http://ieeexplore.ieee.org/gateway/ipsSearch.jsp?'
this.opts = opts;
this.opts = opts
}
IEEE.prototype.search = function(query) {
IEEE.prototype.search = function (query) {
var ieee = this
var ieee = this;
if (ieee.opts.xml) {
log.warn("The IEEE API does not provide fulltext XML, so the --xml flag will be ignored");
log.warn('The IEEE API does not provide fulltext XML, so the --xml flag will be ignored')
}
if (ieee.opts.pdf) {
log.warn("The IEEE API does not provide fulltext PDF links, so the --pdf flag will be ignored");
log.warn('The IEEE API does not provide fulltext PDF links, so the --pdf flag will be ignored')
}
if (ieee.opts.minedterms) {
log.warn("The IEEE API does not provide mined terms, so the --minedterms flag will be ignored");
log.warn('The IEEE API does not provide mined terms, so the --minedterms flag will be ignored')
}
if (ieee.opts.supp) {
log.warn("The IEEE API does not provide supplementary materials, so the --supp flag will be ignored");
log.warn('The IEEE API does not provide supplementary materials, so the --supp flag will be ignored')
}
ieee.pagesize = 200;
ieee.pagesize = 200
options = {
var options = {
hc: ieee.pagesize
};
}
if (!ieee.opts.all) {
options['oa'] = 1;
options['oa'] = 1
}
ieee.queryurl = ieee.buildQuery(query, options);
ieee.first = true;
ieee.residualhits = 0;
ieee.hitlimit = ieee.opts.hitlimit ? ieee.opts.hitlimit : 0;
ieee.hitcount = 0;
ieee.allresults = [];
ieee.iter = 1;
ieee.queryurl = ieee.buildQuery(query, options)
ieee.first = true
ieee.residualhits = 0
ieee.hitlimit = ieee.opts.hitlimit ? ieee.opts.hitlimit : 0
ieee.hitcount = 0
ieee.allresults = []
ieee.iter = 1
ieee.timeouts = 0;
ieee.timeouts = 0
ieee.resultstream = fs.createWriteStream('ieee_results.json');
ieee.fulltextURLstream = fs.createWriteStream('ieee_fulltext_html_urls.txt');
ieee.resultstream = fs.createWriteStream('ieee_results.json')
ieee.fulltextURLstream = fs.createWriteStream('ieee_fulltext_html_urls.txt')
ieee.pageQuery();
ieee.pageQuery()
}
IEEE.prototype.pageQuery = function() {
IEEE.prototype.pageQuery = function () {
var ieee = this
var ieee = this;
var thisQueryUrl = ieee.queryurl
var thisQueryUrl = ieee.queryurl;
if (ieee.iter > 0) {
var pageterm = '&rs=' + ieee.iter;
thisQueryUrl += pageterm;
var pageterm = '&rs=' + ieee.iter
thisQueryUrl += pageterm
}
log.debug(thisQueryUrl);
log.debug(thisQueryUrl)
var rq = request.get({url: thisQueryUrl,
headers: { 'Accept': 'application/json',
'User-Agent': config.userAgent}});
headers: {
'Accept': 'application/json',
'User-Agent': config.userAgent
}})
var convertXML2JSON = function (data) {
//console.log(data.body)
// console.log(data.body)
parseString(data.body, function (err, datum) {
cb = ieee.completeCallback.bind(ieee, datum)
cb() } )
if (err) throw err
var cb = ieee.completeCallback.bind(ieee, datum)
cb()
})
}
rq.on('complete', convertXML2JSON);
rq.on('timeout', ieee.timeoutCallback.bind(ieee));
rq.on('complete', convertXML2JSON)
rq.on('timeout', ieee.timeoutCallback.bind(ieee))
}
IEEE.prototype.completeCallback = function(data) {
IEEE.prototype.completeCallback = function (data) {
var ieee = this
var ieee = this;
var totalfound = 0
var totalfound = 0;
if (data.hasOwnProperty('root')) {
var totalfound = parseInt(data.root.totalfound[0]);
totalfound = parseInt(data.root.totalfound[0])
}
if (ieee.first) {
ieee.first = false;
ieee.hitcount = totalfound;
var oaclause = ieee.opts.all ? '' : ' open access';
log.info('Found ' + ieee.hitcount + oaclause + ' results');
if (ieee.hitcount == 0 || ieee.opts.noexecute) {
process.exit(0);
ieee.first = false
ieee.hitcount = totalfound
var oaclause = ieee.opts.all ? '' : ' open access'
log.info('Found ' + ieee.hitcount + oaclause + ' results')
if (ieee.hitcount === 0 || ieee.opts.noexecute) {
process.exit(0)
}
log.info('The IEEE API does not provide fulltext HTML links, but we will try to guess them from other metadata');
log.info('The IEEE API does not provide fulltext HTML links, but we will try to guess them from other metadata')
// set hitlimit
// set hitlimit
if (ieee.hitlimit && ieee.hitlimit < ieee.hitcount) {
log.info('Limiting to ' + ieee.hitlimit + ' hits');
}
else { ieee.hitlimit = ieee.hitcount; }
log.info('Limiting to ' + ieee.hitlimit + ' hits')
} else { ieee.hitlimit = ieee.hitcount }
// create progress bar
var progmsg = 'Fetching result metadata [:bar] :percent' +
' (:current/:total) [:elapseds elapsed, eta :etas]';
' (:current/:total) [:elapseds elapsed, eta :etas]'
var progopts = {

@@ -127,117 +118,100 @@ total: ieee.hitlimit,

complete: chalk.green('=')
};
ieee.pageprogress = new ProgressBar(progmsg, progopts);
}
ieee.pageprogress = new ProgressBar(progmsg, progopts)
}
var result
if (!ieee.residualhits) {
result = data.root.document
} else {
result = data.root.document.slice(0, ieee.residualhits)
}
var pretty = JSON.stringify(result, null, 2)
ieee.resultstream.write(pretty)
if(!ieee.residualhits) { var result = data.root.document; }
else { var result = data.root.document.slice(0,ieee.residualhits); }
var pretty = JSON.stringify(result, null, 2);
ieee.resultstream.write(pretty);
var urls = ieee.getFulltextHTMLUrls(result)
urls.forEach(function (url) { ieee.fulltextURLstream.write(url + '\n') })
var urls = ieee.getFulltextHTMLUrls(result);
urls.forEach(function(url) { ieee.fulltextURLstream.write(url + '\n') });
ieee.allresults = ieee.allresults.concat(result)
ieee.pageprogress.tick(result.length)
ieee.allresults = ieee.allresults.concat(result);
ieee.pageprogress.tick(result.length);
if (ieee.allresults.length < ieee.hitcount) {
ieee.iter += 1;
remaininghits = ieee.hitcount - ieee.allresults.length;
if(remaininghits<ieee.pagesize) { ieee.residualhits = remaininghits; }
log.debug(ieee.allresults.length);
ieee.pageQuery();
ieee.iter += 1
var remaininghits = ieee.hitcount - ieee.allresults.length
if (remaininghits < ieee.pagesize) { ieee.residualhits = remaininghits }
log.debug(ieee.allresults.length)
ieee.pageQuery()
} else {
log.info('Done collecting results. Got ' + ieee.allresults.length);
ieee.handleSearchResults(ieee);
log.info('Done collecting results. Got ' + ieee.allresults.length)
ieee.handleSearchResults(ieee)
}
}
IEEE.prototype.handleSearchResults = function(ieee) {
IEEE.prototype.handleSearchResults = function (ieee) {
// write the full result set to a file
log.info('Saving result metadata');
var pretty = JSON.stringify(ieee.allresults, null, 2);
log.info('Saving result metadata')
var pretty = JSON.stringify(ieee.allresults, null, 2)
fs.writeFileSync('ieee_results.json', pretty)
var filename = chalk.blue('ieee_results.json')
log.info('Full IEEE result metadata written to ' + filename);
log.info('Full IEEE result metadata written to ' + filename)
ieee.fulltextURLstream.end();
ieee.fulltextURLstream.end()
filename = chalk.blue('ieee_fulltext_html_urls.txt')
log.info('Fulltext HTML URL list written to ' + filename);
log.info('Fulltext HTML URL list written to ' + filename)
}
IEEE.prototype.timeoutCallback = function(ms) {
IEEE.prototype.timeoutCallback = function (ms) {
var ieee = this
var ieee = this;
log.error('Did not get a response from the IEEE API within ' + ms + 'ms')
log.error('There have been ' + ieee.timeouts + ' total timeouts')
ieee.timeouts += 1
log.error('Did not get a response from the IEEE API within ' + ms + 'ms');
log.error('There have been ' + ieee.timeouts + ' total timeouts');
ieee.timeouts += 1;
if (ieee.timeouts > 99) {
log.info('Timed out 100 times - the connection is probably broken');
log.info('Timed out 100 times - the connection is probably broken')
log.info('You have either been disconnected from the internet, or ' +
'the API provider has blocked your IP');
process.exit(1);
'the API provider has blocked your IP')
process.exit(1)
} else {
log.info('Retrying timed-out query');
ieee.pageQuery();
log.info('Retrying timed-out query')
ieee.pageQuery()
}
}
};
IEEE.prototype.buildQuery = function (query, options) {
var ieee = this
IEEE.prototype.buildQuery = function(query, options) {
var queryurl = ieee.baseurl + 'querytext=' + encodeURIComponent(query)
var ieee = this;
var queryurl = ieee.baseurl + 'querytext=' + encodeURIComponent(query);
Object.keys(options).forEach(function(key) {
var val = options[key];
Object.keys(options).forEach(function (key) {
var val = options[key]
if (key.length > 0) {
queryurl += '&' + key + '=' + val;
queryurl += '&' + key + '=' + val
}
});
})
return queryurl;
return queryurl
}
IEEE.prototype.getFulltextHTMLUrl = function(result) {
var ieee = this;
IEEE.prototype.getFulltextHTMLUrl = function (result) {
if (result.htmlFlag && result.htmlFlag[0] === '1') {
var arnumber = result.arnumber[0]
var arnumber = result.arnumber[0];
var url = "http://ieeexplore.ieee.org/xpls/icp.jsp?arnumber=" + arnumber;
result.html = url;
return url;
var url = 'http://ieeexplore.ieee.org/xpls/icp.jsp?arnumber=' + arnumber
result.html = url
return url
} else {
return null;
return null
}
}
IEEE.prototype.getFulltextHTMLUrls = function(results) {
IEEE.prototype.getFulltextHTMLUrls = function (results) {
var ieee = this
var ieee = this;
return results
.map(ieee.getFulltextHTMLUrl, ieee)
.filter(function(x) { return !(x === null) });
.filter(function (x) { return !(x === null) })
}
module.exports = IEEE;
module.exports = IEEE

@@ -1,2 +0,2 @@

var log = module.exports;
var log = module.exports

@@ -14,3 +14,3 @@ log.levels = {

error: 9
};
}

@@ -28,2 +28,2 @@ log.colors = {

error: 'red'
};
}
{
"name": "getpapers",
"description": "Get fulltexts or fulltext URLs of papers matching a search query",
"version": "0.4.13",
"version": "0.4.14",
"homepage": "https://github.com/ContentMine/getpapers",

@@ -29,3 +29,3 @@ "author": {

"scripts": {
"test": "mocha",
"test": "standard && mocha ",
"coverage": "istanbul cover ./node_modules/mocha/bin/_mocha --report lcovonly -- -R spec",

@@ -47,3 +47,3 @@ "coveralls": "istanbul cover ./node_modules/mocha/bin/_mocha --report lcovonly -- -R spec && cat ./coverage/lcov.info | ./node_modules/coveralls/bin/coveralls.js && rm -rf ./coverage"

"version_compare": "0.0.3",
"winston": "~1.0.0",
"winston": "~2.3.1",
"xml2js": "^0.4.17"

@@ -55,8 +55,13 @@ },

"devDependencies": {
"chai": "^4.0.2",
"coveralls": "~2.11.2",
"grunt": "~0.4.5",
"coveralls": "~2.11.2",
"istanbul": "~0.3.13",
"mocha": "~2.2.4",
"mocha-lcov-reporter": "0.0.2",
"nock": "^9.0.13",
"should": "~4.0.0",
"istanbul": "~0.3.13",
"mocha": "~2.2.4"
"standard": "^10.0.2",
"sinon": "^2.3.5",
"sinon-chai": "^2.11.0"
},

@@ -68,3 +73,15 @@ "keywords": [

"science"
]
],
"standard": {
"globals": [
"describe",
"context",
"before",
"beforeEach",
"after",
"afterEach",
"it",
"expect"
]
}
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc