Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Sign inDemoInstall


Package Overview
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies


academia - npm Package Compare versions

Comparing version 0.3.0 to 0.4.0




@@ -1,316 +0,95 @@

/// <reference path="./type_declarations/index.d.ts" />
var lexing = require('lexing');
var academia;
(function (academia) {
var styles;
(function (styles) {
var acl;
(function (acl) {
function pushAll(array, items) {
return Array.prototype.push.apply(array, items);
var name = '[A-Z][^()\\s]+(?: [IV]+)?';
var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
var citeSources = [
// et al., duo, and single, with year in parens
(name + "\\s+et\\s+al.\\s+\\(" + year + "\\)"),
(name + "\\s+(?:and|&)\\s+" + name + "\\s+\\(" + year + "\\)"),
(name + "\\s+\\(" + year + "\\)"),
// et al., duo, and single, with year not in parens (note the commas)
(name + "\\s+et\\s+al.,\\s+" + year + "\\b"),
(name + "\\s+(?:and|&)\\s+" + name + ",\\s+" + year + "\\b"),
(name + ",\\s+" + year + "\\b"),
acl.citeRegExp = new RegExp(citeSources.join('|'), 'g');
acl.yearRegExp = new RegExp(year);
var citeCleanRegExp = new RegExp("[(),]|" + year, 'g');
Given the text of a paper, extract the `Cite`s using regular expressions.
function parseCites(body) {
// when String.prototype.match is called with a RegExp with the 'g' (global)
// flag set, the result will ignore any capture groups and return an Array of
// strings, or null if the RegExp matched nothing.
var cites = body.match(acl.citeRegExp) || [];
return (cite) {
var year_match = cite.match(acl.yearRegExp);
// we cull it down to just the names by removing parentheses, commas,
// and years (with optional suffixes), and trimming any extra whitespace
var names_string = cite.replace(citeCleanRegExp, '').trim();
return {
authors: names.parseNames(names_string),
year: year_match ? year_match[0] : null,
style: types.CiteStyle.Textual,
source: cite,
acl.parseCites = parseCites;
acl.referenceRegExp = new RegExp("^(.+?)[.,]?\\s*\\(?(" + year + ")\\)?\\.\\s*(.+?)\\.");
Given a string representing an individual reference in a bibliography, parse
it into a Reference structure.
function parseReference(reference) {
var match = reference.match(acl.referenceRegExp);
var authors = match ? names.parseNames(match[1]) : [];
return {
authors: authors,
year: match ? match[2] : undefined,
title: match ? match[3] : undefined,
source: reference,
acl.parseReference = parseReference;
Given a Reference, format it as a string.
function formatReference(reference) {
var authors = names.formatNames(reference.authors);
var parts = [authors, reference.year, reference.title, reference.venue, reference.publisher, reference.pages];
return parts.filter(function (part) { return part !== undefined && part !== null; }).join('. ') + '.';
acl.formatReference = formatReference;
In-place modifies `cites` by setting the `reference` value of each one where
a unique match from `references` is found.
TODO: handle multiple matches somehow.
function linkCites(cites, references) {
cites.forEach(function (cite) {
var matching_references = references.filter(function (reference) {
return names.authorsMatch(cite.authors, reference.authors) && (cite.year == reference.year);
if (matching_references.length === 1) {
cite.reference = matching_references[0];
acl.linkCites = linkCites;
Join the papers sections into a single string, for searching, and find all cites
in that string. Parse references, and link the cites to them heuristically.
Extend the given paper with the parsed references and cites (linked or not),
and return it.
function linkPaper(paper, referencesTitleRegExp) {
if (referencesTitleRegExp === void 0) { referencesTitleRegExp = /References?/; }
var body = paper.sections
.filter(function (section) { return !referencesTitleRegExp.test(section.title); })
.map(function (section) { return ("# " + section.title + "\n" + section.paragraphs.join('\n')); })
paper.references = paper.sections
.filter(function (section) { return referencesTitleRegExp.test(section.title); })
.map(function (section) { return; })
.reduce(function (accumulator, references) {
pushAll(accumulator, references);
return accumulator;
}, []);
var cites = parseCites(body);
linkCites(cites, paper.references);
paper.cites = cites;
return paper;
acl.linkPaper = linkPaper;
})(acl = styles.acl || (styles.acl = {}));
})(styles = academia.styles || (academia.styles = {}));
var names;
(function (names_1) {
var Token = lexing.Token;
Given a name represented by a single string, parse it into first name, middle
name, and last name.
makeName(['Leonardo', 'da', 'Vinci']) -> { first: 'Leonardo', last: 'da Vinci' }
makeName(['Chris', 'Callison-Burch']) -> { first: 'Chris', last: 'Callison-Burch' }
makeName(['Hanna', 'M', 'Wallach']) -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
makeName(['Zhou']) -> { last: 'Zhou' }
makeName(['McCallum', 'Andrew']) -> { first: 'Andrew', last: 'McCallum' }
TODO: handle 'van', 'von', 'da', etc.
function parseName(parts) {
var n = parts.length;
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
return {
last: parts[0]
var fs_1 = require('fs');
var chalk = require('chalk');
var yargs = require('yargs');
var acl_1 = require('./styles/acl');
function stderr(line) {
process.stderr.write(chalk.magenta(line) + '\n');
function highlight(filename) {
stderr("highlighting " + filename);
var paper_json = fs_1.readFileSync(filename, { encoding: 'utf8' });
var paper = JSON.parse(paper_json);
return paper.sections
.map(function (section) { return ("# " + section.title + "\n" + section.paragraphs.join('\n')); })
.replace(/# References?/g, function (group0) {
.replace(acl_1.citeRegExp, function (group0) {
function link(filename) {
var paper_json = fs_1.readFileSync(filename, { encoding: 'utf8' });
var original_paper = JSON.parse(paper_json);
// extract body and references from Paper object
var paper = acl_1.linkPaper(original_paper);
var linked_cites = paper.cites.filter(function (cite) { return cite.references.length > 0; });
// report
var report = {
filename: filename,
references: paper.references.length,
cites: paper.cites.length,
linked: linked_cites.length,
linking_success: (100 * linked_cites.length / paper.cites.length).toFixed(0) + '%'
// report
// output analysis
return paper;
function main() {
var argvparser = yargs
.usage('Usage: academia <command> <file>')
.command('highlight', 'highlight references in paper')
.example('academia highlight P14-1148.pdf.json', 'Print the Paper specified in P14-1148.pdf.json as plaintext with the references highlighted')
.command('link', 'detect references, citations, and link citations to references as possible')
.example('academia link P14-1148.pdf.json', 'Detect cites and references, link them, and print the full enhanced Paper object')
output: 'output file (- for STDOUT)',
help: 'print this help message',
verbose: 'print debug messages',
version: 'print version',
o: 'output',
h: 'help',
v: 'verbose',
output: '-',
var argv = argvparser.argv;
if ( {
else if (argv.version) {
else {
argv = argvparser.demand(2).argv;
// pull off positional arguments
var command = argv._[0];
var input_filename = argv._[1];
// apply command to input
var output;
if (command === 'highlight') {
output = highlight(input_filename);
names_1.parseName = parseName;
Opinionated name formatting.
function formatName(name) {
return [name.first, name.middle, name.last].filter(function (part) { return part !== null && part !== undefined; }).join(' ');
else if (command === 'link') {
var paper = link(input_filename);
output = JSON.stringify(paper);
names_1.formatName = formatName;
function formatNames(names) {
var name_strings =;
if (name_strings.length < 3) {
return name_strings.join(' and ');
// use the Oxford comma
var parts = name_strings.slice(0, -2); // might be []
parts.push(name_strings.slice(-2).join(', and '));
return parts.join(', ');
else {
stderr("Unrecognized command: \"" + command + "\"");
names_1.formatNames = formatNames;
var default_rules = [
[/^$/, function (match) { return Token('EOF'); }],
[/^\s+/, function (match) { return null; }],
[/^,/, function (match) { return Token('SEPARATOR', match[0]); }],
[/^(and|et|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
[/^[A-Z](\.|\s)/, function (match) { return Token('INITIAL', match[0].trim()); }],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+\b)?/i, function (match) { return Token('NAME', match[0]); }],
// pretty much a catch-all:
[/^[^,\s]+/i, function (match) { return Token('NAME', match[0]); }],
1. Typical list of 3+
'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
2. List of 3+ without the Oxford comma, in case that ever happens
'Aravind K Joshi, Ben King and Steven Abney' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
3. Duo
'Daniel Ramage and Chris Callison-Burch' ->
['David Mimno', 'Chris Callison-Burch']
4. Single author
'David Sankofl' ->
['David Sankofl']
5. Et al. abbreviation
'Zhao et al.' ->
['Zhao', 'al.']
TODO: handle last-name-first swaps, e.g.,
'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
Technically, this is ambiguous, since we could support lists of only last names
(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.
Example chunks:
function parseNames(input) {
var input_iterable = new lexing.StringIterator(input);
var tokenizer = new lexing.Tokenizer(default_rules);
var token_iterator =;
var names = [];
var buffer = [];
var buffer_swap = false;
function flush() {
if (buffer_swap) {
// move the first item to the last item
var name = parseName(buffer);
// reset
buffer = [];
buffer_swap = false;
while (1) {
var token =;
// console.error('%s=%s',, token.value);
if ( === 'EOF') {
else if ( === 'NAME') {
// the first long name after
if (buffer.length > 0 && buffer_swap) {
else if ( === 'INITIAL') {
// console.log('INITIAL=%s', token.value);
else if ( === 'SEPARATOR' || === 'CONJUNCTION') {
if (buffer.length === 1) {
buffer_swap = true;
else if (buffer.length > 1) {
else {
// finish up
if (buffer.length > 0) {
return names;
names_1.parseNames = parseNames;
Typically, in-paper citations (`Cite`s) only have the last names of the authors,
while the `Reference`s in the Bibliography have full names, or at least first
initials and last names.
This method determines whether a `Cite`'s names match a `Reference`'s authors.
authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true
'et al.' gets special treatment. 'et al.' is a match if and only if there are
more reference authors beyond the one parallel to the 'et al.' citation author.
In other words, 'et al.' cannot stand in for a single author.
authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
function authorsMatch(citeAuthors, referenceAuthors) {
for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
var citeAuthor = citeAuthors[i];
var referenceAuthor = referenceAuthors[i];
// the et al. handling has to precede the normal name-checking conditional below
if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
// early exit: ignore the rest of the reference authors
return true;
if (citeAuthor === undefined || referenceAuthor === undefined || citeAuthor.last !== referenceAuthor.last) {
return false;
return true;
names_1.authorsMatch = authorsMatch;
})(names = academia.names || (academia.names = {}));
var types;
(function (types) {
Textual: Brown (2015)
Parenthetical: (Brown 2015)
Alternate: Brown 2015
(function (CiteStyle) {
CiteStyle[CiteStyle["Textual"] = 0] = "Textual";
CiteStyle[CiteStyle["Parenthetical"] = 1] = "Parenthetical";
CiteStyle[CiteStyle["Alternate"] = 2] = "Alternate";
})(types.CiteStyle || (types.CiteStyle = {}));
var CiteStyle = types.CiteStyle;
})(types = academia.types || (academia.types = {}));
})(academia || (academia = {}));
module.exports = academia;
var outputStream = (argv.output == '-') ? process.stdout : fs_1.createWriteStream(argv.output, { encoding: 'utf8' });
outputStream.write(output + '\n');
exports.main = main;

@@ -1,4 +0,2 @@

/// <reference path="./type_declarations/index.d.ts" />
var lexing = require('lexing');
var Token = lexing.Token;
var lexing_1 = require('lexing');

@@ -55,10 +53,10 @@ Given a name represented by a single string, parse it into first name, middle

var default_rules = [
[/^$/, function (match) { return Token('EOF'); }],
[/^$/, function (match) { return lexing_1.Token('EOF'); }],
[/^\s+/, function (match) { return null; }],
[/^,/, function (match) { return Token('SEPARATOR', match[0]); }],
[/^(and|et|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
[/^[A-Z](\.|\s)/, function (match) { return Token('INITIAL', match[0].trim()); }],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+\b)?/i, function (match) { return Token('NAME', match[0]); }],
[/^,/, function (match) { return lexing_1.Token('SEPARATOR', match[0]); }],
[/^(and|et|&)/, function (match) { return lexing_1.Token('CONJUNCTION', match[0]); }],
[/^[A-Z](\.|\s)/, function (match) { return lexing_1.Token('INITIAL', match[0].trim()); }],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+\b)?/i, function (match) { return lexing_1.Token('NAME', match[0]); }],
// pretty much a catch-all:
[/^[^,\s]+/i, function (match) { return Token('NAME', match[0]); }],
[/^[^,\s]+/i, function (match) { return lexing_1.Token('NAME', match[0]); }],

@@ -98,4 +96,4 @@ /**

function parseNames(input) {
var input_iterable = new lexing.StringIterator(input);
var tokenizer = new lexing.Tokenizer(default_rules);
var input_iterable = new lexing_1.StringIterator(input);
var tokenizer = new lexing_1.Tokenizer(default_rules);
var token_iterator =;

@@ -102,0 +100,0 @@ var names = [];

@@ -11,5 +11,8 @@ {

"version": "0.3.0",
"version": "0.4.0",
"homepage": "",
"repository": "git://",
"repository": {
"type": "git",
"url": ""
"author": "Christopher Brown <> (",

@@ -23,5 +26,6 @@ "license": "MIT",

"devDependencies": {
"babel-core": "^5.0.0",
"declarations": "*",
"mocha": "*",
"typescript": "*",
"typescript-declare": "*"
"typescript": "next"

@@ -28,0 +32,0 @@ "scripts": {

@@ -75,2 +75,2 @@ # academia

Copyright 2015 Christopher Brown. [MIT Licensed](
Copyright 2015 Christopher Brown. [MIT Licensed](
var types = require('../types');
var names = require('../names');
function pushAll(array, items) {
return Array.prototype.push.apply(array, items);
var name = '[A-Z][^()\\s]+(?: [IV]+)?';

@@ -22,23 +19,18 @@ var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';

Given the text of a paper, extract the `Cite`s using regular expressions.
find the start indices and lengths of all non-overlapping substrings matching
`regExp` in `input`.
function parseCites(body) {
// when String.prototype.match is called with a RegExp with the 'g' (global)
// flag set, the result will ignore any capture groups and return an Array of
// strings, or null if the RegExp matched nothing.
var cites = body.match(exports.citeRegExp) || [];
return (cite) {
var year_match = cite.match(exports.yearRegExp);
// we cull it down to just the names by removing parentheses, commas,
// and years (with optional suffixes), and trimming any extra whitespace
var names_string = cite.replace(citeCleanRegExp, '').trim();
return {
authors: names.parseNames(names_string),
year: year_match ? year_match[0] : null,
style: types.CiteStyle.Textual,
source: cite,
function matchSpans(input, regExp) {
if (regExp === void 0) { regExp = exports.citeRegExp; }
// reset the regex
regExp.lastIndex = 0;
// set up the iteration variables
var previousLastIndex = regExp.lastIndex;
var spans = [];
var match;
while ((match = regExp.exec(input)) !== null) {
spans.push([match.index, match[0].length]);
return spans;
exports.parseCites = parseCites;
exports.referenceRegExp = new RegExp("^(.+?)[.,]?\\s*\\(?(" + year + ")\\)?\\.\\s*(.+?)\\.");

@@ -77,8 +69,12 @@ /**

cites.forEach(function (cite) {
var matching_references = references.filter(function (reference) {
cite.references = references
.map(function (reference, reference_i) { return ({ reference: reference, reference_i: reference_i }); })
.filter(function (_a) {
var reference = _a.reference, reference_i = _a.reference_i;
return names.authorsMatch(cite.authors, reference.authors) && (cite.year == reference.year);
.map(function (_a) {
var reference = _a.reference, reference_i = _a.reference_i;
return ("/references/" + reference_i);
if (matching_references.length === 1) {
cite.reference = matching_references[0];

@@ -88,2 +84,28 @@ }

Given the text of some part of a paper, extract the `Cite`s using regular expressions.
function findCites(input, pointer) {
return matchSpans(input, exports.citeRegExp).map(function (_a) {
var offset = _a[0], length = _a[1];
var text = input.slice(offset, offset + length);
var year_match = text.match(exports.yearRegExp);
// we cull it down to just the names by removing parentheses, commas,
// and years (with optional suffixes), and trimming any extra whitespace
var names_string = text.replace(citeCleanRegExp, '').trim();
return {
style: types.CiteStyle.Textual,
text: text,
origin: {
pointer: pointer,
offset: offset,
length: length,
authors: names.parseNames(names_string),
year: year_match ? year_match[0] : null,
references: [],
exports.findCites = findCites;
Join the papers sections into a single string, for searching, and find all cites

@@ -97,18 +119,20 @@ in that string. Parse references, and link the cites to them heuristically.

if (referencesTitleRegExp === void 0) { referencesTitleRegExp = /References?/; }
var body = paper.sections
.filter(function (section) { return !referencesTitleRegExp.test(section.title); })
.map(function (section) { return ("# " + section.title + "\n" + section.paragraphs.join('\n')); })
paper.references = paper.sections
var sections = paper.sections;
var body_sections = sections.filter(function (section) { return !referencesTitleRegExp.test(section.title); });
var references = sections
.filter(function (section) { return referencesTitleRegExp.test(section.title); })
.map(function (section) { return; })
.reduce(function (accumulator, references) {
pushAll(accumulator, references);
accumulator.push.apply(accumulator, references);
return accumulator;
}, []);
var cites = parseCites(body);
linkCites(cites, paper.references);
paper.cites = cites;
return paper;
var cites = [];
body_sections.forEach(function (section, section_i) {
section.paragraphs.forEach(function (paragraph, paragraph_i) {
cites.push.apply(cites, findCites(paragraph, "/sections/" + section_i + "/paragraphs/" + paragraph_i));
linkCites(cites, references);
return { sections: sections, references: references, cites: cites };
exports.linkPaper = linkPaper;

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo


  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog



Stay in touch

Get open source security insights delivered straight into your inbox.

  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc