@@ -1,316 +0,95 @@
		/// <reference path="./type_declarations/index.d.ts" />
		var lexing = require('lexing');
		var academia;
		(function (academia) {
		var styles;
		(function (styles) {
		var acl;
		(function (acl) {
		function pushAll(array, items) {
		return Array.prototype.push.apply(array, items);
		}
		var name = '[A-Z][^()\\s]+(?: [IV]+)?';
		var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
		var citeSources = [
		// et al., duo, and single, with year in parens
		(name + "\\s+et\\s+al.\\s+\$" + year + "\$"),
		(name + "\\s+(?:and\|&)\\s+" + name + "\\s+\$" + year + "\$"),
		(name + "\\s+\$" + year + "\$"),
		// et al., duo, and single, with year not in parens (note the commas)
		(name + "\\s+et\\s+al.,\\s+" + year + "\\b"),
		(name + "\\s+(?:and\|&)\\s+" + name + ",\\s+" + year + "\\b"),
		(name + ",\\s+" + year + "\\b"),
		];
		acl.citeRegExp = new RegExp(citeSources.join('\|'), 'g');
		acl.yearRegExp = new RegExp(year);
		var citeCleanRegExp = new RegExp("[(),]\|" + year, 'g');
		/**
		Given the text of a paper, extract the `Cite`s using regular expressions.
		*/
		function parseCites(body) {
		// when String.prototype.match is called with a RegExp with the 'g' (global)
		// flag set, the result will ignore any capture groups and return an Array of
		// strings, or null if the RegExp matched nothing.
		var cites = body.match(acl.citeRegExp) \|\| [];
		return cites.map(function (cite) {
		var year_match = cite.match(acl.yearRegExp);
		// we cull it down to just the names by removing parentheses, commas,
		// and years (with optional suffixes), and trimming any extra whitespace
		var names_string = cite.replace(citeCleanRegExp, '').trim();
		return {
		authors: names.parseNames(names_string),
		year: year_match ? year_match[0] : null,
		style: types.CiteStyle.Textual,
		source: cite,
		};
		});
		}
		acl.parseCites = parseCites;
		acl.referenceRegExp = new RegExp("^(.+?)[.,]?\\s\$?(" + year + ")\$?\\.\\s(.+?)\\.");
		/**
		Given a string representing an individual reference in a bibliography, parse
		it into a Reference structure.
		*/
		function parseReference(reference) {
		var match = reference.match(acl.referenceRegExp);
		var authors = match ? names.parseNames(match[1]) : [];
		return {
		authors: authors,
		year: match ? match[2] : undefined,
		title: match ? match[3] : undefined,
		source: reference,
		};
		}
		acl.parseReference = parseReference;
		/**
		Given a Reference, format it as a string.
		*/
		function formatReference(reference) {
		var authors = names.formatNames(reference.authors);
		var parts = [authors, reference.year, reference.title, reference.venue, reference.publisher, reference.pages];
		return parts.filter(function (part) { return part !== undefined && part !== null; }).join('. ') + '.';
		}
		acl.formatReference = formatReference;
		/**
		In-place modifies `cites` by setting the `reference` value of each one where
		a unique match from `references` is found.

		TODO: handle multiple matches somehow.
		*/
		function linkCites(cites, references) {
		cites.forEach(function (cite) {
		var matching_references = references.filter(function (reference) {
		return names.authorsMatch(cite.authors, reference.authors) && (cite.year == reference.year);
		});
		if (matching_references.length === 1) {
		cite.reference = matching_references[0];
		}
		});
		}
		acl.linkCites = linkCites;
		/**
		Join the papers sections into a single string, for searching, and find all cites
		in that string. Parse references, and link the cites to them heuristically.

		Extend the given paper with the parsed references and cites (linked or not),
		and return it.
		*/
		function linkPaper(paper, referencesTitleRegExp) {
		if (referencesTitleRegExp === void 0) { referencesTitleRegExp = /References?/; }
		var body = paper.sections
		.filter(function (section) { return !referencesTitleRegExp.test(section.title); })
		.map(function (section) { return ("# " + section.title + "\n" + section.paragraphs.join('\n')); })
		.join('\n');
		paper.references = paper.sections
		.filter(function (section) { return referencesTitleRegExp.test(section.title); })
		.map(function (section) { return section.paragraphs.map(parseReference); })
		.reduce(function (accumulator, references) {
		pushAll(accumulator, references);
		return accumulator;
		}, []);
		var cites = parseCites(body);
		linkCites(cites, paper.references);
		paper.cites = cites;
		return paper;
		}
		acl.linkPaper = linkPaper;
		})(acl = styles.acl \|\| (styles.acl = {}));
		})(styles = academia.styles \|\| (academia.styles = {}));
		var names;
		(function (names_1) {
		var Token = lexing.Token;
		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		makeName(['Leonardo', 'da', 'Vinci']) -> { first: 'Leonardo', last: 'da Vinci' }
		makeName(['Chris', 'Callison-Burch']) -> { first: 'Chris', last: 'Callison-Burch' }
		makeName(['Hanna', 'M', 'Wallach']) -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		makeName(['Zhou']) -> { last: 'Zhou' }
		makeName(['McCallum', 'Andrew']) -> { first: 'Andrew', last: 'McCallum' }

		TODO: handle 'van', 'von', 'da', etc.
		*/
		function parseName(parts) {
		var n = parts.length;
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		var fs_1 = require('fs');
		var chalk = require('chalk');
		var yargs = require('yargs');
		var acl_1 = require('./styles/acl');
		function stderr(line) {
		process.stderr.write(chalk.magenta(line) + '\n');
		}
		function highlight(filename) {
		stderr("highlighting " + filename);
		var paper_json = fs_1.readFileSync(filename, { encoding: 'utf8' });
		var paper = JSON.parse(paper_json);
		return paper.sections
		.map(function (section) { return ("# " + section.title + "\n" + section.paragraphs.join('\n')); })
		.join('\n')
		.replace(/# References?/g, function (group0) {
		return chalk.blue(group0).toString();
		})
		.replace(acl_1.citeRegExp, function (group0) {
		return chalk.green(group0).toString();
		});
		}
		function link(filename) {
		var paper_json = fs_1.readFileSync(filename, { encoding: 'utf8' });
		var original_paper = JSON.parse(paper_json);
		// extract body and references from Paper object
		var paper = acl_1.linkPaper(original_paper);
		var linked_cites = paper.cites.filter(function (cite) { return cite.references.length > 0; });
		// report
		var report = {
		filename: filename,
		references: paper.references.length,
		cites: paper.cites.length,
		linked: linked_cites.length,
		linking_success: (100 * linked_cites.length / paper.cites.length).toFixed(0) + '%'
		};
		// report
		stderr(JSON.stringify(report));
		// output analysis
		return paper;
		}
		function main() {
		var argvparser = yargs
		.usage('Usage: academia <command> <file>')
		.command('highlight', 'highlight references in paper')
		.example('academia highlight P14-1148.pdf.json', 'Print the Paper specified in P14-1148.pdf.json as plaintext with the references highlighted')
		.command('link', 'detect references, citations, and link citations to references as possible')
		.example('academia link P14-1148.pdf.json', 'Detect cites and references, link them, and print the full enhanced Paper object')
		.describe({
		output: 'output file (- for STDOUT)',
		help: 'print this help message',
		verbose: 'print debug messages',
		version: 'print version',
		})
		.alias({
		o: 'output',
		h: 'help',
		v: 'verbose',
		})
		.boolean([
		'help',
		'verbose',
		])
		.default({
		output: '-',
		});
		var argv = argvparser.argv;
		if (argv.help) {
		argvparser.showHelp();
		}
		else if (argv.version) {
		console.log(require('../package').version);
		}
		else {
		argv = argvparser.demand(2).argv;
		// pull off positional arguments
		var command = argv._[0];
		var input_filename = argv._[1];
		// apply command to input
		var output;
		if (command === 'highlight') {
		output = highlight(input_filename);
		}
		names_1.parseName = parseName;
		/**
		Opinionated name formatting.
		*/
		function formatName(name) {
		return [name.first, name.middle, name.last].filter(function (part) { return part !== null && part !== undefined; }).join(' ');
		else if (command === 'link') {
		var paper = link(input_filename);
		output = JSON.stringify(paper);
		}
		names_1.formatName = formatName;
		function formatNames(names) {
		var name_strings = names.map(formatName);
		if (name_strings.length < 3) {
		return name_strings.join(' and ');
		}
		// use the Oxford comma
		var parts = name_strings.slice(0, -2); // might be []
		parts.push(name_strings.slice(-2).join(', and '));
		return parts.join(', ');
		else {
		stderr("Unrecognized command: \"" + command + "\"");
		process.exit(1);
		}
		names_1.formatNames = formatNames;
		var default_rules = [
		[/^$/, function (match) { return Token('EOF'); }],
		[/^\s+/, function (match) { return null; }],
		[/^,/, function (match) { return Token('SEPARATOR', match[0]); }],
		[/^(and\|et\|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
		[/^[A-Z](\.\|\s)/, function (match) { return Token('INITIAL', match[0].trim()); }],
		[/^((van\|von\|da\|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+\b)?/i, function (match) { return Token('NAME', match[0]); }],
		// pretty much a catch-all:
		[/^[^,\s]+/i, function (match) { return Token('NAME', match[0]); }],
		];
		/**
		1. Typical list of 3+
		'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
		['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
		2. List of 3+ without the Oxford comma, in case that ever happens
		'Aravind K Joshi, Ben King and Steven Abney' ->
		['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
		3. Duo
		'Daniel Ramage and Chris Callison-Burch' ->
		['David Mimno', 'Chris Callison-Burch']
		4. Single author
		'David Sankofl' ->
		['David Sankofl']
		5. Et al. abbreviation
		'Zhao et al.' ->
		['Zhao', 'al.']

		TODO: handle last-name-first swaps, e.g.,
		'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
		Or:
		'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
		Technically, this is ambiguous, since we could support lists of only last names
		(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.

		Example chunks:

		[FIRST MIDDLE LAST] SEP
		[FIRST LAST] SEP
		[LAST SEP FIRST] SEP
		[LAST SEP INITIAL] [LAST2 SEP INITIAL2]

		*/
		function parseNames(input) {
		var input_iterable = new lexing.StringIterator(input);
		var tokenizer = new lexing.Tokenizer(default_rules);
		var token_iterator = tokenizer.map(input_iterable);
		var names = [];
		var buffer = [];
		var buffer_swap = false;
		function flush() {
		if (buffer_swap) {
		// move the first item to the last item
		buffer.push(buffer.shift());
		}
		var name = parseName(buffer);
		names.push(name);
		// reset
		buffer = [];
		buffer_swap = false;
		}
		while (1) {
		var token = token_iterator.next();
		// console.error('%s=%s', token.name, token.value);
		// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
		if (token.name === 'EOF') {
		break;
		}
		else if (token.name === 'NAME') {
		// the first long name after
		if (buffer.length > 0 && buffer_swap) {
		flush();
		}
		buffer.push(token.value);
		}
		else if (token.name === 'INITIAL') {
		// console.log('INITIAL=%s', token.value);
		buffer.push(token.value);
		}
		else if (token.name === 'SEPARATOR' \|\| token.name === 'CONJUNCTION') {
		if (buffer.length === 1) {
		buffer_swap = true;
		}
		else if (buffer.length > 1) {
		flush();
		}
		else {
		}
		}
		}
		// finish up
		if (buffer.length > 0) {
		flush();
		}
		return names;
		}
		names_1.parseNames = parseNames;
		/**
		Typically, in-paper citations (`Cite`s) only have the last names of the authors,
		while the `Reference`s in the Bibliography have full names, or at least first
		initials and last names.

		This method determines whether a `Cite`'s names match a `Reference`'s authors.

		authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
		authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true

		'et al.' gets special treatment. 'et al.' is a match if and only if there are
		more reference authors beyond the one parallel to the 'et al.' citation author.
		In other words, 'et al.' cannot stand in for a single author.

		authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
		*/
		function authorsMatch(citeAuthors, referenceAuthors) {
		for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
		var citeAuthor = citeAuthors[i];
		var referenceAuthor = referenceAuthors[i];
		// the et al. handling has to precede the normal name-checking conditional below
		if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
		// early exit: ignore the rest of the reference authors
		return true;
		}
		if (citeAuthor === undefined \|\| referenceAuthor === undefined \|\| citeAuthor.last !== referenceAuthor.last) {
		return false;
		}
		}
		return true;
		}
		names_1.authorsMatch = authorsMatch;
		})(names = academia.names \|\| (academia.names = {}));
		var types;
		(function (types) {
		/**
		Textual: Brown (2015)
		Parenthetical: (Brown 2015)
		Alternate: Brown 2015
		*/
		(function (CiteStyle) {
		CiteStyle[CiteStyle["Textual"] = 0] = "Textual";
		CiteStyle[CiteStyle["Parenthetical"] = 1] = "Parenthetical";
		CiteStyle[CiteStyle["Alternate"] = 2] = "Alternate";
		})(types.CiteStyle \|\| (types.CiteStyle = {}));
		var CiteStyle = types.CiteStyle;
		})(types = academia.types \|\| (academia.types = {}));
		})(academia \|\| (academia = {}));
		module.exports = academia;
		var outputStream = (argv.output == '-') ? process.stdout : fs_1.createWriteStream(argv.output, { encoding: 'utf8' });
		outputStream.write(output + '\n');
		}
		}
		exports.main = main;

names.js

		@@ -1,4 +0,2 @@
		/// <reference path="./type_declarations/index.d.ts" />
		var lexing = require('lexing');
		var Token = lexing.Token;
		var lexing_1 = require('lexing');
		/**
		@@ -55,10 +53,10 @@ Given a name represented by a single string, parse it into first name, middle
		var default_rules = [
		[/^$/, function (match) { return Token('EOF'); }],
		[/^$/, function (match) { return lexing_1.Token('EOF'); }],
		[/^\s+/, function (match) { return null; }],
		[/^,/, function (match) { return Token('SEPARATOR', match[0]); }],
		[/^(and\|et\|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
		[/^[A-Z](\.\|\s)/, function (match) { return Token('INITIAL', match[0].trim()); }],
		[/^((van\|von\|da\|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+\b)?/i, function (match) { return Token('NAME', match[0]); }],
		[/^,/, function (match) { return lexing_1.Token('SEPARATOR', match[0]); }],
		[/^(and\|et\|&)/, function (match) { return lexing_1.Token('CONJUNCTION', match[0]); }],
		[/^[A-Z](\.\|\s)/, function (match) { return lexing_1.Token('INITIAL', match[0].trim()); }],
		[/^((van\|von\|da\|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+\b)?/i, function (match) { return lexing_1.Token('NAME', match[0]); }],
		// pretty much a catch-all:
		[/^[^,\s]+/i, function (match) { return Token('NAME', match[0]); }],
		[/^[^,\s]+/i, function (match) { return lexing_1.Token('NAME', match[0]); }],
		];
		@@ -98,4 +96,4 @@ /**
		function parseNames(input) {
		var input_iterable = new lexing.StringIterator(input);
		var tokenizer = new lexing.Tokenizer(default_rules);
		var input_iterable = new lexing_1.StringIterator(input);
		var tokenizer = new lexing_1.Tokenizer(default_rules);
		var token_iterator = tokenizer.map(input_iterable);
		@@ -102,0 +100,0 @@ var names = [];

package.json

		@@ -11,5 +11,8 @@ {
		],
		"version": "0.3.0",
		"version": "0.4.0",
		"homepage": "https://github.com/chbrown/academia",
		"repository": "git://github.com/chbrown/academia.git",
		"repository": {
		"type": "git",
		"url": "https://github.com/chbrown/academia.git"
		},
		"author": "Christopher Brown <io@henrian.com> (http://henrian.com)",
		@@ -23,5 +26,6 @@ "license": "MIT",
		"devDependencies": {
		"babel-core": "^5.0.0",
		"declarations": "*",
		"mocha": "*",
		"typescript": "*",
		"typescript-declare": "*"
		"typescript": "next"
		},
		@@ -28,0 +32,0 @@ "scripts": {

README.md

		@@ -75,2 +75,2 @@ # academia

		Copyright 2015 Christopher Brown. [MIT Licensed](http://opensource.org/licenses/MIT).
		Copyright 2015 Christopher Brown. [MIT Licensed](http://chbrown.github.io/licenses/MIT/#2015).

styles/acl.js

		var types = require('../types');
		var names = require('../names');
		function pushAll(array, items) {
		return Array.prototype.push.apply(array, items);
		}
		var name = '[A-Z][^()\\s]+(?: [IV]+)?';
		@@ -22,23 +19,18 @@ var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
		/**
		Given the text of a paper, extract the `Cite`s using regular expressions.
		find the start indices and lengths of all non-overlapping substrings matching
		`regExp` in `input`.
		*/
		function parseCites(body) {
		// when String.prototype.match is called with a RegExp with the 'g' (global)
		// flag set, the result will ignore any capture groups and return an Array of
		// strings, or null if the RegExp matched nothing.
		var cites = body.match(exports.citeRegExp) \|\| [];
		return cites.map(function (cite) {
		var year_match = cite.match(exports.yearRegExp);
		// we cull it down to just the names by removing parentheses, commas,
		// and years (with optional suffixes), and trimming any extra whitespace
		var names_string = cite.replace(citeCleanRegExp, '').trim();
		return {
		authors: names.parseNames(names_string),
		year: year_match ? year_match[0] : null,
		style: types.CiteStyle.Textual,
		source: cite,
		};
		});
		function matchSpans(input, regExp) {
		if (regExp === void 0) { regExp = exports.citeRegExp; }
		// reset the regex
		regExp.lastIndex = 0;
		// set up the iteration variables
		var previousLastIndex = regExp.lastIndex;
		var spans = [];
		var match;
		while ((match = regExp.exec(input)) !== null) {
		spans.push([match.index, match[0].length]);
		}
		return spans;
		}
		exports.parseCites = parseCites;
		exports.referenceRegExp = new RegExp("^(.+?)[.,]?\\s\$?(" + year + ")\$?\\.\\s(.+?)\\.");
		@@ -77,8 +69,12 @@ /**
		cites.forEach(function (cite) {
		var matching_references = references.filter(function (reference) {
		cite.references = references
		.map(function (reference, reference_i) { return ({ reference: reference, reference_i: reference_i }); })
		.filter(function (_a) {
		var reference = _a.reference, reference_i = _a.reference_i;
		return names.authorsMatch(cite.authors, reference.authors) && (cite.year == reference.year);
		})
		.map(function (_a) {
		var reference = _a.reference, reference_i = _a.reference_i;
		return ("/references/" + reference_i);
		});
		if (matching_references.length === 1) {
		cite.reference = matching_references[0];
		}
		});
		@@ -88,2 +84,28 @@ }
		/**
		Given the text of some part of a paper, extract the `Cite`s using regular expressions.
		*/
		function findCites(input, pointer) {
		return matchSpans(input, exports.citeRegExp).map(function (_a) {
		var offset = _a[0], length = _a[1];
		var text = input.slice(offset, offset + length);
		var year_match = text.match(exports.yearRegExp);
		// we cull it down to just the names by removing parentheses, commas,
		// and years (with optional suffixes), and trimming any extra whitespace
		var names_string = text.replace(citeCleanRegExp, '').trim();
		return {
		style: types.CiteStyle.Textual,
		text: text,
		origin: {
		pointer: pointer,
		offset: offset,
		length: length,
		},
		authors: names.parseNames(names_string),
		year: year_match ? year_match[0] : null,
		references: [],
		};
		});
		}
		exports.findCites = findCites;
		/**
		Join the papers sections into a single string, for searching, and find all cites
		@@ -97,18 +119,20 @@ in that string. Parse references, and link the cites to them heuristically.
		if (referencesTitleRegExp === void 0) { referencesTitleRegExp = /References?/; }
		var body = paper.sections
		.filter(function (section) { return !referencesTitleRegExp.test(section.title); })
		.map(function (section) { return ("# " + section.title + "\n" + section.paragraphs.join('\n')); })
		.join('\n');
		paper.references = paper.sections
		var sections = paper.sections;
		var body_sections = sections.filter(function (section) { return !referencesTitleRegExp.test(section.title); });
		var references = sections
		.filter(function (section) { return referencesTitleRegExp.test(section.title); })
		.map(function (section) { return section.paragraphs.map(parseReference); })
		.reduce(function (accumulator, references) {
		pushAll(accumulator, references);
		accumulator.push.apply(accumulator, references);
		return accumulator;
		}, []);
		var cites = parseCites(body);
		linkCites(cites, paper.references);
		paper.cites = cites;
		return paper;
		var cites = [];
		body_sections.forEach(function (section, section_i) {
		section.paragraphs.forEach(function (paragraph, paragraph_i) {
		cites.push.apply(cites, findCites(paragraph, "/sections/" + section_i + "/paragraphs/" + paragraph_i));
		});
		});
		linkCites(cites, references);
		return { sections: sections, references: references, cites: cites };
		}
		exports.linkPaper = linkPaper;

academia.d.ts

bin/academia.js

bin/academia.ts

Makefile

names.ts

styles/acl.ts

test/names.js

test/names.ts

tsc-declare.json

type_declarations/DefinitelyTyped/chalk/chalk.d.ts

type_declarations/DefinitelyTyped/mocha/mocha.d.ts

type_declarations/DefinitelyTyped/node/node.d.ts

type_declarations/DefinitelyTyped/yargs/yargs.d.ts

type_declarations/index.d.ts

types.ts

bin/academia

Sorry, the diff of this file is not supported yet

academia - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics