academia - npm Package Compare versions

test/names.js

test/names.ts

type_declarations/DefinitelyTyped/mocha/mocha.d.ts

type_declarations/DefinitelyTyped/node/node.d.ts

type_declarations/index.d.ts

46

academia.d.ts

		@@ -6,2 +6,3 @@ declare module "academia" {
		const citeRegExp: RegExp;
		const yearRegExp: RegExp;
		/**
		@@ -11,2 +12,3 @@ Given the text of a paper, extract the `Cite`s using regular expressions.
		function parseCites(body: string): types.AuthorYearCite[];
		const referenceRegExp: RegExp;
		/**
		@@ -20,3 +22,3 @@ Given a list of strings representing individual references in a bibliography,
		a unique match from `references` is found.


		TODO: handle multiple matches somehow.
		@@ -43,5 +45,20 @@ */
		'Zhao et al.' ->
		['Zhao', 'et al.']
		['Zhao', 'al.']

		TODO: handle last-name-first swaps, e.g.,
		'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
		Or:
		'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
		Technically, this is ambiguous, since we could support lists of only last names
		(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.

		Example chunks:

		[FIRST MIDDLE LAST] SEP
		[FIRST LAST] SEP
		[LAST SEP FIRST] SEP
		[LAST SEP INITIAL] [LAST2 SEP INITIAL2]

		*/
		function splitNames(input: string): string[];
		function parseNames(input: string): types.Name[];
		/**
		@@ -51,26 +68,15 @@ Typically, in-paper citations (`Cite`s) only have the last names of the authors,
		initials and last names.


		This method determines whether a `Cite`'s names match a `Reference`'s authors.


		authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
		authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true


		'et al.' gets special treatment. 'et al.' is a match if and only if there are
		more reference authors beyond the one parallel to the 'et al.' citation author.
		In other words, 'et al.' cannot stand in for a single author.


		authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
		*/
		function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]): boolean;
		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		parseAuthor('Zhou') -> { last: 'Zhou' }
		parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
		*/
		function parseName(input: string): types.Name;
		}
		@@ -100,3 +106,3 @@ module types {
		interface AuthorYearCite extends Cite {
		/** usually only last names, one of which may be 'et al.' */
		/** usually only last names, one of which may be 'al.' (from 'et al.') */
		authors: Name[];
		@@ -136,3 +142,3 @@ /** not necessarily a number, if there is a letter suffix */
		paragraph distinctions.


		`sections` is a flat list; abstracts / subsections / references all count at
		@@ -139,0 +145,0 @@ the same level.

185

index.js

		@@ -0,1 +1,3 @@
		/// <reference path="./type_declarations/index.d.ts" />
		var lexing = require('lexing');
		var academia;
		@@ -30,3 +32,3 @@ (function (academia) {
		acl.citeRegExp = new RegExp(citeSources.join('\|'), 'g');
		var yearRegExp = new RegExp(year);
		acl.yearRegExp = new RegExp(year);
		var citeCleanRegExp = new RegExp("[(),]\|" + year, 'g');
		@@ -41,3 +43,3 @@ /**
		return (body.match(acl.citeRegExp) \|\| []).map(function (cite) {
		var year_match = cite.match(yearRegExp);
		var year_match = cite.match(acl.yearRegExp);
		// we cull it down to just the names by removing parentheses, commas,
		@@ -47,3 +49,3 @@ // and years (with optional suffixes), and trimming any extra whitespace
		return {
		authors: names.splitNames(names_string).map(names.parseName),
		authors: names.parseNames(names_string),
		year: year_match ? year_match[0] : null,
		@@ -55,3 +57,3 @@ style: types.CiteStyle.Textual,
		acl.parseCites = parseCites;
		var referenceRegExp = new RegExp("^(.+?)\\.\\s(" + year + ")\\.\\s(.+?)\\.");
		acl.referenceRegExp = new RegExp("^(.+?)[.,]\\s\$?(" + year + ")\$?\\.\\s(.+?)\\.");
		/**
		@@ -63,4 +65,4 @@ Given a list of strings representing individual references in a bibliography,
		return references.map(function (reference) {
		var match = reference.match(referenceRegExp);
		var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
		var match = reference.match(acl.referenceRegExp);
		var authors = match ? names.parseNames(match[1]) : [];
		return {
		@@ -94,4 +96,44 @@ authors: authors,
		var names;
		(function (names) {
		(function (names_1) {
		var Token = lexing.Token;
		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		makeName('Zhou') -> { last: 'Zhou' }
		makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }

		TODO: handle 'van', 'von', 'da', etc.
		*/
		function makeName(parts) {
		var n = parts.length;
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		}
		var default_rules = [
		[/^$/, function (match) { return Token('EOF'); }],
		[/^\s+/, function (match) { return null; }],
		[/^,\s+/, function (match) { return Token('SEPARATOR', match[0]); }],
		[/^(and\|et\|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
		[/^[A-Z](\.\|\b)/, function (match) { return Token('INITIAL', match[0]); }],
		[/^((van\|von\|da\|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, function (match) { return Token('NAME', match[0]); }],
		];
		/**
		1. Typical list of 3+
		@@ -111,10 +153,72 @@ 'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
		'Zhao et al.' ->
		['Zhao', 'et al.']
		['Zhao', 'al.']

		TODO: handle last-name-first swaps, e.g.,
		'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
		Or:
		'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
		Technically, this is ambiguous, since we could support lists of only last names
		(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.

		Example chunks:

		[FIRST MIDDLE LAST] SEP
		[FIRST LAST] SEP
		[LAST SEP FIRST] SEP
		[LAST SEP INITIAL] [LAST2 SEP INITIAL2]

		*/
		function splitNames(input) {
		// three split options: (, and ) or ( and ) or (, )
		// TODO: fix the 'et al.' hack
		return input.replace(/\s+et al\./, ', et al.').split(/,\s(?:and\|&)\s+\|\s(?:and\|&)\s+\|,\s*/);
		function parseNames(input) {
		var input_iterable = new lexing.StringIterator(input);
		var tokenizer = new lexing.Tokenizer(default_rules);
		var token_iterator = tokenizer.map(input_iterable);
		var names = [];
		var buffer = [];
		var buffer_swap = false;
		function flush() {
		if (buffer_swap) {
		// move the first item to the last item
		buffer.push(buffer.shift());
		}
		var name = makeName(buffer);
		names.push(name);
		// reset
		buffer = [];
		buffer_swap = false;
		}
		while (1) {
		var token = token_iterator.next();
		// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
		if (token.name === 'EOF') {
		break;
		}
		else if (token.name === 'NAME') {
		// the first long name after
		if (buffer.length > 0 && buffer_swap) {
		flush();
		}
		buffer.push(token.value);
		}
		else if (token.name === 'INITIAL') {
		// console.log('INITIAL=%s', token.value);
		buffer.push(token.value);
		}
		else if (token.name === 'SEPARATOR' \|\| token.name === 'CONJUNCTION') {
		if (buffer.length === 1) {
		buffer_swap = true;
		}
		else if (buffer.length > 1) {
		flush();
		}
		else {
		}
		}
		}
		// finish up
		if (buffer.length > 0) {
		flush();
		}
		return names;
		}
		names.splitNames = splitNames;
		names_1.parseNames = parseNames;
		/**
		@@ -137,52 +241,17 @@ Typically, in-paper citations (`Cite`s) only have the last names of the authors,
		function authorsMatch(citeAuthors, referenceAuthors) {
		return citeAuthors.every(function (citeAuthor, i) {
		if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
		for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
		var citeAuthor = citeAuthors[i];
		var referenceAuthor = referenceAuthors[i];
		// the et al. handling has to precede the normal name-checking conditional below
		if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
		// early exit: ignore the rest of the reference authors
		return true;
		}
		if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
		return true;
		if (citeAuthor === undefined \|\| referenceAuthor === undefined \|\| citeAuthor.last !== referenceAuthor.last) {
		return false;
		}
		return false;
		});
		}
		names.authorsMatch = authorsMatch;
		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		parseAuthor('Zhou') -> { last: 'Zhou' }
		parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
		*/
		function parseName(input) {
		// 0. 'et al.' is a special case
		if (input === 'et al.') {
		return { last: input };
		}
		// 1. normalize the comma out
		input = input.split(/,\s*/).reverse().join(' ');
		// 2. split on whitespace
		var parts = input.split(/\s+/);
		var n = parts.length;
		// 3. TODO: handle 'van', 'von', 'da', etc.
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		return true;
		}
		names.parseName = parseName;
		names_1.authorsMatch = authorsMatch;
		})(names = academia.names \|\| (academia.names = {}));
		@@ -189,0 +258,0 @@ var types;

171

names.js

		@@ -0,2 +1,44 @@
		/// <reference path="./type_declarations/index.d.ts" />
		var lexing = require('lexing');
		var Token = lexing.Token;
		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		makeName('Zhou') -> { last: 'Zhou' }
		makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }

		TODO: handle 'van', 'von', 'da', etc.
		*/
		function makeName(parts) {
		var n = parts.length;
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		}
		var default_rules = [
		[/^$/, function (match) { return Token('EOF'); }],
		[/^\s+/, function (match) { return null; }],
		[/^,\s+/, function (match) { return Token('SEPARATOR', match[0]); }],
		[/^(and\|et\|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
		[/^[A-Z](\.\|\b)/, function (match) { return Token('INITIAL', match[0]); }],
		[/^((van\|von\|da\|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, function (match) { return Token('NAME', match[0]); }],
		];
		/**
		1. Typical list of 3+
		@@ -16,10 +58,72 @@ 'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
		'Zhao et al.' ->
		['Zhao', 'et al.']
		['Zhao', 'al.']

		TODO: handle last-name-first swaps, e.g.,
		'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
		Or:
		'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
		Technically, this is ambiguous, since we could support lists of only last names
		(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.

		Example chunks:

		[FIRST MIDDLE LAST] SEP
		[FIRST LAST] SEP
		[LAST SEP FIRST] SEP
		[LAST SEP INITIAL] [LAST2 SEP INITIAL2]

		*/
		function splitNames(input) {
		// three split options: (, and ) or ( and ) or (, )
		// TODO: fix the 'et al.' hack
		return input.replace(/\s+et al\./, ', et al.').split(/,\s(?:and\|&)\s+\|\s(?:and\|&)\s+\|,\s*/);
		function parseNames(input) {
		var input_iterable = new lexing.StringIterator(input);
		var tokenizer = new lexing.Tokenizer(default_rules);
		var token_iterator = tokenizer.map(input_iterable);
		var names = [];
		var buffer = [];
		var buffer_swap = false;
		function flush() {
		if (buffer_swap) {
		// move the first item to the last item
		buffer.push(buffer.shift());
		}
		var name = makeName(buffer);
		names.push(name);
		// reset
		buffer = [];
		buffer_swap = false;
		}
		while (1) {
		var token = token_iterator.next();
		// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
		if (token.name === 'EOF') {
		break;
		}
		else if (token.name === 'NAME') {
		// the first long name after
		if (buffer.length > 0 && buffer_swap) {
		flush();
		}
		buffer.push(token.value);
		}
		else if (token.name === 'INITIAL') {
		// console.log('INITIAL=%s', token.value);
		buffer.push(token.value);
		}
		else if (token.name === 'SEPARATOR' \|\| token.name === 'CONJUNCTION') {
		if (buffer.length === 1) {
		buffer_swap = true;
		}
		else if (buffer.length > 1) {
		flush();
		}
		else {
		}
		}
		}
		// finish up
		if (buffer.length > 0) {
		flush();
		}
		return names;
		}
		exports.splitNames = splitNames;
		exports.parseNames = parseNames;
		/**
		@@ -42,51 +146,16 @@ Typically, in-paper citations (`Cite`s) only have the last names of the authors,
		function authorsMatch(citeAuthors, referenceAuthors) {
		return citeAuthors.every(function (citeAuthor, i) {
		if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
		for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
		var citeAuthor = citeAuthors[i];
		var referenceAuthor = referenceAuthors[i];
		// the et al. handling has to precede the normal name-checking conditional below
		if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
		// early exit: ignore the rest of the reference authors
		return true;
		}
		if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
		return true;
		if (citeAuthor === undefined \|\| referenceAuthor === undefined \|\| citeAuthor.last !== referenceAuthor.last) {
		return false;
		}
		return false;
		});
		}
		return true;
		}
		exports.authorsMatch = authorsMatch;
		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		parseAuthor('Zhou') -> { last: 'Zhou' }
		parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
		*/
		function parseName(input) {
		// 0. 'et al.' is a special case
		if (input === 'et al.') {
		return { last: input };
		}
		// 1. normalize the comma out
		input = input.split(/,\s*/).reverse().join(' ');
		// 2. split on whitespace
		var parts = input.split(/\s+/);
		var n = parts.length;
		// 3. TODO: handle 'van', 'von', 'da', etc.
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		}
		exports.parseName = parseName;

178

names.ts

		@@ -0,4 +1,49 @@
		/// <reference path="./type_declarations/index.d.ts" />
		import lexing = require('lexing');
		var Token = lexing.Token;

		import types = require('./types');

		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		makeName('Zhou') -> { last: 'Zhou' }
		makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }

		TODO: handle 'van', 'von', 'da', etc.
		*/
		function makeName(parts: string[]): types.Name {
		var n = parts.length;
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		}

		var default_rules: lexing.RegexRule<string>[] = [
		[/^$/, match => Token('EOF') ],
		[/^\s+/, match => null ],
		[/^,\s+/, match => Token('SEPARATOR', match[0]) ],
		[/^(and\|et\|&)/, match => Token('CONJUNCTION', match[0]) ],
		[/^[A-Z](\.\|\b)/, match => Token('INITIAL', match[0]) ],
		[/^((van\|von\|da\|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, match => Token('NAME', match[0]) ],
		];

		/**
		1. Typical list of 3+
		@@ -18,8 +63,78 @@ 'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
		'Zhao et al.' ->
		['Zhao', 'et al.']
		['Zhao', 'al.']

		TODO: handle last-name-first swaps, e.g.,
		'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
		Or:
		'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
		Technically, this is ambiguous, since we could support lists of only last names
		(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.

		Example chunks:

		[FIRST MIDDLE LAST] SEP
		[FIRST LAST] SEP
		[LAST SEP FIRST] SEP
		[LAST SEP INITIAL] [LAST2 SEP INITIAL2]

		*/
		export function splitNames(input: string): string[] {
		// three split options: (, and ) or ( and ) or (, )
		// TODO: fix the 'et al.' hack
		return input.replace(/\s+et al\./, ', et al.').split(/,\s(?:and\|&)\s+\|\s(?:and\|&)\s+\|,\s*/);
		export function parseNames(input: string): types.Name[] {
		var input_iterable = new lexing.StringIterator(input);

		var tokenizer = new lexing.Tokenizer(default_rules);
		var token_iterator = tokenizer.map(input_iterable);

		var names: types.Name[] = [];

		var buffer: string[] = [];
		var buffer_swap = false;
		function flush() {
		if (buffer_swap) {
		// move the first item to the last item
		buffer.push(buffer.shift());
		}
		var name = makeName(buffer);
		names.push(name);
		// reset
		buffer = [];
		buffer_swap = false;
		}

		while (1) {
		var token = token_iterator.next();

		// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
		if (token.name === 'EOF') {
		break;
		}
		else if (token.name === 'NAME') {
		// the first long name after
		if (buffer.length > 0 && buffer_swap) {
		flush();
		}
		buffer.push(token.value);
		}
		else if (token.name === 'INITIAL') {
		// console.log('INITIAL=%s', token.value);
		buffer.push(token.value);
		}
		else if (token.name === 'SEPARATOR' \|\| token.name === 'CONJUNCTION') {
		if (buffer.length === 1) {
		buffer_swap = true;
		}
		else if (buffer.length > 1) {
		flush();
		}
		else {
		// a second separator without anything to separate
		}
		}
		}

		// finish up
		if (buffer.length > 0) {
		flush();
		}

		return names;
		}
		@@ -44,50 +159,15 @@
		export function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]) {
		return citeAuthors.every((citeAuthor, i) => {
		if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
		for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
		var citeAuthor = citeAuthors[i];
		var referenceAuthor = referenceAuthors[i];
		// the et al. handling has to precede the normal name-checking conditional below
		if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
		// early exit: ignore the rest of the reference authors
		return true;
		}
		if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
		return true;
		if (citeAuthor === undefined \|\| referenceAuthor === undefined \|\| citeAuthor.last !== referenceAuthor.last) {
		return false;
		}
		return false;
		});
		}

		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		parseAuthor('Zhou') -> { last: 'Zhou' }
		parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
		*/
		export function parseName(input: string): types.Name {
		// 0. 'et al.' is a special case
		if (input === 'et al.') {
		return {last: input};
		}
		// 1. normalize the comma out
		input = input.split(/,\s*/).reverse().join(' ');
		// 2. split on whitespace
		var parts = input.split(/\s+/);
		var n = parts.length;
		// 3. TODO: handle 'van', 'von', 'da', etc.
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		return true;
		}

10

package.json

		@@ -11,3 +11,3 @@ {
		],
		"version": "0.1.2",
		"version": "0.1.3",
		"homepage": "https://github.com/chbrown/academia",
		@@ -17,7 +17,13 @@ "repository": "git://github.com/chbrown/academia.git",
		"license": "MIT",
		"dependencies": {},
		"dependencies": {
		"lexing": "*"
		},
		"devDependencies": {
		"mocha": "*",
		"typescript-declare": "*",
		"typescript": "*"
		},
		"scripts": {
		"test": "make test"
		}
		}

12

styles/acl.js

		@@ -26,3 +26,3 @@ var types = require('../types');
		exports.citeRegExp = new RegExp(citeSources.join('\|'), 'g');
		var yearRegExp = new RegExp(year);
		exports.yearRegExp = new RegExp(year);
		var citeCleanRegExp = new RegExp("[(),]\|" + year, 'g');
		@@ -37,3 +37,3 @@ /**
		return (body.match(exports.citeRegExp) \|\| []).map(function (cite) {
		var year_match = cite.match(yearRegExp);
		var year_match = cite.match(exports.yearRegExp);
		// we cull it down to just the names by removing parentheses, commas,
		@@ -43,3 +43,3 @@ // and years (with optional suffixes), and trimming any extra whitespace
		return {
		authors: names.splitNames(names_string).map(names.parseName),
		authors: names.parseNames(names_string),
		year: year_match ? year_match[0] : null,
		@@ -51,3 +51,3 @@ style: types.CiteStyle.Textual,
		exports.parseCites = parseCites;
		var referenceRegExp = new RegExp("^(.+?)\\.\\s(" + year + ")\\.\\s(.+?)\\.");
		exports.referenceRegExp = new RegExp("^(.+?)[.,]\\s\$?(" + year + ")\$?\\.\\s(.+?)\\.");
		/**
		@@ -59,4 +59,4 @@ Given a list of strings representing individual references in a bibliography,
		return references.map(function (reference) {
		var match = reference.match(referenceRegExp);
		var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
		var match = reference.match(exports.referenceRegExp);
		var authors = match ? names.parseNames(match[1]) : [];
		return {
		@@ -63,0 +63,0 @@ authors: authors,

13

styles/acl.ts

		@@ -14,4 +14,4 @@ import types = require('../types');

		var name = '[A-Z][^()\\s]+(?: [IV]+)?';
		var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
		const name = '[A-Z][^()\\s]+(?: [IV]+)?';
		const year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';

		@@ -28,4 +28,5 @@ var citeSources = [
		];

		export const citeRegExp = new RegExp(citeSources.join('\|'), 'g');
		const yearRegExp = new RegExp(year);
		export const yearRegExp = new RegExp(year);
		const citeCleanRegExp = new RegExp(`[(),]\|${year}`, 'g');
		@@ -46,3 +47,3 @@
		return {
		authors: names.splitNames(names_string).map(names.parseName),
		authors: names.parseNames(names_string),
		year: year_match ? year_match[0] : null,
		@@ -54,3 +55,3 @@ style: types.CiteStyle.Textual,

		const referenceRegExp = new RegExp(`^(.+?)\\.\\s(${year})\\.\\s(.+?)\\.`);
		export const referenceRegExp = new RegExp(`^(.+?)[.,]\\s\$?(${year})\$?\\.\\s(.+?)\\.`);

		@@ -64,3 +65,3 @@ /**
		var match = reference.match(referenceRegExp);
		var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
		var authors = match ? names.parseNames(match[1]) : [];
		return {
		@@ -67,0 +68,0 @@ authors: authors,

294

tmp/index.ts

		@@ -0,1 +1,3 @@
		/// <reference path="./type_declarations/index.d.ts" />
		import lexing = require('lexing');
		module academia {
		@@ -15,17 +17,18 @@ export module styles {

		var name = '[A-Z][^()\\s]+';
		var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
		const name = '[A-Z][^()\\s]+(?: [IV]+)?';
		const year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';

		var citeSources = [
		// et al., duo, and single, with year in parens
		`${name} et al. \$${year}\$`,
		`${name} and ${name} \$${year}\$`,
		`${name} \$${year}\$`,
		`${name}\\s+et\\s+al.\\s+\$${year}\$`,
		`${name}\\s+(?:and\|&)\\s+${name}\\s+\$${year}\$`,
		`${name}\\s+\$${year}\$`,
		// et al., duo, and single, with year not in parens (note the commas)
		`${name} et al., ${year}\\b`,
		`${name} and ${name}, ${year}\\b`,
		`${name}, ${year}\\b`,
		`${name}\\s+et\\s+al.,\\s+${year}\\b`,
		`${name}\\s+(?:and\|&)\\s+${name},\\s+${year}\\b`,
		`${name},\\s+${year}\\b`,
		];

		export const citeRegExp = new RegExp(citeSources.join('\|'), 'g');
		const yearRegExp = new RegExp(year);
		export const yearRegExp = new RegExp(year);
		const citeCleanRegExp = new RegExp(`[(),]\|${year}`, 'g');
		@@ -46,3 +49,3 @@
		return {
		authors: names.splitNames(names_string).map(names.parseName),
		authors: names.parseNames(names_string),
		year: year_match ? year_match[0] : null,
		@@ -54,3 +57,3 @@ style: types.CiteStyle.Textual,

		const referenceRegExp = new RegExp(`^(.+?)\\.\\s(${year})\\.\\s(.+?)\\.`);
		export const referenceRegExp = new RegExp(`^(.+?)[.,]\\s\$?(${year})\$?\\.\\s(.+?)\\.`);

		@@ -64,3 +67,3 @@ /**
		var match = reference.match(referenceRegExp);
		var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
		var authors = match ? names.parseNames(match[1]) : [];
		return {
		@@ -90,5 +93,176 @@ authors: authors,
		}

		}
		}
		export module names {
		var Token = lexing.Token;


		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		makeName('Zhou') -> { last: 'Zhou' }
		makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }

		TODO: handle 'van', 'von', 'da', etc.
		*/
		function makeName(parts: string[]): types.Name {
		var n = parts.length;
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		}

		var default_rules: lexing.RegexRule<string>[] = [
		[/^$/, match => Token('EOF') ],
		[/^\s+/, match => null ],
		[/^,\s+/, match => Token('SEPARATOR', match[0]) ],
		[/^(and\|et\|&)/, match => Token('CONJUNCTION', match[0]) ],
		[/^[A-Z](\.\|\b)/, match => Token('INITIAL', match[0]) ],
		[/^((van\|von\|da\|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, match => Token('NAME', match[0]) ],
		];

		/**
		1. Typical list of 3+
		'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
		['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
		2. List of 3+ without the Oxford comma, in case that ever happens
		'Aravind K Joshi, Ben King and Steven Abney' ->
		['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
		3. Duo
		'Daniel Ramage and Chris Callison-Burch' ->
		['David Mimno', 'Chris Callison-Burch']
		4. Single author
		'David Sankofl' ->
		['David Sankofl']
		5. Et al. abbreviation
		'Zhao et al.' ->
		['Zhao', 'al.']

		TODO: handle last-name-first swaps, e.g.,
		'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
		Or:
		'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
		Technically, this is ambiguous, since we could support lists of only last names
		(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.

		Example chunks:

		[FIRST MIDDLE LAST] SEP
		[FIRST LAST] SEP
		[LAST SEP FIRST] SEP
		[LAST SEP INITIAL] [LAST2 SEP INITIAL2]

		*/
		export function parseNames(input: string): types.Name[] {
		var input_iterable = new lexing.StringIterator(input);

		var tokenizer = new lexing.Tokenizer(default_rules);
		var token_iterator = tokenizer.map(input_iterable);

		var names: types.Name[] = [];

		var buffer: string[] = [];
		var buffer_swap = false;
		function flush() {
		if (buffer_swap) {
		// move the first item to the last item
		buffer.push(buffer.shift());
		}
		var name = makeName(buffer);
		names.push(name);
		// reset
		buffer = [];
		buffer_swap = false;
		}

		while (1) {
		var token = token_iterator.next();

		// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
		if (token.name === 'EOF') {
		break;
		}
		else if (token.name === 'NAME') {
		// the first long name after
		if (buffer.length > 0 && buffer_swap) {
		flush();
		}
		buffer.push(token.value);
		}
		else if (token.name === 'INITIAL') {
		// console.log('INITIAL=%s', token.value);
		buffer.push(token.value);
		}
		else if (token.name === 'SEPARATOR' \|\| token.name === 'CONJUNCTION') {
		if (buffer.length === 1) {
		buffer_swap = true;
		}
		else if (buffer.length > 1) {
		flush();
		}
		else {
		// a second separator without anything to separate
		}
		}
		}

		// finish up
		if (buffer.length > 0) {
		flush();
		}

		return names;
		}

		/**
		Typically, in-paper citations (`Cite`s) only have the last names of the authors,
		while the `Reference`s in the Bibliography have full names, or at least first
		initials and last names.

		This method determines whether a `Cite`'s names match a `Reference`'s authors.

		authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
		authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true

		'et al.' gets special treatment. 'et al.' is a match if and only if there are
		more reference authors beyond the one parallel to the 'et al.' citation author.
		In other words, 'et al.' cannot stand in for a single author.

		authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
		*/
		export function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]) {
		for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
		var citeAuthor = citeAuthors[i];
		var referenceAuthor = referenceAuthors[i];
		// the et al. handling has to precede the normal name-checking conditional below
		if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
		// early exit: ignore the rest of the reference authors
		return true;
		}
		if (citeAuthor === undefined \|\| referenceAuthor === undefined \|\| citeAuthor.last !== referenceAuthor.last) {
		return false;
		}
		}
		return true;
		}

		}
		export module types {
		@@ -117,3 +291,3 @@ /**
		export interface AuthorYearCite extends Cite {
		/** usually only last names, one of which may be 'et al.' */
		/** usually only last names, one of which may be 'al.' (from 'et al.') */
		authors: Name[];
		@@ -168,98 +342,8 @@ /** not necessarily a number, if there is a letter suffix */
		sections: Section[];
		// analysis
		references?: Reference[];
		}
		}

		export module names {
		/**
		1. Typical list of 3+
		'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
		['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
		2. List of 3+ without the Oxford comma, in case that ever happens
		'Aravind K Joshi, Ben King and Steven Abney' ->
		['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
		3. Duo
		'Daniel Ramage and Chris Callison-Burch' ->
		['David Mimno', 'Chris Callison-Burch']
		4. Single author
		'David Sankofl' ->
		['David Sankofl']
		5. Et al. abbreviation
		'Zhao et al.' ->
		['Zhao', 'et al.']
		*/
		export function splitNames(input: string): string[] {
		// three split options: (, and ) or ( and ) or (, )
		// TODO: fix the 'et al.' hack
		return input.replace(/\s+et al\./, ', et al.').split(/,\sand\s+\|\sand\s+\|,\s*/);
		}

		/**
		Typically, in-paper citations (`Cite`s) only have the last names of the authors,
		while the `Reference`s in the Bibliography have full names, or at least first
		initials and last names.

		This method determines whether a `Cite`'s names match a `Reference`'s authors.

		authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
		authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true

		'et al.' gets special treatment. 'et al.' is a match if and only if there are
		more reference authors beyond the one parallel to the 'et al.' citation author.
		In other words, 'et al.' cannot stand in for a single author.

		authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
		*/
		export function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]) {
		return citeAuthors.every((citeAuthor, i) => {
		if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
		return true;
		}
		if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
		return true;
		}
		return false;
		});
		}

		/**
		Given a name represented by a single string, parse it into first name, middle
		name, and last name.

		parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
		parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
		parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
		parseAuthor('Zhou') -> { last: 'Zhou' }
		parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
		*/
		export function parseName(input: string): types.Name {
		// 0. 'et al.' is a special case
		if (input === 'et al.') {
		return {last: input};
		}
		// 1. normalize the comma out
		input = input.split(/,\s*/).reverse().join(' ');
		// 2. split on whitespace
		var parts = input.split(/\s+/);
		var n = parts.length;
		if (n >= 3) {
		return {
		first: parts[0],
		middle: parts.slice(1, n - 1).join(' '),
		last: parts[n - 1],
		};
		}
		else if (n == 2) {
		return {
		first: parts[0],
		last: parts[1],
		};
		}
		return {
		last: parts[0]
		};
		}

		}
		}

		export = academia;

125

tmp/scratch.txt

		@@ -122,1 +122,126 @@ // the original text from the paper (for debugging)
		*/

		[format]
		pretty = %C(yellow)%h%C(reset) [%C(magenta)%an <%ae>%C(reset)] %C(green)%cr%C(reset): %s

		gulpfile:

		// var eventStream = require('event-stream');
		// var gulp = require('gulp');
		// var ts = require('gulp-typescript');

		// function compile() {
		// var result = gulp.src([
		// // 'typings/*/.d.ts',
		// 'lib/*/.ts'
		// ], {
		// base: './lib'
		// })
		// .pipe(ts(project));

		// return eventStream.merge(
		// result.dts.pipe(gulp.dest('d.ts')),
		// result.js
		// .pipe(gulp.dest('js'))
		// );
		// }

		// var project = ts.createProject({
		// target: 'es5',
		// module: 'commonjs',
		// declarationFiles: true,
		// noExternalResolve: true
		// });

		// gulp.task('compile', compile());

		// 1. normalize the comma out
		// input = input.split(/,\s*/).reverse().join(' ');

		// and currentName is partly filled, it must be first/middle names
		// but if currentName is empty, then this must be



		export function parseNames(input: string): types.Name[] {
		var parts = input.split(/,?\s+(?:and\|et\|&)\s+\|,\s+/);
		// console.error('parts', parts);

		var names: types.Name[] = [];
		var currentLastName: string;
		// function flush() {
		// names.push(currentName);
		// currentLastName = undefined;
		// }

		for (var i = 0, part; (part = parts[i]) !== undefined; i++) {
		// split on whitespace
		var subparts = part.split(/\s+/);
		// if we didn't use the last part, put it on the end
		if (currentLastName !== undefined) {
		subparts.push(currentLastName);
		}

		// if the part has multiple subparts, it must be the full name all together
		if (subparts.length > 1) {
		// console.error('subparts', subparts);
		var name = makeName(subparts);
		names.push(name);
		currentLastName = undefined;
		}
		// otherwise, it's got a single subpart, which must be a last name, which
		// we keep for the next iteration of this loop
		else {
		currentLastName = subparts[0];
		}
		}

		// finish up
		if (currentLastName !== undefined) {
		var name = makeName([currentLastName]);
		names.push(name);
		}

		// console.error('names', names);
		return names;
		}

		var parts = input.split(/,?\s+(?:and\|et\|&)\s+\|,\s+/);
		console.error('parts', parts);

		for (var i = 0, part; (part = parts[i]) !== undefined; i++) {
		// split on whitespace
		var subparts = part.match(/.+/);
		console.error('subparts', subparts);
		// if we didn't use the last part, put it on the end
		if (currentLastName !== undefined) {
		subparts.push(currentLastName);
		}

		// if the part has multiple subparts, it must be the full name all together
		if (subparts.length > 1) {
		var name = makeName(subparts);
		names.push(name);
		currentLastName = undefined;
		}
		// otherwise, it's got a single subpart, which must be a last name, which
		// we keep for the next iteration of this loop
		else {
		currentLastName = subparts[0];
		}
		}

		console.log('SEPARATOR', buffer.length);
		console.log('NAME?', buffer.length > 1 && buffer_swap);

		function splitNames(input: string): string[] {
		// seven split options:
		// 1a. ", and "
		// 1b. ", et "
		// 1b. ", & "
		// 2a. " and "
		// 2a. " et "
		// 2b. " & "
		// 3. ", "
		return input.split(/,?\s\b(?:and\|et\|&)\b\s+\|,\s/);
		}

2

types.ts

		@@ -23,3 +23,3 @@ /**
		export interface AuthorYearCite extends Cite {
		/** usually only last names, one of which may be 'et al.' */
		/** usually only last names, one of which may be 'al.' (from 'et al.') */
		authors: Name[];
		@@ -26,0 +26,0 @@ /** not necessarily a number, if there is a letter suffix */

tmp/gulpfile.js

tmp/index.d.ts

tmp/index.d.ts-copy

tmp/index.js

Makefile

Sorry, the diff of this file is not supported yet

academia - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes