Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

academia

Package Overview
Dependencies
Maintainers
1
Versions
14
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

academia - npm Package Compare versions

Comparing version 0.1.2 to 0.1.3

test/names.js

46

academia.d.ts

@@ -6,2 +6,3 @@ declare module "academia" {

const citeRegExp: RegExp;
const yearRegExp: RegExp;
/**

@@ -11,2 +12,3 @@ Given the text of a paper, extract the `Cite`s using regular expressions.

function parseCites(body: string): types.AuthorYearCite[];
const referenceRegExp: RegExp;
/**

@@ -20,3 +22,3 @@ Given a list of strings representing individual references in a bibliography,

a unique match from `references` is found.
TODO: handle multiple matches somehow.

@@ -43,5 +45,20 @@ */

'Zhao et al.' ->
['Zhao', 'et al.']
['Zhao', 'al.']
TODO: handle last-name-first swaps, e.g.,
'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
Or:
'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
Technically, this is ambiguous, since we could support lists of only last names
(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.
Example chunks:
[FIRST MIDDLE LAST] SEP
[FIRST LAST] SEP
[LAST SEP FIRST] SEP
[LAST SEP INITIAL] [LAST2 SEP INITIAL2]
*/
function splitNames(input: string): string[];
function parseNames(input: string): types.Name[];
/**

@@ -51,26 +68,15 @@ Typically, in-paper citations (`Cite`s) only have the last names of the authors,

initials and last names.
This method determines whether a `Cite`'s names match a `Reference`'s authors.
authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true
'et al.' gets special treatment. 'et al.' is a match if and only if there are
more reference authors beyond the one parallel to the 'et al.' citation author.
In other words, 'et al.' cannot stand in for a single author.
authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
*/
function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]): boolean;
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
parseAuthor('Zhou') -> { last: 'Zhou' }
parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
*/
function parseName(input: string): types.Name;
}

@@ -100,3 +106,3 @@ module types {

interface AuthorYearCite extends Cite {
/** usually only last names, one of which may be 'et al.' */
/** usually only last names, one of which may be 'al.' (from 'et al.') */
authors: Name[];

@@ -136,3 +142,3 @@ /** not necessarily a number, if there is a letter suffix */

paragraph distinctions.
`sections` is a flat list; abstracts / subsections / references all count at

@@ -139,0 +145,0 @@ the same level.

@@ -0,1 +1,3 @@

/// <reference path="./type_declarations/index.d.ts" />
var lexing = require('lexing');
var academia;

@@ -30,3 +32,3 @@ (function (academia) {

acl.citeRegExp = new RegExp(citeSources.join('|'), 'g');
var yearRegExp = new RegExp(year);
acl.yearRegExp = new RegExp(year);
var citeCleanRegExp = new RegExp("[(),]|" + year, 'g');

@@ -41,3 +43,3 @@ /**

return (body.match(acl.citeRegExp) || []).map(function (cite) {
var year_match = cite.match(yearRegExp);
var year_match = cite.match(acl.yearRegExp);
// we cull it down to just the names by removing parentheses, commas,

@@ -47,3 +49,3 @@ // and years (with optional suffixes), and trimming any extra whitespace

return {
authors: names.splitNames(names_string).map(names.parseName),
authors: names.parseNames(names_string),
year: year_match ? year_match[0] : null,

@@ -55,3 +57,3 @@ style: types.CiteStyle.Textual,

acl.parseCites = parseCites;
var referenceRegExp = new RegExp("^(.+?)\\.\\s*(" + year + ")\\.\\s*(.+?)\\.");
acl.referenceRegExp = new RegExp("^(.+?)[.,]\\s*\\(?(" + year + ")\\)?\\.\\s*(.+?)\\.");
/**

@@ -63,4 +65,4 @@ Given a list of strings representing individual references in a bibliography,

return references.map(function (reference) {
var match = reference.match(referenceRegExp);
var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
var match = reference.match(acl.referenceRegExp);
var authors = match ? names.parseNames(match[1]) : [];
return {

@@ -94,4 +96,44 @@ authors: authors,

var names;
(function (names) {
(function (names_1) {
var Token = lexing.Token;
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
makeName('Zhou') -> { last: 'Zhou' }
makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
TODO: handle 'van', 'von', 'da', etc.
*/
function makeName(parts) {
var n = parts.length;
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
}
var default_rules = [
[/^$/, function (match) { return Token('EOF'); }],
[/^\s+/, function (match) { return null; }],
[/^,\s+/, function (match) { return Token('SEPARATOR', match[0]); }],
[/^(and|et|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
[/^[A-Z](\.|\b)/, function (match) { return Token('INITIAL', match[0]); }],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, function (match) { return Token('NAME', match[0]); }],
];
/**
1. Typical list of 3+

@@ -111,10 +153,72 @@ 'David Mimno, Hanna M Wallach, and Andrew McCallum' ->

'Zhao et al.' ->
['Zhao', 'et al.']
['Zhao', 'al.']
TODO: handle last-name-first swaps, e.g.,
'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
Or:
'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
Technically, this is ambiguous, since we could support lists of only last names
(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.
Example chunks:
[FIRST MIDDLE LAST] SEP
[FIRST LAST] SEP
[LAST SEP FIRST] SEP
[LAST SEP INITIAL] [LAST2 SEP INITIAL2]
*/
function splitNames(input) {
// three split options: (, and ) or ( and ) or (, )
// TODO: fix the 'et al.' hack
return input.replace(/\s+et al\./, ', et al.').split(/,\s*(?:and|&)\s+|\s*(?:and|&)\s+|,\s*/);
function parseNames(input) {
var input_iterable = new lexing.StringIterator(input);
var tokenizer = new lexing.Tokenizer(default_rules);
var token_iterator = tokenizer.map(input_iterable);
var names = [];
var buffer = [];
var buffer_swap = false;
function flush() {
if (buffer_swap) {
// move the first item to the last item
buffer.push(buffer.shift());
}
var name = makeName(buffer);
names.push(name);
// reset
buffer = [];
buffer_swap = false;
}
while (1) {
var token = token_iterator.next();
// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
if (token.name === 'EOF') {
break;
}
else if (token.name === 'NAME') {
// the first long name after
if (buffer.length > 0 && buffer_swap) {
flush();
}
buffer.push(token.value);
}
else if (token.name === 'INITIAL') {
// console.log('INITIAL=%s', token.value);
buffer.push(token.value);
}
else if (token.name === 'SEPARATOR' || token.name === 'CONJUNCTION') {
if (buffer.length === 1) {
buffer_swap = true;
}
else if (buffer.length > 1) {
flush();
}
else {
}
}
}
// finish up
if (buffer.length > 0) {
flush();
}
return names;
}
names.splitNames = splitNames;
names_1.parseNames = parseNames;
/**

@@ -137,52 +241,17 @@ Typically, in-paper citations (`Cite`s) only have the last names of the authors,

function authorsMatch(citeAuthors, referenceAuthors) {
return citeAuthors.every(function (citeAuthor, i) {
if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
var citeAuthor = citeAuthors[i];
var referenceAuthor = referenceAuthors[i];
// the et al. handling has to precede the normal name-checking conditional below
if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
// early exit: ignore the rest of the reference authors
return true;
}
if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
return true;
if (citeAuthor === undefined || referenceAuthor === undefined || citeAuthor.last !== referenceAuthor.last) {
return false;
}
return false;
});
}
names.authorsMatch = authorsMatch;
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
parseAuthor('Zhou') -> { last: 'Zhou' }
parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
*/
function parseName(input) {
// 0. 'et al.' is a special case
if (input === 'et al.') {
return { last: input };
}
// 1. normalize the comma out
input = input.split(/,\s*/).reverse().join(' ');
// 2. split on whitespace
var parts = input.split(/\s+/);
var n = parts.length;
// 3. TODO: handle 'van', 'von', 'da', etc.
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
return true;
}
names.parseName = parseName;
names_1.authorsMatch = authorsMatch;
})(names = academia.names || (academia.names = {}));

@@ -189,0 +258,0 @@ var types;

@@ -0,2 +1,44 @@

/// <reference path="./type_declarations/index.d.ts" />
var lexing = require('lexing');
var Token = lexing.Token;
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
makeName('Zhou') -> { last: 'Zhou' }
makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
TODO: handle 'van', 'von', 'da', etc.
*/
function makeName(parts) {
var n = parts.length;
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
}
var default_rules = [
[/^$/, function (match) { return Token('EOF'); }],
[/^\s+/, function (match) { return null; }],
[/^,\s+/, function (match) { return Token('SEPARATOR', match[0]); }],
[/^(and|et|&)/, function (match) { return Token('CONJUNCTION', match[0]); }],
[/^[A-Z](\.|\b)/, function (match) { return Token('INITIAL', match[0]); }],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, function (match) { return Token('NAME', match[0]); }],
];
/**
1. Typical list of 3+

@@ -16,10 +58,72 @@ 'David Mimno, Hanna M Wallach, and Andrew McCallum' ->

'Zhao et al.' ->
['Zhao', 'et al.']
['Zhao', 'al.']
TODO: handle last-name-first swaps, e.g.,
'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
Or:
'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
Technically, this is ambiguous, since we could support lists of only last names
(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.
Example chunks:
[FIRST MIDDLE LAST] SEP
[FIRST LAST] SEP
[LAST SEP FIRST] SEP
[LAST SEP INITIAL] [LAST2 SEP INITIAL2]
*/
function splitNames(input) {
// three split options: (, and ) or ( and ) or (, )
// TODO: fix the 'et al.' hack
return input.replace(/\s+et al\./, ', et al.').split(/,\s*(?:and|&)\s+|\s*(?:and|&)\s+|,\s*/);
function parseNames(input) {
var input_iterable = new lexing.StringIterator(input);
var tokenizer = new lexing.Tokenizer(default_rules);
var token_iterator = tokenizer.map(input_iterable);
var names = [];
var buffer = [];
var buffer_swap = false;
function flush() {
if (buffer_swap) {
// move the first item to the last item
buffer.push(buffer.shift());
}
var name = makeName(buffer);
names.push(name);
// reset
buffer = [];
buffer_swap = false;
}
while (1) {
var token = token_iterator.next();
// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
if (token.name === 'EOF') {
break;
}
else if (token.name === 'NAME') {
// the first long name after
if (buffer.length > 0 && buffer_swap) {
flush();
}
buffer.push(token.value);
}
else if (token.name === 'INITIAL') {
// console.log('INITIAL=%s', token.value);
buffer.push(token.value);
}
else if (token.name === 'SEPARATOR' || token.name === 'CONJUNCTION') {
if (buffer.length === 1) {
buffer_swap = true;
}
else if (buffer.length > 1) {
flush();
}
else {
}
}
}
// finish up
if (buffer.length > 0) {
flush();
}
return names;
}
exports.splitNames = splitNames;
exports.parseNames = parseNames;
/**

@@ -42,51 +146,16 @@ Typically, in-paper citations (`Cite`s) only have the last names of the authors,

function authorsMatch(citeAuthors, referenceAuthors) {
return citeAuthors.every(function (citeAuthor, i) {
if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
var citeAuthor = citeAuthors[i];
var referenceAuthor = referenceAuthors[i];
// the et al. handling has to precede the normal name-checking conditional below
if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
// early exit: ignore the rest of the reference authors
return true;
}
if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
return true;
if (citeAuthor === undefined || referenceAuthor === undefined || citeAuthor.last !== referenceAuthor.last) {
return false;
}
return false;
});
}
return true;
}
exports.authorsMatch = authorsMatch;
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
parseAuthor('Zhou') -> { last: 'Zhou' }
parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
*/
function parseName(input) {
// 0. 'et al.' is a special case
if (input === 'et al.') {
return { last: input };
}
// 1. normalize the comma out
input = input.split(/,\s*/).reverse().join(' ');
// 2. split on whitespace
var parts = input.split(/\s+/);
var n = parts.length;
// 3. TODO: handle 'van', 'von', 'da', etc.
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
}
exports.parseName = parseName;

@@ -0,4 +1,49 @@

/// <reference path="./type_declarations/index.d.ts" />
import lexing = require('lexing');
var Token = lexing.Token;
import types = require('./types');
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
makeName('Zhou') -> { last: 'Zhou' }
makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
TODO: handle 'van', 'von', 'da', etc.
*/
function makeName(parts: string[]): types.Name {
var n = parts.length;
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
}
var default_rules: lexing.RegexRule<string>[] = [
[/^$/, match => Token('EOF') ],
[/^\s+/, match => null ],
[/^,\s+/, match => Token('SEPARATOR', match[0]) ],
[/^(and|et|&)/, match => Token('CONJUNCTION', match[0]) ],
[/^[A-Z](\.|\b)/, match => Token('INITIAL', match[0]) ],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, match => Token('NAME', match[0]) ],
];
/**
1. Typical list of 3+

@@ -18,8 +63,78 @@ 'David Mimno, Hanna M Wallach, and Andrew McCallum' ->

'Zhao et al.' ->
['Zhao', 'et al.']
['Zhao', 'al.']
TODO: handle last-name-first swaps, e.g.,
'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
Or:
'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
Technically, this is ambiguous, since we could support lists of only last names
(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.
Example chunks:
[FIRST MIDDLE LAST] SEP
[FIRST LAST] SEP
[LAST SEP FIRST] SEP
[LAST SEP INITIAL] [LAST2 SEP INITIAL2]
*/
export function splitNames(input: string): string[] {
// three split options: (, and ) or ( and ) or (, )
// TODO: fix the 'et al.' hack
return input.replace(/\s+et al\./, ', et al.').split(/,\s*(?:and|&)\s+|\s*(?:and|&)\s+|,\s*/);
export function parseNames(input: string): types.Name[] {
var input_iterable = new lexing.StringIterator(input);
var tokenizer = new lexing.Tokenizer(default_rules);
var token_iterator = tokenizer.map(input_iterable);
var names: types.Name[] = [];
var buffer: string[] = [];
var buffer_swap = false;
function flush() {
if (buffer_swap) {
// move the first item to the last item
buffer.push(buffer.shift());
}
var name = makeName(buffer);
names.push(name);
// reset
buffer = [];
buffer_swap = false;
}
while (1) {
var token = token_iterator.next();
// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
if (token.name === 'EOF') {
break;
}
else if (token.name === 'NAME') {
// the first long name after
if (buffer.length > 0 && buffer_swap) {
flush();
}
buffer.push(token.value);
}
else if (token.name === 'INITIAL') {
// console.log('INITIAL=%s', token.value);
buffer.push(token.value);
}
else if (token.name === 'SEPARATOR' || token.name === 'CONJUNCTION') {
if (buffer.length === 1) {
buffer_swap = true;
}
else if (buffer.length > 1) {
flush();
}
else {
// a second separator without anything to separate
}
}
}
// finish up
if (buffer.length > 0) {
flush();
}
return names;
}

@@ -44,50 +159,15 @@

export function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]) {
return citeAuthors.every((citeAuthor, i) => {
if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
var citeAuthor = citeAuthors[i];
var referenceAuthor = referenceAuthors[i];
// the et al. handling has to precede the normal name-checking conditional below
if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
// early exit: ignore the rest of the reference authors
return true;
}
if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
return true;
if (citeAuthor === undefined || referenceAuthor === undefined || citeAuthor.last !== referenceAuthor.last) {
return false;
}
return false;
});
}
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
parseAuthor('Zhou') -> { last: 'Zhou' }
parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
*/
export function parseName(input: string): types.Name {
// 0. 'et al.' is a special case
if (input === 'et al.') {
return {last: input};
}
// 1. normalize the comma out
input = input.split(/,\s*/).reverse().join(' ');
// 2. split on whitespace
var parts = input.split(/\s+/);
var n = parts.length;
// 3. TODO: handle 'van', 'von', 'da', etc.
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
return true;
}

@@ -11,3 +11,3 @@ {

],
"version": "0.1.2",
"version": "0.1.3",
"homepage": "https://github.com/chbrown/academia",

@@ -17,7 +17,13 @@ "repository": "git://github.com/chbrown/academia.git",

"license": "MIT",
"dependencies": {},
"dependencies": {
"lexing": "*"
},
"devDependencies": {
"mocha": "*",
"typescript-declare": "*",
"typescript": "*"
},
"scripts": {
"test": "make test"
}
}

@@ -26,3 +26,3 @@ var types = require('../types');

exports.citeRegExp = new RegExp(citeSources.join('|'), 'g');
var yearRegExp = new RegExp(year);
exports.yearRegExp = new RegExp(year);
var citeCleanRegExp = new RegExp("[(),]|" + year, 'g');

@@ -37,3 +37,3 @@ /**

return (body.match(exports.citeRegExp) || []).map(function (cite) {
var year_match = cite.match(yearRegExp);
var year_match = cite.match(exports.yearRegExp);
// we cull it down to just the names by removing parentheses, commas,

@@ -43,3 +43,3 @@ // and years (with optional suffixes), and trimming any extra whitespace

return {
authors: names.splitNames(names_string).map(names.parseName),
authors: names.parseNames(names_string),
year: year_match ? year_match[0] : null,

@@ -51,3 +51,3 @@ style: types.CiteStyle.Textual,

exports.parseCites = parseCites;
var referenceRegExp = new RegExp("^(.+?)\\.\\s*(" + year + ")\\.\\s*(.+?)\\.");
exports.referenceRegExp = new RegExp("^(.+?)[.,]\\s*\\(?(" + year + ")\\)?\\.\\s*(.+?)\\.");
/**

@@ -59,4 +59,4 @@ Given a list of strings representing individual references in a bibliography,

return references.map(function (reference) {
var match = reference.match(referenceRegExp);
var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
var match = reference.match(exports.referenceRegExp);
var authors = match ? names.parseNames(match[1]) : [];
return {

@@ -63,0 +63,0 @@ authors: authors,

@@ -14,4 +14,4 @@ import types = require('../types');

var name = '[A-Z][^()\\s]+(?: [IV]+)?';
var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
const name = '[A-Z][^()\\s]+(?: [IV]+)?';
const year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';

@@ -28,4 +28,5 @@ var citeSources = [

];
export const citeRegExp = new RegExp(citeSources.join('|'), 'g');
const yearRegExp = new RegExp(year);
export const yearRegExp = new RegExp(year);
const citeCleanRegExp = new RegExp(`[(),]|${year}`, 'g');

@@ -46,3 +47,3 @@

return {
authors: names.splitNames(names_string).map(names.parseName),
authors: names.parseNames(names_string),
year: year_match ? year_match[0] : null,

@@ -54,3 +55,3 @@ style: types.CiteStyle.Textual,

const referenceRegExp = new RegExp(`^(.+?)\\.\\s*(${year})\\.\\s*(.+?)\\.`);
export const referenceRegExp = new RegExp(`^(.+?)[.,]\\s*\\(?(${year})\\)?\\.\\s*(.+?)\\.`);

@@ -64,3 +65,3 @@ /**

var match = reference.match(referenceRegExp);
var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
var authors = match ? names.parseNames(match[1]) : [];
return {

@@ -67,0 +68,0 @@ authors: authors,

@@ -0,1 +1,3 @@

/// <reference path="./type_declarations/index.d.ts" />
import lexing = require('lexing');
module academia {

@@ -15,17 +17,18 @@ export module styles {

var name = '[A-Z][^()\\s]+';
var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
const name = '[A-Z][^()\\s]+(?: [IV]+)?';
const year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?';
var citeSources = [
// et al., duo, and single, with year in parens
`${name} et al. \\(${year}\\)`,
`${name} and ${name} \\(${year}\\)`,
`${name} \\(${year}\\)`,
`${name}\\s+et\\s+al.\\s+\\(${year}\\)`,
`${name}\\s+(?:and|&)\\s+${name}\\s+\\(${year}\\)`,
`${name}\\s+\\(${year}\\)`,
// et al., duo, and single, with year not in parens (note the commas)
`${name} et al., ${year}\\b`,
`${name} and ${name}, ${year}\\b`,
`${name}, ${year}\\b`,
`${name}\\s+et\\s+al.,\\s+${year}\\b`,
`${name}\\s+(?:and|&)\\s+${name},\\s+${year}\\b`,
`${name},\\s+${year}\\b`,
];
export const citeRegExp = new RegExp(citeSources.join('|'), 'g');
const yearRegExp = new RegExp(year);
export const yearRegExp = new RegExp(year);
const citeCleanRegExp = new RegExp(`[(),]|${year}`, 'g');

@@ -46,3 +49,3 @@

return {
authors: names.splitNames(names_string).map(names.parseName),
authors: names.parseNames(names_string),
year: year_match ? year_match[0] : null,

@@ -54,3 +57,3 @@ style: types.CiteStyle.Textual,

const referenceRegExp = new RegExp(`^(.+?)\\.\\s*(${year})\\.\\s*(.+?)\\.`);
export const referenceRegExp = new RegExp(`^(.+?)[.,]\\s*\\(?(${year})\\)?\\.\\s*(.+?)\\.`);

@@ -64,3 +67,3 @@ /**

var match = reference.match(referenceRegExp);
var authors = match ? names.splitNames(match[1]).map(names.parseName) : [];
var authors = match ? names.parseNames(match[1]) : [];
return {

@@ -90,5 +93,176 @@ authors: authors,

}
}
}
export module names {
var Token = lexing.Token;
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
makeName('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
makeName('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
makeName('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
makeName('Zhou') -> { last: 'Zhou' }
makeName('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
TODO: handle 'van', 'von', 'da', etc.
*/
function makeName(parts: string[]): types.Name {
var n = parts.length;
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
}
var default_rules: lexing.RegexRule<string>[] = [
[/^$/, match => Token('EOF') ],
[/^\s+/, match => null ],
[/^,\s+/, match => Token('SEPARATOR', match[0]) ],
[/^(and|et|&)/, match => Token('CONJUNCTION', match[0]) ],
[/^[A-Z](\.|\b)/, match => Token('INITIAL', match[0]) ],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+)?/i, match => Token('NAME', match[0]) ],
];
/**
1. Typical list of 3+
'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
2. List of 3+ without the Oxford comma, in case that ever happens
'Aravind K Joshi, Ben King and Steven Abney' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
3. Duo
'Daniel Ramage and Chris Callison-Burch' ->
['David Mimno', 'Chris Callison-Burch']
4. Single author
'David Sankofl' ->
['David Sankofl']
5. Et al. abbreviation
'Zhao et al.' ->
['Zhao', 'al.']
TODO: handle last-name-first swaps, e.g.,
'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
Or:
'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
Technically, this is ambiguous, since we could support lists of only last names
(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.
Example chunks:
[FIRST MIDDLE LAST] SEP
[FIRST LAST] SEP
[LAST SEP FIRST] SEP
[LAST SEP INITIAL] [LAST2 SEP INITIAL2]
*/
export function parseNames(input: string): types.Name[] {
var input_iterable = new lexing.StringIterator(input);
var tokenizer = new lexing.Tokenizer(default_rules);
var token_iterator = tokenizer.map(input_iterable);
var names: types.Name[] = [];
var buffer: string[] = [];
var buffer_swap = false;
function flush() {
if (buffer_swap) {
// move the first item to the last item
buffer.push(buffer.shift());
}
var name = makeName(buffer);
names.push(name);
// reset
buffer = [];
buffer_swap = false;
}
while (1) {
var token = token_iterator.next();
// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
if (token.name === 'EOF') {
break;
}
else if (token.name === 'NAME') {
// the first long name after
if (buffer.length > 0 && buffer_swap) {
flush();
}
buffer.push(token.value);
}
else if (token.name === 'INITIAL') {
// console.log('INITIAL=%s', token.value);
buffer.push(token.value);
}
else if (token.name === 'SEPARATOR' || token.name === 'CONJUNCTION') {
if (buffer.length === 1) {
buffer_swap = true;
}
else if (buffer.length > 1) {
flush();
}
else {
// a second separator without anything to separate
}
}
}
// finish up
if (buffer.length > 0) {
flush();
}
return names;
}
/**
Typically, in-paper citations (`Cite`s) only have the last names of the authors,
while the `Reference`s in the Bibliography have full names, or at least first
initials and last names.
This method determines whether a `Cite`'s names match a `Reference`'s authors.
authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true
'et al.' gets special treatment. 'et al.' is a match if and only if there are
more reference authors beyond the one parallel to the 'et al.' citation author.
In other words, 'et al.' cannot stand in for a single author.
authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
*/
export function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]) {
for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
var citeAuthor = citeAuthors[i];
var referenceAuthor = referenceAuthors[i];
// the et al. handling has to precede the normal name-checking conditional below
if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
// early exit: ignore the rest of the reference authors
return true;
}
if (citeAuthor === undefined || referenceAuthor === undefined || citeAuthor.last !== referenceAuthor.last) {
return false;
}
}
return true;
}
}
export module types {

@@ -117,3 +291,3 @@ /**

export interface AuthorYearCite extends Cite {
/** usually only last names, one of which may be 'et al.' */
/** usually only last names, one of which may be 'al.' (from 'et al.') */
authors: Name[];

@@ -168,98 +342,8 @@ /** not necessarily a number, if there is a letter suffix */

sections: Section[];
// analysis
references?: Reference[];
}
}
export module names {
/**
1. Typical list of 3+
'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
2. List of 3+ without the Oxford comma, in case that ever happens
'Aravind K Joshi, Ben King and Steven Abney' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
3. Duo
'Daniel Ramage and Chris Callison-Burch' ->
['David Mimno', 'Chris Callison-Burch']
4. Single author
'David Sankofl' ->
['David Sankofl']
5. Et al. abbreviation
'Zhao et al.' ->
['Zhao', 'et al.']
*/
export function splitNames(input: string): string[] {
// three split options: (, and ) or ( and ) or (, )
// TODO: fix the 'et al.' hack
return input.replace(/\s+et al\./, ', et al.').split(/,\s*and\s+|\s*and\s+|,\s*/);
}
/**
Typically, in-paper citations (`Cite`s) only have the last names of the authors,
while the `Reference`s in the Bibliography have full names, or at least first
initials and last names.
This method determines whether a `Cite`'s names match a `Reference`'s authors.
authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true
'et al.' gets special treatment. 'et al.' is a match if and only if there are
more reference authors beyond the one parallel to the 'et al.' citation author.
In other words, 'et al.' cannot stand in for a single author.
authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
*/
export function authorsMatch(citeAuthors: types.Name[], referenceAuthors: types.Name[]) {
return citeAuthors.every((citeAuthor, i) => {
if (referenceAuthors[i] && citeAuthor.last === referenceAuthors[i].last) {
return true;
}
if (citeAuthor.last === 'et al.' && referenceAuthors.length > (i + 1)) {
return true;
}
return false;
});
}
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
parseAuthor('Leonardo da Vinci') -> { first: 'Leonardo', last: 'da Vinci' }
parseAuthor('Chris Callison-Burch') -> { first: 'Chris', last: 'Callison-Burch' }
parseAuthor('Hanna M Wallach') -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
parseAuthor('Zhou') -> { last: 'Zhou' }
parseAuthor('McCallum, Andrew') -> { first: 'Andrew', last: 'McCallum' }
*/
export function parseName(input: string): types.Name {
// 0. 'et al.' is a special case
if (input === 'et al.') {
return {last: input};
}
// 1. normalize the comma out
input = input.split(/,\s*/).reverse().join(' ');
// 2. split on whitespace
var parts = input.split(/\s+/);
var n = parts.length;
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
}
}
}
export = academia;

@@ -122,1 +122,126 @@ // the original text from the paper (for debugging)

*/
[format]
pretty = %C(yellow)%h%C(reset) [%C(magenta)%an <%ae>%C(reset)] %C(green)%cr%C(reset): %s
gulpfile:
// var eventStream = require('event-stream');
// var gulp = require('gulp');
// var ts = require('gulp-typescript');
// function compile() {
// var result = gulp.src([
// // 'typings/**/*.d.ts',
// 'lib/**/*.ts'
// ], {
// base: './lib'
// })
// .pipe(ts(project));
// return eventStream.merge(
// result.dts.pipe(gulp.dest('d.ts')),
// result.js
// .pipe(gulp.dest('js'))
// );
// }
// var project = ts.createProject({
// target: 'es5',
// module: 'commonjs',
// declarationFiles: true,
// noExternalResolve: true
// });
// gulp.task('compile', compile());
// 1. normalize the comma out
// input = input.split(/,\s*/).reverse().join(' ');
// and currentName is partly filled, it must be first/middle names
// but if currentName is empty, then this must be
export function parseNames(input: string): types.Name[] {
var parts = input.split(/,?\s+(?:and|et|&)\s+|,\s+/);
// console.error('parts', parts);
var names: types.Name[] = [];
var currentLastName: string;
// function flush() {
// names.push(currentName);
// currentLastName = undefined;
// }
for (var i = 0, part; (part = parts[i]) !== undefined; i++) {
// split on whitespace
var subparts = part.split(/\s+/);
// if we didn't use the last part, put it on the end
if (currentLastName !== undefined) {
subparts.push(currentLastName);
}
// if the part has multiple subparts, it must be the full name all together
if (subparts.length > 1) {
// console.error('subparts', subparts);
var name = makeName(subparts);
names.push(name);
currentLastName = undefined;
}
// otherwise, it's got a single subpart, which must be a last name, which
// we keep for the next iteration of this loop
else {
currentLastName = subparts[0];
}
}
// finish up
if (currentLastName !== undefined) {
var name = makeName([currentLastName]);
names.push(name);
}
// console.error('names', names);
return names;
}
var parts = input.split(/,?\s+(?:and|et|&)\s+|,\s+/);
console.error('parts', parts);
for (var i = 0, part; (part = parts[i]) !== undefined; i++) {
// split on whitespace
var subparts = part.match(/.+/);
console.error('subparts', subparts);
// if we didn't use the last part, put it on the end
if (currentLastName !== undefined) {
subparts.push(currentLastName);
}
// if the part has multiple subparts, it must be the full name all together
if (subparts.length > 1) {
var name = makeName(subparts);
names.push(name);
currentLastName = undefined;
}
// otherwise, it's got a single subpart, which must be a last name, which
// we keep for the next iteration of this loop
else {
currentLastName = subparts[0];
}
}
console.log('SEPARATOR', buffer.length);
console.log('NAME?', buffer.length > 1 && buffer_swap);
function splitNames(input: string): string[] {
// seven split options:
// 1a. ", and "
// 1b. ", et "
// 1b. ", & "
// 2a. " and "
// 2a. " et "
// 2b. " & "
// 3. ", "
return input.split(/,?\s*\b(?:and|et|&)\b\s+|,\s*/);
}

@@ -23,3 +23,3 @@ /**

export interface AuthorYearCite extends Cite {
/** usually only last names, one of which may be 'et al.' */
/** usually only last names, one of which may be 'al.' (from 'et al.') */
authors: Name[];

@@ -26,0 +26,0 @@ /** not necessarily a number, if there is a letter suffix */

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc