Comparing version 0.1.0 to 0.2.1
@@ -1,199 +0,251 @@ | ||
// based on PHP Name Parser by Josh Fraser (joshfraser.com) | ||
// http://www.onlineaspect.com/2009/08/17/splitting-names/ | ||
// ported to JavaScript by Mark Pemburn (pemburnia.com) | ||
// released under Apache 2.0 license | ||
/* | ||
* Copyright Josh Fraser | ||
* based on PHP Name Parser by Josh Fraser (joshfraser.com) | ||
* http://www.onlineaspect.com/2009/08/17/splitting-names/ | ||
* | ||
* Copyright Mark Pemburn | ||
* ported to JavaScript by Mark Pemburn (pemburnia.com) | ||
* | ||
* Copyright 2014, Christoph Hartmann | ||
* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
'use strict'; | ||
Array.prototype.in_array = function (value) { | ||
for (var i = 0; i < this.length; i++) { | ||
if (this[i] == value) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
for (var i = 0; i < this.length; i++) { | ||
if (this[i] === value) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
}; | ||
Array.prototype.implode = function (separator) { | ||
var output = ""; | ||
var sep = ""; | ||
for (var i = 0; i < this.length; i++) { | ||
output += sep + this[i]; | ||
sep = separator; | ||
} | ||
return output; | ||
var output = ''; | ||
var sep = ''; | ||
for (var i = 0; i < this.length; i++) { | ||
output += sep + this[i]; | ||
sep = separator; | ||
} | ||
return output; | ||
}; | ||
String.prototype.trim = function() { | ||
return this.replace(/^\s+|\s+$|\,$/g,""); | ||
String.prototype.trim = function () { | ||
return this.replace(/^\s+|\s+$|\,$/g, ''); | ||
}; | ||
String.prototype.ucfirst = function() { | ||
return this.substr(0,1).toUpperCase() + this.substr(1,this.length - 1).toLowerCase(); | ||
String.prototype.ucfirst = function () { | ||
return this.substr(0, 1).toUpperCase() + this.substr(1, this.length - 1).toLowerCase(); | ||
}; | ||
function NameParse() { | ||
function NameParse() {} | ||
// detect and format standard salutations | ||
// I'm only considering english honorifics for now & not words like | ||
NameParse.prototype.is_salutation = function (word) { | ||
// ignore periods | ||
word = word.replace('.', '').toLowerCase(); | ||
var value = false; | ||
// returns normalized values | ||
switch (word) { | ||
case 'mr': | ||
case 'master': | ||
case 'mister': | ||
value = 'Mr.'; | ||
break; | ||
case 'mrs': | ||
value = 'Mrs.'; | ||
break; | ||
case 'miss': | ||
case 'ms': | ||
value = 'Ms.'; | ||
break; | ||
case 'dr': | ||
value = 'Dr.'; | ||
break; | ||
case 'rev': | ||
value = 'Rev.'; | ||
break; | ||
case 'fr': | ||
value = 'Fr.'; | ||
break; | ||
} | ||
return value; | ||
}; | ||
// detect and format common suffixes | ||
NameParse.prototype.is_suffix = function (word) { | ||
// ignore periods | ||
word = word.replace(/\./g, '').toLowerCase(); | ||
// these are some common suffixes - what am I missing? | ||
var suffixArray = [ | ||
'I', 'II', 'III', 'IV', 'V', 'Senior', 'Junior', 'Jr', 'Sr', | ||
'PhD', 'APR', 'RPh', 'PE', 'MD', 'MA', 'DMD', 'CME' | ||
]; | ||
for (var i = 0; i < suffixArray.length; i++) { | ||
if (suffixArray[i].toLowerCase() === word) { | ||
return suffixArray[i]; | ||
} | ||
} | ||
return false; | ||
}; | ||
// detect compound last names like "Von Fange" | ||
NameParse.prototype.is_compound_lastName = function (word) { | ||
word = word.toLowerCase(); | ||
// these are some common prefixes that identify a compound last names - what am I missing? | ||
var words = [ | ||
'vere', 'von', 'van', 'de', 'del', 'della', 'di', 'da', 'pietro', | ||
'vanden', 'du', 'st.', 'st', 'la', 'lo', 'ter' | ||
]; | ||
return words.in_array(word); | ||
}; | ||
// single letter, possibly followed by a period | ||
NameParse.prototype.is_initial = function (word) { | ||
// ignore periods | ||
word = word.replace('.', ''); | ||
return (word.length === 1); | ||
}; | ||
// detect mixed case words like "McDonald" | ||
// returns false if the string is all one case | ||
NameParse.prototype.is_camel_case = function (word) { | ||
var ucReg = /|[A-Z]+|s/; | ||
var lcReg = /|[a-z]+|s/; | ||
return (word.match(ucReg) != null && word.match(lcReg) != null); | ||
}; | ||
// ucfirst words split by dashes or periods | ||
// ucfirst all upper/lower strings, but leave camelcase words alone | ||
NameParse.prototype.fix_case = function (word) { | ||
var me = this; | ||
// split full names into the following parts: | ||
// - prefix / salutation (Mr., Mrs., etc) | ||
// - given name / first name | ||
// - middle initials | ||
// - surname / last name | ||
// - suffix (II, Phd, Jr, etc) | ||
NameParse.prototype.parse = function (fullastName) { | ||
fullastName = fullastName.trim(); | ||
// split into words | ||
var unfilteredNameParts = fullastName.split(" "); | ||
var name = {}; | ||
var nameParts = []; | ||
var lastName = ""; | ||
var firstName = ""; | ||
var initials = ""; | ||
var j = 0; | ||
var i = 0; | ||
// completely ignore any words in parentheses | ||
for (i=0; i<unfilteredNameParts.length; i++) { | ||
if (unfilteredNameParts[i].indexOf("(") == -1) { | ||
nameParts[j++] = unfilteredNameParts[i]; | ||
} | ||
// uppercase words split by dashes, like "Kimura-Fay" | ||
word = me.safe_ucfirst('-', word); | ||
// uppercase words split by periods, like "J.P." | ||
word = me.safe_ucfirst('.', word); | ||
return word; | ||
}; | ||
// helper this.for fix_case | ||
NameParse.prototype.safe_ucfirst = function (seperator, word) { | ||
var me = this; | ||
var words = []; | ||
// uppercase words split by the seperator (ex. dashes or periods) | ||
var parts = word.split(seperator); | ||
for (var i = 0; i < parts.length; i++) { | ||
var thisWord = parts[i]; | ||
words[i] = (me.is_camel_case(thisWord)) ? thisWord : thisWord.ucfirst.toLowerCase(); | ||
} | ||
return words.implode(seperator); | ||
}; | ||
// split full names into the following parts: | ||
// - prefix / salutation (Mr., Mrs., etc) | ||
// - given name / first name | ||
// - middle initials | ||
// - surname / last name | ||
// - suffix (II, Phd, Jr, etc) | ||
NameParse.prototype.parse = function (fullastName) { | ||
var me = this; | ||
fullastName = fullastName.trim(); | ||
// split into words | ||
var unfilteredNameParts = fullastName.split(' '); | ||
var name = {}; | ||
var nameParts = []; | ||
var lastName = ''; | ||
var firstName = ''; | ||
var initials = ''; | ||
var j = 0; | ||
var i = 0; | ||
// completely ignore any words in parentheses | ||
for (i = 0; i < unfilteredNameParts.length; i++) { | ||
if (unfilteredNameParts[i].indexOf('(') === -1) { | ||
nameParts[j++] = unfilteredNameParts[i]; | ||
} | ||
var numWords = nameParts.length; | ||
// is the first word a title? (Mr. Mrs, etc) | ||
var salutation = me.is_salutation(nameParts[0]); | ||
var suffix = me.is_suffix(nameParts[nameParts.length - 1]); | ||
// set the range for the middle part of the name (trim prefixes & suffixes) | ||
var start = (salutation) ? 1 : 0; | ||
var end = (suffix) ? numWords - 1 : numWords; | ||
// concat the first name | ||
var word = ''; | ||
for (i=start; i<(end - 1); i++) { | ||
word = nameParts[i]; | ||
// move on to parsing the last name if we find an indicator of a compound last name (Von, Van, etc) | ||
// we use i != start to allow for rare cases where an indicator is actually the first name (like "Von Fabella") | ||
if (me.is_compound_lastName(word) && i != start) { | ||
break; | ||
} | ||
// is it a middle initial or part of their first name? | ||
// if we start off with an initial, we'll call it the first name | ||
if (me.is_initial(word)) { | ||
// is the initial the first word? | ||
if (i == start) { | ||
// if so, do a look-ahead to see if they go by their middle name | ||
// for ex: "R. Jason Smith" => "Jason Smith" & "R." is stored as an initial | ||
// but "R. J. Smith" => "R. Smith" and "J." is stored as an initial | ||
if (me.is_initial(nameParts[i + 1])) { | ||
firstName += " " + word.toUpperCase(); | ||
} else { | ||
initials += " " + word.toUpperCase(); | ||
} | ||
// otherwise, just go ahead and save the initial | ||
} | ||
var numWords = nameParts.length; | ||
// is the first word a title? (Mr. Mrs, etc) | ||
var salutation = me.is_salutation(nameParts[0]); | ||
var suffix = me.is_suffix(nameParts[nameParts.length - 1]); | ||
// set the range for the middle part of the name (trim prefixes & suffixes) | ||
var start = (salutation) ? 1 : 0; | ||
var end = (suffix) ? numWords - 1 : numWords; | ||
// concat the first name | ||
var word = ''; | ||
for (i = start; i < (end - 1); i++) { | ||
word = nameParts[i]; | ||
// move on to parsing the last name if we find an indicator of a compound | ||
// last name (Von, Van, etc) | ||
// we use i != start to allow for rare cases where an indicator is actually | ||
// the first name (like "Von Fabella") | ||
if (me.is_compound_lastName(word) && i !== start) { | ||
break; | ||
} | ||
// is it a middle initial or part of their first name? | ||
// if we start off with an initial, we'll call it the first name | ||
if (me.is_initial(word)) { | ||
// is the initial the first word? | ||
if (i === start) { | ||
// if so, do a look-ahead to see if they go by their middle name | ||
// for ex: "R. Jason Smith" => "Jason Smith" & "R." is stored as an initial | ||
// but "R. J. Smith" => "R. Smith" and "J." is stored as an initial | ||
if (me.is_initial(nameParts[i + 1])) { | ||
firstName += ' ' + word.toUpperCase(); | ||
} else { | ||
initials += " " + word.toUpperCase(); | ||
initials += ' ' + word.toUpperCase(); | ||
} | ||
// otherwise, just go ahead and save the initial | ||
} else { | ||
firstName += " " + me.fix_case(word); | ||
} | ||
} | ||
// check that we have more than 1 word in our string | ||
if ((end - start) > 1) { | ||
// concat the last name | ||
for (j=i; j<end; j++) { | ||
lastName += " " + me.fix_case(nameParts[j]); | ||
initials += ' ' + word.toUpperCase(); | ||
} | ||
} else { | ||
// otherwise, single word strings are assumed to be first names | ||
firstName = me.fix_case(nameParts[i]); | ||
firstName += ' ' + me.fix_case(word); | ||
} | ||
// return the various parts in an array | ||
name.salutation = (salutation != false) ? salutation : ""; | ||
name.firstName = (firstName != "") ? firstName.trim() : ""; | ||
name.initials = (initials != "") ? initials.trim() : ""; | ||
name.lastName = (lastName != "") ? lastName.trim() : ""; | ||
name.suffix = (suffix != false) ? suffix : ""; | ||
return name; | ||
}; | ||
// detect and format standard salutations | ||
// I'm only considering english honorifics for now & not words like | ||
this.is_salutation = function (word) { | ||
// ignore periods | ||
word = word.replace(".","").toLowerCase(); | ||
// returns normalized values | ||
if (word == "mr" || word == "master" || word == "mister") { | ||
return "Mr."; | ||
} else if (word == "mrs") { | ||
return "Mrs."; | ||
} else if (word == "miss" || word == "ms") { | ||
return "Ms."; | ||
} else if (word == "dr") { | ||
return "Dr."; | ||
} else if (word == "rev") { | ||
return "Rev."; | ||
} else if (word == "fr") { | ||
return "Fr."; | ||
} else { | ||
return false; | ||
} | ||
// check that we have more than 1 word in our string | ||
if ((end - start) > 1) { | ||
// concat the last name | ||
for (j = i; j < end; j++) { | ||
lastName += ' ' + me.fix_case(nameParts[j]); | ||
} | ||
}; | ||
// detect and format common suffixes | ||
this.is_suffix = function (word) { | ||
// ignore periods | ||
word = word.replace(/\./g,"").toLowerCase(); | ||
// these are some common suffixes - what am I missing? | ||
var suffixArray = ['I','II','III','IV','V','Senior','Junior','Jr','Sr','PhD','APR','RPh','PE','MD','MA','DMD','CME']; | ||
for (var i=0; i<suffixArray.length; i++) { | ||
if (suffixArray[i].toLowerCase() == word) { | ||
return suffixArray[i]; | ||
} | ||
} | ||
return false; | ||
}; | ||
// detect compound last names like "Von Fange" | ||
this.is_compound_lastName = function (word) { | ||
word = word.toLowerCase(); | ||
// these are some common prefixes that identify a compound last names - what am I missing? | ||
var words = ['vere','von','van','de','del','della','di','da','pietro','vanden','du','st.','st','la','lo','ter']; | ||
return words.in_array(word); | ||
}; | ||
// single letter, possibly followed by a period | ||
this.is_initial = function (word) { | ||
// ignore periods | ||
word = word.replace(".",""); | ||
return (word.length == 1); | ||
}; | ||
} else { | ||
// otherwise, single word strings are assumed to be first names | ||
firstName = me.fix_case(nameParts[i]); | ||
} | ||
// detect mixed case words like "McDonald" | ||
// returns false if the string is all one case | ||
this.is_camel_case = function (word) { | ||
var ucReg = /|[A-Z]+|s/; | ||
var lcReg = /|[a-z]+|s/; | ||
return (word.match(ucReg) != null && word.match(lcReg) != null); | ||
}; | ||
// ucfirst words split by dashes or periods | ||
// ucfirst all upper/lower strings, but leave camelcase words alone | ||
this.fix_case = function (word) { | ||
// uppercase words split by dashes, like "Kimura-Fay" | ||
word = me.safe_ucfirst("-",word); | ||
// uppercase words split by periods, like "J.P." | ||
word = me.safe_ucfirst(".",word); | ||
return word; | ||
}; | ||
// return the various parts in an array | ||
name.salutation = (salutation !== false) ? salutation : ''; | ||
name.firstName = (firstName !== '') ? firstName.trim() : ''; | ||
name.initials = (initials !== '') ? initials.trim() : ''; | ||
name.lastName = (lastName !== '') ? lastName.trim() : ''; | ||
name.suffix = (suffix !== false) ? suffix : ''; | ||
// helper this.for fix_case | ||
this.safe_ucfirst = function (seperator, word) { | ||
var words = []; | ||
// uppercase words split by the seperator (ex. dashes or periods) | ||
var parts = word.split(seperator); | ||
for (var i=0; i<parts.length; i++) { | ||
var thisWord = parts[i]; | ||
words[i] = (me.is_camel_case(thisWord)) ? thisWord : thisWord.ucfirst.toLowerCase(); | ||
} | ||
return words.implode(seperator); | ||
}; | ||
} | ||
return name; | ||
}; | ||
exports.NameParse = NameParse; | ||
exports.NameParse = NameParse; |
{ | ||
"name": "humanname", | ||
"version": "0.1.0", | ||
"version": "0.2.1", | ||
"description": "A human name parser for nodejs. Parses names an split them up into parts", | ||
@@ -18,5 +18,8 @@ "author": { | ||
"devDependencies": { | ||
"mocha": "*" | ||
"grunt-contrib-jshint": "~0.10.0", | ||
"grunt-mocha-test": "~0.10.2" | ||
}, | ||
"scripts": {}, | ||
"scripts": { | ||
"test": "grunt test" | ||
}, | ||
"engines": { | ||
@@ -23,0 +26,0 @@ "node": ">= 0.8.0" |
@@ -0,10 +1,21 @@ | ||
# humanname | ||
JavaScript library to split names into their respective components (first, last, etc) | ||
**Usage:** | ||
## Usage: | ||
```javascript | ||
// in browser | ||
var parser = new NameParse(); | ||
var parsed = parser.parse("Mr. Chales P. Wooten, III"); | ||
``` | ||
**Results:** | ||
```javascript | ||
// in nodejs | ||
var humanname = require('humanname'); | ||
var parsed = humanname.parse("Mr. Chales P. Wooten, III"); | ||
``` | ||
## Results: | ||
parsed { | ||
@@ -18,10 +29,22 @@ salutation: "Mr.", | ||
**The algorithm:** | ||
## The algorithm: | ||
We start by splitting the full name into separate words. We then do a dictionary lookup on the first and last words to see if they are a common prefix or suffix. Next, we take the middle portion of the string (everything minus the prefix & suffix) and look at everything except the last word of that string. We then loop through each of those words concatenating them together to make up the first name. While we’re doing that, we watch for any indication of a compound last name. It turns out that almost every compound last name starts with 1 of 15 prefixes (Von, Van, Vere, etc). If we see one of those prefixes, we break out of the first name loop and move on to concatenating the last name. We handle the capitalization issue by checking for camel-case before uppercasing the first letter of each word and lowercasing everything else. I wrote special cases for periods and dashes. We also have a couple other special cases, like ignoring words in parentheses all-together. | ||
**Credits & license:** | ||
## Credits & license: | ||
* Based on [PHP Name Parser](http://www.onlineaspect.com/2009/08/17/splitting-names/) by [Josh Fraser](http://joshfraser.com) | ||
* Ported to JavaScript by [Mark Pemburn](http://pemburnia.com) | ||
* Released under Apache 2.0 license | ||
* Adapted for Nodejs by Christoph Hartmann | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
23886
10
350
50
2