utfstring - npm Package Compare versions

spec/visual/char_at_spec.js

spec/visual/find_byte_index_spec.js

spec/visual/find_char_index_spec.js

spec/visual/index_of_spec.js

spec/visual/last_index_of_spec.js

spec/visual/length_spec.js

spec/visual/slice_spec.js

spec/visual/string_to_char_array_spec.js

spec/visual/substr_spec.js

4

CHANGELOG.md

		@@ -0,1 +1,5 @@
		# 2.0.0
		- Abstract grapheme cluster identification in order to separate visual graphemes from individual code points.
		- The classic example if this is regional indicators, which are separate code points but combined by display systems into one visible character. Automatically treating them as a single character can be confusing when using utfstring in other Unicode-aware libraries. Since a number of other programming languages (eg. Ruby, Elixir) don't combine regional indicators when determining length, substrings, etc, I've decided to move regional indicator combination support from the existing utfstring functions to a separate implementation available in `UtfString.visual`, which supports regional indicators but otherwise behaves identically.

		# 1.3.1
		@@ -2,0 +6,0 @@ - Fix bug causing incorrect character index calculations for strings containing newlines.

2

package.json

		{
		"name": "utfstring",
		"version": "1.3.1",
		"version": "2.0.0",
		"description": "UTF-safe string operations",
		@@ -5,0 +5,0 @@ "repository": {

11

README.md

		@@ -82,2 +82,13 @@ utfstring

		## Regional Indicators

		Certain characters in the Unicode standard are meant to be combined by display systems, but are represented by multiple code points. A good example are the so-called regional indicators. By themselves, regional indicators u1F1EB (regional indicator symbol letter F) and u1F1F7 (regional indicator symbol letter R) don't mean much, but combined they form the French flag: 🇫🇷.

		Since regional indicators are semantically individual Unicode code points and because utfstring is a dependency of other Unicode-aware libraries, it doesn't make sense for utfstring to treat two regional indicators as a single character by default. That said, it can be useful to treat them as such from a display or layout perspective. In order to support both scenarios, two implementations are necessary. The first and default implementation is available via the instructions above. For visual grapheme clustering such as the grouping of regional indicators, use the `visual` property on `UtfString`. Display-aware versions of all the functions described above are available. The difference can be seen by way of the `length` function:

		```javascript
		UtfString.visual.length("🇫🇷"); // 1
		UtfString.length("🇫🇷"); // 2
		```

		## Running Tests
		@@ -84,0 +95,0 @@

6

spec/char_at_spec.js

		@@ -29,8 +29,2 @@ var UtfString = require('../utfstring.js');

		it('works with regional indicators', function() {
		var str = '🇸🇴🇫🇷';
		expect(UtfString.charAt(str, 0)).toEqual('🇸🇴');
		expect(UtfString.charAt(str, 1)).toEqual('🇫🇷');
		});

		it('returns an empty string for indices that are out of range', function() {
		@@ -37,0 +31,0 @@ var str = 'abc';

7

spec/find_byte_index_spec.js

		@@ -32,9 +32,8 @@ var UtfString = require('../utfstring.js');

		it('works with regional indicators', function() {
		var str = '🇸🇴🇫🇷';
		it('works with mixed characters', function() {
		var str = "\u{0001}\u{1F1E6}";
		expect(UtfString.findByteIndex(str, 0)).toEqual(0);
		expect(UtfString.findByteIndex(str, 1)).toEqual(4);
		expect(UtfString.findByteIndex(str, 2)).toEqual(-1);
		expect(UtfString.findByteIndex(str, 1)).toEqual(1);
		});
		});
		});

13

spec/find_char_index_spec.js

		@@ -36,15 +36,2 @@ var UtfString = require('../utfstring.js');

		it('works with regional indicators', function() {
		var str = '🇸🇴🇫🇷';
		expect(UtfString.findCharIndex(str, 0)).toEqual(0);
		expect(UtfString.findCharIndex(str, 1)).toEqual(0);
		expect(UtfString.findCharIndex(str, 2)).toEqual(0);
		expect(UtfString.findCharIndex(str, 3)).toEqual(0);
		expect(UtfString.findCharIndex(str, 4)).toEqual(1);
		expect(UtfString.findCharIndex(str, 5)).toEqual(1);
		expect(UtfString.findCharIndex(str, 6)).toEqual(1);
		expect(UtfString.findCharIndex(str, 7)).toEqual(1);
		expect(UtfString.findCharIndex(str, 8)).toEqual(-1);
		});

		it('works with a newline character', function() {
		@@ -51,0 +38,0 @@ var str = "\u{000D}\u{1F1E6}";

10

spec/index_of_spec.js

		@@ -37,12 +37,2 @@ var UtfString = require('../utfstring.js');

		it('works with regional indicators', function() {
		var str = '🇸🇴🇫🇷';
		expect(UtfString.indexOf(str, '🇸🇴')).toEqual(0);
		expect(UtfString.indexOf(str, '🇫🇷')).toEqual(1);
		expect(UtfString.indexOf(str, '🇸')).toEqual(0);
		expect(UtfString.indexOf(str, '🇴')).toEqual(0);
		expect(UtfString.indexOf(str, '🇫')).toEqual(1);
		expect(UtfString.indexOf(str, '🇷')).toEqual(1);
		});

		it('works with mixed characters', function() {
		@@ -49,0 +39,0 @@ var str = 'あaりbがc𤔣dとeうf';

8

spec/last_index_of_spec.js

		@@ -37,10 +37,2 @@ var UtfString = require('../utfstring.js');

		it('works with regional indicators', function() {
		var str = '🇫🇷🇸🇴🇫🇷';
		expect(UtfString.lastIndexOf(str, '🇫🇷')).toEqual(2);
		expect(UtfString.lastIndexOf(str, '🇫')).toEqual(2);
		expect(UtfString.lastIndexOf(str, '🇷')).toEqual(2);
		expect(UtfString.lastIndexOf(str, '🇸🇴')).toEqual(1);
		});

		it('works with mixed characters', function() {
		@@ -47,0 +39,0 @@ var str = 'あaりbがc𤔣dとeうf';

20

spec/length_spec.js

		@@ -31,3 +31,3 @@ var UtfString = require('../utfstring.js');
		var str = 'あaりbがc𤔣dとeうf🇫🇷g'
		expect(UtfString.length(str)).toEqual(14);
		expect(UtfString.length(str)).toEqual(15);
		});
		@@ -41,20 +41,2 @@

		it('correctly counts single regional indicator characters', function() {
		var str = '🇸'
		expect(str.length).toEqual(2);
		expect(UtfString.length(str)).toEqual(1);
		});

		it('correctly counts pairs of regional indicator characters', function() {
		var str = '🇸🇴'
		expect(str.length).toEqual(4);
		expect(UtfString.length(str)).toEqual(1);
		});

		it('correctly counts multiple pairs of regional indicator characters', function() {
		var str = '🇸🇴🇫🇷'
		expect(str.length).toEqual(8);
		expect(UtfString.length(str)).toEqual(2);
		});

		it('returns zero when the string is empty', function() {
		@@ -61,0 +43,0 @@ expect(UtfString.length('')).toEqual(0);

19

spec/slice_spec.js

		@@ -82,22 +82,3 @@ var UtfString = require('../utfstring.js');
		});

		describe('with regional indicators', function() {
		var str = '🇸🇴🇫🇷';

		it('works when given start and end indices', function() {
		expect(UtfString.slice(str, 0, 1)).toEqual('🇸🇴');
		expect(UtfString.slice(str, 1, 2)).toEqual('🇫🇷');
		});

		it('works when not given an end index', function() {
		expect(UtfString.slice(str, 0)).toEqual('🇸🇴🇫🇷');
		expect(UtfString.slice(str, 1)).toEqual('🇫🇷');
		});

		it('returns an empty string when given out-of-bounds indices', function() {
		expect(UtfString.slice(str, 4)).toEqual('');
		expect(UtfString.slice(str, 4, 5)).toEqual('');
		});
		});
		});
		});

5

spec/string_to_char_array_spec.js

		@@ -23,8 +23,3 @@ var UtfString = require('../utfstring.js');
		});

		it('works with regional indicators', function() {
		var str = '🇸🇴🇫🇷';
		expect(UtfString.stringToCharArray(str)).toEqual(['🇸🇴', '🇫🇷']);
		});
		});
		});

31

spec/substr_spec.js

		@@ -115,34 +115,3 @@ var UtfString = require('../utfstring.js');
		});

		describe('with regional indicators', function() {
		var str = '🇸🇴🇫🇷';

		it('works when given a start and a length', function() {
		expect(UtfString.substr(str, 0, 1)).toEqual('🇸🇴');
		expect(UtfString.substr(str, 1, 1)).toEqual('🇫🇷');
		});

		it('works when not given a length', function() {
		expect(UtfString.substr(str, 0)).toEqual('🇸🇴🇫🇷');
		expect(UtfString.substr(str, 1)).toEqual('🇫🇷');
		});

		it('returns an empty string if given an out-of-bounds start', function() {
		expect(UtfString.substr(str, 4, 1)).toEqual('');
		});

		it('returns up to the length of the string if given an out-of-bounds length', function() {
		expect(UtfString.substr(str, 1, 10)).toEqual('🇫🇷');
		});

		it('accepts a negative start value', function() {
		expect(UtfString.substr(str, -1, 1)).toEqual('🇫🇷');
		expect(UtfString.substr(str, -2, 1)).toEqual('🇸🇴');
		});

		it('returns an empty string if the negative start value is out-of-bounds', function() {
		expect(UtfString.substr(str, -3, 1)).toEqual('');
		});
		});
		});
		});

2

tester.js

		var utfstring = require('utfstring');
		console.log(utfstring.findCharIndex("\u{000D}\u{1F1E6}", 1));
		console.log(utfstring.findByteIndex("\u{0001}\u{1F1E6}", 1));

433

utfstring.js

		@@ -14,288 +14,305 @@ (function() {

		UtfString.findCharIndex = function(string, byteIndex) {
		if (byteIndex >= string.length) {
		return -1;
		}
		function factory(obj, graphemeClusterRegexes) {
		obj.findCharIndex = function(string, byteIndex) {
		if (byteIndex >= string.length) {
		return -1;
		}

		// optimization: don't iterate unless necessary
		if (!containsUnsupportedCharacters(string)) {
		return byteIndex;
		}
		// optimization: don't iterate unless necessary
		if (!containsGraphemeClusterGroup(string)) {
		return byteIndex;
		}

		var regStr = unsupportedPairs.source + '\|[^]';
		var scanner = new RegExp(regStr, 'g');
		var charCount = 0;
		var scanner = createScanner();
		var charCount = 0;

		while (scanner.exec(string) !== null) {
		if (scanner.lastIndex > byteIndex) {
		break;
		while (scanner.exec(string) !== null) {
		if (scanner.lastIndex > byteIndex) {
		break;
		}

		charCount ++;
		}

		charCount ++;
		}
		return charCount;
		};

		return charCount;
		};
		obj.findByteIndex = function(string, charIndex) {
		if (charIndex >= this.length(string)) {
		return -1;
		}

		UtfString.findByteIndex = function(string, charIndex) {
		if (charIndex >= this.length(string)) {
		return -1;
		}
		return scan(string, createScanner(), charIndex);
		};

		return scan(string, createScanner(), charIndex);
		};
		obj.charAt = function(string, index) {
		var byteIndex = this.findByteIndex(string, index);

		UtfString.charAt = function(string, index) {
		var byteIndex = this.findByteIndex(string, index);
		if ((byteIndex < 0) \|\| (byteIndex >= string.length)) {
		return '';
		}

		if ((byteIndex < 0) \|\| (byteIndex >= string.length)) {
		return '';
		}
		var characters = string.slice(byteIndex, byteIndex + 8);
		var match = graphemeClusterRegex.exec(characters);

		var characters = string.slice(byteIndex, byteIndex + 8);
		var match = unsupportedPairs.exec(characters);
		if (match === null) {
		return characters[0];
		} else {
		return match[0];
		}
		};

		if (match === null) {
		return characters[0];
		} else {
		return match[0];
		}
		};
		obj.charCodeAt = function(string, index) {
		var byteIndex = findSurrogateByteIndex(string, index);

		UtfString.charCodeAt = function(string, index) {
		var byteIndex = findSurrogateByteIndex(string, index);
		if (byteIndex < 0) {
		return NaN;
		}

		if (byteIndex < 0) {
		return NaN;
		}
		var code = string.charCodeAt(byteIndex);

		var code = string.charCodeAt(byteIndex);
		if ((0xD800 <= code) && (code <= 0xDBFF)) {
		var hi = code;
		var low = string.charCodeAt(byteIndex + 1);
		return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
		}

		if ((0xD800 <= code) && (code <= 0xDBFF)) {
		var hi = code;
		var low = string.charCodeAt(byteIndex + 1);
		return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
		}
		return code;
		};

		return code;
		};
		obj.fromCharCode = function(charCode) {
		if (charCode > 0xFFFF) {
		charCode -= 0x10000;

		UtfString.fromCharCode = function(charCode) {
		if (charCode > 0xFFFF) {
		charCode -= 0x10000;
		return String.fromCharCode(
		0xD800 + (charCode >> 10), 0xDC00 + (charCode & 0x3FF)
		);
		} else {
		return String.fromCharCode(charCode);
		}
		};

		return String.fromCharCode(
		0xD800 + (charCode >> 10), 0xDC00 + (charCode & 0x3FF)
		);
		} else {
		return String.fromCharCode(charCode);
		}
		};
		obj.indexOf = function(string, searchValue, start) {
		if ((typeof start === 'undefined') \|\| (start === null)) {
		start = 0;
		}

		UtfString.indexOf = function(string, searchValue, start) {
		if ((typeof start === 'undefined') \|\| (start === null)) {
		start = 0;
		}
		var startByteIndex = this.findByteIndex(string, start);
		var index = string.indexOf(searchValue, startByteIndex);

		var startByteIndex = this.findByteIndex(string, start);
		var index = string.indexOf(searchValue, startByteIndex);
		if (index < 0) {
		return -1
		} else {
		return this.findCharIndex(string, index);
		}
		};

		if (index < 0) {
		return -1
		} else {
		return this.findCharIndex(string, index);
		}
		};
		obj.lastIndexOf = function(string, searchValue, start) {
		var index;

		UtfString.lastIndexOf = function(string, searchValue, start) {
		var index;
		if ((typeof start === 'undefined') \|\| (start === null)) {
		index = string.lastIndexOf(searchValue);
		} else {
		var startByteIndex = this.findByteIndex(string, start);
		index = string.lastIndexOf(searchValue, startByteIndex);
		}

		if ((typeof start === 'undefined') \|\| (start === null)) {
		index = string.lastIndexOf(searchValue);
		} else {
		if (index < 0) {
		return -1;
		} else {
		return this.findCharIndex(string, index);
		}
		};

		obj.slice = function(string, start, finish) {
		var startByteIndex = this.findByteIndex(string, start);
		index = string.lastIndexOf(searchValue, startByteIndex);
		}
		var finishByteIndex;

		if (index < 0) {
		return -1;
		} else {
		return this.findCharIndex(string, index);
		}
		};
		if (startByteIndex < 0) {
		startByteIndex = string.length;
		}

		UtfString.slice = function(string, start, finish) {
		var startByteIndex = this.findByteIndex(string, start);
		var finishByteIndex;
		if ((typeof finish === 'undefined') \|\| (finish === null)) {
		finishByteIndex = string.length;
		} else {
		finishByteIndex = this.findByteIndex(string, finish);

		if (startByteIndex < 0) {
		startByteIndex = string.length;
		}
		if (finishByteIndex < 0) {
		finishByteIndex = string.length;
		}
		}

		if ((typeof finish === 'undefined') \|\| (finish === null)) {
		finishByteIndex = string.length;
		} else {
		finishByteIndex = this.findByteIndex(string, finish);
		return string.slice(startByteIndex, finishByteIndex);
		};

		if (finishByteIndex < 0) {
		finishByteIndex = string.length;
		obj.substr = function(string, start, length) {
		if (start < 0) {
		start = this.length(string) + start;
		}
		}

		return string.slice(startByteIndex, finishByteIndex);
		};
		if ((typeof length === 'undefined') \|\| (length === null)) {
		return this.slice(string, start);
		} else {
		return this.slice(string, start, start + length);
		}
		};

		UtfString.substr = function(string, start, length) {
		if (start < 0) {
		start = this.length(string) + start;
		}
		// they do the same thing
		obj.substring = obj.slice;

		if ((typeof length === 'undefined') \|\| (length === null)) {
		return this.slice(string, start);
		} else {
		return this.slice(string, start, start + length);
		}
		};
		obj.length = function(string) {
		// findCharIndex will return -1 if string is empty, so add 1
		return this.findCharIndex(string, string.length - 1) + 1;
		};

		// they do the same thing
		UtfString.substring = UtfString.slice;
		obj.stringToCodePoints = function(string) {
		var result = [];

		UtfString.length = function(string) {
		// findCharIndex will return -1 if string is empty, so add 1
		return this.findCharIndex(string, string.length - 1) + 1;
		};
		for (var i = 0; i < string.length; i ++) {
		codePoint = this.charCodeAt(string, i);

		UtfString.stringToCodePoints = function(string) {
		var result = [];
		if (!codePoint) {
		break;
		}

		for (var i = 0; i < string.length; i ++) {
		codePoint = this.charCodeAt(string, i);

		if (!codePoint) {
		break;
		result.push(codePoint);
		}

		result.push(codePoint);
		}
		return result;
		};

		return result;
		};
		obj.codePointsToString = function(arr) {
		var chars = [];

		UtfString.codePointsToString = function(arr) {
		var chars = [];
		for (var i = 0; i < arr.length; i ++) {
		chars.push(this.fromCharCode(arr[i]));
		}

		for (var i = 0; i < arr.length; i ++) {
		chars.push(this.fromCharCode(arr[i]));
		}
		return chars.join('');
		};

		return chars.join('');
		};
		obj.stringToBytes = function(string) {
		var result = [];

		UtfString.stringToBytes = function(string) {
		var result = [];
		for (var i = 0; i < string.length; i ++) {
		var chr = string.charCodeAt(i);
		var byteArray = [];

		for (var i = 0; i < string.length; i ++) {
		var chr = string.charCodeAt(i);
		var byteArray = [];
		while (chr > 0) {
		byteArray.push(chr & 0xFF);
		chr >>= 8;
		}

		while (chr > 0) {
		byteArray.push(chr & 0xFF);
		chr >>= 8;
		// all utf-16 characters are two bytes
		if (byteArray.length == 1) {
		byteArray.push(0);
		}

		// assume big-endian
		result = result.concat(byteArray.reverse());
		}

		// all utf-16 characters are two bytes
		if (byteArray.length == 1) {
		byteArray.push(0);
		return result;
		};

		obj.bytesToString = function(arr) {
		var result = [];

		for (var i = 0; i < arr.length; i += 2) {
		var hi = arr[i];
		var low = arr[i + 1];
		var combined = (hi << 8) \| low;
		result.push(String.fromCharCode(combined));
		}

		// assume big-endian
		result = result.concat(byteArray.reverse());
		}
		return result.join('');
		};

		return result;
		};
		obj.stringToCharArray = function(string) {
		var result = [];
		var scanner = createScanner();

		UtfString.bytesToString = function(arr) {
		var result = [];
		do {
		var match = scanner.exec(string);

		for (var i = 0; i < arr.length; i += 2) {
		var hi = arr[i];
		var low = arr[i + 1];
		var combined = (hi << 8) \| low;
		result.push(String.fromCharCode(combined));
		}
		if (match === null) {
		break;
		}

		return result.join('');
		};
		result.push(match[0]);
		} while(match !== null);

		UtfString.stringToCharArray = function(string) {
		var result = [];
		var regStr = unsupportedPairs.source + '\|.';
		var scanner = new RegExp(regStr, 'g');
		return result;
		};

		do {
		var match = scanner.exec(string);
		function findSurrogateByteIndex(string, charIndex) {
		return scan(string, new RegExp(surrogatePairs.source, 'g'), charIndex);
		}

		if (match === null) {
		break;
		function scan(string, scanner, charIndex) {
		// optimization: don't iterate unless it's necessary
		if (!containsGraphemeClusterGroup(string)) {
		return charIndex;
		}

		result.push(match[0]);
		} while(match !== null);
		var byteIndex = 0;
		var charCount = 0;

		return result;
		};
		do {
		var match = scanner.exec(string);

		function findSurrogateByteIndex(string, charIndex) {
		return scan(string, new RegExp(surrogatePairs.source, 'g'), charIndex);
		}
		if (match === null) {
		break;
		}

		function scan(string, scanner, charIndex) {
		// optimization: don't iterate unless it's necessary
		if (!containsUnsupportedCharacters(string)) {
		return charIndex;
		if (charCount < charIndex) {
		byteIndex += match[0].length;
		charCount ++;
		} else {
		break;
		}
		} while (match !== null);

		if (byteIndex >= string.length) {
		return -1;
		}

		return byteIndex;
		}

		var byteIndex = 0;
		var charCount = 0;
		function containsGraphemeClusterGroup(string) {
		return graphemeClusterRegex.test(string);
		}

		do {
		var match = scanner.exec(string);

		if (match === null) {
		break;
		function createScanner(extraSources, modifiers) {
		if (extraSources == undefined) {
		extraSources = ['[^]'];
		}

		if (charCount < charIndex) {
		byteIndex += match[0].length;
		charCount ++;
		} else {
		break;
		if (modifiers == undefined) {
		modifiers = 'g';
		}
		} while (match !== null);

		if (byteIndex >= string.length) {
		return -1;
		}
		var sources = [];

		return byteIndex;
		}
		graphemeClusterRegexes.forEach(function(re) {
		sources.push(re.source);
		});

		function containsUnsupportedCharacters(string) {
		return unsupportedPairs.test(string);
		}
		sources.push(surrogatePairs.source);
		sources = sources.concat(extraSources);

		function createScanner(modifiers) {
		if ((typeof modifiers === 'undefined') \|\| (modifiers === null)) {
		modifiers = '';
		return new RegExp(sources.join('\|'), modifiers);
		}

		var regStr = [regionalIndicatorPairs.source, surrogatePairs.source].join('\|');
		return new RegExp(regStr, modifiers);
		var surrogatePairs = /[\uD800-\uDBFF][\uDC00-\uDFFF]/;
		var graphemeClusterRegex = createScanner([], '');
		}

		// "unsupported" means "handled incorrectly by javascript"
		var surrogatePairs = /[\uD800-\uDBFF][\uDC00-\uDFFF]/;
		var regionalIndicatorPairs = /\uD83C[\uDDE6-\uDDFF]\uD83C[\uDDE6-\uDDFF]/;
		var unsupportedPairs = createScanner();

		UtfString.visual = {};

		factory(UtfString, []);
		factory(UtfString.visual, [regionalIndicatorPairs]);
		})();

utfstring - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics