graphemesplit
Advanced tools
Comparing version 2.0.1 to 2.0.3
120
generate.js
@@ -1,9 +0,9 @@ | ||
const fs = require('fs') | ||
const https = require('https') | ||
const stream = require('stream') | ||
const fs = require("fs"); | ||
const https = require("https"); | ||
const stream = require("stream"); | ||
const linesStream = require('@orisano/lines-stream') | ||
const UnicodeTrieBuilder = require('unicode-trie/builder') | ||
const linesStream = require("@orisano/lines-stream"); | ||
const UnicodeTrieBuilder = require("unicode-trie/builder"); | ||
const types = require('./types') | ||
const types = require("./types"); | ||
@@ -16,63 +16,73 @@ function parseLine() { | ||
transform(line, encoding, callback) { | ||
const body = line.split('#')[0] | ||
const body = line.split("#")[0]; | ||
if (body.trim().length === 0) { | ||
callback() | ||
return | ||
callback(); | ||
return; | ||
} | ||
const [rawRange, type] = body.split(';').map(x => x.trim()) | ||
const range = rawRange.split('..').map(x => parseInt(x, 16)) | ||
const [rawRange, type] = body.split(";").map(x => x.trim()); | ||
const range = rawRange.split("..").map(x => parseInt(x, 16)); | ||
if (range.length > 1) { | ||
this.push({start: range[0], end: range[1], type}) | ||
this.push({ start: range[0], end: range[1], type }); | ||
} else { | ||
this.push({start: range[0], end: range[0], type}) | ||
this.push({ start: range[0], end: range[0], type }); | ||
} | ||
callback() | ||
}, | ||
}) | ||
callback(); | ||
} | ||
}); | ||
} | ||
https.get('https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt', res => { | ||
const {statusCode} = res | ||
if (statusCode !== 200) { | ||
console.error(`failed to request: ${statusCode}`) | ||
res.resume() | ||
return | ||
https.get( | ||
"https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt", | ||
res => { | ||
const { statusCode } = res; | ||
if (statusCode !== 200) { | ||
console.error(`failed to request: ${statusCode}`); | ||
res.resume(); | ||
return; | ||
} | ||
const trie = new UnicodeTrieBuilder(types.Other); | ||
res | ||
.setEncoding("utf8") | ||
.pipe(linesStream()) | ||
.pipe(parseLine()) | ||
.on("data", ({ start, end, type }) => { | ||
trie.setRange(start, end, types[type]); | ||
}) | ||
.on("end", () => { | ||
fs.writeFileSync( | ||
"./typeTrie.json", | ||
JSON.stringify({ | ||
data: trie.toBuffer().toString("base64") | ||
}) | ||
); | ||
}); | ||
} | ||
); | ||
const trie = new UnicodeTrieBuilder(types.Other) | ||
res | ||
.setEncoding('utf8') | ||
.pipe(linesStream()) | ||
.pipe(parseLine()) | ||
.on('data', ({start, end, type}) => { | ||
trie.setRange(start, end, types[type]) | ||
}) | ||
.on('end', () => { | ||
fs.writeFileSync('./typeTrie.json', JSON.stringify({ | ||
data: trie.toBuffer().toString('base64'), | ||
})) | ||
}) | ||
}) | ||
https.get('https://unicode.org/Public/emoji/11.0/emoji-data.txt', res => { | ||
const {statusCode} = res | ||
https.get("https://unicode.org/Public/emoji/11.0/emoji-data.txt", res => { | ||
const { statusCode } = res; | ||
if (statusCode !== 200) { | ||
console.error(`failed to request: ${statusCode}`) | ||
res.resume() | ||
return | ||
console.error(`failed to request: ${statusCode}`); | ||
res.resume(); | ||
return; | ||
} | ||
const trie = new UnicodeTrieBuilder() | ||
const trie = new UnicodeTrieBuilder(); | ||
res | ||
.setEncoding('utf8') | ||
.pipe(linesStream()) | ||
.pipe(parseLine()) | ||
.on('data', ({start, end, type}) => { | ||
if (type === 'Extended_Pictographic') trie.setRange(start, end, types.Extended_Pictographic) | ||
}) | ||
.on('end', () => { | ||
fs.writeFileSync('./extPict.json', JSON.stringify({ | ||
data: trie.toBuffer().toString('base64'), | ||
})) | ||
}) | ||
}) | ||
.setEncoding("utf8") | ||
.pipe(linesStream()) | ||
.pipe(parseLine()) | ||
.on("data", ({ start, end, type }) => { | ||
if (type === "Extended_Pictographic") | ||
trie.setRange(start, end, types.Extended_Pictographic); | ||
}) | ||
.on("end", () => { | ||
fs.writeFileSync( | ||
"./extPict.json", | ||
JSON.stringify({ | ||
data: trie.toBuffer().toString("base64") | ||
}) | ||
); | ||
}); | ||
}); |
115
index.js
@@ -1,12 +0,12 @@ | ||
const types = require('./types') | ||
const typeTrieData = require('./typeTrie').data | ||
const extPictData = require('./extPict').data | ||
const types = require("./types"); | ||
const typeTrieData = require("./typeTrie").data; | ||
const extPictData = require("./extPict").data; | ||
const UnicodeTrie = require('unicode-trie') | ||
const UnicodeTrie = require("unicode-trie"); | ||
const typeTrie = new UnicodeTrie(Buffer.from(typeTrieData, 'base64')) | ||
const extPict = new UnicodeTrie(Buffer.from(extPictData, 'base64')) | ||
const typeTrie = new UnicodeTrie(Buffer.from(typeTrieData, "base64")); | ||
const extPict = new UnicodeTrie(Buffer.from(extPictData, "base64")); | ||
function is(type, bit) { | ||
return (type & bit) !== 0 | ||
return (type & bit) !== 0; | ||
} | ||
@@ -17,9 +17,9 @@ | ||
ExtendOrZWJ: 1, | ||
NotBoundary: 2, | ||
} | ||
NotBoundary: 2 | ||
}; | ||
function nextGraphemeClusterSize(ts, start) { | ||
const L = ts.length | ||
const L = ts.length; | ||
let ri = 0 | ||
let ri = 0; | ||
let gb11State = GB11State.Initial; | ||
@@ -29,8 +29,8 @@ | ||
for (let i = start; i + 1 < L; i++) { | ||
const curr = ts[i + 0] | ||
const next = ts[i + 1] | ||
const curr = ts[i + 0]; | ||
const next = ts[i + 1]; | ||
// for GB12, GB13 | ||
if (!is(curr, types.Regional_Indicator)) { | ||
ri = 0 | ||
ri = 0; | ||
} | ||
@@ -43,16 +43,19 @@ | ||
if (is(curr, types.Extended_Pictographic)) { | ||
gb11State = GB11State.ExtendOrZWJ | ||
gb11State = GB11State.ExtendOrZWJ; | ||
} else { | ||
gb11State = GB11State.Initial | ||
gb11State = GB11State.Initial; | ||
} | ||
break | ||
break; | ||
case GB11State.ExtendOrZWJ: | ||
if (is(curr, types.Extend)) { | ||
gb11State = GB11State.ExtendOrZWJ | ||
} else if (is(curr, types.ZWJ) && is(next, types.Extended_Pictographic)) { | ||
gb11State = GB11State.NotBoundary | ||
gb11State = GB11State.ExtendOrZWJ; | ||
} else if ( | ||
is(curr, types.ZWJ) && | ||
is(next, types.Extended_Pictographic) | ||
) { | ||
gb11State = GB11State.NotBoundary; | ||
} else { | ||
gb11State = GB11State.Initial | ||
gb11State = GB11State.Initial; | ||
} | ||
break | ||
break; | ||
} | ||
@@ -62,65 +65,81 @@ | ||
if (is(curr, types.CR) && is(next, types.LF)) { | ||
continue | ||
continue; | ||
} | ||
// GB4: (Control | CR | LF) ÷ | ||
if (is(curr, types.Control | types.CR | types.LF)) { | ||
return (i + 1) - start | ||
return i + 1 - start; | ||
} | ||
// GB5: ÷ (Control | CR | LF) | ||
if (is(next, types.Control | types.CR | types.LF)) { | ||
return (i + 1) - start | ||
return i + 1 - start; | ||
} | ||
// GB6: L x (L | V | LV | LVT) | ||
if (is(curr, types.L) && is(next, types.L | types.V | types.LV | types.LVT)) { | ||
continue | ||
if ( | ||
is(curr, types.L) && | ||
is(next, types.L | types.V | types.LV | types.LVT) | ||
) { | ||
continue; | ||
} | ||
// GB7: (LV | V) x (V | T) | ||
if (is(curr, types.LV | types.V) && is(next, types.V | types.T)) { | ||
continue | ||
continue; | ||
} | ||
// GB8: (LVT | T) x T | ||
if (is(curr, types.LVT | types.T) && is(next, types.T)) { | ||
continue | ||
continue; | ||
} | ||
// GB9: x (Extend | ZWJ) | ||
if (is(next, types.Extend | types.ZWJ)) { | ||
continue | ||
continue; | ||
} | ||
// GB9a: x SpacingMark | ||
if (is(next, types.SpacingMark)) { | ||
continue | ||
continue; | ||
} | ||
// GB9b: Prepend x | ||
if (is(curr, types.Prepend)) { | ||
continue | ||
continue; | ||
} | ||
// GB11: \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic} | ||
if (gb11State === GB11State.NotBoundary) { | ||
continue | ||
continue; | ||
} | ||
// GB12: sot (RI RI)* RI x RI | ||
// GB13: [^RI] (RI RI)* RI x RI | ||
if (is(curr, types.Regional_Indicator) && is(next, types.Regional_Indicator) && ri % 2 === 0) { | ||
ri++ | ||
continue | ||
if ( | ||
is(curr, types.Regional_Indicator) && | ||
is(next, types.Regional_Indicator) && | ||
ri % 2 === 0 | ||
) { | ||
ri++; | ||
continue; | ||
} | ||
// GB999: Any ÷ Any | ||
return (i + 1) - start | ||
return i + 1 - start; | ||
} | ||
// GB2: Any ÷ eot | ||
return L - start | ||
return L - start; | ||
} | ||
module.exports = function split(str) { | ||
const graphemeClusters = [] | ||
const graphemeClusters = []; | ||
const codePoints = [...str].map(x => x.codePointAt(0)) | ||
const ts = codePoints.map(c => typeTrie.get(c) | extPict.get(c)) | ||
for (let offset = 0; offset < codePoints.length;) { | ||
const size = nextGraphemeClusterSize(ts, offset) | ||
const graphemeCluster = codePoints.slice(offset, offset + size) | ||
graphemeClusters.push(String.fromCodePoint(...graphemeCluster)) | ||
offset += size | ||
const map = [0]; | ||
const ts = []; | ||
for (let i = 0; i < str.length; ) { | ||
const code = str.codePointAt(i); | ||
ts.push(typeTrie.get(code) | extPict.get(code)); | ||
i += code > 65535 ? 2 : 1; | ||
map.push(i); | ||
} | ||
return graphemeClusters | ||
} | ||
for (let offset = 0; offset < ts.length; ) { | ||
const size = nextGraphemeClusterSize(ts, offset); | ||
const start = map[offset]; | ||
const end = map[offset + size]; | ||
graphemeClusters.push(str.slice(start, end)); | ||
offset += size; | ||
} | ||
return graphemeClusters; | ||
}; |
{ | ||
"name": "graphemesplit", | ||
"version": "2.0.1", | ||
"version": "2.0.3", | ||
"main": "index.js", | ||
@@ -8,4 +8,8 @@ "author": "Nao YONASHIRO", | ||
"repository": "github:orisano/graphemesplit", | ||
"scripts": { | ||
"fmt": "prettier --write **/*.js" | ||
}, | ||
"devDependencies": { | ||
"@orisano/lines-stream": "^0.1.2" | ||
"@orisano/lines-stream": "^0.1.2", | ||
"prettier": "^1.16.4" | ||
}, | ||
@@ -12,0 +16,0 @@ "dependencies": { |
88
test.js
@@ -1,44 +0,54 @@ | ||
const assert = require('assert') | ||
const https = require('https') | ||
const stream = require('stream') | ||
const assert = require("assert"); | ||
const https = require("https"); | ||
const stream = require("stream"); | ||
const linesStream = require('@orisano/lines-stream') | ||
const linesStream = require("@orisano/lines-stream"); | ||
const graphemesplit = require('./') | ||
const graphemesplit = require("./"); | ||
https.get('https://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt', res => { | ||
const {statusCode} = res | ||
if (statusCode !== 200) { | ||
console.error(`failed to http request: ${statusCode}`) | ||
res.resume() | ||
return | ||
} | ||
res | ||
.pipe(linesStream()) | ||
.pipe(new stream.Transform({ | ||
decodeStrings: false, | ||
readableObjectMode: true, | ||
https.get( | ||
"https://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt", | ||
res => { | ||
const { statusCode } = res; | ||
if (statusCode !== 200) { | ||
console.error(`failed to http request: ${statusCode}`); | ||
res.resume(); | ||
return; | ||
} | ||
res | ||
.pipe(linesStream()) | ||
.pipe( | ||
new stream.Transform({ | ||
decodeStrings: false, | ||
readableObjectMode: true, | ||
transform(line, encoding, callback) { | ||
callback() | ||
transform(line, encoding, callback) { | ||
callback(); | ||
if (line.trim().length === 0) return | ||
const [body, description] = line.split('#') | ||
const test = body.trim() | ||
if (test.length === 0) return | ||
const graphemeClusters = test | ||
.split('÷') | ||
.filter(x => x.length > 0) | ||
.map(x => { | ||
const codePoints = x.split('×') | ||
.map(y => parseInt(y.trim(), 16)) | ||
return String.fromCodePoint(...codePoints) | ||
}) | ||
this.push({expected: graphemeClusters, description}) | ||
} | ||
})) | ||
.on('data', ({expected, description}) => { | ||
const got = graphemesplit(expected.join('')) | ||
assert.deepStrictEqual(got, expected, `unexpected grapheme clusters. expected: ${expected}, but got: ${got} # ${description}`) | ||
}) | ||
}) | ||
if (line.trim().length === 0) return; | ||
const [body, description] = line.split("#"); | ||
const test = body.trim(); | ||
if (test.length === 0) return; | ||
const graphemeClusters = test | ||
.split("÷") | ||
.filter(x => x.length > 0) | ||
.map(x => { | ||
const codePoints = x | ||
.split("×") | ||
.map(y => parseInt(y.trim(), 16)); | ||
return String.fromCodePoint(...codePoints); | ||
}); | ||
this.push({ expected: graphemeClusters, description }); | ||
} | ||
}) | ||
) | ||
.on("data", ({ expected, description }) => { | ||
const got = graphemesplit(expected.join("")); | ||
assert.deepStrictEqual( | ||
got, | ||
expected, | ||
`unexpected grapheme clusters. expected: ${expected}, but got: ${got} # ${description}` | ||
); | ||
}); | ||
} | ||
); |
@@ -16,3 +16,3 @@ module.exports = { | ||
LVT: 1 << 12, | ||
Extended_Pictographic: 1 << 13, | ||
} | ||
Extended_Pictographic: 1 << 13 | ||
}; |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
Debug access
Supply chain riskUses debug, reflection and dynamic code execution features.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
294
14216
2
10
3
3