@orchidjs/unicode-variants
Advanced tools
Comparing version 1.0.1 to 1.0.2
@@ -13,13 +13,11 @@ /*! @orchidjs/unicode-variants | https://github.com/orchidjs/unicode-variants | Apache License (v2) */ | ||
*/ | ||
const arrayToPattern = (chars) =>{ | ||
const arrayToPattern = chars => { | ||
chars = chars.filter(Boolean); | ||
chars = chars.filter( Boolean ); | ||
if (chars.length < 2) { | ||
return chars[0] || ''; | ||
} | ||
if( chars.length < 2 ){ | ||
return chars[0] || ''; | ||
} | ||
return (maxValueLength(chars) == 1) ? '['+chars.join('')+']' : '(?:'+chars.join('|')+')'; | ||
return maxValueLength(chars) == 1 ? '[' + chars.join('') + ']' : '(?:' + chars.join('|') + ')'; | ||
}; | ||
/** | ||
@@ -29,38 +27,30 @@ * @param {string[]} array | ||
*/ | ||
const sequencePattern = (array)=>{ | ||
if( !hasDuplicates(array) ){ | ||
return array.join(''); | ||
} | ||
const sequencePattern = array => { | ||
if (!hasDuplicates(array)) { | ||
return array.join(''); | ||
} | ||
let pattern = ''; | ||
let prev_char_count = 0; | ||
let pattern = ''; | ||
let prev_char_count = 0; | ||
const prev_pattern = ()=>{ | ||
if( prev_char_count > 1 ){ | ||
pattern += '{'+prev_char_count+'}'; | ||
} | ||
}; | ||
const prev_pattern = () => { | ||
if (prev_char_count > 1) { | ||
pattern += '{' + prev_char_count + '}'; | ||
} | ||
}; | ||
array.forEach((char,i)=>{ | ||
array.forEach((char, i) => { | ||
if (char === array[i - 1]) { | ||
prev_char_count++; | ||
return; | ||
} | ||
if( char === array[i-1] ){ | ||
prev_char_count++; | ||
return; | ||
} | ||
prev_pattern(); | ||
pattern += char; | ||
prev_char_count = 1; | ||
}); | ||
prev_pattern(); | ||
return pattern; | ||
prev_pattern(); | ||
pattern += char; | ||
prev_char_count = 1; | ||
}); | ||
prev_pattern(); | ||
return pattern; | ||
}; | ||
/** | ||
@@ -73,9 +63,7 @@ * Convert array of strings to a regular expression | ||
*/ | ||
const setToPattern = (chars)=>{ | ||
let array = toArray(chars); | ||
return arrayToPattern(array); | ||
const setToPattern = chars => { | ||
let array = toArray(chars); | ||
return arrayToPattern(array); | ||
}; | ||
/** | ||
@@ -86,7 +74,6 @@ * | ||
*/ | ||
const hasDuplicates = (array) => { | ||
return (new Set(array)).size !== array.length; | ||
const hasDuplicates = array => { | ||
return new Set(array).size !== array.length; | ||
}; | ||
/** | ||
@@ -97,6 +84,6 @@ * https://stackoverflow.com/questions/63006601/why-does-u-throw-an-invalid-escape-error | ||
*/ | ||
const escape_regex = (str) => { | ||
return (str + '').replace(/([\$\(\)\*\+\.\?\[\]\^\{\|\}\\])/gu, '\\$1'); | ||
const escape_regex = str => { | ||
return (str + '').replace(/([\$\(\)\*\+\.\?\[\]\^\{\|\}\\])/gu, '\\$1'); | ||
}; | ||
/** | ||
@@ -107,14 +94,13 @@ * Return the max length of array values | ||
*/ | ||
const maxValueLength = (array) => { | ||
return array.reduce( (longest, value) => Math.max(longest,unicodeLength(value)),0); | ||
const maxValueLength = array => { | ||
return array.reduce((longest, value) => Math.max(longest, unicodeLength(value)), 0); | ||
}; | ||
/** | ||
* @param {string} str | ||
*/ | ||
const unicodeLength = (str) => { | ||
return toArray(str).length; | ||
const unicodeLength = str => { | ||
return toArray(str).length; | ||
}; | ||
/** | ||
@@ -124,4 +110,5 @@ * @param {any} p | ||
*/ | ||
const toArray = (p) => Array.from(p); | ||
const toArray = p => Array.from(p); | ||
/** | ||
@@ -133,51 +120,49 @@ * Get all possible combinations of substrings that add up to the given string | ||
*/ | ||
const allSubstrings = (input) => { | ||
const allSubstrings = input => { | ||
if (input.length === 1) return [[input]]; | ||
/** @type {string[][]} */ | ||
if( input.length === 1) return [[input]]; | ||
/** @type {string[][]} */ | ||
let result = []; | ||
const start = input.substring(1); | ||
const suba = allSubstrings(start); | ||
suba.forEach(function(subresult) { | ||
let tmp = subresult.slice(0); | ||
tmp[0] = input.charAt(0) + tmp[0]; | ||
result.push(tmp); | ||
tmp = subresult.slice(0); | ||
tmp.unshift(input.charAt(0)); | ||
result.push(tmp); | ||
}); | ||
return result; | ||
let result = []; | ||
const start = input.substring(1); | ||
const suba = allSubstrings(start); | ||
suba.forEach(function (subresult) { | ||
let tmp = subresult.slice(0); | ||
tmp[0] = input.charAt(0) + tmp[0]; | ||
result.push(tmp); | ||
tmp = subresult.slice(0); | ||
tmp.unshift(input.charAt(0)); | ||
result.push(tmp); | ||
}); | ||
return result; | ||
}; | ||
/** | ||
* @typedef {{[key:string]:string}} TUnicodeMap | ||
* @typedef {{[key:string]:Set<string>}} TUnicodeSets | ||
* @typedef {[[number,number]]} TCodePoints | ||
* @typedef {{folded:string,composed:string,code_point:number}} TCodePointObj | ||
* @typedef {{start:number,end:number,length:number,substr:string}} TSequencePart | ||
*/ | ||
/** @type {TCodePoints} */ | ||
const code_points = [[ 0, 65535 ]]; | ||
const code_points = [[0, 65535]]; | ||
const accent_pat = '[\u0300-\u036F\u{b7}\u{2be}]'; // \u{2bc} | ||
/** @type {TUnicodeMap} */ | ||
exports.unicode_map = void 0; | ||
/** @type {RegExp} */ | ||
/** @type {RegExp} */ | ||
let multi_char_reg; | ||
const max_char_length = 3; | ||
/** @type {TUnicodeMap} */ | ||
/** @type {TUnicodeMap} */ | ||
const latin_convert = { | ||
'æ': 'ae', | ||
'ⱥ': 'a', | ||
'ø': 'o', | ||
'⁄': '/', | ||
'∕': '/', | ||
'æ': 'ae', | ||
'ⱥ': 'a', | ||
'ø': 'o', | ||
'⁄': '/', | ||
'∕': '/' | ||
}; | ||
const convert_pat = new RegExp(Object.keys(latin_convert).join('|')+'|'+accent_pat,'gu'); | ||
const convert_pat = new RegExp(Object.keys(latin_convert).join('|') + '|' + accent_pat, 'gu'); | ||
/** | ||
@@ -188,8 +173,7 @@ * Initialize the unicode_map from the give code point ranges | ||
*/ | ||
const initialize = (_code_points) => { | ||
if( exports.unicode_map !== undefined ) return; | ||
exports.unicode_map = generateMap(_code_points || code_points ); | ||
const initialize = _code_points => { | ||
if (exports.unicode_map !== undefined) return; | ||
exports.unicode_map = generateMap(_code_points || code_points); | ||
}; | ||
/** | ||
@@ -201,5 +185,4 @@ * Helper method for normalize a string | ||
*/ | ||
const normalize = (str,form='NFKD') => str.normalize(form); | ||
const normalize = (str, form = 'NFKD') => str.normalize(form); | ||
/** | ||
@@ -210,23 +193,17 @@ * Compatibility Decomposition without reordering string | ||
*/ | ||
const decompose = (str) =>{ | ||
if( str.match(/[\u0f71-\u0f81]/) ){ | ||
return toArray(str).reduce( | ||
/** | ||
* @param {string} result | ||
* @param {string} char | ||
*/ | ||
(result, char) =>{ | ||
return result + normalize(char) | ||
}, | ||
'' | ||
); | ||
} | ||
const decompose = str => { | ||
if (str.match(/[\u0f71-\u0f81]/)) { | ||
return toArray(str).reduce( | ||
/** | ||
* @param {string} result | ||
* @param {string} char | ||
*/ | ||
(result, char) => { | ||
return result + normalize(char); | ||
}, ''); | ||
} | ||
return normalize(str); | ||
return normalize(str); | ||
}; | ||
/** | ||
@@ -238,15 +215,10 @@ * Remove accents | ||
*/ | ||
const asciifold = (str) => { | ||
return decompose(str) | ||
.toLowerCase() | ||
.replace(convert_pat,(/** @type {string} */ char) => { | ||
return latin_convert[char] || ''; | ||
}); | ||
const asciifold = str => { | ||
return decompose(str).toLowerCase().replace(convert_pat, ( | ||
/** @type {string} */ | ||
char) => { | ||
return latin_convert[char] || ''; | ||
}); | ||
}; | ||
/** | ||
@@ -257,42 +229,41 @@ * Generate a list of unicode variants from the list of code points | ||
*/ | ||
function* generator(code_points){ | ||
for(const [code_point_min, code_point_max] of code_points){ | ||
for(let i = code_point_min; i <= code_point_max; i++){ | ||
function* generator(code_points) { | ||
for (const [code_point_min, code_point_max] of code_points) { | ||
for (let i = code_point_min; i <= code_point_max; i++) { | ||
let composed = String.fromCharCode(i); | ||
let folded = asciifold(composed); | ||
let composed = String.fromCharCode(i); | ||
let folded = asciifold(composed); | ||
if (folded == composed.toLowerCase()) { | ||
continue; | ||
} // skip when folded is a string longer than 3 characters long | ||
// bc the resulting regex patterns will be long | ||
// eg: | ||
// folded صلى الله عليه وسلم length 18 code point 65018 | ||
// folded جل جلاله length 8 code point 65019 | ||
if( folded == composed.toLowerCase() ){ | ||
continue; | ||
} | ||
if (folded.length > max_char_length) { | ||
continue; | ||
} | ||
// skip when folded is a string longer than 3 characters long | ||
// bc the resulting regex patterns will be long | ||
// eg: | ||
// folded صلى الله عليه وسلم length 18 code point 65018 | ||
// folded جل جلاله length 8 code point 65019 | ||
if( folded.length > max_char_length ){ | ||
continue; | ||
} | ||
if (folded.length == 0) { | ||
continue; | ||
} | ||
if( folded.length == 0 ){ | ||
continue | ||
} | ||
let decomposed = normalize(composed); | ||
let recomposed = normalize(decomposed, 'NFC'); | ||
let decomposed = normalize(composed); | ||
let recomposed = normalize(decomposed,'NFC'); | ||
if (recomposed === composed && folded === decomposed) { | ||
continue; | ||
} | ||
if( recomposed === composed && folded === decomposed ){ | ||
continue; | ||
} | ||
yield {folded:folded,composed:composed,code_point:i}; | ||
} | ||
} | ||
yield { | ||
folded: folded, | ||
composed: composed, | ||
code_point: i | ||
}; | ||
} | ||
} | ||
} | ||
/** | ||
@@ -303,35 +274,31 @@ * Generate a unicode map from the list of code points | ||
*/ | ||
const generateSets = (code_points) => { | ||
/** @type {{[key:string]:Set<string>}} */ | ||
const unicode_sets = {}; | ||
const generateSets = code_points => { | ||
/** @type {{[key:string]:Set<string>}} */ | ||
const unicode_sets = {}; | ||
/** | ||
* @param {string} folded | ||
* @param {string} to_add | ||
*/ | ||
const addMatching = (folded, to_add) => { | ||
/** @type {Set<string>} */ | ||
const folded_set = unicode_sets[folded] || new Set(); | ||
const patt = new RegExp('^' + setToPattern(folded_set) + '$', 'iu'); | ||
/** | ||
* @param {string} folded | ||
* @param {string} to_add | ||
*/ | ||
const addMatching = (folded,to_add) => { | ||
if (to_add.match(patt)) { | ||
return; | ||
} | ||
/** @type {Set<string>} */ | ||
const folded_set = unicode_sets[folded] || new Set(); | ||
folded_set.add(escape_regex(to_add)); | ||
unicode_sets[folded] = folded_set; | ||
}; | ||
const patt = new RegExp( '^'+setToPattern(folded_set)+'$','iu'); | ||
if( to_add.match(patt) ){ | ||
return; | ||
} | ||
for (let value of generator(code_points)) { | ||
addMatching(value.folded, value.folded); | ||
addMatching(value.folded, value.composed); | ||
} | ||
folded_set.add(escape_regex(to_add)); | ||
unicode_sets[folded] = folded_set; | ||
}; | ||
for( let value of generator(code_points) ){ | ||
addMatching(value.folded,value.folded); | ||
addMatching(value.folded,value.composed); | ||
} | ||
return unicode_sets; | ||
return unicode_sets; | ||
}; | ||
/** | ||
@@ -344,33 +311,30 @@ * Generate a unicode map from the list of code points | ||
*/ | ||
const generateMap = (code_points) => { | ||
/** @type {TUnicodeSets} */ | ||
const unicode_sets = generateSets(code_points); | ||
const generateMap = code_points => { | ||
/** @type {TUnicodeSets} */ | ||
const unicode_sets = generateSets(code_points); | ||
/** @type {TUnicodeMap} */ | ||
/** @type {TUnicodeMap} */ | ||
const unicode_map = {}; | ||
const unicode_map = {}; | ||
/** @type {string[]} */ | ||
/** @type {string[]} */ | ||
let multi_char = []; | ||
let multi_char = []; | ||
for( let folded in unicode_sets ){ | ||
for (let folded in unicode_sets) { | ||
let set = unicode_sets[folded]; | ||
let set = unicode_sets[folded]; | ||
if( set ){ | ||
unicode_map[folded] = setToPattern(set); | ||
} | ||
if (set) { | ||
unicode_map[folded] = setToPattern(set); | ||
} | ||
if( folded.length > 1 ){ | ||
multi_char.push(escape_regex(folded)); | ||
} | ||
} | ||
if (folded.length > 1) { | ||
multi_char.push(escape_regex(folded)); | ||
} | ||
} | ||
multi_char.sort((a, b) => b.length - a.length ); | ||
const multi_char_patt = arrayToPattern(multi_char); | ||
multi_char_reg = new RegExp('^'+multi_char_patt,'u'); | ||
return unicode_map; | ||
multi_char.sort((a, b) => b.length - a.length); | ||
const multi_char_patt = arrayToPattern(multi_char); | ||
multi_char_reg = new RegExp('^' + multi_char_patt, 'u'); | ||
return unicode_map; | ||
}; | ||
/** | ||
@@ -382,20 +346,19 @@ * Map each element of an array from it's folded value to all possible unicode matches | ||
*/ | ||
const mapSequence = (strings,min_replacement=1) =>{ | ||
let chars_replaced = 0; | ||
const mapSequence = (strings, min_replacement = 1) => { | ||
let chars_replaced = 0; | ||
strings = strings.map(str => { | ||
if (exports.unicode_map[str]) { | ||
chars_replaced += str.length; | ||
} | ||
strings = strings.map((str)=>{ | ||
if( exports.unicode_map[str] ){ | ||
chars_replaced += str.length; | ||
} | ||
return exports.unicode_map[str] || str; | ||
}); | ||
return exports.unicode_map[str] || str; | ||
}); | ||
if( chars_replaced >= min_replacement ){ | ||
return sequencePattern(strings); | ||
} | ||
if (chars_replaced >= min_replacement) { | ||
return sequencePattern(strings); | ||
} | ||
return ''; | ||
return ''; | ||
}; | ||
/** | ||
@@ -414,13 +377,9 @@ * Convert a short string and split it into all possible patterns | ||
*/ | ||
const substringsToPattern = (str,min_replacement=1) => { | ||
min_replacement = Math.max(min_replacement,str.length-1); | ||
return arrayToPattern( | ||
allSubstrings(str).map( (sub_pat) =>{ | ||
return mapSequence(sub_pat,min_replacement) | ||
}) | ||
); | ||
const substringsToPattern = (str, min_replacement = 1) => { | ||
min_replacement = Math.max(min_replacement, str.length - 1); | ||
return arrayToPattern(allSubstrings(str).map(sub_pat => { | ||
return mapSequence(sub_pat, min_replacement); | ||
})); | ||
}; | ||
/** | ||
@@ -433,18 +392,16 @@ * Convert an array of sequences into a pattern | ||
*/ | ||
const sequencesToPattern = (sequences,all=true) => { | ||
let min_replacement = sequences.length > 1 ? 1 : 0; | ||
return arrayToPattern( | ||
sequences.map( (sequence) =>{ | ||
let seq = []; | ||
const len = all ? sequence.length() : sequence.length() - 1; | ||
for( let j = 0; j < len; j++){ | ||
seq.push(substringsToPattern(sequence.substrs[j]||'',min_replacement)); | ||
} | ||
const sequencesToPattern = (sequences, all = true) => { | ||
let min_replacement = sequences.length > 1 ? 1 : 0; | ||
return arrayToPattern(sequences.map(sequence => { | ||
let seq = []; | ||
const len = all ? sequence.length() : sequence.length() - 1; | ||
return sequencePattern(seq); | ||
}) | ||
); | ||
for (let j = 0; j < len; j++) { | ||
seq.push(substringsToPattern(sequence.substrs[j] || '', min_replacement)); | ||
} | ||
return sequencePattern(seq); | ||
})); | ||
}; | ||
/** | ||
@@ -455,118 +412,116 @@ * Return true if the sequence is already in the sequences | ||
*/ | ||
const inSequences = (needle_seq, sequences) => { | ||
for(const seq of sequences){ | ||
if( seq.start != needle_seq.start || seq.end != needle_seq.end ){ | ||
continue; | ||
} | ||
const inSequences = (needle_seq, sequences) => { | ||
for (const seq of sequences) { | ||
if (seq.start != needle_seq.start || seq.end != needle_seq.end) { | ||
continue; | ||
} | ||
if( seq.substrs.join('') !== needle_seq.substrs.join('') ){ | ||
continue; | ||
} | ||
if (seq.substrs.join('') !== needle_seq.substrs.join('')) { | ||
continue; | ||
} | ||
let needle_parts = needle_seq.parts; | ||
/** | ||
* @param {TSequencePart} part | ||
*/ | ||
let needle_parts = needle_seq.parts; | ||
const filter = part => { | ||
for (const needle_part of needle_parts) { | ||
if (needle_part.start === part.start && needle_part.substr === part.substr) { | ||
return false; | ||
} | ||
/** | ||
* @param {TSequencePart} part | ||
*/ | ||
const filter = (part) =>{ | ||
if (part.length == 1 || needle_part.length == 1) { | ||
continue; | ||
} // check for overlapping parts | ||
// a = ['::=','=='] | ||
// b = ['::','==='] | ||
// a = ['r','sm'] | ||
// b = ['rs','m'] | ||
for(const needle_part of needle_parts){ | ||
if( needle_part.start === part.start && needle_part.substr === part.substr ){ | ||
return false; | ||
} | ||
if (part.start < needle_part.start && part.end > needle_part.start) { | ||
return true; | ||
} | ||
if( part.length == 1 || needle_part.length == 1 ){ | ||
continue; | ||
} | ||
if (needle_part.start < part.start && needle_part.end > part.start) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
}; | ||
// check for overlapping parts | ||
// a = ['::=','=='] | ||
// b = ['::','==='] | ||
// a = ['r','sm'] | ||
// b = ['rs','m'] | ||
if( part.start < needle_part.start && part.end > needle_part.start ){ | ||
return true; | ||
} | ||
let filtered = seq.parts.filter(filter); | ||
if( needle_part.start < part.start && needle_part.end > part.start ){ | ||
return true; | ||
} | ||
if (filtered.length > 0) { | ||
continue; | ||
} | ||
} | ||
return true; | ||
} | ||
return false; | ||
}; | ||
let filtered = seq.parts.filter(filter); | ||
if( filtered.length > 0 ){ | ||
continue; | ||
} | ||
return true; | ||
} | ||
return false; | ||
return false; | ||
}; | ||
class Sequence{ | ||
class Sequence { | ||
constructor() { | ||
/** @type {TSequencePart[]} */ | ||
this.parts = []; | ||
/** @type {string[]} */ | ||
constructor(){ | ||
this.substrs = []; | ||
this.start = 0; | ||
this.end = 0; | ||
} | ||
/** | ||
* @param {TSequencePart|undefined} part | ||
*/ | ||
/** @type {TSequencePart[]} */ | ||
this.parts = []; | ||
/** @type {string[]} */ | ||
this.substrs = []; | ||
this.start = 0; | ||
this.end = 0; | ||
} | ||
add(part) { | ||
if (part) { | ||
this.parts.push(part); | ||
this.substrs.push(part.substr); | ||
this.start = Math.min(part.start, this.start); | ||
this.end = Math.max(part.end, this.end); | ||
} | ||
} | ||
/** | ||
* @param {TSequencePart|undefined} part | ||
*/ | ||
add(part){ | ||
if( part ){ | ||
this.parts.push(part); | ||
this.substrs.push(part.substr); | ||
this.start = Math.min(part.start,this.start); | ||
this.end = Math.max(part.end,this.end); | ||
} | ||
} | ||
last() { | ||
return this.parts[this.parts.length - 1]; | ||
} | ||
last(){ | ||
return this.parts[this.parts.length-1]; | ||
} | ||
length() { | ||
return this.parts.length; | ||
} | ||
/** | ||
* @param {number} position | ||
* @param {TSequencePart} last_piece | ||
*/ | ||
length(){ | ||
return this.parts.length; | ||
} | ||
/** | ||
* @param {number} position | ||
* @param {TSequencePart} last_piece | ||
*/ | ||
clone(position, last_piece){ | ||
let clone = new Sequence(); | ||
clone(position, last_piece) { | ||
let clone = new Sequence(); | ||
let parts = JSON.parse(JSON.stringify(this.parts)); | ||
let last_part = parts.pop(); | ||
let parts = JSON.parse(JSON.stringify(this.parts)); | ||
let last_part = parts.pop(); | ||
for( const part of parts ){ | ||
clone.add(part); | ||
} | ||
for (const part of parts) { | ||
clone.add(part); | ||
} | ||
let last_substr = last_piece.substr.substring(0,position-last_part.start); | ||
let clone_last_len = last_substr.length; | ||
clone.add({start:last_part.start,end:last_part.start+clone_last_len,length:clone_last_len,substr:last_substr}); | ||
let last_substr = last_piece.substr.substring(0, position - last_part.start); | ||
let clone_last_len = last_substr.length; | ||
clone.add({ | ||
start: last_part.start, | ||
end: last_part.start + clone_last_len, | ||
length: clone_last_len, | ||
substr: last_substr | ||
}); | ||
return clone; | ||
} | ||
return clone; | ||
} | ||
} | ||
/** | ||
@@ -587,98 +542,98 @@ * Expand a regular expression pattern to include unicode variants | ||
*/ | ||
const getPattern = (str) => { | ||
initialize(); | ||
str = asciifold(str); | ||
let pattern = ''; | ||
let sequences = [new Sequence()]; | ||
const getPattern = str => { | ||
initialize(); | ||
str = asciifold(str); | ||
let pattern = ''; | ||
let sequences = [new Sequence()]; | ||
for( let i = 0; i < str.length; i++ ){ | ||
for (let i = 0; i < str.length; i++) { | ||
let substr = str.substring(i); | ||
let match = substr.match(multi_char_reg); | ||
const char = str.substring(i, i + 1); | ||
const match_str = match ? match[0] : null; // loop through sequences | ||
// add either the char or multi_match | ||
let substr = str.substring(i); | ||
let match = substr.match(multi_char_reg); | ||
const char = str.substring(i,i+1); | ||
const match_str = match ? match[0] : null; | ||
let overlapping = []; | ||
let added_types = new Set(); | ||
for (const sequence of sequences) { | ||
const last_piece = sequence.last(); | ||
// loop through sequences | ||
// add either the char or multi_match | ||
let overlapping = []; | ||
let added_types = new Set(); | ||
for(const sequence of sequences){ | ||
if (!last_piece || last_piece.length == 1 || last_piece.end <= i) { | ||
// if we have a multi match | ||
if (match_str) { | ||
const len = match_str.length; | ||
sequence.add({ | ||
start: i, | ||
end: i + len, | ||
length: len, | ||
substr: match_str | ||
}); | ||
added_types.add('1'); | ||
} else { | ||
sequence.add({ | ||
start: i, | ||
end: i + 1, | ||
length: 1, | ||
substr: char | ||
}); | ||
added_types.add('2'); | ||
} | ||
} else if (match_str) { | ||
let clone = sequence.clone(i, last_piece); | ||
const len = match_str.length; | ||
clone.add({ | ||
start: i, | ||
end: i + len, | ||
length: len, | ||
substr: match_str | ||
}); | ||
overlapping.push(clone); | ||
} else { | ||
// don't add char | ||
// adding would create invalid patterns: 234 => [2,34,4] | ||
added_types.add('3'); | ||
} | ||
} // if we have overlapping | ||
const last_piece = sequence.last(); | ||
if (overlapping.length > 0) { | ||
// ['ii','iii'] before ['i','i','iii'] | ||
overlapping = overlapping.sort((a, b) => { | ||
return a.length() - b.length(); | ||
}); | ||
if( !last_piece || last_piece.length == 1 || last_piece.end <= i ){ | ||
for (let clone of overlapping) { | ||
// don't add if we already have an equivalent sequence | ||
if (inSequences(clone, sequences)) { | ||
continue; | ||
} | ||
// if we have a multi match | ||
if( match_str ){ | ||
const len = match_str.length; | ||
sequence.add({start:i,end:i+len,length:len,substr:match_str}); | ||
added_types.add('1'); | ||
}else { | ||
sequence.add({start:i,end:i+1,length:1,substr:char}); | ||
added_types.add('2'); | ||
} | ||
sequences.push(clone); | ||
} | ||
}else if( match_str ){ | ||
continue; | ||
} // if we haven't done anything unique | ||
// clean up the patterns | ||
// helps keep patterns smaller | ||
// if str = 'r₨㎧aarss', pattern will be 446 instead of 655 | ||
let clone = sequence.clone(i,last_piece); | ||
const len = match_str.length; | ||
clone.add({start:i,end:i+len,length:len,substr:match_str}); | ||
if (i > 0 && added_types.size == 1 && !added_types.has('3')) { | ||
pattern += sequencesToPattern(sequences, false); | ||
let new_seq = new Sequence(); | ||
const old_seq = sequences[0]; | ||
overlapping.push(clone); | ||
if (old_seq) { | ||
new_seq.add(old_seq.last()); | ||
} | ||
}else { | ||
// don't add char | ||
// adding would create invalid patterns: 234 => [2,34,4] | ||
added_types.add('3'); | ||
} | ||
sequences = [new_seq]; | ||
} | ||
} | ||
} | ||
// if we have overlapping | ||
if( overlapping.length > 0 ){ | ||
// ['ii','iii'] before ['i','i','iii'] | ||
overlapping = overlapping.sort((a,b)=>{ | ||
return a.length() - b.length(); | ||
}); | ||
for( let clone of overlapping){ | ||
// don't add if we already have an equivalent sequence | ||
if( inSequences(clone, sequences) ){ | ||
continue; | ||
} | ||
sequences.push(clone); | ||
} | ||
continue; | ||
} | ||
// if we haven't done anything unique | ||
// clean up the patterns | ||
// helps keep patterns smaller | ||
// if str = 'r₨㎧aarss', pattern will be 446 instead of 655 | ||
if( i > 0 && added_types.size == 1 && !added_types.has('3') ){ | ||
pattern += sequencesToPattern(sequences,false); | ||
let new_seq = new Sequence(); | ||
const old_seq = sequences[0]; | ||
if( old_seq ){ | ||
new_seq.add(old_seq.last()); | ||
} | ||
sequences = [new_seq]; | ||
} | ||
} | ||
pattern += sequencesToPattern(sequences,true); | ||
return pattern; | ||
pattern += sequencesToPattern(sequences, true); | ||
return pattern; | ||
}; | ||
@@ -685,0 +640,0 @@ |
@@ -15,13 +15,11 @@ /*! @orchidjs/unicode-variants | https://github.com/orchidjs/unicode-variants | Apache License (v2) */ | ||
*/ | ||
const arrayToPattern = (chars) =>{ | ||
const arrayToPattern = chars => { | ||
chars = chars.filter(Boolean); | ||
chars = chars.filter( Boolean ); | ||
if (chars.length < 2) { | ||
return chars[0] || ''; | ||
} | ||
if( chars.length < 2 ){ | ||
return chars[0] || ''; | ||
} | ||
return (maxValueLength(chars) == 1) ? '['+chars.join('')+']' : '(?:'+chars.join('|')+')'; | ||
return maxValueLength(chars) == 1 ? '[' + chars.join('') + ']' : '(?:' + chars.join('|') + ')'; | ||
}; | ||
/** | ||
@@ -31,38 +29,30 @@ * @param {string[]} array | ||
*/ | ||
const sequencePattern = (array)=>{ | ||
if( !hasDuplicates(array) ){ | ||
return array.join(''); | ||
} | ||
const sequencePattern = array => { | ||
if (!hasDuplicates(array)) { | ||
return array.join(''); | ||
} | ||
let pattern = ''; | ||
let prev_char_count = 0; | ||
let pattern = ''; | ||
let prev_char_count = 0; | ||
const prev_pattern = ()=>{ | ||
if( prev_char_count > 1 ){ | ||
pattern += '{'+prev_char_count+'}'; | ||
} | ||
}; | ||
const prev_pattern = () => { | ||
if (prev_char_count > 1) { | ||
pattern += '{' + prev_char_count + '}'; | ||
} | ||
}; | ||
array.forEach((char,i)=>{ | ||
array.forEach((char, i) => { | ||
if (char === array[i - 1]) { | ||
prev_char_count++; | ||
return; | ||
} | ||
if( char === array[i-1] ){ | ||
prev_char_count++; | ||
return; | ||
} | ||
prev_pattern(); | ||
pattern += char; | ||
prev_char_count = 1; | ||
}); | ||
prev_pattern(); | ||
return pattern; | ||
prev_pattern(); | ||
pattern += char; | ||
prev_char_count = 1; | ||
}); | ||
prev_pattern(); | ||
return pattern; | ||
}; | ||
/** | ||
@@ -75,9 +65,7 @@ * Convert array of strings to a regular expression | ||
*/ | ||
const setToPattern = (chars)=>{ | ||
let array = toArray(chars); | ||
return arrayToPattern(array); | ||
const setToPattern = chars => { | ||
let array = toArray(chars); | ||
return arrayToPattern(array); | ||
}; | ||
/** | ||
@@ -88,7 +76,6 @@ * | ||
*/ | ||
const hasDuplicates = (array) => { | ||
return (new Set(array)).size !== array.length; | ||
const hasDuplicates = array => { | ||
return new Set(array).size !== array.length; | ||
}; | ||
/** | ||
@@ -99,6 +86,6 @@ * https://stackoverflow.com/questions/63006601/why-does-u-throw-an-invalid-escape-error | ||
*/ | ||
const escape_regex = (str) => { | ||
return (str + '').replace(/([\$\(\)\*\+\.\?\[\]\^\{\|\}\\])/gu, '\\$1'); | ||
const escape_regex = str => { | ||
return (str + '').replace(/([\$\(\)\*\+\.\?\[\]\^\{\|\}\\])/gu, '\\$1'); | ||
}; | ||
/** | ||
@@ -109,14 +96,13 @@ * Return the max length of array values | ||
*/ | ||
const maxValueLength = (array) => { | ||
return array.reduce( (longest, value) => Math.max(longest,unicodeLength(value)),0); | ||
const maxValueLength = array => { | ||
return array.reduce((longest, value) => Math.max(longest, unicodeLength(value)), 0); | ||
}; | ||
/** | ||
* @param {string} str | ||
*/ | ||
const unicodeLength = (str) => { | ||
return toArray(str).length; | ||
const unicodeLength = str => { | ||
return toArray(str).length; | ||
}; | ||
/** | ||
@@ -126,4 +112,5 @@ * @param {any} p | ||
*/ | ||
const toArray = (p) => Array.from(p); | ||
const toArray = p => Array.from(p); | ||
/** | ||
@@ -135,51 +122,49 @@ * Get all possible combinations of substrings that add up to the given string | ||
*/ | ||
const allSubstrings = (input) => { | ||
const allSubstrings = input => { | ||
if (input.length === 1) return [[input]]; | ||
/** @type {string[][]} */ | ||
if( input.length === 1) return [[input]]; | ||
/** @type {string[][]} */ | ||
let result = []; | ||
const start = input.substring(1); | ||
const suba = allSubstrings(start); | ||
suba.forEach(function(subresult) { | ||
let tmp = subresult.slice(0); | ||
tmp[0] = input.charAt(0) + tmp[0]; | ||
result.push(tmp); | ||
tmp = subresult.slice(0); | ||
tmp.unshift(input.charAt(0)); | ||
result.push(tmp); | ||
}); | ||
return result; | ||
let result = []; | ||
const start = input.substring(1); | ||
const suba = allSubstrings(start); | ||
suba.forEach(function (subresult) { | ||
let tmp = subresult.slice(0); | ||
tmp[0] = input.charAt(0) + tmp[0]; | ||
result.push(tmp); | ||
tmp = subresult.slice(0); | ||
tmp.unshift(input.charAt(0)); | ||
result.push(tmp); | ||
}); | ||
return result; | ||
}; | ||
/** | ||
* @typedef {{[key:string]:string}} TUnicodeMap | ||
* @typedef {{[key:string]:Set<string>}} TUnicodeSets | ||
* @typedef {[[number,number]]} TCodePoints | ||
* @typedef {{folded:string,composed:string,code_point:number}} TCodePointObj | ||
* @typedef {{start:number,end:number,length:number,substr:string}} TSequencePart | ||
*/ | ||
/** @type {TCodePoints} */ | ||
const code_points = [[ 0, 65535 ]]; | ||
const code_points = [[0, 65535]]; | ||
const accent_pat = '[\u0300-\u036F\u{b7}\u{2be}]'; // \u{2bc} | ||
/** @type {TUnicodeMap} */ | ||
exports.unicode_map = void 0; | ||
/** @type {RegExp} */ | ||
/** @type {RegExp} */ | ||
let multi_char_reg; | ||
const max_char_length = 3; | ||
/** @type {TUnicodeMap} */ | ||
/** @type {TUnicodeMap} */ | ||
const latin_convert = { | ||
'æ': 'ae', | ||
'ⱥ': 'a', | ||
'ø': 'o', | ||
'⁄': '/', | ||
'∕': '/', | ||
'æ': 'ae', | ||
'ⱥ': 'a', | ||
'ø': 'o', | ||
'⁄': '/', | ||
'∕': '/' | ||
}; | ||
const convert_pat = new RegExp(Object.keys(latin_convert).join('|')+'|'+accent_pat,'gu'); | ||
const convert_pat = new RegExp(Object.keys(latin_convert).join('|') + '|' + accent_pat, 'gu'); | ||
/** | ||
@@ -190,8 +175,7 @@ * Initialize the unicode_map from the give code point ranges | ||
*/ | ||
const initialize = (_code_points) => { | ||
if( exports.unicode_map !== undefined ) return; | ||
exports.unicode_map = generateMap(_code_points || code_points ); | ||
const initialize = _code_points => { | ||
if (exports.unicode_map !== undefined) return; | ||
exports.unicode_map = generateMap(_code_points || code_points); | ||
}; | ||
/** | ||
@@ -203,5 +187,4 @@ * Helper method for normalize a string | ||
*/ | ||
const normalize = (str,form='NFKD') => str.normalize(form); | ||
const normalize = (str, form = 'NFKD') => str.normalize(form); | ||
/** | ||
@@ -212,23 +195,17 @@ * Compatibility Decomposition without reordering string | ||
*/ | ||
const decompose = (str) =>{ | ||
if( str.match(/[\u0f71-\u0f81]/) ){ | ||
return toArray(str).reduce( | ||
/** | ||
* @param {string} result | ||
* @param {string} char | ||
*/ | ||
(result, char) =>{ | ||
return result + normalize(char) | ||
}, | ||
'' | ||
); | ||
} | ||
const decompose = str => { | ||
if (str.match(/[\u0f71-\u0f81]/)) { | ||
return toArray(str).reduce( | ||
/** | ||
* @param {string} result | ||
* @param {string} char | ||
*/ | ||
(result, char) => { | ||
return result + normalize(char); | ||
}, ''); | ||
} | ||
return normalize(str); | ||
return normalize(str); | ||
}; | ||
/** | ||
@@ -240,15 +217,10 @@ * Remove accents | ||
*/ | ||
const asciifold = (str) => { | ||
return decompose(str) | ||
.toLowerCase() | ||
.replace(convert_pat,(/** @type {string} */ char) => { | ||
return latin_convert[char] || ''; | ||
}); | ||
const asciifold = str => { | ||
return decompose(str).toLowerCase().replace(convert_pat, ( | ||
/** @type {string} */ | ||
char) => { | ||
return latin_convert[char] || ''; | ||
}); | ||
}; | ||
/** | ||
@@ -259,42 +231,41 @@ * Generate a list of unicode variants from the list of code points | ||
*/ | ||
function* generator(code_points){ | ||
for(const [code_point_min, code_point_max] of code_points){ | ||
for(let i = code_point_min; i <= code_point_max; i++){ | ||
function* generator(code_points) { | ||
for (const [code_point_min, code_point_max] of code_points) { | ||
for (let i = code_point_min; i <= code_point_max; i++) { | ||
let composed = String.fromCharCode(i); | ||
let folded = asciifold(composed); | ||
let composed = String.fromCharCode(i); | ||
let folded = asciifold(composed); | ||
if (folded == composed.toLowerCase()) { | ||
continue; | ||
} // skip when folded is a string longer than 3 characters long | ||
// bc the resulting regex patterns will be long | ||
// eg: | ||
// folded صلى الله عليه وسلم length 18 code point 65018 | ||
// folded جل جلاله length 8 code point 65019 | ||
if( folded == composed.toLowerCase() ){ | ||
continue; | ||
} | ||
if (folded.length > max_char_length) { | ||
continue; | ||
} | ||
// skip when folded is a string longer than 3 characters long | ||
// bc the resulting regex patterns will be long | ||
// eg: | ||
// folded صلى الله عليه وسلم length 18 code point 65018 | ||
// folded جل جلاله length 8 code point 65019 | ||
if( folded.length > max_char_length ){ | ||
continue; | ||
} | ||
if (folded.length == 0) { | ||
continue; | ||
} | ||
if( folded.length == 0 ){ | ||
continue | ||
} | ||
let decomposed = normalize(composed); | ||
let recomposed = normalize(decomposed, 'NFC'); | ||
let decomposed = normalize(composed); | ||
let recomposed = normalize(decomposed,'NFC'); | ||
if (recomposed === composed && folded === decomposed) { | ||
continue; | ||
} | ||
if( recomposed === composed && folded === decomposed ){ | ||
continue; | ||
} | ||
yield {folded:folded,composed:composed,code_point:i}; | ||
} | ||
} | ||
yield { | ||
folded: folded, | ||
composed: composed, | ||
code_point: i | ||
}; | ||
} | ||
} | ||
} | ||
/** | ||
@@ -305,35 +276,31 @@ * Generate a unicode map from the list of code points | ||
*/ | ||
const generateSets = (code_points) => { | ||
/** @type {{[key:string]:Set<string>}} */ | ||
const unicode_sets = {}; | ||
const generateSets = code_points => { | ||
/** @type {{[key:string]:Set<string>}} */ | ||
const unicode_sets = {}; | ||
/** | ||
* @param {string} folded | ||
* @param {string} to_add | ||
*/ | ||
const addMatching = (folded, to_add) => { | ||
/** @type {Set<string>} */ | ||
const folded_set = unicode_sets[folded] || new Set(); | ||
const patt = new RegExp('^' + setToPattern(folded_set) + '$', 'iu'); | ||
/** | ||
* @param {string} folded | ||
* @param {string} to_add | ||
*/ | ||
const addMatching = (folded,to_add) => { | ||
if (to_add.match(patt)) { | ||
return; | ||
} | ||
/** @type {Set<string>} */ | ||
const folded_set = unicode_sets[folded] || new Set(); | ||
folded_set.add(escape_regex(to_add)); | ||
unicode_sets[folded] = folded_set; | ||
}; | ||
const patt = new RegExp( '^'+setToPattern(folded_set)+'$','iu'); | ||
if( to_add.match(patt) ){ | ||
return; | ||
} | ||
for (let value of generator(code_points)) { | ||
addMatching(value.folded, value.folded); | ||
addMatching(value.folded, value.composed); | ||
} | ||
folded_set.add(escape_regex(to_add)); | ||
unicode_sets[folded] = folded_set; | ||
}; | ||
for( let value of generator(code_points) ){ | ||
addMatching(value.folded,value.folded); | ||
addMatching(value.folded,value.composed); | ||
} | ||
return unicode_sets; | ||
return unicode_sets; | ||
}; | ||
/** | ||
@@ -346,33 +313,30 @@ * Generate a unicode map from the list of code points | ||
*/ | ||
const generateMap = (code_points) => { | ||
/** @type {TUnicodeSets} */ | ||
const unicode_sets = generateSets(code_points); | ||
const generateMap = code_points => { | ||
/** @type {TUnicodeSets} */ | ||
const unicode_sets = generateSets(code_points); | ||
/** @type {TUnicodeMap} */ | ||
/** @type {TUnicodeMap} */ | ||
const unicode_map = {}; | ||
const unicode_map = {}; | ||
/** @type {string[]} */ | ||
/** @type {string[]} */ | ||
let multi_char = []; | ||
let multi_char = []; | ||
for( let folded in unicode_sets ){ | ||
for (let folded in unicode_sets) { | ||
let set = unicode_sets[folded]; | ||
let set = unicode_sets[folded]; | ||
if( set ){ | ||
unicode_map[folded] = setToPattern(set); | ||
} | ||
if (set) { | ||
unicode_map[folded] = setToPattern(set); | ||
} | ||
if( folded.length > 1 ){ | ||
multi_char.push(escape_regex(folded)); | ||
} | ||
} | ||
if (folded.length > 1) { | ||
multi_char.push(escape_regex(folded)); | ||
} | ||
} | ||
multi_char.sort((a, b) => b.length - a.length ); | ||
const multi_char_patt = arrayToPattern(multi_char); | ||
multi_char_reg = new RegExp('^'+multi_char_patt,'u'); | ||
return unicode_map; | ||
multi_char.sort((a, b) => b.length - a.length); | ||
const multi_char_patt = arrayToPattern(multi_char); | ||
multi_char_reg = new RegExp('^' + multi_char_patt, 'u'); | ||
return unicode_map; | ||
}; | ||
/** | ||
@@ -384,20 +348,19 @@ * Map each element of an array from it's folded value to all possible unicode matches | ||
*/ | ||
const mapSequence = (strings,min_replacement=1) =>{ | ||
let chars_replaced = 0; | ||
const mapSequence = (strings, min_replacement = 1) => { | ||
let chars_replaced = 0; | ||
strings = strings.map(str => { | ||
if (exports.unicode_map[str]) { | ||
chars_replaced += str.length; | ||
} | ||
strings = strings.map((str)=>{ | ||
if( exports.unicode_map[str] ){ | ||
chars_replaced += str.length; | ||
} | ||
return exports.unicode_map[str] || str; | ||
}); | ||
return exports.unicode_map[str] || str; | ||
}); | ||
if( chars_replaced >= min_replacement ){ | ||
return sequencePattern(strings); | ||
} | ||
if (chars_replaced >= min_replacement) { | ||
return sequencePattern(strings); | ||
} | ||
return ''; | ||
return ''; | ||
}; | ||
/** | ||
@@ -416,13 +379,9 @@ * Convert a short string and split it into all possible patterns | ||
*/ | ||
const substringsToPattern = (str,min_replacement=1) => { | ||
min_replacement = Math.max(min_replacement,str.length-1); | ||
return arrayToPattern( | ||
allSubstrings(str).map( (sub_pat) =>{ | ||
return mapSequence(sub_pat,min_replacement) | ||
}) | ||
); | ||
const substringsToPattern = (str, min_replacement = 1) => { | ||
min_replacement = Math.max(min_replacement, str.length - 1); | ||
return arrayToPattern(allSubstrings(str).map(sub_pat => { | ||
return mapSequence(sub_pat, min_replacement); | ||
})); | ||
}; | ||
/** | ||
@@ -435,18 +394,16 @@ * Convert an array of sequences into a pattern | ||
*/ | ||
const sequencesToPattern = (sequences,all=true) => { | ||
let min_replacement = sequences.length > 1 ? 1 : 0; | ||
return arrayToPattern( | ||
sequences.map( (sequence) =>{ | ||
let seq = []; | ||
const len = all ? sequence.length() : sequence.length() - 1; | ||
for( let j = 0; j < len; j++){ | ||
seq.push(substringsToPattern(sequence.substrs[j]||'',min_replacement)); | ||
} | ||
const sequencesToPattern = (sequences, all = true) => { | ||
let min_replacement = sequences.length > 1 ? 1 : 0; | ||
return arrayToPattern(sequences.map(sequence => { | ||
let seq = []; | ||
const len = all ? sequence.length() : sequence.length() - 1; | ||
return sequencePattern(seq); | ||
}) | ||
); | ||
for (let j = 0; j < len; j++) { | ||
seq.push(substringsToPattern(sequence.substrs[j] || '', min_replacement)); | ||
} | ||
return sequencePattern(seq); | ||
})); | ||
}; | ||
/** | ||
@@ -457,118 +414,116 @@ * Return true if the sequence is already in the sequences | ||
*/ | ||
const inSequences = (needle_seq, sequences) => { | ||
for(const seq of sequences){ | ||
if( seq.start != needle_seq.start || seq.end != needle_seq.end ){ | ||
continue; | ||
} | ||
const inSequences = (needle_seq, sequences) => { | ||
for (const seq of sequences) { | ||
if (seq.start != needle_seq.start || seq.end != needle_seq.end) { | ||
continue; | ||
} | ||
if( seq.substrs.join('') !== needle_seq.substrs.join('') ){ | ||
continue; | ||
} | ||
if (seq.substrs.join('') !== needle_seq.substrs.join('')) { | ||
continue; | ||
} | ||
let needle_parts = needle_seq.parts; | ||
/** | ||
* @param {TSequencePart} part | ||
*/ | ||
let needle_parts = needle_seq.parts; | ||
const filter = part => { | ||
for (const needle_part of needle_parts) { | ||
if (needle_part.start === part.start && needle_part.substr === part.substr) { | ||
return false; | ||
} | ||
/** | ||
* @param {TSequencePart} part | ||
*/ | ||
const filter = (part) =>{ | ||
if (part.length == 1 || needle_part.length == 1) { | ||
continue; | ||
} // check for overlapping parts | ||
// a = ['::=','=='] | ||
// b = ['::','==='] | ||
// a = ['r','sm'] | ||
// b = ['rs','m'] | ||
for(const needle_part of needle_parts){ | ||
if( needle_part.start === part.start && needle_part.substr === part.substr ){ | ||
return false; | ||
} | ||
if (part.start < needle_part.start && part.end > needle_part.start) { | ||
return true; | ||
} | ||
if( part.length == 1 || needle_part.length == 1 ){ | ||
continue; | ||
} | ||
if (needle_part.start < part.start && needle_part.end > part.start) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
}; | ||
// check for overlapping parts | ||
// a = ['::=','=='] | ||
// b = ['::','==='] | ||
// a = ['r','sm'] | ||
// b = ['rs','m'] | ||
if( part.start < needle_part.start && part.end > needle_part.start ){ | ||
return true; | ||
} | ||
let filtered = seq.parts.filter(filter); | ||
if( needle_part.start < part.start && needle_part.end > part.start ){ | ||
return true; | ||
} | ||
if (filtered.length > 0) { | ||
continue; | ||
} | ||
} | ||
return true; | ||
} | ||
return false; | ||
}; | ||
let filtered = seq.parts.filter(filter); | ||
if( filtered.length > 0 ){ | ||
continue; | ||
} | ||
return true; | ||
} | ||
return false; | ||
return false; | ||
}; | ||
class Sequence{ | ||
class Sequence { | ||
constructor() { | ||
/** @type {TSequencePart[]} */ | ||
this.parts = []; | ||
/** @type {string[]} */ | ||
constructor(){ | ||
this.substrs = []; | ||
this.start = 0; | ||
this.end = 0; | ||
} | ||
/** | ||
* @param {TSequencePart|undefined} part | ||
*/ | ||
/** @type {TSequencePart[]} */ | ||
this.parts = []; | ||
/** @type {string[]} */ | ||
this.substrs = []; | ||
this.start = 0; | ||
this.end = 0; | ||
} | ||
add(part) { | ||
if (part) { | ||
this.parts.push(part); | ||
this.substrs.push(part.substr); | ||
this.start = Math.min(part.start, this.start); | ||
this.end = Math.max(part.end, this.end); | ||
} | ||
} | ||
/** | ||
* @param {TSequencePart|undefined} part | ||
*/ | ||
add(part){ | ||
if( part ){ | ||
this.parts.push(part); | ||
this.substrs.push(part.substr); | ||
this.start = Math.min(part.start,this.start); | ||
this.end = Math.max(part.end,this.end); | ||
} | ||
} | ||
last() { | ||
return this.parts[this.parts.length - 1]; | ||
} | ||
last(){ | ||
return this.parts[this.parts.length-1]; | ||
} | ||
length() { | ||
return this.parts.length; | ||
} | ||
/** | ||
* @param {number} position | ||
* @param {TSequencePart} last_piece | ||
*/ | ||
length(){ | ||
return this.parts.length; | ||
} | ||
/** | ||
* @param {number} position | ||
* @param {TSequencePart} last_piece | ||
*/ | ||
clone(position, last_piece){ | ||
let clone = new Sequence(); | ||
clone(position, last_piece) { | ||
let clone = new Sequence(); | ||
let parts = JSON.parse(JSON.stringify(this.parts)); | ||
let last_part = parts.pop(); | ||
let parts = JSON.parse(JSON.stringify(this.parts)); | ||
let last_part = parts.pop(); | ||
for( const part of parts ){ | ||
clone.add(part); | ||
} | ||
for (const part of parts) { | ||
clone.add(part); | ||
} | ||
let last_substr = last_piece.substr.substring(0,position-last_part.start); | ||
let clone_last_len = last_substr.length; | ||
clone.add({start:last_part.start,end:last_part.start+clone_last_len,length:clone_last_len,substr:last_substr}); | ||
let last_substr = last_piece.substr.substring(0, position - last_part.start); | ||
let clone_last_len = last_substr.length; | ||
clone.add({ | ||
start: last_part.start, | ||
end: last_part.start + clone_last_len, | ||
length: clone_last_len, | ||
substr: last_substr | ||
}); | ||
return clone; | ||
} | ||
return clone; | ||
} | ||
} | ||
/** | ||
@@ -589,98 +544,98 @@ * Expand a regular expression pattern to include unicode variants | ||
*/ | ||
const getPattern = (str) => { | ||
initialize(); | ||
str = asciifold(str); | ||
let pattern = ''; | ||
let sequences = [new Sequence()]; | ||
const getPattern = str => { | ||
initialize(); | ||
str = asciifold(str); | ||
let pattern = ''; | ||
let sequences = [new Sequence()]; | ||
for( let i = 0; i < str.length; i++ ){ | ||
for (let i = 0; i < str.length; i++) { | ||
let substr = str.substring(i); | ||
let match = substr.match(multi_char_reg); | ||
const char = str.substring(i, i + 1); | ||
const match_str = match ? match[0] : null; // loop through sequences | ||
// add either the char or multi_match | ||
let substr = str.substring(i); | ||
let match = substr.match(multi_char_reg); | ||
const char = str.substring(i,i+1); | ||
const match_str = match ? match[0] : null; | ||
let overlapping = []; | ||
let added_types = new Set(); | ||
for (const sequence of sequences) { | ||
const last_piece = sequence.last(); | ||
// loop through sequences | ||
// add either the char or multi_match | ||
let overlapping = []; | ||
let added_types = new Set(); | ||
for(const sequence of sequences){ | ||
if (!last_piece || last_piece.length == 1 || last_piece.end <= i) { | ||
// if we have a multi match | ||
if (match_str) { | ||
const len = match_str.length; | ||
sequence.add({ | ||
start: i, | ||
end: i + len, | ||
length: len, | ||
substr: match_str | ||
}); | ||
added_types.add('1'); | ||
} else { | ||
sequence.add({ | ||
start: i, | ||
end: i + 1, | ||
length: 1, | ||
substr: char | ||
}); | ||
added_types.add('2'); | ||
} | ||
} else if (match_str) { | ||
let clone = sequence.clone(i, last_piece); | ||
const len = match_str.length; | ||
clone.add({ | ||
start: i, | ||
end: i + len, | ||
length: len, | ||
substr: match_str | ||
}); | ||
overlapping.push(clone); | ||
} else { | ||
// don't add char | ||
// adding would create invalid patterns: 234 => [2,34,4] | ||
added_types.add('3'); | ||
} | ||
} // if we have overlapping | ||
const last_piece = sequence.last(); | ||
if (overlapping.length > 0) { | ||
// ['ii','iii'] before ['i','i','iii'] | ||
overlapping = overlapping.sort((a, b) => { | ||
return a.length() - b.length(); | ||
}); | ||
if( !last_piece || last_piece.length == 1 || last_piece.end <= i ){ | ||
for (let clone of overlapping) { | ||
// don't add if we already have an equivalent sequence | ||
if (inSequences(clone, sequences)) { | ||
continue; | ||
} | ||
// if we have a multi match | ||
if( match_str ){ | ||
const len = match_str.length; | ||
sequence.add({start:i,end:i+len,length:len,substr:match_str}); | ||
added_types.add('1'); | ||
}else { | ||
sequence.add({start:i,end:i+1,length:1,substr:char}); | ||
added_types.add('2'); | ||
} | ||
sequences.push(clone); | ||
} | ||
}else if( match_str ){ | ||
continue; | ||
} // if we haven't done anything unique | ||
// clean up the patterns | ||
// helps keep patterns smaller | ||
// if str = 'r₨㎧aarss', pattern will be 446 instead of 655 | ||
let clone = sequence.clone(i,last_piece); | ||
const len = match_str.length; | ||
clone.add({start:i,end:i+len,length:len,substr:match_str}); | ||
if (i > 0 && added_types.size == 1 && !added_types.has('3')) { | ||
pattern += sequencesToPattern(sequences, false); | ||
let new_seq = new Sequence(); | ||
const old_seq = sequences[0]; | ||
overlapping.push(clone); | ||
if (old_seq) { | ||
new_seq.add(old_seq.last()); | ||
} | ||
}else { | ||
// don't add char | ||
// adding would create invalid patterns: 234 => [2,34,4] | ||
added_types.add('3'); | ||
} | ||
sequences = [new_seq]; | ||
} | ||
} | ||
} | ||
// if we have overlapping | ||
if( overlapping.length > 0 ){ | ||
// ['ii','iii'] before ['i','i','iii'] | ||
overlapping = overlapping.sort((a,b)=>{ | ||
return a.length() - b.length(); | ||
}); | ||
for( let clone of overlapping){ | ||
// don't add if we already have an equivalent sequence | ||
if( inSequences(clone, sequences) ){ | ||
continue; | ||
} | ||
sequences.push(clone); | ||
} | ||
continue; | ||
} | ||
// if we haven't done anything unique | ||
// clean up the patterns | ||
// helps keep patterns smaller | ||
// if str = 'r₨㎧aarss', pattern will be 446 instead of 655 | ||
if( i > 0 && added_types.size == 1 && !added_types.has('3') ){ | ||
pattern += sequencesToPattern(sequences,false); | ||
let new_seq = new Sequence(); | ||
const old_seq = sequences[0]; | ||
if( old_seq ){ | ||
new_seq.add(old_seq.last()); | ||
} | ||
sequences = [new_seq]; | ||
} | ||
} | ||
pattern += sequencesToPattern(sequences,true); | ||
return pattern; | ||
pattern += sequencesToPattern(sequences, true); | ||
return pattern; | ||
}; | ||
@@ -687,0 +642,0 @@ |
{ | ||
"name": "@orchidjs/unicode-variants", | ||
"version": "1.0.1", | ||
"version": "1.0.2", | ||
"description": "Unicode variant string matching", | ||
@@ -5,0 +5,0 @@ "main": "dist/umd/index.js", |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
202346
2176