wink-eng-lite-web-model
Advanced tools
Comparing version 1.6.0 to 1.7.0
@@ -0,1 +1,12 @@ | ||
# [Enable accented chars & word joiner char handling](https://github.com/winkjs/wink-eng-lite-web-model/releases/tag/1.7.0) | ||
## Version 1.7.0 April 12, 2024 | ||
### ✨ Features | ||
- Accented characters now also include diacritical marks. 🙌 | ||
- Token containing a word joiner is treated as a single token. ✅ | ||
- Shape of token containing accented chars, diacritical marks & word joiner char(s) is now determined after removing them. 👏 | ||
# [Enable non-breaking space (U+00A0) handling](https://github.com/winkjs/wink-eng-lite-web-model/releases/tag/1.6.0) | ||
@@ -2,0 +13,0 @@ ## Version 1.6.0 April 03, 2024 |
@@ -1,1 +0,1 @@ | ||
var tcat=require("./token-categories.js"),tcat2pos=new Array(tcat.list.length);tcat2pos.fill(0),tcat2pos[0]=17,tcat2pos[2]=9,tcat2pos[3]=15,tcat2pos[4]=15,tcat2pos[4]=15,tcat2pos[5]=12,tcat2pos[6]=15,tcat2pos[7]=15,tcat2pos[8]=15,tcat2pos[9]=15,tcat2pos[10]=11,tcat2pos[11]=15,tcat2pos[12]=13,tcat2pos[13]=15,tcat2pos[14]=14,tcat2pos[18]=9;var suffix2pos=Object.create(null);suffix2pos.ing=16,suffix2pos.ed=16,suffix2pos.ly=3,suffix2pos.able=1,suffix2pos.tish=1,suffix2pos.like=1,suffix2pos.ous=1,suffix2pos.ful=1,suffix2pos.ary=1,suffix2pos.less=1,suffix2pos.ier=1,suffix2pos.est=1,suffix2pos.fy=1,suffix2pos.ky=1,suffix2pos.es=8,suffix2pos.er=8,suffix2pos.or=8,suffix2pos.ity=8,suffix2pos.ion=8,suffix2pos.llah=12,suffix2pos.stan=12,suffix2pos.gton=12,suffix2pos.abad=12,suffix2pos.land=12,suffix2pos.pur=12,suffix2pos.tnam=12;var prefix2pos=Object.create(null);prefix2pos.anti=1,prefix2pos.post=1,prefix2pos.non=1,prefix2pos.cross=8;var feature=function(config,lang,featuresData,isLexicographer){const rgxLC=/^[a-z][a-z\-\–\—\.]*$/,rgxUC=/^[A-Z][A-Z\-\–\—\.]*$/,rgxTC=/^[A-Z][a-z\-\–\—\.]*$/;var i,rgxCatDetectors=[],regexes=lang?lang.trex.lex:null,imax=lang?regexes.length:0;const fd=featuresData;var methods=Object.create(null);let prevWord=null;for(i=0;i<imax;i+=1)rgxCatDetectors.push([new RegExp(regexes[i][0],regexes[i][1]),regexes[i][2]]);return methods.shape=function(word){return word.replace(/[A-Z]{4,}/g,"XXXX").replace(/[A-Z]/g,"X").replace(/[a-z]{4,}/g,"xxxx").replace(/[a-z]/g,"x").replace(/\d{4,}/g,"dddd").replace(/\d/g,"d")},methods.suffix=function(word){return word.slice(-config.suffix)},methods.prefix=function(word){return word.slice(0,config.prefix)},methods.lexeme=function(word){return word},methods.lexemeCID=function(word){return word},methods.isAbbrev=function(word){return/[a-z].*\.$/i.test(word)?1:0},methods.normal=function(word){const lcwHash=fd.lexeme.hash[word.toLowerCase()];if(void 0===lcwHash)return console.log("[41m%s[0m entry is missing! (feature.normal)",JSON.stringify(word.toLowerCase())),0;const offset=lcwHash-fd.lexeme.hash[word];if(offset<0||offset>3)throw new Error("feature.normal: offset of "+offset+" for "+JSON.stringify(word));return offset},methods.tokenType=function(word){var cat;for(cat=0;cat<rgxCatDetectors.length;cat+=1)if(rgxCatDetectors[cat][0].test(word))return rgxCatDetectors[cat][1];return console.log("[41m%s[0m has unknown token type! (feature.tokenType)",JSON.stringify(word)),tcat.hash.word},methods.pos=function(word,category,cache){if(isLexicographer){const tags=fd.pos.hash[word];return tags?lang.xpos.hash[tags&&1===tags.length?tags[0]:"UNK"]:lang.xpos.hash.UNK}var pos,wordInLC;if(category===tcat.hash.word){wordInLC=word.toLowerCase();const hash=cache.lookup(wordInLC)[0];if(hash<cache.intrinsicSize()){const posOfWLC=cache.posOf(hash),isFirstToken=null===prevWord||/^[\t\r\n.?!]+$/.test(prevWord);pos=!rgxTC.test(word)&&!rgxUC.test(word)||isFirstToken?posOfWLC:12}else pos=function(word){if(!rgxLC.test(word))return 12;var wlc=word.toLowerCase();return suffix2pos[wlc.slice(-4)]||suffix2pos[wlc.slice(-3)]||suffix2pos[wlc.slice(-2)]||prefix2pos[wlc.slice(0,5)]||prefix2pos[wlc.slice(0,4)]||prefix2pos[wlc.slice(0,3)]||prefix2pos[wlc.slice(0,2)]||8}(word)}return prevWord=word,pos||tcat2pos[category]||(rgxTC.test(word)?12:8)},methods.isSPoS=function(word){const tags=fd.pos.hash[word];return tags&&1===tags.length?1:0},methods.lemma=function(word){if(0===fd.lexeme.hash[word])return 0;const lmh=fd.lemma.hash[word];return void 0===lmh||void 0===fd.lexeme.hash[lmh[0]]?(console.log("[41m%s[0m entry is missing! (feature.lemma)",JSON.stringify(lmh[0])),0):fd.lexeme.hash[lmh[0]]},methods.isSLemma=function(word){if(0===fd.lexeme.hash[word])return 0;const lmh=fd.lemma.hash[word];return lmh&&fd.lexeme.hash[lmh[0]]&&1===lmh.length?1:0},methods.lutCase=function(word){return rgxLC.test(word)?1:rgxUC.test(word)?2:rgxTC.test(word)?3:0},methods};module.exports=feature; | ||
var tcat=require("./token-categories.js"),tcat2pos=new Array(tcat.list.length);tcat2pos.fill(0),tcat2pos[0]=17,tcat2pos[2]=9,tcat2pos[3]=15,tcat2pos[4]=15,tcat2pos[4]=15,tcat2pos[5]=12,tcat2pos[6]=15,tcat2pos[7]=15,tcat2pos[8]=15,tcat2pos[9]=15,tcat2pos[10]=11,tcat2pos[11]=15,tcat2pos[12]=13,tcat2pos[13]=15,tcat2pos[14]=14,tcat2pos[18]=9;var suffix2pos=Object.create(null);suffix2pos.ing=16,suffix2pos.ed=16,suffix2pos.ly=3,suffix2pos.able=1,suffix2pos.tish=1,suffix2pos.like=1,suffix2pos.ous=1,suffix2pos.ful=1,suffix2pos.ary=1,suffix2pos.less=1,suffix2pos.ier=1,suffix2pos.est=1,suffix2pos.fy=1,suffix2pos.ky=1,suffix2pos.es=8,suffix2pos.er=8,suffix2pos.or=8,suffix2pos.ity=8,suffix2pos.ion=8,suffix2pos.llah=12,suffix2pos.stan=12,suffix2pos.gton=12,suffix2pos.abad=12,suffix2pos.land=12,suffix2pos.pur=12,suffix2pos.tnam=12;var prefix2pos=Object.create(null);prefix2pos.anti=1,prefix2pos.post=1,prefix2pos.non=1,prefix2pos.cross=8;var feature=function(config,lang,featuresData,isLexicographer){const rgxLC=/^[a-z][a-z\-\–\—\.]*$/,rgxUC=/^[A-Z][A-Z\-\–\—\.]*$/,rgxTC=/^[A-Z][a-z\-\–\—\.]*$/;var i,rgxDiacriticalWordJoiner=/[\u0300-\u036f\u2060]/g,rgxCatDetectors=[],regexes=lang?lang.trex.lex:null,imax=lang?regexes.length:0;const fd=featuresData;var methods=Object.create(null);let prevWord=null;for(i=0;i<imax;i+=1)rgxCatDetectors.push([new RegExp(regexes[i][0],regexes[i][1]),regexes[i][2]]);return methods.shape=function(word){return word.normalize("NFD").replace(rgxDiacriticalWordJoiner,"").replace(/[A-Z]{4,}/g,"XXXX").replace(/[A-Z]/g,"X").replace(/[a-z]{4,}/g,"xxxx").replace(/[a-z]/g,"x").replace(/\d{4,}/g,"dddd").replace(/\d/g,"d")},methods.suffix=function(word){return word.slice(-config.suffix)},methods.prefix=function(word){return word.slice(0,config.prefix)},methods.lexeme=function(word){return word},methods.lexemeCID=function(word){return word},methods.isAbbrev=function(word){return/[a-z].*\.$/i.test(word)?1:0},methods.normal=function(word){const lcwHash=fd.lexeme.hash[word.toLowerCase()];if(void 0===lcwHash)return console.log("[41m%s[0m entry is missing! (feature.normal)",JSON.stringify(word.toLowerCase())),0;const offset=lcwHash-fd.lexeme.hash[word];if(offset<0||offset>3)throw new Error("feature.normal: offset of "+offset+" for "+JSON.stringify(word));return offset},methods.tokenType=function(word){var cat;for(cat=0;cat<rgxCatDetectors.length;cat+=1)if(rgxCatDetectors[cat][0].test(word))return rgxCatDetectors[cat][1];return console.log("[41m%s[0m has unknown token type! (feature.tokenType)",JSON.stringify(word)),tcat.hash.word},methods.pos=function(word,category,cache){if(isLexicographer){const tags=fd.pos.hash[word];return tags?lang.xpos.hash[tags&&1===tags.length?tags[0]:"UNK"]:lang.xpos.hash.UNK}var pos,wordInLC;if(category===tcat.hash.word){wordInLC=word.toLowerCase();const hash=cache.lookup(wordInLC)[0];if(hash<cache.intrinsicSize()){const posOfWLC=cache.posOf(hash),isFirstToken=null===prevWord||/^[\t\r\n.?!]+$/.test(prevWord);pos=!rgxTC.test(word)&&!rgxUC.test(word)||isFirstToken?posOfWLC:12}else pos=function(word){if(!rgxLC.test(word))return 12;var wlc=word.toLowerCase();return suffix2pos[wlc.slice(-4)]||suffix2pos[wlc.slice(-3)]||suffix2pos[wlc.slice(-2)]||prefix2pos[wlc.slice(0,5)]||prefix2pos[wlc.slice(0,4)]||prefix2pos[wlc.slice(0,3)]||prefix2pos[wlc.slice(0,2)]||8}(word)}return prevWord=word,pos||tcat2pos[category]||(rgxTC.test(word)?12:8)},methods.isSPoS=function(word){const tags=fd.pos.hash[word];return tags&&1===tags.length?1:0},methods.lemma=function(word){if(0===fd.lexeme.hash[word])return 0;const lmh=fd.lemma.hash[word];return void 0===lmh||void 0===fd.lexeme.hash[lmh[0]]?(console.log("[41m%s[0m entry is missing! (feature.lemma)",JSON.stringify(lmh[0])),0):fd.lexeme.hash[lmh[0]]},methods.isSLemma=function(word){if(0===fd.lexeme.hash[word])return 0;const lmh=fd.lemma.hash[word];return lmh&&fd.lexeme.hash[lmh[0]]&&1===lmh.length?1:0},methods.lutCase=function(word){return rgxLC.test(word)?1:rgxUC.test(word)?2:rgxTC.test(word)?3:0},methods};module.exports=feature; |
{ | ||
"name": "wink-eng-lite-web-model", | ||
"version": "1.6.0", | ||
"version": "1.7.0", | ||
"description": "Wink's English Language Light Web Model for Web Browsers", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
Sorry, the diff of this file is too big to display
3820860