suffix-thumb
Advanced tools
Comparing version 3.1.5 to 4.0.0
@@ -1,1 +0,1 @@ | ||
import{unpack as e}from"efrt";const t=/^.([0-9]+)/,n=function(e,n){if(n.exceptions.hasOwnProperty(e))return function(e,n){let r=n.exceptions[e],o=r.match(t);if(null===o)return n.exceptions[e];let l=Number(o[1])||0;return e.substr(0,l)+r.replace(t,"")}(e,n);const r=function(e,t){let n=e[e.length-1],r=t.rules[n]||[];return 0===r.length&&(r=t.rules[""]||r),r}(e,n);for(let t=0;t<r.length;t+=1){let n=r[t][0];if(e.endsWith(n)){let o=new RegExp(n+"$");return e.replace(o,r[t][1])}}return e},r=function(e,t={}){let n={},r={};return e=e.filter((e=>void 0!==n[e[0]]?(t.verbose&&(console.warn("Duplicate left side:"),console.log(" 1.",[e[0],n[e[0]]]),console.log(" 2.",e)),!1):void 0!==r[e[1]]?(t.verbose&&(console.warn("Duplicate right side:"),console.log(" 1.",[r[e[1]],e[1]]),console.log(" 2.",e)),!1===t.inverse):(n[e[0]]=e[1],r[e[1]]=e[0],!0)))},o=function(e){let t={};return e.forEach((e=>{let n=e[0]||"",r=n[n.length-1]||"";t[r]=t[r]||[],t[r].push(e)})),t},l=function(e){return e=e.sort(((e,t)=>e[0].length>t[0].length?-1:e[0].length<t[0].length?1:0))},s=/^.([0-9]+)/,c=function(e){return Object.keys(e).forEach((t=>{let n=e[t],r=n.match(s);if(null!==r){let o=Number(r[1])||0,l=t.substring(0,o)+n.replace(s,"");e[t]=l}})),e},i=function(t={}){var n;return"string"==typeof t.exceptions&&(t.exceptions=e(t.exceptions),t.exceptions=c(t.exceptions)),"string"==typeof t.rules&&(t.rules=(n=t.rules)?(n=e(n),n=c(n),n=Object.entries(n),n=l(n),n=o(n)):{}),t},u=function(e){let t=[];var n;return Object.keys(e.rules).forEach((n=>{t=t.concat(e.rules[n].map((e=>[e[1],e[0]])))})),t=l(t),{rules:o(t),exceptions:(n=e.exceptions,Object.entries(n).reduce(((e,t)=>(e[t[1]]=t[0],e)),{}))}};export{n as convert,u as reverse,i as uncompress,r as validate}; | ||
const e=/^.([0-9]+)/,t=function(t,n,r){if(n.exceptions.hasOwnProperty(t))return r&&console.log("exception, ",t,n.exceptions[t]),function(t,n){let r=n.exceptions[t],l=r.match(e);if(null===l)return n.exceptions[t];let o=Number(l[1])||0;return t.substr(0,o)+r.replace(e,"")}(t,n);let l=n.rules;n.reversed&&(l=n.rev),l=function(e,t={}){let n=t[e[e.length-1]]||[];return t[""]&&(n=n.concat(t[""])),n}(t,l);for(let e=0;e<l.length;e+=1){let n=l[e][0];if(t.endsWith(n)){r&&console.log("rule, ",l[e]);let o=new RegExp(n+"$");return t.replace(o,l[e][1])}}return r&&console.log(" x - "+t),t},n=function(e,t={}){let n={},r={};return e=e.filter((e=>void 0!==n[e[0]]?(t.debug&&(console.warn("Duplicate left side:"),console.log(" 1.",[e[0],n[e[0]]]),console.log(" 2.",e)),!1):void 0!==r[e[1]]?(t.debug&&(console.warn("Duplicate right side:"),console.log(" 1.",[r[e[1]],e[1]]),console.log(" 2.",e)),!1===t.inverse):(n[e[0]]=e[1],r[e[1]]=e[0],!0)))},r=function(e){let t={};return e.forEach((e=>{let n=e[0]||"",r=n[n.length-1]||"";t[r]=t[r]||[],t[r].push(e)})),t},l=/^([0-9]+)/,o=function(e){const t=/\|/;return e.split(/,/).map((e=>{let n=e.split(t);return function(e="",t=""){let n=(t=String(t)).match(l);if(null===n)return[e,t];let r=Number(n[1])||0,o=e.substring(0,r);return[e,o+t.replace(l,"")]}(n[0],n[1])}))},s=function(e={}){return(e=Object.assign({},e)).rules=o(e.rules),e.rules=r(e.rules),e.rev=o(e.rev),e.rev=r(e.rev),e.exceptions=o(e.exceptions),e.exceptions=e.exceptions.reduce(((e,t)=>(e[t[0]]=t[1],e)),{}),e},u=function(e){let{rules:t,exceptions:n,rev:r}=e;var l;return l=n,n=Object.entries(l).reduce(((e,t)=>(e[t[1]]=t[0],e)),{}),{reversed:!Boolean(e.reversed),rules:t,exceptions:n,rev:r}},i=function(e,t){return t[e[e.length-1]]||[]},c=function(e,t,n){const r="Left",l="Right";if(t.exceptions.hasOwnProperty(e))return r;let o=Object.entries(t.exceptions);for(let t=0;t<o.length;t+=1)if(o[t][1]===e)return l;let s=i(e,t.rules);for(let t=0;t<s.length;t+=1)if(e.endsWith(s[t][0]))return r;s=i(e,t.rev);for(let t=0;t<s.length;t+=1)if(e.endsWith(s[t][0]))return l;s=i(e,t.rules);for(let t=0;t<s.length;t+=1)if(e.endsWith(s[t][1]))return l;s=i(e,t.rev);for(let t=0;t<s.length;t+=1)if(e.endsWith(s[t][1]))return r;return null};export{c as classify,t as convert,u as reverse,s as uncompress,n as validate}; |
@@ -1,7 +0,7 @@ | ||
/* suffix-thumb 3.1.5 MIT */ | ||
/* suffix-thumb 4.0.0 MIT */ | ||
(function (global, factory) { | ||
typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports, require('efrt')) : | ||
typeof define === 'function' && define.amd ? define(['exports', 'efrt'], factory) : | ||
(global = typeof globalThis !== 'undefined' ? globalThis : global || self, factory(global.suffixThumb = {}, global.efrt)); | ||
})(this, (function (exports, efrt) { 'use strict'; | ||
typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : | ||
typeof define === 'function' && define.amd ? define(['exports'], factory) : | ||
(global = typeof globalThis !== 'undefined' ? globalThis : global || self, factory(global.suffixThumb = {})); | ||
})(this, (function (exports) { 'use strict'; | ||
@@ -25,22 +25,33 @@ const prefix$1 = /^.([0-9]+)/; | ||
// get suffix-rules according to last char of word | ||
const getRules$1 = function (word, model) { | ||
const getRules$1 = function (word, rules = {}) { | ||
let char = word[word.length - 1]; | ||
let rules = model.rules[char] || []; | ||
if (rules.length === 0) { | ||
// do we have a generic suffix? | ||
rules = model.rules[''] || rules; | ||
let list = rules[char] || []; | ||
// do we have a generic suffix? | ||
if (rules['']) { | ||
list = list.concat(rules['']); | ||
} | ||
return rules | ||
return list | ||
}; | ||
const convert = function (word, model) { | ||
const convert = function (word, model, debug) { | ||
// check list of irregulars | ||
if (model.exceptions.hasOwnProperty(word)) { | ||
if (debug) { | ||
console.log("exception, ", word, model.exceptions[word]); | ||
} | ||
return getKeyVal(word, model) | ||
} | ||
// if model is reversed, try rev rules | ||
let rules = model.rules; | ||
if (model.reversed) { | ||
rules = model.rev; | ||
} | ||
// try suffix rules | ||
const rules = getRules$1(word, model); | ||
rules = getRules$1(word, rules); | ||
for (let i = 0; i < rules.length; i += 1) { | ||
let suffix = rules[i][0]; | ||
if (word.endsWith(suffix)) { | ||
if (debug) { | ||
console.log("rule, ", rules[i]); | ||
} | ||
let reg = new RegExp(suffix + '$'); | ||
@@ -50,2 +61,5 @@ return word.replace(reg, rules[i][1]) | ||
} | ||
if (debug) { | ||
console.log(' x - ' + word); | ||
} | ||
// return the original word unchanged | ||
@@ -55,299 +69,290 @@ return word | ||
const max = 6; | ||
const getSuffixes = function (str = '') { | ||
let list = []; | ||
for (let i = max; i >= 0; i -= 1) { | ||
if (str.length - 1 <= i) { | ||
continue | ||
// longest common prefix | ||
const findOverlap = (from, to) => { | ||
let all = []; | ||
for (let i = 0; i < from.length; i += 1) { | ||
if (from[i] === to[i]) { | ||
all.push(from[i]); | ||
} else { | ||
break | ||
} | ||
let n = str.length - i - 1; | ||
let suffix = str.substring(n, n + str.length - 1); | ||
list.push(suffix); | ||
} | ||
return list | ||
return all.join('') | ||
}; | ||
const getAll = function (arr) { | ||
const suffixes = {}; | ||
arr.forEach((a) => { | ||
let [from, to] = a; | ||
let fromList = getSuffixes(from); | ||
fromList.push(''); //add a prepend-only option | ||
fromList.forEach((left) => { | ||
suffixes[left] = suffixes[left] || {}; | ||
let toList = getSuffixes(to); | ||
toList.forEach((right) => { | ||
suffixes[left][right] = suffixes[left][right] || 0; | ||
suffixes[left][right] += 1; | ||
}); | ||
}); | ||
let compress$1 = function (key, val) { | ||
let prefix = findOverlap(key, val); | ||
if (prefix.length < 1) { | ||
return [key, val] | ||
} | ||
let out = prefix.length + val.substr(prefix.length); | ||
return [key, out] | ||
}; | ||
// console.log(compress('fixture', 'fixturing')) | ||
// index rules by last-char | ||
const indexRules = function (rules) { | ||
let byChar = {}; | ||
rules.forEach((a) => { | ||
let suff = a[0] || ''; | ||
let char = suff[suff.length - 1] || ''; | ||
byChar[char] = byChar[char] || []; | ||
byChar[char].push(a); | ||
}); | ||
return suffixes | ||
return byChar | ||
}; | ||
const topChange = function (obj, from) { | ||
let keys = Object.keys(obj); | ||
let arr = keys.map((to) => { | ||
return { | ||
from: from, | ||
to: to, | ||
yes: obj[to], | ||
} | ||
const unIndex = function (byChar) { | ||
let arr = []; | ||
Object.keys(byChar).forEach(k => { | ||
arr = arr.concat(byChar[k]); | ||
}); | ||
arr = arr.sort((a, b) => { | ||
if (a.yes > b.yes) { | ||
return -1 | ||
} else if (a.yes < b.yes) { | ||
return 1 | ||
} | ||
return 0 | ||
}); | ||
return arr | ||
}; | ||
const findBest = function (suffixes) { | ||
let good = []; | ||
Object.keys(suffixes).forEach((left) => { | ||
let top = topChange(suffixes[left], left); | ||
if (top[0] && top[0].yes > 1) { | ||
good.push(top[0]); | ||
} | ||
// remove shared data in key-val pairs | ||
// uses an ad-hoc run-length encoding format | ||
// {walk: walking} -> {walk: '.4ing'} | ||
const pressPairs = function (pairs) { | ||
pairs = pairs.map(a => { | ||
return compress$1(a[0], a[1]).join('|') | ||
}); | ||
good = good.sort((a, b) => { | ||
if (a.yes > b.yes) { | ||
return -1 | ||
} else if (a.yes < b.yes) { | ||
return 1 | ||
} | ||
return 0 | ||
}); | ||
return good | ||
return pairs.join(',') | ||
}; | ||
const getScores = function (arr, pairs) { | ||
return arr.map((obj) => { | ||
let yes = 0; | ||
let no = 0; | ||
let exceptions = {}; | ||
pairs.forEach((pair) => { | ||
if (pair[0].endsWith(obj.from)) { | ||
let reg = new RegExp(obj.from + '$');//unsafe | ||
let have = pair[0].replace(reg, obj.to); | ||
if (have === pair[1]) { | ||
yes += 1; | ||
} else { | ||
no += 1; | ||
exceptions[pair[0]] = pair[1]; | ||
} | ||
} | ||
}); | ||
return { | ||
from: obj.from, | ||
to: obj.to, | ||
yes: yes, | ||
no: no, | ||
percent: yes / (yes + no), | ||
exceptions: exceptions, | ||
} | ||
}) | ||
const compress = function (model = {}) { | ||
model = Object.assign({}, model); | ||
// compress fwd rules | ||
model.rules = unIndex(model.rules); | ||
model.rules = pressPairs(model.rules); | ||
// compress reverse rules | ||
model.rev = unIndex(model.rev); | ||
model.rev = pressPairs(model.rev); | ||
// compress exceptions | ||
model.exceptions = Object.entries(model.exceptions); | ||
model.exceptions = pressPairs(model.exceptions); | ||
return model | ||
}; | ||
const rank = function (arr, pairs) { | ||
let scored = getScores(arr, pairs); | ||
// baseline filter | ||
scored = scored.filter((o) => { | ||
return o.yes > 1 && o.yes > o.no | ||
}); | ||
// sort by # of positive | ||
scored = scored.sort((a, b) => { | ||
if (a.yes > b.yes) { | ||
return -1 | ||
} else if (a.yes < b.yes) { | ||
return 1 | ||
} | ||
return 0 | ||
}); | ||
return scored | ||
const prefix = /^([0-9]+)/; | ||
const expand = function (key = '', val = '') { | ||
val = String(val); | ||
let m = val.match(prefix); | ||
if (m === null) { | ||
return [key, val] | ||
} | ||
let num = Number(m[1]) || 0; | ||
let pre = key.substring(0, num); | ||
let full = pre + val.replace(prefix, ''); | ||
return [key, full] | ||
}; | ||
// remove any redundant rules | ||
const squeeze = function (arr) { | ||
let redundant = {}; | ||
arr.forEach((o, i) => { | ||
let downstream = arr.slice(i + 1, arr.length); | ||
downstream.forEach((d) => { | ||
if (d.from.endsWith(o.from)) { | ||
// also ensure the surviving one has no exceptions | ||
if (d.no === 0) { | ||
redundant[d.from] = true; | ||
} | ||
} | ||
}); | ||
}); | ||
// actually remove any redundant suffixes | ||
arr = arr.filter((o) => { | ||
return redundant.hasOwnProperty(o.from) === false | ||
}); | ||
return arr | ||
const toArray = function (txt) { | ||
const pipe = /\|/; | ||
return txt.split(/,/).map(str => { | ||
let a = str.split(pipe); | ||
return expand(a[0], a[1]) | ||
}) | ||
}; | ||
function reverse$1(str) { | ||
return str.split('').reverse().join('') | ||
} | ||
const uncompress = function (model = {}) { | ||
model = Object.assign({}, model); | ||
const fmtRules = function (rules) { | ||
// sort by length, then by suffix | ||
rules = rules.sort((a, b) => { | ||
if (a.from.length > b.from.length) { | ||
return -1 | ||
} else if (a.from.length < b.from.length) { | ||
return 1 | ||
} | ||
a = reverse$1(a.from); | ||
b = reverse$1(b.from); | ||
if (a > b) { | ||
return 1 | ||
} else if (a < b) { | ||
return -1 | ||
} | ||
return 0 | ||
}); | ||
return rules.map((o) => [o.from, o.to]) | ||
// compress fwd rules | ||
model.rules = toArray(model.rules); | ||
model.rules = indexRules(model.rules); | ||
// compress reverse rules | ||
model.rev = toArray(model.rev); | ||
model.rev = indexRules(model.rev); | ||
// compress exceptions | ||
model.exceptions = toArray(model.exceptions); | ||
model.exceptions = model.exceptions.reduce((h, a) => { | ||
h[a[0]] = a[1]; | ||
return h | ||
}, {}); | ||
return model | ||
}; | ||
const format = function (rules, pairs) { | ||
let exceptions = {}; | ||
rules.forEach((rule) => { | ||
Object.assign(exceptions, rule.exceptions); | ||
}); | ||
rules = fmtRules(rules); | ||
// console.log(expand('fixture', '6ing')) | ||
// console.log(toArray('heard|4')) | ||
// find remaining pairs with no rule | ||
let remaining = pairs.filter((pair) => { | ||
if (exceptions.hasOwnProperty(pair[0])) { | ||
return false | ||
} | ||
// console.log(rules.find((rule) => pair[0].endsWith(rule.from))) | ||
if (rules.find((rule) => pair[0].endsWith(rule.from))) { | ||
return false | ||
} | ||
return true | ||
}); | ||
// add them to exceptions list | ||
remaining.forEach(a => { | ||
exceptions[a[0]] = a[1]; | ||
}); | ||
const reverseObj = function (obj) { | ||
return Object.entries(obj).reduce((h, a) => { | ||
h[a[1]] = a[0]; | ||
return h | ||
}, {}) | ||
}; | ||
const reverse = function (model) { | ||
let { rules, exceptions, rev } = model; | ||
exceptions = reverseObj(exceptions); | ||
return { | ||
reversed: !Boolean(model.reversed),//toggle this | ||
rules, | ||
exceptions: exceptions, | ||
exceptions, | ||
rev | ||
} | ||
}; | ||
const firstPass = function (pairs) { | ||
pairs = pairs.filter((a) => a && a[0] && a[1]); | ||
// look at all patterns | ||
const suffixes = getAll(pairs); | ||
// look for the greatest patterns | ||
let best = findBest(suffixes); | ||
// run pattern against the pairs | ||
let rules = rank(best, pairs); | ||
// console.log(rules) | ||
// remove duplicates | ||
rules = squeeze(rules); | ||
// nice result format | ||
let res = format(rules, pairs); | ||
// console.log(res) | ||
return res | ||
// approximate file-size of given text | ||
const fileSize = (txt) => { | ||
if (!txt) { | ||
return '0kb' | ||
} | ||
if (typeof txt === 'object') { | ||
txt = JSON.stringify(txt); | ||
} | ||
let unit = 'kb'; | ||
let num = Buffer.byteLength(txt, 'utf8'); | ||
num = num / 1000; | ||
if (num > 1000) { | ||
unit = 'mb'; | ||
num = num / 1000; | ||
} | ||
num = Math.round(num * 10) / 10;//round it | ||
return num.toLocaleString() + unit | ||
}; | ||
const reduceExceptions = function (res) { | ||
let final = {}; | ||
let { rules, exceptions } = res; | ||
Object.keys(exceptions).forEach(k => { | ||
let found = rules.find((rule) => { | ||
return k.endsWith(rule[0]) | ||
}); | ||
// no rule applies | ||
if (!found) { | ||
final[k] = exceptions[k]; | ||
return | ||
} | ||
let tmp = k.replace(found[0], found[1]); | ||
// did we do it wrong? | ||
if (tmp !== exceptions[k]) { | ||
final[k] = exceptions[k]; //still an exception then | ||
} | ||
}); | ||
return final | ||
// get suffix-rules according to last char of word | ||
const getRules = function (word, modelRules) { | ||
let char = word[word.length - 1]; | ||
let rules = modelRules[char] || []; | ||
// if (modelRules['']) { | ||
// // do we have a generic suffix? | ||
// rules = rules.concat(modelRules['']) | ||
// } | ||
return rules | ||
}; | ||
const postProcess = function (res) { | ||
// some exceptions are not anymore | ||
res.exceptions = reduceExceptions(res); | ||
return res | ||
const classify = function (str, model, debug) { | ||
const l = 'Left'; | ||
const r = 'Right'; | ||
// check known exceptions | ||
if (model.exceptions.hasOwnProperty(str)) { | ||
return l | ||
} | ||
let list = Object.entries(model.exceptions); | ||
for (let i = 0; i < list.length; i += 1) { | ||
if (list[i][1] === str) { | ||
return r | ||
} | ||
} | ||
// check rules | ||
let rules = getRules(str, model.rules); | ||
for (let i = 0; i < rules.length; i += 1) { | ||
if (str.endsWith(rules[i][0])) { | ||
return l | ||
} | ||
} | ||
rules = getRules(str, model.rev); | ||
for (let i = 0; i < rules.length; i += 1) { | ||
if (str.endsWith(rules[i][0])) { | ||
return r | ||
} | ||
} | ||
// check weak-side of rules | ||
rules = getRules(str, model.rules); | ||
for (let i = 0; i < rules.length; i += 1) { | ||
if (str.endsWith(rules[i][1])) { | ||
return r | ||
} | ||
} | ||
rules = getRules(str, model.rev); | ||
for (let i = 0; i < rules.length; i += 1) { | ||
if (str.endsWith(rules[i][1])) { | ||
return l | ||
} | ||
} | ||
return null | ||
}; | ||
// index rules by last-char | ||
const indexRules = function (rules) { | ||
let byChar = {}; | ||
rules.forEach((a) => { | ||
let suff = a[0] || ''; | ||
let char = suff[suff.length - 1] || ''; | ||
byChar[char] = byChar[char] || []; | ||
byChar[char].push(a); | ||
}); | ||
return byChar | ||
}; | ||
const green = str => '\x1b[32m' + str + '\x1b[0m'; | ||
const red = str => '\x1b[31m' + str + '\x1b[0m'; | ||
const blue = str => '\x1b[34m' + str + '\x1b[0m'; | ||
const yellow = str => '\x1b[33m' + str + '\x1b[0m'; | ||
const dim = str => '\x1b[2m' + str + '\x1b[0m'; | ||
const unIndex = function (byChar) { | ||
let arr = []; | ||
Object.keys(byChar).forEach(k => { | ||
arr = arr.concat(byChar[k]); | ||
const testFwd = function (pairs, model) { | ||
let wrong = 0; | ||
pairs.forEach((a) => { | ||
let created = convert(a[0], model); | ||
if (created !== a[1]) { | ||
wrong += 1; | ||
console.log(red('error:'), yellow(a[0] + ' →' + created)); | ||
} | ||
}); | ||
return arr | ||
if (wrong === 0) { | ||
console.log(green(` ✓ forward`)); | ||
} else { | ||
console.log(red(` ✗ ${wrong} `) + 'errors\n'); | ||
} | ||
return wrong | ||
}; | ||
const sortRules = function (rules) { | ||
rules = rules.sort((a, b) => { | ||
if (a[0].length > b[0].length) { | ||
return -1 | ||
} else if (a[0].length < b[0].length) { | ||
return 1 | ||
const testBack = function (pairs, model) { | ||
let wrong = 0; | ||
let rev = reverse(model); | ||
pairs.forEach((a) => { | ||
let created = convert(a[1], rev); | ||
if (created !== a[0]) { | ||
wrong += 1; | ||
console.log(red(' rev ✗: '), yellow(a[1] + ' → ' + created)); | ||
} | ||
return 0 | ||
}); | ||
return rules | ||
if (wrong === 0) { | ||
console.log(green(` ✓ backward`)); | ||
} else { | ||
console.log(red(` ✗ ${wrong} `) + 'errors reversed\n'); | ||
} | ||
return wrong | ||
}; | ||
// add all reverse-exceptions | ||
const addInverse = function (model, pairs) { | ||
// create a reverse model | ||
let tmp = Object.assign({}, model); | ||
tmp.rules = indexRules(model.rules); | ||
let rev = reverse(tmp); | ||
// look for exceptions | ||
pairs.forEach(a => { | ||
let [left, right] = a; | ||
if (convert(right, rev) !== left) { | ||
// console.log(a) | ||
model.exceptions[a[0]] = a[1]; | ||
} | ||
}); | ||
// console.log(convert('relearn', rev)) | ||
return model | ||
const testSize = function (pairs, model) { | ||
let before = fileSize(pairs); | ||
let smol = compress(model); | ||
let after = fileSize(smol); | ||
console.log(` ${dim(before)} -> ${blue(after)}`); | ||
}; | ||
const secondPass = function (res, pairs, opts) { | ||
// remove redundant exceptions | ||
res = postProcess(res); | ||
// turn some exceptions into singleton suffix-rules | ||
// res = toRules(res, pairs) | ||
if (opts.inverse !== false) { | ||
res = addInverse(res, pairs); | ||
} | ||
return res | ||
const stats = function (model) { | ||
let rules = 0; | ||
Object.keys(model.rules).forEach(k => rules += model.rules[k].length); | ||
let rev = 0; | ||
Object.keys(model.rev || {}).forEach(k => rev += model.rev[k].length); | ||
let exc = Object.keys(model.exceptions).length; | ||
console.log(` ${blue(rules.toLocaleString())} rules, ${yellow(rev.toLocaleString())} reversed, ${blue(exc.toLocaleString())} exceptions`); | ||
}; | ||
const test = function (pairs, opts) { | ||
console.log('\n'); | ||
console.log(yellow(pairs.length.toLocaleString()) + ` pairs - ${dim(fileSize(pairs))}`); | ||
let begin = new Date(); | ||
let model = learnBoth(pairs, opts); | ||
let end = new Date(); | ||
console.log(' ', (end.getTime() - begin.getTime()) / 1000, 'seconds'); | ||
console.log(yellow('\nSize:')); | ||
stats(model); | ||
testSize(pairs, model); | ||
model = compress(model); | ||
model = uncompress(model); | ||
console.log(yellow('\nForward:')); | ||
testFwd(pairs, model); | ||
console.log(yellow('\nBackward:')); | ||
testBack(pairs, model); | ||
// hmm | ||
// console.log(yellow('\nClassify:')) | ||
// testSide(pairs, model, 'Left') | ||
// testSide(pairs, model, 'Right') | ||
}; | ||
// make sure inputs are not impossible to square-up | ||
@@ -359,3 +364,3 @@ const validate = function (pairs, opts = {}) { | ||
if (left[a[0]] !== undefined) { | ||
if (opts.verbose) { | ||
if (opts.debug) { | ||
console.warn('Duplicate left side:'); | ||
@@ -368,3 +373,3 @@ console.log(' 1.', [a[0], left[a[0]]]); | ||
if (right[a[1]] !== undefined) { | ||
if (opts.verbose) { | ||
if (opts.debug) { | ||
console.warn('Duplicate right side:'); | ||
@@ -386,134 +391,203 @@ console.log(' 1.', [right[a[1]], a[1]]); | ||
const learn = function (pairs, opts = {}) { | ||
// ensure input pairs are possible | ||
pairs = validate(pairs, opts); | ||
// create basic {rules, exceptions} | ||
let res = firstPass(pairs); | ||
// optimize it further | ||
res = secondPass(res, pairs, opts); | ||
// organize rules by their suffix char | ||
res.rules = indexRules(res.rules); | ||
return res | ||
}; | ||
const max = 6; | ||
// longest common prefix | ||
const findOverlap = (from, to) => { | ||
let all = []; | ||
for (let i = 0; i < from.length; i += 1) { | ||
if (from[i] === to[i]) { | ||
all.push(from[i]); | ||
} else { | ||
break | ||
const getSuffixes = function (str = '') { | ||
let list = []; | ||
for (let i = max; i >= 0; i -= 1) { | ||
if (str.length - 1 <= i) { | ||
continue | ||
} | ||
let n = str.length - i - 1; | ||
let suffix = str.substring(n, n + str.length - 1); | ||
list.push(suffix); | ||
} | ||
return all.join('') | ||
return list.reverse() | ||
}; | ||
// remove shared data in key-val pairs | ||
// uses an ad-hoc run-length encoding format | ||
// {walk: walking} -> {walk: '.4ing'} | ||
const pressObj = function (obj) { | ||
let res = {}; | ||
Object.keys(obj).forEach((k) => { | ||
let val = obj[k]; | ||
let prefix = findOverlap(k, val); | ||
if (prefix.length < 2) { | ||
res[k] = val; | ||
return | ||
const getDiff = function (left, right, suff) { | ||
suff = suff.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string | ||
let reg = new RegExp(suff + '$'); | ||
let stem = left.replace(reg, ''); | ||
if (!right.startsWith(stem)) { | ||
return | ||
} | ||
stem = stem.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); | ||
let start = new RegExp('^' + stem); | ||
let rest = right.replace(start, ''); | ||
return { from: suff, to: rest, id: suff + '|' + rest, reg } | ||
}; | ||
const unique = function (arr) { | ||
let set = new Set(); | ||
return arr.filter(a => { | ||
if (set.has(a.id)) { | ||
return false | ||
} | ||
let out = '.' + prefix.length + val.substr(prefix.length); | ||
res[k] = out; | ||
set.add(a.id); | ||
return true | ||
}) | ||
}; | ||
const getAll = function (arr) { | ||
let res = []; | ||
arr.forEach((a) => { | ||
let [left, right] = a; | ||
let list = getSuffixes(left); | ||
list.forEach(suff => { | ||
let diff = getDiff(left, right, suff); | ||
if (diff) { | ||
res.push(diff); | ||
} | ||
}); | ||
}); | ||
res = unique(res); | ||
return res | ||
}; | ||
const toObj = (rules) => { | ||
return rules.reduce((h, a) => { | ||
h[a[0]] = a[1]; | ||
return h | ||
}, {}) | ||
// console.log(getAll([['laughed', 'laughing']])) | ||
const noDupes = function (rules) { | ||
let already = new Set(); | ||
rules = rules.filter(r => { | ||
if (already.has(r.from)) { | ||
return false | ||
} | ||
already.add(r.from); | ||
return true | ||
}); | ||
return rules | ||
}; | ||
const packRules = function (rules) { | ||
rules = unIndex(rules); | ||
rules = toObj(rules); | ||
rules = pressObj(rules); | ||
rules = efrt.pack(rules); | ||
const cleanup = function (rules) { | ||
// only helpful ones | ||
rules = rules.filter(r => r.yes > 0 && r.yes > r.no); | ||
// one rule per suffix | ||
rules = noDupes(rules); | ||
return rules | ||
}; | ||
const compress = function (model = {}) { | ||
model.rules = packRules(model.rules); | ||
// compress exceptions | ||
model.exceptions = pressObj(model.exceptions); | ||
model.exceptions = efrt.pack(model.exceptions); | ||
return model | ||
const getCounts = function (rule, pairs) { | ||
let yes = 0; | ||
let no = 0; | ||
pairs.forEach(pair => { | ||
let [left, right] = pair; | ||
if (!rule.reg.test(left)) { | ||
return | ||
} | ||
// console.log(replace(left, rule.from, rule.to), left.replace(rule.reg, rule.to)) | ||
// if (replace(left, rule.from, rule.to) === right) { | ||
if (left.replace(rule.reg, rule.to) === right) { | ||
yes += 1; | ||
} else { | ||
no += 1; | ||
} | ||
}); | ||
return { yes, no } | ||
}; | ||
const prefix = /^.([0-9]+)/; | ||
const unEncode = function (obj) { | ||
Object.keys(obj).forEach(k => { | ||
let val = obj[k]; | ||
let m = val.match(prefix); | ||
if (m !== null) { | ||
let num = Number(m[1]) || 0; | ||
let pre = k.substring(0, num); | ||
let full = pre + val.replace(prefix, ''); | ||
obj[k] = full; | ||
const score = function (rules, pairs, opts = {}) { | ||
rules = rules.map(rule => { | ||
let { yes, no } = getCounts(rule, pairs); | ||
rule.yes = yes; | ||
rule.no = no; | ||
delete rule.id; | ||
return rule | ||
}); | ||
// worst-to-best | ||
rules = rules.sort((a, b) => { | ||
if (a.yes > b.yes) { | ||
return 1 | ||
} else if (a.yes < b.yes) { | ||
return -1 | ||
} | ||
return 0 | ||
}); | ||
return obj | ||
return rules | ||
}; | ||
const unpackRules = function (rules) { | ||
if (!rules) { | ||
return {} | ||
} | ||
// un-do our trie compression | ||
rules = efrt.unpack(rules); | ||
// un-do our run-length encoding | ||
rules = unEncode(rules); | ||
// turn into an array | ||
rules = Object.entries(rules); | ||
// ensure they are longest-first order | ||
rules = sortRules(rules); | ||
// index by end-char | ||
rules = indexRules(rules); | ||
const findRules = function (pairs, opts = {}) { | ||
let rules = getAll(pairs); | ||
rules = score(rules, pairs, opts); | ||
rules = cleanup(rules); | ||
return rules | ||
}; | ||
const updateRules = function (rules, pairs, opts) { | ||
rules = score(rules, pairs, opts); | ||
rules = cleanup(rules); | ||
return rules | ||
}; | ||
const uncompress = function (model = {}) { | ||
if (typeof model.exceptions === 'string') { | ||
model.exceptions = efrt.unpack(model.exceptions); | ||
model.exceptions = unEncode(model.exceptions); | ||
} | ||
if (typeof model.rules === 'string') { | ||
model.rules = unpackRules(model.rules); | ||
} | ||
return model | ||
const trimPairs = function (pairs, rule) { | ||
let { reg, to } = rule; | ||
let done = []; | ||
let remain = pairs.filter(pair => { | ||
let [left, right] = pair; | ||
if (left.match(reg)) { | ||
if (left.replace(reg, to) === right) { | ||
done.push(pair); | ||
return false //done with it | ||
} | ||
} | ||
return true // keep it | ||
}); | ||
return { remain, done } | ||
}; | ||
// remove any rules that challenge existing pairs | ||
const trimRules = function (rules, pairsDone) { | ||
return rules.filter(r => { | ||
for (let i = 0; i < pairsDone.length; i += 1) { | ||
let pair = pairsDone[i]; | ||
if (r.reg.test(pair[0]) && pair[0].replace(r.reg, r.to) !== pair[1]) { | ||
// console.log('banned rule:', r) | ||
return false | ||
} | ||
} | ||
return true | ||
}) | ||
}; | ||
const reverseObj = function (obj) { | ||
return Object.entries(obj).reduce((h, a) => { | ||
h[a[1]] = a[0]; | ||
const learn = function (pairs, opts = {}) { | ||
pairs = validate(pairs, opts); | ||
let rules = findRules(pairs); | ||
let pairsLeft = pairs; | ||
let pairsDone = []; | ||
let chosen = []; | ||
// pick our top rule | ||
while (pairsLeft.length > 0 && rules.length > 0) { | ||
let rule = rules.pop(); | ||
chosen.push([rule.from, rule.to]); | ||
// remove now-covered pairs | ||
let res = trimPairs(pairsLeft, rule); | ||
pairsLeft = res.remain; | ||
pairsDone = pairsDone.concat(res.done); | ||
// remove now-unsafe rules | ||
rules = trimRules(rules, pairsDone); | ||
// re-rank our rules | ||
rules = updateRules(rules, pairsLeft, opts); | ||
// logging | ||
if (opts.debug) { | ||
console.log(`\n${rule.from} -> ${rule.to || "''"}`); | ||
console.log(` \x1b[32m +${res.done.length.toLocaleString()} pairs\x1b[0m`); | ||
console.log(' ', pairsLeft.length, 'remaining'); | ||
console.log(' ', rules.length, 'rules left'); | ||
} | ||
} | ||
// turn em upside-down | ||
chosen = chosen.reverse(); | ||
// remaining pairs are exceptions | ||
let exceptions = pairsLeft.reduce((h, a) => { | ||
h[a[0]] = a[1]; | ||
return h | ||
}, {}) | ||
}; | ||
}, {}); | ||
const reverseArr = function (arr) { | ||
return arr.map(a => [a[1], a[0]]) | ||
}; | ||
const reverse = function (model) { | ||
let allRules = []; | ||
Object.keys(model.rules).forEach(k => { | ||
allRules = allRules.concat(reverseArr(model.rules[k])); | ||
}); | ||
allRules = sortRules(allRules); | ||
let rules = indexRules(allRules); | ||
let exceptions = reverseObj(model.exceptions); | ||
return { | ||
rules, | ||
rules: chosen, | ||
exceptions | ||
@@ -523,34 +597,30 @@ } | ||
// get suffix-rules according to last char of word | ||
const getRules = function (word, model) { | ||
let char = word[word.length - 1]; | ||
let rules = model.rules[char] || []; | ||
if (rules.length === 0) { | ||
// do we have a generic suffix? | ||
rules = model.rules[''] || rules; | ||
} | ||
return rules | ||
const mergeExceptions = function (fwd, bkwd) { | ||
Object.entries(bkwd).forEach(b => { | ||
fwd[b[1]] = b[0]; //reverse | ||
}); | ||
return fwd | ||
}; | ||
const debug = function (word, model) { | ||
if (model.exceptions.hasOwnProperty(word)) { | ||
let obj = {}; | ||
obj[word] = model.exceptions[word]; | ||
return { found: 'exception', exception: obj } | ||
const learnBoth = function (pairs, opts = {}) { | ||
let fwd = learn(pairs, opts); | ||
// learn backward too? | ||
if (opts.reverse !== false) { | ||
pairs = pairs.map(a => [a[1], a[0]]); | ||
let bkwd = learn(pairs, opts); | ||
// merge exceptions | ||
fwd.exceptions = mergeExceptions(fwd.exceptions, bkwd.exceptions); | ||
// add rules | ||
fwd.rev = indexRules(bkwd.rules); | ||
} | ||
const rules = getRules(word, model); | ||
for (let i = 0; i < rules.length; i += 1) { | ||
let suffix = rules[i][0]; | ||
if (word.endsWith(suffix)) { | ||
return { found: 'rule', rule: rules[i] } | ||
} | ||
} | ||
return { found: null } | ||
fwd.rules = indexRules(fwd.rules); | ||
return fwd | ||
}; | ||
exports.classify = classify; | ||
exports.compress = compress; | ||
exports.convert = convert; | ||
exports.debug = debug; | ||
exports.learn = learn; | ||
exports.learn = learnBoth; | ||
exports.reverse = reverse; | ||
exports.test = test; | ||
exports.uncompress = uncompress; | ||
@@ -557,0 +627,0 @@ exports.validate = validate; |
{ | ||
"name": "suffix-thumb", | ||
"description": "learn transformations between two sets of words", | ||
"version": "3.1.5", | ||
"version": "4.0.0", | ||
"author": "Spencer Kelly <spencermountain@gmail.com> (http://spencermounta.in)", | ||
"main": "./src/index.js", | ||
"unpkg": "./builds/suffix-thumb-client.js", | ||
"types": "types/index.d.ts", | ||
"types": "/builds/types.d.ts", | ||
"type": "module", | ||
@@ -41,16 +41,14 @@ "sideEffects": false, | ||
], | ||
"dependencies": { | ||
"efrt": "2.5.0" | ||
}, | ||
"dependencies": {}, | ||
"devDependencies": { | ||
"@rollup/plugin-commonjs": "21.0.1", | ||
"@rollup/plugin-node-resolve": "13.1.2", | ||
"@rollup/plugin-node-resolve": "13.1.3", | ||
"amble": "1.3.0", | ||
"rollup": "2.63.0", | ||
"rollup": "2.67.2", | ||
"rollup-plugin-filesize-check": "0.0.1", | ||
"rollup-plugin-terser": "7.0.2", | ||
"tap-dancer": "0.3.4", | ||
"tape": "5.4.0" | ||
"tape": "5.5.2" | ||
}, | ||
"license": "MIT" | ||
} |
@@ -83,3 +83,3 @@ <div align="center"> | ||
```js | ||
learn(pairs, {inverse: false}) | ||
learn(pairs, {reverse: false}) | ||
``` | ||
@@ -137,3 +137,3 @@ you can expect the model to be 5% smaller or so - not much. | ||
] | ||
let model = learn(pairs, {inverse: false}) | ||
let model = learn(pairs, {reverse: false}) | ||
let out = convert('ok', model) | ||
@@ -143,2 +143,21 @@ // 'right' | ||
### Classify | ||
the model can also be used to classify whether a given word belongs to either Left or Right sides. | ||
```js | ||
import { learn, classify } from 'suffix-thumb' | ||
let pairs = [ | ||
['walk', 'walked'], | ||
['talk', 'talked'], | ||
['go', 'went'], | ||
] | ||
let model = learn(pairs) | ||
let out = classify('stalked', model) | ||
// 'Right' | ||
out = classify('waited', model) | ||
// null | ||
``` | ||
Unlike convert, the classifier is not guarnteed to return 100% on the training data. | ||
The classifier will generally hit high-90s on the given dataset, but how-well it generalizes to novel input is up-to the dataset. | ||
<!-- spacer --> | ||
@@ -163,8 +182,2 @@ <img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
if you find an issue, you can use debug(): | ||
```js | ||
import { debug } from 'suffix-thumb' | ||
let out = debug('walk', model) | ||
// --which rule/exception was triggered-- | ||
``` | ||
<!-- spacer --> | ||
@@ -175,3 +188,3 @@ <img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
### See also | ||
* [efrt](https://github.com/spencermountain/efrt) - trie-based JSON compression | ||
* [efrt](https://github.com/spencermountain/efrt) - trie-based prefix compression for JSON | ||
@@ -178,0 +191,0 @@ <!-- spacer --> |
import convert from './convert/index.js' | ||
import validate from './learn/validate.js' | ||
import uncompress from './uncompress/index.js' | ||
import validate from './validate/index.js' | ||
import uncompress from './compress/uncompress.js' | ||
import reverse from './reverse/index.js' | ||
import classify from './classify/index.js' | ||
export { convert, uncompress, reverse, validate } | ||
export { convert, uncompress, reverse, validate, classify } |
@@ -1,27 +0,29 @@ | ||
import { pack } from 'efrt' | ||
import pressObj from './pressObj.js' | ||
import press from './press.js' | ||
import { unIndex } from '../_lib.js' | ||
const toObj = (rules) => { | ||
return rules.reduce((h, a) => { | ||
h[a[0]] = a[1] | ||
return h | ||
}, {}) | ||
// remove shared data in key-val pairs | ||
// uses an ad-hoc run-length encoding format | ||
// {walk: walking} -> {walk: '.4ing'} | ||
const pressPairs = function (pairs) { | ||
pairs = pairs.map(a => { | ||
return press(a[0], a[1]).join('|') | ||
}) | ||
return pairs.join(',') | ||
} | ||
const packRules = function (rules) { | ||
rules = unIndex(rules) | ||
rules = toObj(rules) | ||
rules = pressObj(rules) | ||
rules = pack(rules) | ||
return rules | ||
} | ||
const compress = function (model = {}) { | ||
model = Object.assign({}, model) | ||
const compress = function (model = {}) { | ||
model.rules = packRules(model.rules) | ||
// compress fwd rules | ||
model.rules = unIndex(model.rules) | ||
model.rules = pressPairs(model.rules) | ||
// compress reverse rules | ||
model.rev = unIndex(model.rev) | ||
model.rev = pressPairs(model.rev) | ||
// compress exceptions | ||
model.exceptions = pressObj(model.exceptions) | ||
model.exceptions = pack(model.exceptions) | ||
model.exceptions = Object.entries(model.exceptions) | ||
model.exceptions = pressPairs(model.exceptions) | ||
return model | ||
} | ||
export default compress |
@@ -18,22 +18,33 @@ const prefix = /^.([0-9]+)/ | ||
// get suffix-rules according to last char of word | ||
const getRules = function (word, model) { | ||
const getRules = function (word, rules = {}) { | ||
let char = word[word.length - 1] | ||
let rules = model.rules[char] || [] | ||
if (rules.length === 0) { | ||
// do we have a generic suffix? | ||
rules = model.rules[''] || rules | ||
let list = rules[char] || [] | ||
// do we have a generic suffix? | ||
if (rules['']) { | ||
list = list.concat(rules['']) | ||
} | ||
return rules | ||
return list | ||
} | ||
const convert = function (word, model) { | ||
const convert = function (word, model, debug) { | ||
// check list of irregulars | ||
if (model.exceptions.hasOwnProperty(word)) { | ||
if (debug) { | ||
console.log("exception, ", word, model.exceptions[word]) | ||
} | ||
return getKeyVal(word, model) | ||
} | ||
// if model is reversed, try rev rules | ||
let rules = model.rules | ||
if (model.reversed) { | ||
rules = model.rev | ||
} | ||
// try suffix rules | ||
const rules = getRules(word, model) | ||
rules = getRules(word, rules) | ||
for (let i = 0; i < rules.length; i += 1) { | ||
let suffix = rules[i][0] | ||
if (word.endsWith(suffix)) { | ||
if (debug) { | ||
console.log("rule, ", rules[i]) | ||
} | ||
let reg = new RegExp(suffix + '$') | ||
@@ -43,2 +54,5 @@ return word.replace(reg, rules[i][1]) | ||
} | ||
if (debug) { | ||
console.log(' x - ' + word) | ||
} | ||
// return the original word unchanged | ||
@@ -45,0 +59,0 @@ return word |
import convert from './convert/index.js' | ||
import learn from './learn/index.js' | ||
import validate from './learn/validate.js' | ||
import compress from './compress/index.js' | ||
import uncompress from './uncompress/index.js' | ||
import uncompress from './compress/uncompress.js' | ||
import reverse from './reverse/index.js' | ||
import debug from './convert/debug.js' | ||
import test from './test/index.js' | ||
import learn from './learn/index.js' | ||
import validate from './validate/index.js' | ||
import classify from './classify/index.js' | ||
export { learn, convert, compress, uncompress, reverse, validate, debug } | ||
export { learn, convert, compress, uncompress, reverse, validate, test, classify } |
@@ -1,17 +0,25 @@ | ||
import firstPass from './1st-pass/index.js' | ||
import secondPass from './2nd-pass/index.js' | ||
import learn from './learn.js' | ||
import { indexRules } from '../_lib.js' | ||
import validate from './validate.js' | ||
const learn = function (pairs, opts = {}) { | ||
// ensure input pairs are possible | ||
pairs = validate(pairs, opts) | ||
// create basic {rules, exceptions} | ||
let res = firstPass(pairs) | ||
// optimize it further | ||
res = secondPass(res, pairs, opts) | ||
// organize rules by their suffix char | ||
res.rules = indexRules(res.rules) | ||
return res | ||
const mergeExceptions = function (fwd, bkwd) { | ||
Object.entries(bkwd).forEach(b => { | ||
fwd[b[1]] = b[0] //reverse | ||
}) | ||
return fwd | ||
} | ||
export default learn | ||
const learnBoth = function (pairs, opts = {}) { | ||
let fwd = learn(pairs, opts) | ||
// learn backward too? | ||
if (opts.reverse !== false) { | ||
pairs = pairs.map(a => [a[1], a[0]]) | ||
let bkwd = learn(pairs, opts) | ||
// merge exceptions | ||
fwd.exceptions = mergeExceptions(fwd.exceptions, bkwd.exceptions) | ||
// add rules | ||
fwd.rev = indexRules(bkwd.rules) | ||
} | ||
fwd.rules = indexRules(fwd.rules) | ||
return fwd | ||
} | ||
export default learnBoth |
@@ -1,3 +0,1 @@ | ||
import { sortRules, indexRules } from '../_lib.js' | ||
const reverseObj = function (obj) { | ||
@@ -10,19 +8,12 @@ return Object.entries(obj).reduce((h, a) => { | ||
const reverseArr = function (arr) { | ||
return arr.map(a => [a[1], a[0]]) | ||
} | ||
const reverse = function (model) { | ||
let allRules = [] | ||
Object.keys(model.rules).forEach(k => { | ||
allRules = allRules.concat(reverseArr(model.rules[k])) | ||
}) | ||
allRules = sortRules(allRules) | ||
let rules = indexRules(allRules) | ||
let exceptions = reverseObj(model.exceptions) | ||
let { rules, exceptions, rev } = model | ||
exceptions = reverseObj(exceptions) | ||
return { | ||
reversed: !Boolean(model.reversed),//toggle this | ||
rules, | ||
exceptions | ||
exceptions, | ||
rev | ||
} | ||
} | ||
export default reverse |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
60995
0
28
1785
191
1
- Removedefrt@2.5.0
- Removedefrt@2.5.0(transitive)