suffix-thumb
Advanced tools
Comparing version 2.0.0 to 3.0.0
@@ -1,1 +0,1 @@ | ||
!function(r,n){"object"==typeof exports&&"undefined"!=typeof module?n(exports):"function"==typeof define&&define.amd?define(["exports"],n):n((r="undefined"!=typeof globalThis?globalThis:r||self).suffixThumb={})}(this,(function(r){"use strict";var n=/^.([0-9]+)/,e=function(r,n){if(e=n.rules,"[object Array]"===Object.prototype.toString.call(e))return n.rules;var e,t=r[r.length-1],o=n.rules[t]||[];return 0===o.length&&(o=n.rules[""]||o),o};function t(r,n){return function(r){if(Array.isArray(r))return r}(r)||function(r,n){var e=null==r?null:"undefined"!=typeof Symbol&&r[Symbol.iterator]||r["@@iterator"];if(null==e)return;var t,o,u=[],i=!0,f=!1;try{for(e=e.call(r);!(i=(t=e.next()).done)&&(u.push(t.value),!n||u.length!==n);i=!0);}catch(r){f=!0,o=r}finally{try{i||null==e.return||e.return()}finally{if(f)throw o}}return u}(r,n)||function(r,n){if(!r)return;if("string"==typeof r)return o(r,n);var e=Object.prototype.toString.call(r).slice(8,-1);"Object"===e&&r.constructor&&(e=r.constructor.name);if("Map"===e||"Set"===e)return Array.from(r);if("Arguments"===e||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(e))return o(r,n)}(r,n)||function(){throw new TypeError("Invalid attempt to destructure non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.")}()}function o(r,n){(null==n||n>r.length)&&(n=r.length);for(var e=0,t=new Array(n);e<n;e++)t[e]=r[e];return t}var u=function(){for(var r=arguments.length>0&&void 0!==arguments[0]?arguments[0]:"",n=[],e=4;e>=0;e-=1)if(!(r.length-1<=e)){var t=r.substr(r.length-e-1,r.length-1);n.push(t)}return n},i=function(r){var n=[];return Object.keys(r).forEach((function(e){var t,o,u=(t=r[e],o=e,Object.keys(t).map((function(r){return{from:o,to:r,yes:t[r]}})).sort((function(r,n){return r.yes>n.yes?-1:r.yes<n.yes?1:0})));u[0]&&u[0].yes>1&&n.push(u[0])})),n=n.sort((function(r,n){return r.yes>n.yes?-1:r.yes<n.yes?1:0}))},f=function(r,n){var e=function(r,n){return r.map((function(r){var e=0,t=0,o={};return n.forEach((function(n){if(n[0].endsWith(r.from)){var u=new RegExp(r.from+"$");n[0].replace(u,r.to)===n[1]?e+=1:(t+=1,o[n[0]]=n[1])}})),{from:r.from,to:r.to,yes:e,no:t,percent:e/(e+t),exceptions:o}}))}(r,n);return e=(e=e.filter((function(r){return r.yes>1&&r.yes>r.no}))).sort((function(r,n){return r.yes>n.yes?-1:r.yes<n.yes?1:0}))};function c(r){return r.split("").reverse().join("")}var s=function(r){return(r=r.sort((function(r,n){return r.from.length>n.from.length?-1:r.from.length<n.from.length||(r=c(r.from))>(n=c(n.from))?1:r<n?-1:0}))).map((function(r){return[r.from,r.to,r.yes]}))},l=function(r){var n,e,t,o;return r.rules=(n=r.rules,e={},n.forEach((function(r){var n=r[0]||"",t=n[n.length-1]||"";e[t]=e[t]||[],e[t].push(r)})),e),r.exceptions=(t=r.exceptions,o={},Object.keys(t).forEach((function(r){var n=t[r],e=function(r,n){for(var e=[],t=0;t<r.length&&r[t]===n[t];t+=1)e.push(r[t]);return e.join("")}(r,n);if(e.length<2)o[r]=n;else{var u="."+e.length+n.substr(e.length);o[r]=u}})),o),r},a=function(r){var n,e,o=function(r){var n={};return r.forEach((function(r){var e=t(r,2),o=e[0],i=e[1],f=u(o);f.push(""),f.forEach((function(r){n[r]=n[r]||{},u(i).forEach((function(e){n[r][e]=n[r][e]||0,n[r][e]+=1}))}))})),n}(r=r.filter((function(r){return r&&r[0]&&r[1]}))),c=i(o),l=f(c,r);return e={},(n=l).forEach((function(r,t){n.slice(t+1,n.length).forEach((function(n){n.from.endsWith(r.from)&&(e[n.from]=!0)}))})),function(r,n){var e={};r.forEach((function(r){Object.assign(e,r.exceptions)}));var t=n.filter((function(n){return!e.hasOwnProperty(n[0])&&!r.find((function(r){return n[0].endsWith(r.from)}))})),o=(n.length-t.length)/n.length;return{rules:s(r),exceptions:e,coverage:o,remaining:t}}(l=n=n.filter((function(r){return!1===e.hasOwnProperty(r.from)})),r)},h=function(r,n){var e,t=0;return r.rules=r.rules.map((function(r){return t+=r[2],r.slice(0,2)})),r.exceptions=r.exceptions.reduce((function(r,n){return r[n[0]]=n[1],r}),{}),r.rules=r.rules.sort((function(r,n){return r[0].length>n[0].length?-1:r[0].length<n[0].length?1:0})),r.coverage=(e=t/n,Math.round(1e3*e)/1e3),r};r.compress=l,r.convert=function(r,t){if(t.exceptions.hasOwnProperty(r))return function(r,e){var t=e.exceptions[r],o=t.match(n);if(null===o)return e.exceptions[r];var u=Number(o[1])||0;return r.substr(0,u)+t.replace(n,"")}(r,t);for(var o=e(r,t),u=0;u<o.length;u+=1){var i=o[u][0];if(r.endsWith(i)){var f=new RegExp(i+"$");return r.replace(f,o[u][1])}}return null},r.find=function(r){var n=r.length,e={},t=a(r);return e.rules=t.rules||[],e.exceptions=t.remaining.concat(Object.entries(t.exceptions)),e=h(e,n),e=l(e)},Object.defineProperty(r,"__esModule",{value:!0})})); | ||
!function(e,t){"object"==typeof exports&&"undefined"!=typeof module?t(exports,require("efrt")):"function"==typeof define&&define.amd?define(["exports","efrt"],t):t((e="undefined"!=typeof globalThis?globalThis:e||self).suffixThumb={},e.efrt)}(this,(function(e,t){"use strict";const n=/^.([0-9]+)/,r=function(e,t){if(n=t.rules,"[object Array]"===Object.prototype.toString.call(n))return t.rules;var n;let r=e[e.length-1],o=t.rules[r]||[];return 0===o.length&&(o=t.rules[""]||o),o},o=function(e,t){if(t.exceptions.hasOwnProperty(e))return function(e,t){let r=t.exceptions[e],o=r.match(n);if(null===o)return t.exceptions[e];let s=Number(o[1])||0;return e.substr(0,s)+r.replace(n,"")}(e,t);const o=r(e,t);for(let t=0;t<o.length;t+=1){let n=o[t][0];if(e.endsWith(n)){let r=new RegExp(n+"$");return e.replace(r,o[t][1])}}return e},s=function(e=""){let t=[];for(let n=6;n>=0;n-=1){if(e.length-1<=n)continue;let r=e.length-n-1,o=e.substring(r,r+e.length-1);t.push(o)}return t},c=function(e){let t=[];return Object.keys(e).forEach((n=>{let r=function(e,t){let n=Object.keys(e).map((n=>({from:t,to:n,yes:e[n]})));return n=n.sort(((e,t)=>e.yes>t.yes?-1:e.yes<t.yes?1:0)),n}(e[n],n);r[0]&&r[0].yes>1&&t.push(r[0])})),t=t.sort(((e,t)=>e.yes>t.yes?-1:e.yes<t.yes?1:0)),t},l=function(e,t){let n=function(e,t){return e.map((e=>{let n=0,r=0,o={};return t.forEach((t=>{if(t[0].endsWith(e.from)){let s=new RegExp(e.from+"$");t[0].replace(s,e.to)===t[1]?n+=1:(r+=1,o[t[0]]=t[1])}})),{from:e.from,to:e.to,yes:n,no:r,percent:n/(n+r),exceptions:o}}))}(e,t);return n=n.filter((e=>e.yes>1&&e.yes>e.no)),n=n.sort(((e,t)=>e.yes>t.yes?-1:e.yes<t.yes?1:0)),n};function u(e){return e.split("").reverse().join("")}const i=function(e,t){let n={};return e.forEach((e=>{Object.assign(n,e.exceptions)})),e=function(e){return(e=e.sort(((e,t)=>e.from.length>t.from.length?-1:e.from.length<t.from.length||(e=u(e.from))>(t=u(t.from))?1:e<t?-1:0))).map((e=>[e.from,e.to]))}(e),t.filter((t=>!n.hasOwnProperty(t[0])&&!e.find((e=>t[0].endsWith(e.from))))).forEach((e=>{n[e[0]]=e[1]})),{rules:e,exceptions:n}},f=function(e){const t=function(e){const t={};return e.forEach((e=>{let[n,r]=e,o=s(n);o.push(""),o.forEach((e=>{t[e]=t[e]||{},s(r).forEach((n=>{t[e][n]=t[e][n]||0,t[e][n]+=1}))}))})),t}(e=e.filter((e=>e&&e[0]&&e[1])));let n=c(t),r=l(n,e);return r=function(e){let t={};return e.forEach(((n,r)=>{e.slice(r+1,e.length).forEach((e=>{e.from.endsWith(n.from)&&0===e.no&&(t[e.from]=!0)}))})),e=e.filter((e=>!1===t.hasOwnProperty(e.from)))}(r),i(r,e)},p=function(e){return e.exceptions=function(e){let t={},{rules:n,exceptions:r}=e;return Object.keys(r).forEach((e=>{let o=n.find((t=>e.endsWith(t[0])));o?e.replace(o[0],o[1])!==r[e]&&(t[e]=r[e]):t[e]=r[e]})),t}(e),e},h=function(e){let t={};return e.forEach((e=>{let n=e[0]||"",r=n[n.length-1]||"";t[r]=t[r]||[],t[r].push(e)})),t},a=function(e){return e=e.sort(((e,t)=>e[0].length>t[0].length?-1:e[0].length<t[0].length?1:0))},g=function(e,t,n){return e=p(e),!1!==n.inverse&&(e=function(e,t){let n=Object.assign({},e);n.rules=h(e.rules);let r=E(n);return t.forEach((t=>{let[n,s]=t;o(s,r)!==n&&(e.exceptions[t[0]]=t[1])})),e}(e,t)),e},y=function(e,t={}){let n={},r={};return e=e.filter((e=>void 0!==n[e[0]]?(t.verbose&&(console.warn("Duplicate left side:"),console.log(" 1.",[e[0],n[e[0]]]),console.log(" 2.",e)),!1):void 0!==r[e[1]]?(t.verbose&&(console.warn("Duplicate right side:"),console.log(" 1.",[r[e[1]],e[1]]),console.log(" 2.",e)),!1===t.inverse):(n[e[0]]=e[1],r[e[1]]=e[0],!0)))},m=function(e){let t={};return Object.keys(e).forEach((n=>{let r=e[n],o=((e,t)=>{let n=[];for(let r=0;r<e.length&&e[r]===t[r];r+=1)n.push(e[r]);return n.join("")})(n,r);if(o.length<2)return void(t[n]=r);let s="."+o.length+r.substr(o.length);t[n]=s})),t},b=function(e){return e=(e=>e.reduce(((e,t)=>(e[t[0]]=t[1],e)),{}))(e=function(e){let t=[];return Object.keys(e).forEach((n=>{t=t.concat(e[n])})),t}(e)),e=m(e),e=t.pack(e)},d=/^.([0-9]+)/,x=function(e){return Object.keys(e).forEach((t=>{let n=e[t],r=n.match(d);if(null!==r){let o=Number(r[1])||0,s=t.substring(0,o)+n.replace(d,"");e[t]=s}})),e},E=function(e){let t=[];var n;return Object.keys(e.rules).forEach((n=>{t=t.concat(e.rules[n].map((e=>[e[1],e[0]])))})),t=a(t),{rules:h(t),exceptions:(n=e.exceptions,Object.entries(n).reduce(((e,t)=>(e[t[1]]=t[0],e)),{}))}};e.compress=function(e={}){return e.rules=b(e.rules),e.exceptions=m(e.exceptions),e.exceptions=t.pack(e.exceptions),e},e.convert=o,e.learn=function(e,t={}){e=y(e,t);let n=f(e);return n=g(n,e,t),n.rules=h(n.rules),n},e.reverse=E,e.uncompress=function(e={}){var n;return"string"==typeof e.exceptions&&(e.exceptions=t.unpack(e.exceptions),e.exceptions=x(e.exceptions)),"string"==typeof e.rules&&(e.rules=(n=e.rules,n=t.unpack(n),n=x(n),n=Object.entries(n),n=a(n),h(n))),e},e.validate=y,Object.defineProperty(e,"__esModule",{value:!0})})); |
{ | ||
"name": "suffix-thumb", | ||
"description": "learn transformations between two sets of words", | ||
"version": "2.0.0", | ||
"version": "3.0.0", | ||
"author": "Spencer Kelly <spencermountain@gmail.com> (http://spencermounta.in)", | ||
@@ -16,7 +16,10 @@ "main": "./builds/suffix-thumb.mjs", | ||
}, | ||
"./find": { | ||
"import": "./src/find/index.js" | ||
"./learn": { | ||
"import": "./src/learn/index.js" | ||
}, | ||
"./convert": { | ||
"import": "./src/convert/index.js" | ||
}, | ||
"./uncompress": { | ||
"import": "./src/uncompress/index.js" | ||
} | ||
@@ -49,18 +52,13 @@ }, | ||
"devDependencies": { | ||
"@babel/core": "7.15.0", | ||
"@babel/preset-env": "7.15.0", | ||
"@rollup/plugin-commonjs": "20.0.0", | ||
"@rollup/plugin-json": "4.1.0", | ||
"@rollup/plugin-node-resolve": "13.0.4", | ||
"@rollup/plugin-commonjs": "21.0.1", | ||
"@rollup/plugin-node-resolve": "13.1.1", | ||
"amble": "1.3.0", | ||
"efrt": "2.3.2", | ||
"eslint": "7.32.0", | ||
"rollup": "2.56.3", | ||
"rollup-plugin-babel": "4.4.0", | ||
"efrt": "^2.3.2", | ||
"rollup": "2.62.0", | ||
"rollup-plugin-filesize-check": "0.0.1", | ||
"rollup-plugin-terser": "7.0.2", | ||
"tap-dancer": "0.3.4", | ||
"tape": "5.3.1" | ||
"tape": "5.4.0" | ||
}, | ||
"license": "MIT" | ||
} |
124
README.md
@@ -20,3 +20,5 @@ <div align="center"> | ||
<hr/> | ||
<!-- spacer --> | ||
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
@@ -29,10 +31,14 @@ </div> | ||
The assumption is that a word's _suffix_ is the most changeable part of a word. | ||
The assumption is that a word's _suffix_ is the most-changed part of a word. | ||
![carbon(1)](https://user-images.githubusercontent.com/399657/79898840-e7e66780-83d9-11ea-9ff3-099bf39cf892.png) | ||
<!-- ![carbon(1)](https://user-images.githubusercontent.com/399657/79898840-e7e66780-83d9-11ea-9ff3-099bf39cf892.png) --> | ||
### Learn → Convert | ||
<!-- spacer --> | ||
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
```js | ||
import { find, convert } from 'suffix-thumb' | ||
import { learn, convert } from 'suffix-thumb' | ||
const pairs = [ | ||
let pairs = [ | ||
['walk', 'walked'], | ||
@@ -42,10 +48,9 @@ ['talk', 'talked'], | ||
] | ||
let model = find(pairs) | ||
let model = learn(pairs) | ||
/* { | ||
rules: { k: [ [ 'alk', 'alked' ] ] }, | ||
exceptions: { go: 'went' }, | ||
coverage: 0.667 | ||
}*/ | ||
const pairs = [ | ||
let pairs = [ | ||
['aail', 'aael'], | ||
@@ -57,7 +62,6 @@ ['bbil', 'bbel'], | ||
] | ||
let model = find(pairs) | ||
let model = learn(pairs) | ||
/* { | ||
rules: { o: [ [ 'foo', 'bar' ] ], l: [ [ 'il', 'el' ] ] }, | ||
exceptions: {}, | ||
coverage: 1 | ||
} | ||
@@ -69,17 +73,105 @@ */ | ||
``` | ||
<!-- spacer --> | ||
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
### Reverse | ||
the model also works transforming the words the other way: | ||
```js | ||
import { learn, reverse, convert } from 'suffix-thumb' | ||
let pairs = [ | ||
['walk', 'walked'], | ||
['talk', 'talked'], | ||
['go', 'went'], | ||
] | ||
let model = learn(pairs) | ||
let rev = reverse(model) | ||
let out = convert('walked', rev) | ||
// 'walk' | ||
``` | ||
by default, the model ensures all two-way transformation - if you only require 1-way, you can do: | ||
```js | ||
learn(pairs, {inverse: false}) | ||
``` | ||
you can expect the model to be 5% smaller or so - not much. | ||
<!-- spacer --> | ||
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
### Compress | ||
by default, the model is small, but remains human-readable (and human-editable). | ||
We can compress it further, turning it into a snowball inscrutible characters: | ||
```js | ||
import { learn, compress, uncompress, convert } from 'suffix-thumb' | ||
let pairs = [ | ||
['walk', 'walked'], | ||
['talk', 'talked'], | ||
['go', 'went'], | ||
] | ||
let model = learn(pairs) | ||
// shrink it | ||
model = compress(shrink) | ||
// {rules:'LSKs3H2-LNL.S3DH'} | ||
// pop it back | ||
model = uncompress(model) | ||
let out = convert('walk', model) | ||
// 'walked' | ||
``` | ||
The models must be uncompressed before they are used, or reversed. | ||
<!-- spacer --> | ||
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
### Validation | ||
sometimes you can accidentally send in an impossible set of transformations. This library quietly ignores duplicates, by default. | ||
You can use `{verbose:true}` to log warnings about this, or validate your input manually: | ||
```js | ||
import { validate } from 'suffix-thumb' | ||
let pairs = [ | ||
['left', 'right'], | ||
['left', 'right-two'], | ||
['ok', 'right'], | ||
] | ||
pairs = validate(pairs) //remove dupes (on both sides) | ||
``` | ||
If you are just doing one-way transformation, and not reverse, you may want to allow duplicates on the right side: | ||
```js | ||
let pairs = [ | ||
['left', 'right'], | ||
['ok', 'right'], | ||
] | ||
let model = learn(pairs, {inverse: false}) | ||
let out = convert('ok', model) | ||
// 'right' | ||
``` | ||
<!-- spacer --> | ||
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
## How it works | ||
For each word-pair, it generates all **n-suffixes** of the left, and **n-suffixes** of the right. | ||
For each word-pair, it generates all **n-suffixes** of the left-side, and **n-suffixes** of the right-side. | ||
any pattern between the two sets of words begins to pop out. | ||
any good correlations between the two suffix pairs begins to pop out. Exceptions to these rules are remembered. It then exhaustively reduces any redundancies in these rules. | ||
it reduces any redundancies in this list. | ||
There are some compromises, magic-numbers, and opinionated decisions - in-order to allow productive, but imperfect rules. | ||
it then runs the patters on the dataset, to get a score, and any exceptions. | ||
* The library is meant optimize for file-size of the model | ||
* compression is slow, uncompression is fast | ||
* it should always return a perfect result | ||
There may be wordlists with no helpful patterns. | ||
The library drops case-information - and numbers and some characters[1](https://github.com/spencermountain/efrt) will not compress properly. | ||
Ideally, you should be able to take a list of word-pairs, create a model for them, and then delete the 2nd half of the word pairs. | ||
There may be wordlists with few helpful patterns. Conjugation datasets in English and French tend to get ~85% filesize compression. | ||
<!-- spacer --> | ||
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
### See also | ||
* [efrt](https://github.com/spencermountain/efrt) - trie-based JSON compression | ||
MIT |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Minified code
QualityThis package contains minified code. This may be harmless in some cases where minified code is included in packaged libraries, however packages on npm should not minify code.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
40764
9
8
992
173
0
2