Comparing version 2.1.0 to 3.0.0
@@ -1,2 +0,2 @@ | ||
var Regex=(()=>{var y=Object.defineProperty;var ue=Object.getOwnPropertyDescriptor;var le=Object.getOwnPropertyNames;var ce=Object.prototype.hasOwnProperty;var fe=(e,t)=>{for(var r in t)y(e,r,{get:t[r],enumerable:!0})},pe=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let s of le(t))!ce.call(e,s)&&s!==r&&y(e,s,{get:()=>t[s],enumerable:!(n=ue(t,s))||n.enumerable});return e};var Ee=e=>pe(y({},"__esModule",{value:!0}),e);var Ue={};fe(Ue,{partial:()=>k,regex:()=>Ie});var E={DEFAULT:"DEFAULT",CHAR_CLASS:"CHAR_CLASS"};function h(e,t,r,n){let s=new RegExp(String.raw`${t}|(?<skip>\\?.)`,"gsu"),a=0,o="";for(let u of e.matchAll(s)){let{0:l,groups:{skip:f}}=u;if(!f&&(!n||n===E.DEFAULT==!a)){r instanceof Function?o+=r(u):o+=r;continue}l==="["?a++:l==="]"&&a&&a--,o+=l}return o}function I(e,t,r,n){h(e,t,r,n)}function P(e,t,r=0,n){if(!new RegExp(t,"su").test(e))return null;let s=new RegExp(String.raw`${t}|(?<skip>\\?.)`,"gsu");s.lastIndex=r;let a=0,o;for(;o=s.exec(e);){let{0:u,groups:{skip:l}}=o;if(!l&&(!n||n===E.DEFAULT==!a))return o;u==="["?a++:u==="]"&&a&&a--,s.lastIndex==o.index&&s.lastIndex++}return null}function L(e,t,r){return!!P(e,t,0,r)}function q(e,t){let r=/\\?./gsu;r.lastIndex=t;let n=e.length,s=0,a=1,o;for(;o=r.exec(e);){let[u]=o;if(u==="[")s++;else if(s)u==="]"&&s--;else if(u==="(")a++;else if(u===")"&&(a--,!a)){n=o.index;break}}return e.slice(t,n)}var S=class{#e;constructor(t){this.#e=t}toString(){return String(this.#e)}};function k(e,...t){if(Array.isArray(e?.raw))return new S(e.raw.flatMap((r,n)=>n<e.raw.length-1?[r,t[n]]:r).join(""));if(!t.length)return new S(e??"");throw new Error(`Unexpected arguments: ${JSON.stringify([e,...t])}`)}var c={DEFAULT:"R_DEFAULT",CHAR_CLASS:"R_CHAR_CLASS",GROUP_NAME:"R_GROUP_NAME",ENCLOSED_TOKEN:"R_ENCLOSED_TOKEN",INTERVAL_QUANTIFIER:"R_INTERVAL_QUANTIFIER",INVALID_INCOMPLETE_TOKEN:"R_INVALID_INCOMPLETE_TOKEN"},p={DEFAULT:"CC_DEFAULT",RANGE:"CC_RANGE",ENCLOSED_TOKEN:"CC_ENCLOSED_TOKEN",Q_TOKEN:"CC_Q_TOKEN",INVALID_INCOMPLETE_TOKEN:"CC_INVALID_INCOMPLETE_TOKEN"},F=(()=>{try{new RegExp("(?i:)")}catch{return!1}return!0})(),B=(()=>{try{new RegExp("","v")}catch{return!1}return!0})(),R="&!#$%*+,.:;<=>?@^`~",O=String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!])`;function M(e,t){return t===E.CHAR_CLASS?e.replace(new RegExp(String.raw`[()\[\]{}|\\/\-${R}]`,"g"),"\\$&"):e.replace(/[()\[\]{}|\\^$*+?.]/g,"\\$&")}function G(e){return e.replace(new RegExp(String.raw`^([${R}])(?!\1)`),(t,r,n)=>`\\${t}${n+1===e.length?"":t}`)}function Z(e){return e.replace(/^\^/,"\\^^")}function D(e,t){return h(e,String.raw`\\0(?!\d)`,"\\u{0}",t)}function W(e,t,r){let n=0;for(let[s]of e.matchAll(new RegExp(`[${M(t+r,E.CHAR_CLASS)}]`,"g")))if(n+=s===t?1:-1,n<0)return r;return n>0?t:""}function J(e,t,r){let n=e.replace(/\\./gsu,"");if(n.endsWith("\\"))return"\\";if(t===c.DEFAULT)return W(n,"(",")");if(t===c.CHAR_CLASS&&!(r===p.ENCLOSED_TOKEN||r===p.Q_TOKEN))return W(n,"[","]");if(t===c.ENCLOSED_TOKEN||t===c.INTERVAL_QUANTIFIER||r===p.ENCLOSED_TOKEN||r===p.Q_TOKEN){if(n.includes("}"))return"}"}else if(t===c.GROUP_NAME&&n.includes(">"))return">";return""}var z=new RegExp(String.raw` | ||
var Regex=(()=>{var M=Object.defineProperty;var le=Object.getOwnPropertyDescriptor;var ce=Object.getOwnPropertyNames;var fe=Object.prototype.hasOwnProperty;var pe=(e,t)=>{for(var n in t)M(e,n,{get:t[n],enumerable:!0})},Ee=(e,t,n,r)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ce(t))!fe.call(e,o)&&o!==n&&M(e,o,{get:()=>t[o],enumerable:!(r=le(t,o))||r.enumerable});return e};var de=e=>Ee(M({},"__esModule",{value:!0}),e);var Fe={};pe(Fe,{pattern:()=>x,regex:()=>Oe});var d={DEFAULT:"DEFAULT",CHAR_CLASS:"CHAR_CLASS"};function w(e,t,n,r){let o=new RegExp(String.raw`${t}|(?<skip>\\?.)`,"gsu"),i=0,s="";for(let u of e.matchAll(o)){let{0:a,groups:{skip:p}}=u;if(!p&&(!r||r===d.DEFAULT==!i)){n instanceof Function?s+=n(u):s+=n;continue}a==="["?i++:a==="]"&&i&&i--,s+=a}return s}function $(e,t,n,r){w(e,t,n,r)}function F(e,t,n=0,r){if(!new RegExp(t,"su").test(e))return null;let o=new RegExp(String.raw`${t}|(?<skip>\\?.)`,"gsu");o.lastIndex=n;let i=0,s;for(;s=o.exec(e);){let{0:u,groups:{skip:a}}=s;if(!a&&(!r||r===d.DEFAULT==!i))return s;u==="["?i++:u==="]"&&i&&i--,o.lastIndex==s.index&&o.lastIndex++}return null}function C(e,t,n){return!!F(e,t,0,n)}function V(e,t){let n=/\\?./gsu;n.lastIndex=t;let r=e.length,o=0,i=1,s;for(;s=n.exec(e);){let[u]=s;if(u==="[")o++;else if(o)u==="]"&&o--;else if(u==="(")i++;else if(u===")"&&(i--,!i)){r=s.index;break}}return e.slice(t,r)}var _=class{#e;constructor(t){this.#e=t}toString(){return String(this.#e)}};function x(e,...t){if(Array.isArray(e?.raw))return new _(e.raw.flatMap((n,r)=>r<e.raw.length-1?[n,t[r]]:n).join(""));if(!t.length)return new _(e??"");throw new Error(`Unexpected arguments: ${JSON.stringify([e,...t])}`)}var f={DEFAULT:"R_DEFAULT",CHAR_CLASS:"R_CHAR_CLASS",GROUP_NAME:"R_GROUP_NAME",ENCLOSED_TOKEN:"R_ENCLOSED_TOKEN",INTERVAL_QUANTIFIER:"R_INTERVAL_QUANTIFIER",INVALID_INCOMPLETE_TOKEN:"R_INVALID_INCOMPLETE_TOKEN"},E={DEFAULT:"CC_DEFAULT",RANGE:"CC_RANGE",ENCLOSED_TOKEN:"CC_ENCLOSED_TOKEN",Q_TOKEN:"CC_Q_TOKEN",INVALID_INCOMPLETE_TOKEN:"CC_INVALID_INCOMPLETE_TOKEN"},b=(()=>{try{new RegExp("(?i:)")}catch{return!1}return!0})(),B=(()=>{try{new RegExp("","v")}catch{return!1}return!0})(),I="&!#$%*+,.:;<=>?@^`~",G=String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`,P=String.raw`\((?!\?)|${G}`,R=String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!]|\(DEFINE\))`;function j(e,t){return t===d.CHAR_CLASS?e.replace(new RegExp(String.raw`[()\[\]{}|\\/\-${I}]`,"g"),"\\$&"):e.replace(/[()\[\]{}|\\^$*+?.]/g,"\\$&")}function K(e){return e.replace(new RegExp(String.raw`^([${I}])(?!\1)`),(t,n,r)=>`\\${t}${r+1===e.length?"":t}`)}function Z(e){return e.replace(/^\^/,"\\^^")}function k(e,t){return w(e,String.raw`\\0(?!\d)`,"\\u{0}",t)}function W(e,t,n){let r=0;for(let[o]of e.matchAll(new RegExp(`[${j(t+n,d.CHAR_CLASS)}]`,"g")))if(r+=o===t?1:-1,r<0)return n;return r>0?t:""}function J(e,t,n){let r=e.replace(/\\./gsu,"");if(r.endsWith("\\"))return"\\";if(t===f.DEFAULT)return W(r,"(",")");if(t===f.CHAR_CLASS&&!(n===E.ENCLOSED_TOKEN||n===E.Q_TOKEN))return W(r,"[","]");if(t===f.ENCLOSED_TOKEN||t===f.INTERVAL_QUANTIFIER||n===E.ENCLOSED_TOKEN||n===E.Q_TOKEN){if(r.includes("}"))return"}"}else if(t===f.GROUP_NAME&&r.includes(">"))return">";return""}var z=new RegExp(String.raw` | ||
(?<groupN>\(\?<(?![=!])|\\[gk]<) | ||
@@ -14,5 +14,5 @@ | (?<enclosedT>\\[pPu]\{) | ||
| \\?. | ||
`.replace(/\s+/g,""),"gsu");function U(e,{regexContext:t=c.DEFAULT,charClassContext:r=p.DEFAULT,charClassDepth:n=0,lastPos:s=0}){z.lastIndex=s;let a;for(;a=z.exec(e);){let{0:o,groups:{groupN:u,enclosedT:l,qT:f,intervalQ:d,incompleteT:i}}=a;o==="["?(n++,t=c.CHAR_CLASS,r=p.DEFAULT):o==="]"&&t===c.CHAR_CLASS?(n&&n--,n||(t=c.DEFAULT),r=p.DEFAULT):t===c.CHAR_CLASS?i?r=p.INVALID_INCOMPLETE_TOKEN:o==="-"?r=p.RANGE:l?r=p.ENCLOSED_TOKEN:f?r=p.Q_TOKEN:(o==="}"&&(r===p.ENCLOSED_TOKEN||r===p.Q_TOKEN)||r===p.INVALID_INCOMPLETE_TOKEN||r===p.RANGE)&&(r=p.DEFAULT):i?t=c.INVALID_INCOMPLETE_TOKEN:u?t=c.GROUP_NAME:l?t=c.ENCLOSED_TOKEN:d?t=c.INTERVAL_QUANTIFIER:(o===">"&&t===c.GROUP_NAME||o==="}"&&(t===c.ENCLOSED_TOKEN||t===c.INTERVAL_QUANTIFIER)||t===c.INVALID_INCOMPLETE_TOKEN)&&(t=c.DEFAULT)}return{regexContext:t,charClassContext:r,charClassDepth:n,lastPos:e.length}}function x(e){let t=0;return I(e,String.raw`\((?:(?!\?)|\?<[^>]+>)`,()=>t++,E.DEFAULT),t}function X(e,t){return h(e,String.raw`\\(?<num>[1-9]\d*)`,({groups:{num:r}})=>`\\${+r+t}`,E.DEFAULT)}var de=["Basic_Emoji","Emoji_Keycap_Sequence","RGI_Emoji_Modifier_Sequence","RGI_Emoji_Flag_Sequence","RGI_Emoji_Tag_Sequence","RGI_Emoji_ZWJ_Sequence","RGI_Emoji"].join("|"),ge=new RegExp(String.raw` | ||
`.replace(/\s+/g,""),"gsu");function D(e,{regexContext:t=f.DEFAULT,charClassContext:n=E.DEFAULT,charClassDepth:r=0,lastPos:o=0}){z.lastIndex=o;let i;for(;i=z.exec(e);){let{0:s,groups:{groupN:u,enclosedT:a,qT:p,intervalQ:c,incompleteT:l}}=i;s==="["?(r++,t=f.CHAR_CLASS,n=E.DEFAULT):s==="]"&&t===f.CHAR_CLASS?(r&&r--,r||(t=f.DEFAULT),n=E.DEFAULT):t===f.CHAR_CLASS?l?n=E.INVALID_INCOMPLETE_TOKEN:s==="-"?n=E.RANGE:a?n=E.ENCLOSED_TOKEN:p?n=E.Q_TOKEN:(s==="}"&&(n===E.ENCLOSED_TOKEN||n===E.Q_TOKEN)||n===E.INVALID_INCOMPLETE_TOKEN||n===E.RANGE)&&(n=E.DEFAULT):l?t=f.INVALID_INCOMPLETE_TOKEN:u?t=f.GROUP_NAME:a?t=f.ENCLOSED_TOKEN:c?t=f.INTERVAL_QUANTIFIER:(s===">"&&t===f.GROUP_NAME||s==="}"&&(t===f.ENCLOSED_TOKEN||t===f.INTERVAL_QUANTIFIER)||t===f.INVALID_INCOMPLETE_TOKEN)&&(t=f.DEFAULT)}return{regexContext:t,charClassContext:n,charClassDepth:r,lastPos:e.length}}function O(e){let t=0;return $(e,P,()=>t++,d.DEFAULT),t}function X(e,t){return w(e,String.raw`\\(?<num>[1-9]\d*)`,({groups:{num:n}})=>`\\${+n+t}`,d.DEFAULT)}var ge=["Basic_Emoji","Emoji_Keycap_Sequence","RGI_Emoji_Modifier_Sequence","RGI_Emoji_Flag_Sequence","RGI_Emoji_Tag_Sequence","RGI_Emoji_ZWJ_Sequence","RGI_Emoji"].join("|"),me=new RegExp(String.raw` | ||
\\(?: c[A-Za-z] | ||
| p\{(?<pStrProp>${de})\} | ||
| p\{(?<pStrProp>${ge})\} | ||
| [pP]\{[^\}]+\} | ||
@@ -27,8 +27,8 @@ | (?<qStrProp>q) | ||
| . | ||
`.replace(/\s+/g,""),"gsu");function V(e){let t=!1,r;for(let{0:n,groups:s}of e.matchAll(ge)){if(s.pStrProp||s.qStrProp||n==="["&&t)return!0;if(["-","--","&&"].includes(n))t=!1;else if(!["[","]"].includes(n)){if(t||r==="]")return!0;t=!0}r=n}return!1}function j(e,t,r){let n={raw:[]},s=[],a={};return e.raw.forEach((o,u)=>{let l=r(o,{...a,lastPos:0});if(n.raw.push(l.transformed),a=l.runningContext,u<e.raw.length-1){let f=t[u];if(f instanceof S){let d=r(f,{...a,lastPos:0});s.push(k(d.transformed)),a=d.runningContext}else s.push(f)}}),{template:n,values:s}}var me=new RegExp(String.raw` | ||
${O} | ||
`.replace(/\s+/g,""),"gsu");function v(e){let t=!1,n;for(let{0:r,groups:o}of e.matchAll(me)){if(o.pStrProp||o.qStrProp||r==="["&&t)return!0;if(["-","--","&&"].includes(r))t=!1;else if(!["[","]"].includes(r)){if(t||n==="]")return!0;t=!0}n=r}return!1}function H(e,t,n){let r={raw:[]},o=[],i={};return e.raw.forEach((s,u)=>{let a=n(s,{...i,lastPos:0});if(r.raw.push(a.transformed),i=a.runningContext,u<e.raw.length-1){let p=t[u];if(p instanceof _){let c=n(p,{...i,lastPos:0});o.push(x(c.transformed)),i=c.runningContext}else o.push(p)}}),{template:r,substitutions:o}}var he=new RegExp(String.raw` | ||
${R} | ||
| \(\?< | ||
| (?<backrefNum>\\[1-9]\d*) | ||
| \\?. | ||
`.replace(/\s+/g,""),"gsu");function Y(e,t){e=String(e);let r="",n="";for(let{0:s,groups:{backrefNum:a}}of e.matchAll(me)){r+=s,t=U(r,t);let{regexContext:o}=t;if(o===c.DEFAULT)if(s==="(")n+="(?:";else{if(a)throw new Error(`Invalid decimal escape "${s}" with implicit flag n; replace with named backreference`);n+=s}else n+=s}return{transformed:n,runningContext:t}}var ee=/^\s$/,Ae=/^\\[\s#]$/,te=/^[ \t]$/,he=/^\\[ \t]$/,Ne=new RegExp(String.raw` | ||
`.replace(/\s+/g,""),"gsu");function Y(e,t){e=String(e);let n="",r="";for(let{0:o,groups:{backrefNum:i}}of e.matchAll(he)){n+=o,t=D(n,t);let{regexContext:s}=t;if(s===f.DEFAULT)if(o==="(")r+="(?:";else{if(i)throw new Error(`Invalid decimal escape "${o}" with implicit flag n; replace with named backreference`);r+=o}else r+=o}return{transformed:r,runningContext:t}}var ee=/^\s$/,Ae=/^\\[\s#]$/,te=/^[ \t]$/,Ne=/^\\[ \t]$/,we=new RegExp(String.raw` | ||
\\(?: [gk]< | ||
@@ -42,22 +42,22 @@ | [pPu]\{ | ||
| \[\^ | ||
| ${O} | ||
| ${R} | ||
| \(\?< | ||
| (?<dp>[${R}])\k<dp> | ||
| (?<dp>[${I}])\k<dp> | ||
| -- | ||
| \\?. | ||
`.replace(/\s+/g,""),"gsu");function re(e,t){e=String(e);let r=!1,n=!1,s=!1,a="",o="",u="",l="",f=!1,d=(i,{prefix:m=!0,postfix:g=!1}={})=>(i=(f&&m?"(?:)":"")+i+(g?"(?:)":""),f=!1,i);for(let[i]of e.matchAll(Ne)){if(s){i===` | ||
`&&(s=!1,f=!0);continue}if(r){if(ee.test(i))continue;r=!1,f=!0}else if(n){if(te.test(i))continue;n=!1}a+=i,t=U(a,t);let{regexContext:m,charClassContext:g}=t;if(i==="-"&&m===c.CHAR_CLASS&&l===p.RANGE)throw new Error("Invalid unescaped hyphen as the end value for a range");if(m===c.DEFAULT&&/^(?:[?*+]|\?\?)$/.test(i)||m===c.INTERVAL_QUANTIFIER&&i==="{")o+=d(i,{prefix:!1,postfix:u==="("&&i==="?"});else if(m===c.DEFAULT)ee.test(i)?r=!0:i.startsWith("#")?s=!0:Ae.test(i)?o+=d(i[1],{prefix:!1}):o+=d(i);else if(m===c.CHAR_CLASS&&i!=="["&&i!=="[^")if(te.test(i)&&(g===p.DEFAULT||g===p.RANGE||g===p.Q_TOKEN))n=!0;else{if(g===p.INVALID_INCOMPLETE_TOKEN)throw new Error(`Invalid incomplete token in character class: "${i}"`);he.test(i)&&(g===p.DEFAULT||g===p.Q_TOKEN)?o+=d(i[1],{prefix:!1}):g===p.DEFAULT?o+=d(G(D(i))):o+=d(i)}else o+=d(i);r||n||s||(u=i,l=g)}return{transformed:o,runningContext:t}}function ne(e){let t=String.raw`\(\?:\)`;return e=h(e,`(?:${t}){2,}`,"(?:)",E.DEFAULT),e=h(e,String.raw`^${t}(?![?*+{])|${t}$|${t}(?=[()|$\\])|(?<=[()|>^]|\(\?(?:[:=!]|<[=!]))${t}`,"",E.DEFAULT),e}function oe(e){if(!L(e,"\\(\\?>",E.DEFAULT))return e;let t=new RegExp(String.raw`(?<noncapturingStart>${O})|(?<capturingStart>\((?:\?<[^>]+>)?)|(?<backrefNum>\\[1-9]\d*)|\\?.`,"gsu"),r="(?>",n="(?:(?=(",s=0,a=0,o=NaN,u;do{u=!1;let l=0,f=0,d=!1,i;for(t.lastIndex=Number.isNaN(o)?0:o+n.length;i=t.exec(e);){let{0:m,index:g,groups:{backrefNum:T,capturingStart:w,noncapturingStart:_}}=i;if(m==="[")l++;else if(l)m==="]"&&l--;else if(m===r&&!d)o=g,d=!0;else if(d&&_)f++;else if(w)d&&f++,s++;else if(m===")"&&d){if(!f){a++,e=`${e.slice(0,o)}${n}${e.slice(o+r.length,g)}))\\k<$$${a+s}>)${e.slice(g+1)}`,u=!0;break}f--}else if(T)throw new Error(`Invalid decimal escape "${m}" in interpolated regex; cannot be used with atomic group`)}}while(u);return e=h(e,String.raw`\\k<\$\$(?<backrefNum>\d+)>`,({groups:{backrefNum:l}})=>`\\${l}`,E.DEFAULT),e}var se=String.raw`\\g<(?<subroutineName>[^>&]+)>`,Q=String.raw`\((?:(?!\?)|\?<(?![=!])(?<captureName>[^>]+)>)`,$=new RegExp(String.raw` | ||
${se} | ||
| (?<capturingStart>${Q}) | ||
`.replace(/\s+/g,""),"gsu");function ne(e,t){e=String(e);let n=!1,r=!1,o=!1,i="",s="",u="",a="",p=!1,c=(l,{prefix:g=!0,postfix:m=!1}={})=>(l=(p&&g?"(?:)":"")+l+(m?"(?:)":""),p=!1,l);for(let[l]of e.matchAll(we)){if(o){l===` | ||
`&&(o=!1,p=!0);continue}if(n){if(ee.test(l))continue;n=!1,p=!0}else if(r){if(te.test(l))continue;r=!1}i+=l,t=D(i,t);let{regexContext:g,charClassContext:m}=t;if(l==="-"&&g===f.CHAR_CLASS&&a===E.RANGE)throw new Error("Invalid unescaped hyphen as the end value for a range");if(g===f.DEFAULT&&/^(?:[?*+]|\?\?)$/.test(l)||g===f.INTERVAL_QUANTIFIER&&l==="{")s+=c(l,{prefix:!1,postfix:u==="("&&l==="?"});else if(g===f.DEFAULT)ee.test(l)?n=!0:l.startsWith("#")?o=!0:Ae.test(l)?s+=c(l[1],{prefix:!1}):s+=c(l);else if(g===f.CHAR_CLASS&&l!=="["&&l!=="[^")if(te.test(l)&&(m===E.DEFAULT||m===E.RANGE||m===E.Q_TOKEN))r=!0;else{if(m===E.INVALID_INCOMPLETE_TOKEN)throw new Error(`Invalid incomplete token in character class: "${l}"`);Ne.test(l)&&(m===E.DEFAULT||m===E.Q_TOKEN)?s+=c(l[1],{prefix:!1}):m===E.DEFAULT?s+=c(K(k(l))):s+=c(l)}else s+=c(l);n||r||o||(u=l,a=m)}return{transformed:s,runningContext:t}}function re(e){let t=String.raw`\(\?:\)`;return e=w(e,`(?:${t}){2,}`,"(?:)",d.DEFAULT),e=w(e,String.raw`^${t}(?![?*+{])|${t}$|${t}(?=[()|$\\])|(?<=[()|>^]|\(\?(?:[:=!]|<[=!]))${t}`,"",d.DEFAULT),e}function oe(e){if(!C(e,"\\(\\?>",d.DEFAULT))return e;let t=new RegExp(String.raw`(?<noncapturingStart>${R})|(?<capturingStart>\((?:\?<[^>]+>)?)|(?<backrefNum>\\[1-9]\d*)|\\?.`,"gsu"),n="(?>",r="(?:(?=(",o=0,i=0,s=NaN,u;do{u=!1;let a=0,p=0,c=!1,l;for(t.lastIndex=Number.isNaN(s)?0:s+r.length;l=t.exec(e);){let{0:g,index:m,groups:{backrefNum:T,capturingStart:S,noncapturingStart:L}}=l;if(g==="[")a++;else if(a)g==="]"&&a--;else if(g===n&&!c)s=m,c=!0;else if(c&&L)p++;else if(S)c&&p++,o++;else if(g===")"&&c){if(!p){i++,e=`${e.slice(0,s)}${r}${e.slice(s+n.length,m)}))\\k<$$${i+o}>)${e.slice(m+1)}`,u=!0;break}p--}else if(T)throw new Error(`Invalid decimal escape "${g}" in interpolated regex; cannot be used with atomic group`)}}while(u);return e=w(e,String.raw`\\k<\$\$(?<backrefNum>\d+)>`,({groups:{backrefNum:a}})=>`\\${a}`,d.DEFAULT),e}function ie(e){let t=ae(e,!0);return _e(Te(e,t),t)}var Se=String.raw`\\g<(?<subroutineName>[^>&]+)>`,U=new RegExp(String.raw` | ||
${Se} | ||
| (?<capturingStart>${P}) | ||
| \\(?<backrefNum>[1-9]\d*) | ||
| \\k<(?<backrefName>[^>]+)> | ||
| \\?. | ||
`.replace(/\s+/g,""),"gsu");function ie(e){if(!L(e,"\\\\g<",E.DEFAULT))return e;let t=Te(e),r=[0],n=we(e),s=0,a=0,o=new Map,u=[],l=0,f=e,d;for($.lastIndex=0;d=$.exec(f);){let{0:i,index:m,groups:{subroutineName:g,capturingStart:T,backrefNum:w,backrefName:_}}=d;if(i==="[")l++;else if(l)i==="]"&&l--;else{let C=o.size?o.get(H(u)):null;if(g){if(!t.has(g))throw new Error(`Invalid named capture referenced by subroutine ${i}`);if(o.has(g))throw new Error(`Subroutine ${i} followed a recursive reference`);let N=t.get(g),A=x(N)+1;a+=A;let b=`(${N})`;o.set(g,{contents:N,unclosedGroupCount:_e(b),numCaptures:A}),u.push(g),f=K(f,m,i,b),$.lastIndex-=i.length}else if(T)o.size?(i!=="("&&(f=K(f,m,i,"("),$.lastIndex-=i.length),r.push(H(r)+C.numCaptures)):(s++,r.length===s&&r.push(H(r)));else if(w){let N=+w,A;if(o.size?N>n&&(A=s+a-n-C.numCaptures):A=r[N],A){let b=`\\${N+A}`;f=K(f,m,i,b),$.lastIndex+=b.length-i.length}}else if(_){if(o.size){let N=!1;for(let A of u)if(L(o.get(A).contents,String.raw`\(\?<${_}>`,E.DEFAULT)){N=!0;break}if(N){let A=`\\${Se(e,_)}`;f=K(f,m,i,A),$.lastIndex-=i.length}}}else i===")"&&o.size&&(C.unclosedGroupCount--,C.unclosedGroupCount||o.delete(u.pop()))}}return f}function we(e){let t=new Set;I(e,se,({groups:{subroutineName:a}})=>{t.add(a)},E.DEFAULT);let r=0,n=0,s;for(;s=P(e,Q,n,E.DEFAULT);){let{0:a,index:o,groups:{captureName:u}}=s;if(t.has(u))break;r++,n=o+a.length}return r}function Se(e,t){let r=0,n=0,s;for(;s=P(e,Q,n,E.DEFAULT);){let{0:a,index:o,groups:{captureName:u}}=s;if(r++,u===t)break;n=o+a.length}return r}function K(e,t,r,n){return e.slice(0,t)+n+e.slice(t+r.length)}function Te(e){let t=new Map;return I(e,String.raw`\(\?<(?<captureName>[^>]+)>`,({0:r,index:n,groups:{captureName:s}})=>{t.has(s)||t.set(s,q(e,n+r.length))},E.DEFAULT),t}function _e(e){let t=0;return I(e,String.raw`\(`,()=>t++,E.DEFAULT),t}function H(e){return e[e.length-1]}var Le="&!#%,:;<=>@`~",Ce=new RegExp(String.raw` | ||
`.replace(/\s+/g,""),"gsu");function Te(e,t){if(!C(e,"\\\\g<",d.DEFAULT))return e;let n=[0],r=new Map,o=[],i=0,s=0,u=0,a=e,p;for(U.lastIndex=0;p=U.exec(a);){let{0:c,index:l,groups:{subroutineName:g,capturingStart:m,backrefNum:T,backrefName:S}}=p;if(c==="[")u++;else if(u)c==="]"&&u--;else{let L=r.size?r.get(Q(o)):null;if(g){if(!t.has(g))throw new Error(`Invalid named capture referenced by subroutine ${c}`);if(r.has(g))throw new Error(`Subroutine ${c} followed a recursive reference`);let A=t.get(g).contents,N=O(A)+1;s+=N;let h=`(${A})`;r.set(g,{contents:A,unclosedGroupCount:Ie(h),numCaptures:N}),o.push(g),a=y(a,l,c,h),U.lastIndex-=c.length}else if(m)r.size?(c!=="("&&(a=y(a,l,c,"("),U.lastIndex-=c.length),n.push(Q(n)+L.numCaptures)):(i++,n.length===i&&n.push(Q(n)));else if(T){let A=+T,N=0;if(r.size){let h=Le(e,o[0]);A>h&&(N=i+s-h-L.numCaptures)}else N=n[A];if(N){let h=`\\${A+N}`;a=y(a,l,c,h),U.lastIndex+=h.length-c.length}}else if(S){if(r.size){let A=!1;if(S===o[0])A=!0;else for(let N of o)if(C(r.get(N).contents,String.raw`\(\?<${S}>`,d.DEFAULT)){A=!0;break}if(A){let N=`\\${Ce(e,S)}`;a=y(a,l,c,N),U.lastIndex-=c.length}}}else c===")"&&r.size&&(L.unclosedGroupCount--,L.unclosedGroupCount||r.delete(o.pop()))}}return a}function _e(e,t){let n=F(e,String.raw`\(\?\(DEFINE\)`,0,d.DEFAULT);if(!n)return e;let r=se(e,n);if(r.afterPos<e.length)throw new Error("DEFINE group allowed only at the end of a regex");if(r.afterPos>e.length)throw new Error("DEFINE group is unclosed");let o=new RegExp(String.raw`${G}|\(\?:\)|(?<unsupported>\\?.)`,"gsu"),i;for(;i=o.exec(r.contents);){let{captureName:s,unsupported:u}=i.groups;if(s){let a=se(r.contents,i),p;if(!t.get(s).isUnique)p=s;else{let c=ae(a.contents);for(let l of c.keys())if(!t.get(l).isUnique){p=l;break}}if(p)throw new Error(`Duplicate group name "${p}" within DEFINE"`);o.lastIndex=a.afterPos;continue}if(u)throw new Error("DEFINE group includes unsupported syntax at top level")}return e.slice(0,n.index)}function se(e,t){let n=t.index+t[0].length,r=V(e,n),o=n+r.length+1;return{contents:r,afterPos:o}}function Le(e,t){let n=0,r=0,o;for(;o=F(e,P,r,d.DEFAULT);){let{0:i,index:s,groups:{captureName:u}}=o;if(u===t)break;n++,r=s+i.length}return n}function Ce(e,t){let n=0,r=0,o;for(;o=F(e,P,r,d.DEFAULT);){let{0:i,index:s,groups:{captureName:u}}=o;if(n++,u===t)break;r=s+i.length}return n}function y(e,t,n,r){return e.slice(0,t)+r+e.slice(t+n.length)}function ae(e,t){let n=new Map;return $(e,G,({0:r,index:o,groups:{captureName:i}})=>{n.has(i)?n.get(i).isUnique=!1:n.set(i,{isUnique:!0,...t?{contents:V(e,o+r.length)}:null})},d.DEFAULT),n}function Ie(e){let t=0;return $(e,String.raw`\(`,()=>t++,d.DEFAULT),t}function Q(e){return e[e.length-1]}var Re="&!#%,:;<=>@`~",De=new RegExp(String.raw` | ||
\[\^?-? | ||
| --?\] | ||
| (?<dp>[${R}])\k<dp> | ||
| (?<dp>[${I}])\k<dp> | ||
| -- | ||
| \\(?<vOnlyEscape>[${Le}]) | ||
| \\(?<vOnlyEscape>[${Re}]) | ||
| \\[pPu]\{[^}]+\} | ||
| \\?. | ||
`.replace(/\s+/g,""),"gsu");function ae(e,t){let r='Invalid unescaped "-" in character class',n=!1,s=!1,a="";for(let{0:o,groups:{dp:u,vOnlyEscape:l}}of e.matchAll(Ce)){if(o[0]==="["){if(n)throw new Error("Invalid nested character class when flag v not supported; possibly from interpolation");if(o.endsWith("-"))throw new Error(r);n=!0,s=o[1]==="^"}else if(o.endsWith("]")){if(o[0]==="-")throw new Error(r);n=s=!1}else if(n){if(o==="&&"||o==="--")throw new Error(`Invalid set operator "${o}" when flag v not supported`);if(u)throw new Error(`Invalid double punctuator "${o}", reserved by flag v`);if("(){}/|".includes(o))throw new Error(`Invalid unescaped "${o}" in character class`);if(s&&o.startsWith("\\P")&&t.includes("i"))throw new Error("Negated \\P in negated character class with flag i works differently with flag v");if(l){a+=l;continue}}a+=o}return a}var Ie=function(e,...t){let r=this instanceof Function?this:RegExp;if(Array.isArray(e?.raw))return v(r,{flags:""},e,...t);if((typeof e=="string"||e===void 0)&&!t.length)return v.bind(null,r,{flags:e});if({}.toString.call(e)==="[object Object]"&&!t.length)return v.bind(null,r,e);throw new Error(`Unexpected arguments: ${JSON.stringify([e,...t])}`)};function v(e,t,r,...n){let{flags:s="",postprocessors:a=[],__flagN:o=!0,__flagV:u=B,__flagX:l=!0,__rake:f=!0}=t;if(/[vu]/.test(s))throw new Error("Flags v/u cannot be explicitly added");l&&({template:r,values:n}=j(r,n,re)),o&&({template:r,values:n}=j(r,n,Y));let d=0,i="",m={};r.raw.forEach((T,w)=>{let _=r.raw[w]||r.raw[w+1];d+=x(T),i+=D(T,E.CHAR_CLASS),m=U(i,m);let{regexContext:C,charClassContext:N}=m;if(w<r.raw.length-1){let A=n[w];i+=Re(A,s,C,N,_,d),(A instanceof RegExp||A instanceof S)&&(d+=x(A.source||String(A)))}});let g=[...a,oe,ie];return u||g.push(ae),f&&g.push(ne),g.forEach(T=>i=T(i,s)),new e(i,(u?"v":"u")+s)}function Re(e,t,r,n,s,a){if(e instanceof RegExp&&r!==c.DEFAULT)throw new Error("Cannot interpolate a RegExp at this position because the syntax context does not match");if(r===c.INVALID_INCOMPLETE_TOKEN||n===p.INVALID_INCOMPLETE_TOKEN)throw new Error("Interpolation preceded by invalid incomplete token");let o=e instanceof S,u;if(!(e instanceof RegExp)){e=String(e),o||(u=M(e,r===c.CHAR_CLASS?E.CHAR_CLASS:E.DEFAULT));let l=J(u||e,r,n);if(l)throw new Error(`Unescaped stray "${l}" in the interpolated value would have side effects outside it`)}if(r===c.ENCLOSED_TOKEN||r===c.INTERVAL_QUANTIFIER||r===c.GROUP_NAME||n===p.ENCLOSED_TOKEN||n===p.Q_TOKEN)return o?e:u;if(r===c.CHAR_CLASS){if(o){if(L(e,"^-|^&&|-$|&&$"))throw new Error("Cannot use range or set operator at boundary of partial; move the operation into the partial or the operator outside of it");let l=Z(G(e));return V(e)?`[${l}]`:D(l)}return V(u)?`[${u}]`:u}if(e instanceof RegExp){let l=Oe(e,t),f=X(l.value,a);return l.usedModifier?f:`(?:${f})`}return o?`(?:${e})`:s?`(?:${u})`:u}function Oe(e,t){let r={i:null,m:null,s:null},n="\\n\\r\\u2028\\u2029",s=e.source;if(e.ignoreCase!==t.includes("i"))if(F)r.i=e.ignoreCase;else throw new Error("Pattern modifiers not supported, so the value of flag i on the interpolated RegExp must match the outer regex");if(e.dotAll!==t.includes("s")&&(F?r.s=e.dotAll:s=h(s,"\\.",e.dotAll?"[^]":`[^${n}]`,E.DEFAULT)),e.multiline!==t.includes("m")&&(F?r.m=e.multiline:(s=h(s,"\\^",e.multiline?`(?<=^|[${n}])`:"(?<![^])",E.DEFAULT),s=h(s,"\\$",e.multiline?`(?=$|[${n}])`:"(?![^])",E.DEFAULT))),F){let a=Object.keys(r),o=a.filter(l=>r[l]===!0).join(""),u=a.filter(l=>r[l]===!1).join("");if(u&&(o+=`-${u}`),o)return{value:`(?${o}:${s})`,usedModifier:!0}}return{value:s}}return Ee(Ue);})(); | ||
`.replace(/\s+/g,""),"gsu");function ue(e,t){let n='Invalid unescaped "-" in character class',r=!1,o=!1,i="";for(let{0:s,groups:{dp:u,vOnlyEscape:a}}of e.matchAll(De)){if(s[0]==="["){if(r)throw new Error("Invalid nested character class when flag v not supported; possibly from interpolation");if(s.endsWith("-"))throw new Error(n);r=!0,o=s[1]==="^"}else if(s.endsWith("]")){if(s[0]==="-")throw new Error(n);r=o=!1}else if(r){if(s==="&&"||s==="--")throw new Error(`Invalid set operator "${s}" when flag v not supported`);if(u)throw new Error(`Invalid double punctuator "${s}", reserved by flag v`);if("(){}/|".includes(s))throw new Error(`Invalid unescaped "${s}" in character class`);if(o&&s.startsWith("\\P")&&t.includes("i"))throw new Error("Negated \\P in negated character class with flag i works differently with flag v");if(a){i+=a;continue}}i+=s}return i}function Oe(e,...t){let n=this instanceof Function?this:RegExp;if(Array.isArray(e?.raw))return q(n,{flags:""},e,...t);if((typeof e=="string"||e===void 0)&&!t.length)return q.bind(null,n,{flags:e});if({}.toString.call(e)==="[object Object]"&&!t.length)return q.bind(null,n,e);throw new Error(`Unexpected arguments: ${JSON.stringify([e,...t])}`)}function q(e,t,n,...r){let{flags:o="",postprocessors:i=[],__flagN:s=!0,__flagV:u=B,__flagX:a=!0,__rake:p=!0}=t;if(/[vu]/.test(o))throw new Error("Flags v/u cannot be explicitly added");a&&({template:n,substitutions:r}=H(n,r,ne)),s&&({template:n,substitutions:r}=H(n,r,Y));let c=0,l="",g={};n.raw.forEach((T,S)=>{let L=!!(n.raw[S]||n.raw[S+1]);c+=O(T),l+=k(T,d.CHAR_CLASS),g=D(l,g);let{regexContext:A,charClassContext:N}=g;if(S<n.raw.length-1){let h=r[S];l+=Ue(h,o,A,N,L,c),h instanceof RegExp?c+=O(h.source):h instanceof _&&(c+=O(String(h)))}});let m=[...i,oe,ie];return u||m.push(ue),p&&m.push(re),m.forEach(T=>l=T(l,o)),new e(l,(u?"v":"u")+o)}function Ue(e,t,n,r,o,i){if(e instanceof RegExp&&n!==f.DEFAULT)throw new Error("Cannot interpolate a RegExp at this position because the syntax context does not match");if(n===f.INVALID_INCOMPLETE_TOKEN||r===E.INVALID_INCOMPLETE_TOKEN)throw new Error("Interpolation preceded by invalid incomplete token");let s=e instanceof _,u;if(!(e instanceof RegExp)){e=String(e),s||(u=j(e,n===f.CHAR_CLASS?d.CHAR_CLASS:d.DEFAULT));let a=J(u||e,n,r);if(a)throw new Error(`Unescaped stray "${a}" in the interpolated value would have side effects outside it`)}if(n===f.ENCLOSED_TOKEN||n===f.INTERVAL_QUANTIFIER||n===f.GROUP_NAME||r===E.ENCLOSED_TOKEN||r===E.Q_TOKEN)return s?e:u;if(n===f.CHAR_CLASS){if(s){if(C(e,"^-|^&&|-$|&&$"))throw new Error("Cannot use range or set operator at boundary of interpolated pattern; move the operation into the pattern or the operator outside of it");let a=Z(K(e));return v(e)?`[${a}]`:k(a)}return v(u)?`[${u}]`:u}if(e instanceof RegExp){let a=$e(e,t),p=X(a.value,i);return a.usedModifier?p:`(?:${p})`}return s?`(?:${e})`:o?`(?:${u})`:u}function $e(e,t){let n={i:null,m:null,s:null},r="\\n\\r\\u2028\\u2029",o=e.source;if(e.ignoreCase!==t.includes("i"))if(b)n.i=e.ignoreCase;else throw new Error("Pattern modifiers not supported, so the value of flag i on the interpolated RegExp must match the outer regex");if(e.dotAll!==t.includes("s")&&(b?n.s=e.dotAll:o=w(o,"\\.",e.dotAll?"[^]":`[^${r}]`,d.DEFAULT)),e.multiline!==t.includes("m")&&(b?n.m=e.multiline:(o=w(o,"\\^",e.multiline?`(?<=^|[${r}])`:"(?<![^])",d.DEFAULT),o=w(o,"\\$",e.multiline?`(?=$|[${r}])`:"(?![^])",d.DEFAULT))),b){let i=Object.keys(n),s=i.filter(a=>n[a]===!0).join(""),u=i.filter(a=>n[a]===!1).join("");if(u&&(s+=`-${u}`),s)return{value:`(?${s}:${o})`,usedModifier:!0}}return{value:o}}return de(Fe);})(); |
{ | ||
"name": "regex", | ||
"version": "2.1.0", | ||
"description": "Context-aware regex template tag with advanced features and best practices built-in", | ||
"version": "3.0.0", | ||
"description": "Regex template tag with extended syntax, context-aware interpolation, and always-on best practices", | ||
"author": "Steven Levithan", | ||
"license": "MIT", | ||
"type": "module", | ||
"exports": "./src/index.js", | ||
"exports": { | ||
".": { | ||
"types": "./types/index.d.ts", | ||
"import": "./src/index.js" | ||
} | ||
}, | ||
"scripts": { | ||
"build": "esbuild src/index.js --bundle --minify --outfile=dist/regex.min.js --global-name=Regex", | ||
"esbuild": "esbuild src/index.js --bundle --minify --outfile=dist/regex.min.js --global-name=Regex", | ||
"prebuild": "rimraf dist types", | ||
"build": "npm run esbuild && npm run types", | ||
"pretest": "npm run build", | ||
"test": "jasmine", | ||
"prepublish": "npm test" | ||
"types": "tsc src/index.js --rootDir src --declaration --allowJs --emitDeclarationOnly --outdir types", | ||
"prepare": "npm test" | ||
}, | ||
"files": [ | ||
"src", | ||
"dist" | ||
"dist", | ||
"types" | ||
], | ||
@@ -31,5 +40,7 @@ "repository": { | ||
"devDependencies": { | ||
"esbuild": "^0.21.5", | ||
"jasmine": "^5.1.0" | ||
"esbuild": "^0.23.0", | ||
"jasmine": "^5.2.0", | ||
"rimraf": "^6.0.1", | ||
"typescript": "^5.5.3" | ||
} | ||
} |
228
README.md
@@ -0,9 +1,16 @@ | ||
<div align="center"> | ||
<a href="https://github.com/slevithan/regex#readme"><img src="https://github.com/slevithan/regex/raw/main/regex-logo.svg" height="130" alt="regex logo"></a> | ||
`regex` creates **readable, high performance, *native* JavaScript regular expressions** with advanced features and best practices built-in. It's lightweight (6.5 KB minified and brotlied) and supports all ES2024+ regex functionality. | ||
[![build status](https://github.com/slevithan/regex/workflows/CI/badge.svg)](https://github.com/slevithan/regex/actions) | ||
[![npm](https://img.shields.io/npm/v/regex)](https://www.npmjs.com/package/regex) | ||
[![bundle size](https://deno.bundlejs.com/badge?q=regex&treeshake=[*])](https://bundlejs.com/?q=regex&treeshake=[*]) | ||
</div> | ||
Highlights include support for free spacing and comments, atomic groups via `(?>…)` which can help you avoid [ReDoS](https://en.wikipedia.org/wiki/ReDoS), subroutines via `\g<name>` which enable powerful composition, and context-aware interpolation of `RegExp` instances, escaped strings, and partial patterns. | ||
`regex` is a template tag that extends JavaScript regular expressions with features that make them more powerful and dramatically more readable. It returns native `RegExp` instances that equal or exceed native performance. It's also lightweight, supports all ES2024+ regex features, and can be used as a [Babel plugin](https://github.com/slevithan/babel-plugin-transform-regex) to avoid any dependencies or runtime cost. | ||
With the `regex` package, JavaScript steps up as one of the very best regex flavors. | ||
Highlights include support for free spacing and comments, atomic groups via `(?>…)` that can help you avoid [ReDoS](https://en.wikipedia.org/wiki/ReDoS), subroutines via `\g<name>` that enable powerful composition, and context-aware interpolation of regexes, escaped strings, and partial patterns. | ||
With the `regex` package, JavaScript steps up as one of the best regex flavors alongside PCRE and Perl, and maybe surpassing C++, Java, .NET, and Python. | ||
<details> | ||
@@ -19,2 +26,3 @@ <summary><b>Table of contents</b></summary> | ||
- [Subroutines](#subroutines) | ||
- [Definition groups](#definition-groups) | ||
- [Recursion](#recursion) | ||
@@ -34,2 +42,3 @@ - [Flags](#-flags) | ||
- [Compatibility](#-compatibility) | ||
- [FAQ](#-faq) | ||
</details> | ||
@@ -40,5 +49,5 @@ | ||
- **A modern regex baseline** so you don't need to continually opt-in to best practices. | ||
- Always-on flag <kbd>v</kbd> gives you the best level of Unicode support and strict errors. In environments without <kbd>v</kbd>, it uses flag <kbd>u</kbd> with <kbd>v</kbd>'s rules applied. | ||
- Always-on implicit flag <kbd>x</kbd> allows you to freely add whitespace and comments to your regexes. | ||
- Always-on implicit flag <kbd>n</kbd> (*named capture only* mode) improves regex readability and efficiency. | ||
- Always-on flag <kbd>v</kbd> gives you the best level of Unicode support and strict errors. In environments without native <kbd>v</kbd>, flag <kbd>u</kbd> is used with <kbd>v</kbd>'s rules applied. | ||
- Always-on flag <kbd>x</kbd> allows you to freely add whitespace and comments to your regexes. | ||
- Always-on flag <kbd>n</kbd> (*named capture only* mode) improves regex readability and efficiency. | ||
- No unreadable escaped backslashes `\\\\` since it's a raw string template tag. | ||
@@ -48,2 +57,3 @@ - **New regex syntax**. | ||
- Subroutines via `\g<name>` enable powerful composition, improving readability and maintainability. | ||
- Definition groups via `(?(DEFINE)…)` allow defining subpatterns for use by reference only. | ||
- Recursive matching is enabled by an extension. | ||
@@ -57,12 +67,18 @@ - **Context-aware and safe interpolation** of regexes, strings, and partial patterns. | ||
```js | ||
import {regex, partial} from 'regex'; | ||
import {regex, pattern} from 'regex'; | ||
// Subroutines | ||
const record = regex('gm')`^ | ||
Born: (?<date> \d{4}-\d{2}-\d{2} ) \n | ||
Admitted: \g<date> \n | ||
Released: \g<date> | ||
$`; | ||
// Definition group and subroutines | ||
const record = regex` | ||
^ Admitted:\ (?<admitted> \g<date>) \n | ||
Released:\ (?<released> \g<date>) $ | ||
// Atomic groups; avoid ReDoS from the nested, overlapping quantifier | ||
(?(DEFINE) | ||
(?<date> \g<year>-\g<month>-\g<day>) | ||
(?<year> \d{4}) | ||
(?<month> \d{2}) | ||
(?<day> \d{2}) | ||
) | ||
`; | ||
// Atomic group. Avoids ReDoS from the nested, overlapping quantifier | ||
const words = regex`^(?>\w+\s?)+$`; | ||
@@ -80,8 +96,9 @@ | ||
# This string is contextually sandboxed but not escaped | ||
${partial('^ a.b $')} | ||
${pattern('^ a.b $')} | ||
`; | ||
// Adjusts backreferences in interpolated regexes | ||
regex`^ ${/(dog)\1/} ${/(cat)\1/} $`; | ||
// → /^(dog)\1(cat)\2$/v | ||
// Adjusts numbered backreferences in interpolated regexes | ||
const double = /(\w)\1/; | ||
regex`^ ${double} ${double} $`; | ||
// → /^(\w)\1(\w)\2$/v | ||
``` | ||
@@ -96,3 +113,3 @@ | ||
```js | ||
import {regex, partial} from 'regex'; | ||
import {regex, pattern} from 'regex'; | ||
``` | ||
@@ -105,3 +122,3 @@ | ||
<script> | ||
const {regex, partial} = Regex; | ||
const {regex, pattern} = Regex; | ||
</script> | ||
@@ -119,13 +136,13 @@ ``` | ||
2. Named capture mode changes the meaning of `\k` when a named capture appears anywhere in a regex. | ||
3. Unicode mode with flag <kbd>u</kbd> adds strict errors (for unreserved letter escapes, octal escapes, escaped literal digits, and unescaped special characters in some contexts), switches to code-point-based matching (changing the potential handling of the dot, negated sets like `\W`, character class ranges, and quantifiers), changes the behavior of case-insensitive matching, and adds new features/syntax. | ||
4. UnicodeSets mode with flag <kbd>v</kbd>, an upgrade to <kbd>u</kbd>, changes escaping rules within character classes, fixes case-insensitive matching for doubly-negated `[^\P{…}]`, and adds new features/syntax. | ||
3. Unicode mode with flag <kbd>u</kbd> adds strict errors (for unreserved letter escapes, octal escapes, escaped literal digits, and unescaped special characters in some contexts), switches to code-point-based matching (changing the potential handling of the dot, negated sets like `\W`, character class ranges, and quantifiers), makes flag <kbd>i</kbd> use Unicode case-folding, and adds new features/syntax. | ||
4. UnicodeSets mode with flag <kbd>v</kbd> (an upgrade to <kbd>u</kbd>) incompatibly changes escaping rules within character classes, fixes case-insensitive matching for doubly-negated `[^\P{…}]`, and adds new features/syntax. | ||
</details> | ||
Additionally, JavaScript regex syntax is hard to write and even harder to read and refactor. But it doesn't have to be that way! With a few key features — raw multiline template strings, insignificant whitespace, comments, subroutines, interpolation, and *named capture only* mode — even long and complex regexes can be **beautiful, grammatical, and easy to understand**. | ||
Additionally, JavaScript regex syntax is hard to write and even harder to read and refactor. But it doesn't have to be that way! With a few key features — raw multiline strings, insignificant whitespace, comments, subroutines, definition groups, interpolation, and *named capture only* mode — even long and complex regexes can be beautiful, grammatical, and easy to understand. | ||
`regex` adds all of these features and returns native `RegExp` instances. It always uses flag <kbd>v</kbd> (already a best practice for new regexes) so you never forget to turn it on and don't have to worry about the differences in other parsing modes (and in environments without native flag <kbd>v</kbd>, it enforces <kbd>v</kbd>'s rules so your regexes are forward compatible). It supports atomic groups via `(?>…)` to help you improve the performance of your regexes and avoid catastrophic backtracking. And it gives you best-in-class, context-aware interpolation of `RegExp` instances, escaped strings, and partial patterns. | ||
`regex` adds all of these features and returns native `RegExp` instances. It always uses flag <kbd>v</kbd> (already a best practice for new regexes) so you never forget to turn it on and don't have to worry about the differences in other parsing modes (and, in environments without native flag <kbd>v</kbd>, it enforces <kbd>v</kbd>'s rules so your regexes are forward and backward compatible). It also supports atomic groups via `(?>…)` to help you improve the performance of your regexes and avoid catastrophic backtracking. And it gives you best-in-class, context-aware interpolation of `RegExp` instances, escaped strings, and partial patterns. | ||
## 🦾 New regex syntax | ||
Historically, JavaScript regexes were not as powerful as other major regex flavors like PCRE, Perl, .NET, Java, Ruby, and Python. With recent advancements and the `regex` package, those days are over. Modern JavaScript regexes have [significantly improved](https://github.com/slevithan/awesome-regex#javascript-regex-evolution) (adding lookbehind, named capture, Unicode properties, character class subtraction and intersection, etc.). The `regex` package, with its extended syntax and flags, adds the remaining pieces needed to compete with or surpass other major flavors. | ||
Historically, JavaScript regexes were not as powerful or readable as other major regex flavors like PCRE, Perl, Java, .NET, and Python. With recent advancements and the `regex` package, those days are over. Modern JavaScript regexes have [significantly improved](https://github.com/slevithan/awesome-regex#javascript-regex-evolution) (adding lookbehind, named capture, Unicode properties, character class subtraction and intersection, etc.). The `regex` package, with its extended syntax and implicit flags, adds the key remaining pieces needed to stand alongside or surpass other major flavors. | ||
@@ -194,8 +211,7 @@ ### Atomic groups | ||
// Matches a record with several date fields | ||
// Matches a record with several date fields, and captures each value | ||
regex` | ||
^ Name:\ (?<name>.*) \n | ||
Born:\ \g<date> \n | ||
Admitted:\ \g<date> \n | ||
Released:\ \g<date> $ | ||
^ Born:\ (?<born> \g<date>) \n | ||
Admitted:\ (?<admitted> \g<date>) \n | ||
Released:\ (?<released> \g<date>) $ | ||
@@ -211,2 +227,4 @@ # Define subpatterns | ||
See the next section on definition groups for another way to do this. | ||
> [!NOTE] | ||
@@ -220,6 +238,42 @@ > Subroutines are based on the feature in PCRE and Perl. PCRE allows several syntax options including `\g<name>`, whereas Perl uses `(?&name)`. Ruby also supports subroutines (and uses the `\g<name>` syntax), but it has behavior differences that make its subroutines not always act as independent subpatterns. | ||
- If there are [duplicate capture names](https://github.com/tc39/proposal-duplicate-named-capturing-groups), subroutines refer to the first instance of the given group (matching the behavior of PCRE and Perl). | ||
- Although subroutines can be chained to any depth, a descriptive error is thrown if they're used recursively. Support for recursion can be added via an extension (see the next section). | ||
- Although subroutines can be chained to any depth, a descriptive error is thrown if they're used recursively. Support for recursion can be added via an extension (see [*Recursion*](#recursion)). | ||
- As with all new syntax in `regex`, subroutines are applied after interpolation, giving them maximal flexibility. | ||
</details> | ||
### Definition groups | ||
The syntax `(?(DEFINE)…)` can be used at the end of a regex to define subpatterns for use by reference only. Compared to the `(…){0}` syntax described in the preceding section on subroutines, definition groups have the advantage that the named groups within them don't appear on a match's `groups` object. | ||
Example: | ||
```js | ||
const record = 'Admitted: 2024-01-01\nReleased: 2024-01-02'; | ||
const match = regex` | ||
^ Admitted:\ (?<admitted> \g<date>) \n | ||
Released:\ (?<released> \g<date>) $ | ||
(?(DEFINE) | ||
(?<date> \g<year>-\g<month>-\g<day>) | ||
(?<year> \d{4}) | ||
(?<month> \d{2}) | ||
(?<day> \d{2}) | ||
) | ||
`.exec(record); | ||
console.log(match.groups); | ||
// → {admitted: '2024-01-01', released: '2024-01-02'} | ||
``` | ||
> [!NOTE] | ||
> Definition groups are based on the feature in PCRE and Perl. However, `regex` supports a stricter version of definition groups since it limits their placement, quantity, and the top-level syntax that can be used within them. | ||
<details> | ||
<summary>👉 <b>Show more details</b></summary> | ||
- Only one definition group is allowed per regex, and it must appear at the end of its pattern. Trailing whitespace and comments are allowed by implicit flag <kbd>x</kbd>. | ||
- At the top level of definition groups, only named groups, whitespace, and comments are allowed. | ||
- Within definition groups, all named groups must use unique names, and all are excluded from the `groups` object of resulting matches. | ||
- The word `DEFINE` must appear in uppercase. | ||
</details> | ||
### Recursion | ||
@@ -255,3 +309,3 @@ | ||
> In environments without native support for flag <kbd>v</kbd>, flag <kbd>u</kbd> is automatically used as a fallback and flag <kbd>v</kbd>'s rules are enforced so your regexes are forward compatible. | ||
> In environments without native support for flag <kbd>v</kbd>, flag <kbd>u</kbd> is automatically used instead while still enforcing flag <kbd>v</kbd>'s rules. So your regexes are forward and backward compatible. | ||
@@ -281,4 +335,4 @@ ### Flag `x` | ||
# Partials are directly embedded, so they use free spacing | ||
${partial`\d + | [a - z]`} | ||
# Patterns are directly embedded, so they use free spacing | ||
${pattern`\d + | [a - z]`} | ||
@@ -319,3 +373,3 @@ # Interpolated regexes use their own flags, so they preserve their whitespace | ||
> [!NOTE] | ||
> Flag <kbd>n</kbd> is based on .NET, C++, PCRE, Perl, and XRegExp, which share the <kbd>n</kbd> flag letter but call it *explicit capture*, *no auto capture*, or *nosubs*. In `regex`, the implicit flag <kbd>n</kbd> also prevents using numbered backreferences to refer to named groups in the outer regex, which follows the behavior of C++ (Ruby also prevents this even without flag <kbd>n</kbd>). Referring to named groups by number is a footgun, and the way that named groups are numbered is inconsistent across regex flavors. | ||
> Flag <kbd>n</kbd> is based on .NET, C++, PCRE, Perl, and XRegExp, which share the <kbd>n</kbd> flag letter but call it *explicit capture*, *no auto capture*, or *nosubs*. In `regex`, the implicit flag <kbd>n</kbd> also prevents using numbered backreferences to refer to named groups in the outer regex, which follows the behavior of C++ (Ruby also always prevents this, despite not having flag <kbd>n</kbd>). Referring to named groups by number is a footgun, and the way that named groups are numbered is inconsistent across regex flavors. | ||
@@ -387,3 +441,3 @@ > Aside: Flag <kbd>n</kbd>'s behavior also enables `regex` to emulate atomic groups, subroutines, and recursion. | ||
These and other issues (including the effects of current and [future](https://github.com/tc39/proposal-regexp-x-mode) flags like `x`) make escaping without context unsafe to use at arbitrary positions in a regex, or at least complicated to get right. The existing popular regex escaping libraries don't even attempt to handle these kinds of issues. | ||
These and other issues (including the effects of current and potential future flags like <kbd>x</kbd>) make escaping without context unsafe to use at arbitrary positions in a regex, or at least complicated to get right. The existing popular regex escaping libraries don't even attempt to handle these kinds of issues. | ||
@@ -401,9 +455,9 @@ `regex` solves all of this via context awareness. So instead of remembering anything above, you should just switch to always safely escaping regex syntax via `regex`. | ||
For all of these cases, you can interpolate `partial(str)` to avoid escaping special characters in the string or creating an intermediary `RegExp` instance. You can also use `` partial`…` `` as a tag, as shorthand for ``partial(String.raw`…`)``. | ||
For all of these cases, you can interpolate `pattern(str)` to avoid escaping special characters in the string or creating an intermediary `RegExp` instance. You can also use `` pattern`…` `` as a tag, as shorthand for ``pattern(String.raw`…`)``. | ||
Apart from edge cases, `partial` just embeds the provided string or other value directly. But because it handles the edge cases, partial patterns can safely be interpolated anywhere in a regex without worrying about their meaning being changed by (or making unintended changes in meaning to) the surrounding pattern. | ||
Apart from edge cases, `pattern` just embeds the provided string or other value directly. But because it handles the edge cases, patterns can safely be interpolated anywhere in a regex without worrying about their meaning being changed by (or making unintended changes in meaning to) the surrounding pattern. | ||
> As with all interpolation in `regex`, partials are sandboxed and treated as complete units. This is relevant e.g. if a partial is followed by a quantifier, if it contains top-level alternation, or if it's bordered by a character class range or set operator. | ||
> As with all interpolation in `regex`, patterns are sandboxed and treated as complete units. This is relevant e.g. if a pattern is followed by a quantifier, if it contains top-level alternation, or if it's bordered by a character class range, subtraction, or intersection operator. | ||
If you want to understand the handling of partial patterns more deeply, let's look at some edge cases… | ||
If you want to understand the handling of interpolated patterns more deeply, let's look at some edge cases… | ||
@@ -416,4 +470,4 @@ <details> | ||
```js | ||
regex`[${partial`^`}]` | ||
regex`[a${partial`^`}]` | ||
regex`[${pattern`^`}]` | ||
regex`[a${pattern`^`}]` | ||
``` | ||
@@ -423,12 +477,12 @@ | ||
Both of these examples therefore match a literal `^`. They don't change the meaning of the surrounding character class. However, note that the `^` is not simply escaped. `` partial`^^` `` embedded in character class context would still correctly lead to an "invalid set operation" error due to the use of a reserved double-punctuator. | ||
Both of these examples therefore match a literal `^`. They don't change the meaning of the surrounding character class. However, note that the `^` is not simply escaped. `` pattern`^^` `` embedded in character class context would still correctly lead to an "invalid set operation" error due to the use of a reserved double-punctuator. | ||
> If you wanted to dynamically choose whether to negate a character class, you could put the whole character class inside the partial. | ||
> If you wanted to dynamically choose whether to negate a character class, you could put the whole character class inside the pattern. | ||
Moving on, the following lines all throw because otherwise the partial patterns would break out of their interpolation sandboxes and change the meaning of their surrounding patterns: | ||
Moving on, the following lines all throw because otherwise the embedded patterns would break out of their interpolation sandboxes and change the meaning of surrounding syntax: | ||
```js | ||
regex`(${partial(')')})` | ||
regex`[${partial(']')}]` | ||
regex`[${partial('a\\')}]]` | ||
regex`(${pattern(')')})` | ||
regex`[${pattern(']')}]` | ||
regex`[${pattern('a\\')}]]` | ||
``` | ||
@@ -439,12 +493,12 @@ | ||
```js | ||
regex`(${partial('()')})` | ||
regex`[\w--${partial('[_]')}]` | ||
regex`[${partial('\\\\')}]` | ||
regex`(${pattern('()')})` | ||
regex`[\w--${pattern('[_]')}]` | ||
regex`[${pattern('\\\\')}]` | ||
``` | ||
Partials can be embedded within any token scope: | ||
Patterns can be embedded within any token scope: | ||
```js | ||
// Not using `partial` for values that are not escaped anyway, but the behavior | ||
// would be the same if providing a partial | ||
// Not using `pattern` for values that are not escaped anyway, but the behavior | ||
// would be the same if you did | ||
regex`.{1,${6}}` | ||
@@ -461,6 +515,6 @@ regex`\p{${'Letter'}}` | ||
```js | ||
// Not using `partial` for values that are not escaped anyway | ||
// Not using `pattern` for values that are not escaped anyway | ||
/* 1.*/ regex`\u${'000A'}` | ||
/* 2.*/ regex`\u{${partial`A}`}` | ||
/* 3.*/ regex`(${partial`?:`}…)` | ||
/* 2.*/ regex`\u{${pattern`A}`}` | ||
/* 3.*/ regex`(${pattern`?:`}…)` | ||
``` | ||
@@ -471,6 +525,6 @@ | ||
1. This is an uncompleted `\u` token (which is an error) followed by the tokens `0`, `0`, `0`, `A`. That's because the interpolation doesn't happen within an enclosed `\u{…}` context. | ||
2. The unescaped `}` within the partial is not allowed to break out of its interpolation sandbox. | ||
2. The unescaped `}` within the interpolated pattern is not allowed to break out of its sandbox. | ||
3. The group opening `(` can't be quantified with `?`. | ||
> Characters outside the interpolation such as a preceding, unescaped `\` or an escaped number also can't change the meaning of tokens inside the partial. | ||
> Characters outside the interpolation such as a preceding, unescaped `\` or an escaped number also can't change the meaning of tokens inside the embedded pattern. | ||
@@ -481,3 +535,3 @@ And since interpolated values are handled as complete units, consider the following: | ||
// This works fine | ||
regex`[\0-${partial`\cZ`}]` | ||
regex`[\0-${pattern`\cZ`}]` | ||
@@ -500,13 +554,15 @@ // But this is an error since you can't create a range from 'a' to the set 'de' | ||
// Instead of | ||
new RegExp(`^(?:${arr.map(RegExp.escape).join('|')})$`) | ||
new RegExp(`^(?:${ | ||
arr.map(RegExp.escape).join('|') | ||
})$`) | ||
// You can say | ||
regex`^${partial( | ||
regex`^${pattern( | ||
arr.map(a => regex`${a}`.source).join('|') | ||
)}$` | ||
// And you could add your own sugar that returns a partial | ||
// And you could add your own sugar that returns a `pattern` value | ||
regex`^${anyOfEscaped(arr)}$` | ||
// You could do the same thing without `partial` by calling `regex` as a | ||
// You could do the same thing without `pattern` by calling `regex` as a | ||
// function instead of using it with backticks, then assembling the arguments | ||
@@ -518,2 +574,4 @@ // list dynamically and holding your nose | ||
> Implementation note: `pattern` returns an object with a custom `toString` that simply returns `String(value)`. So, if you wanted to, you could use it anywhere values are coerced to strings. | ||
### Interpolation principles | ||
@@ -536,3 +594,3 @@ | ||
<th>String / coerced</th> | ||
<th>Partial pattern</th> | ||
<th>Pattern</th> | ||
<th>RegExp</th> | ||
@@ -571,5 +629,5 @@ </tr> | ||
- *Atomized* means that that something is treated as a complete unit; it isn't related to the *atomic groups* feature. Example: In default context, `${x}*` matches any number of the value specified by `x`, and not just its last token. In character class context, set operators (union, subtraction, intersection) apply to the entire atom. | ||
- *Atomized* means that the value is treated as a complete unit; it isn't related to the *atomic groups* feature. Example: In default context, `${x}*` matches any number of the value specified by `x`, and not just its last token. In character class context, subtraction and intersection operators apply to the entire atom. | ||
- *Sandboxed* means that the value can't change the meaning or error status of characters outside of the interpolation, and vice versa. | ||
- Character classes have a sub-context on the borders of ranges, explained in [*Interpolating partial patterns*](#interpolating-partial-patterns). Only one character node (ex: `a` or `\u0061`) can be interpolated at these positions. | ||
- Character classes have a sub-context on the borders of ranges. Only one character node (ex: `a` or `\u0061`) can be interpolated at these positions. | ||
@@ -580,3 +638,3 @@ > The implementation details vary for how `regex` accomplishes sandboxing and atomization, based on the details of the specific pattern. But the concepts should always hold up. | ||
`regex` transpiles its input to native `RegExp` instances. Therefore regexes built with `regex` perform just as fast as native regular expressions. | ||
`regex` transpiles its input to native `RegExp` instances. Therefore regexes created by `regex` perform equally fast as native regular expressions. `regex` calls can also be transpiled via a [Babel plugin](https://github.com/slevithan/babel-plugin-transform-regex), avoiding the tiny overhead of transpiling at runtime. | ||
@@ -587,15 +645,37 @@ For regexes that rely on or have the potential to trigger heavy backtracking, you can dramatically improve beyond native performance via the [atomic groups](#atomic-groups) feature built into `regex`. | ||
If you want `regex` to use a `RegExp` subclass or other constructor, you can do so by modifying `this`: `` regex.bind(RegExpSubclass)`…` ``. | ||
`regex` uses flag <kbd>v</kbd> (`unicodeSets`) when it's supported natively. Flag <kbd>v</kbd> is supported by 2023-era browsers ([compat table](https://caniuse.com/mdn-javascript_builtins_regexp_unicodesets)) and Node.js 20. When <kbd>v</kbd> isn't available, flag <kbd>u</kbd> is automatically used instead (while still enforcing <kbd>v</kbd>'s rules), which extends support to Node.js 14 and 2020-era browsers (2017-era with a build step that transpiles private class fields, the string `matchAll` method, and the `?.` operator). | ||
Following are edge cases that rely on modern JavaScript features: | ||
The following edge cases rely on modern JavaScript features: | ||
- `regex` uses flag <kbd>v</kbd> (`unicodeSets`), which has had universal browser support since ~mid-2023 ([compat table](https://caniuse.com/mdn-javascript_builtins_regexp_unicodesets)) and is available in Node.js 20+. In environments without native flag <kbd>v</kbd>, flag <kbd>u</kbd> is automatically used as a fallback while enforcing <kbd>v</kbd>'s rules, which extends support backward to Node.js 12+ and old browsers. | ||
- Note that `regex` generates nested character classes (which require native flag <kbd>v</kbd>) when interpolating more than one token at a time *inside character classes*. A descriptive error is throw when this isn't supported, which you can avoid by not interpolating multi-token partials/strings into character classes. | ||
- Using an interpolated `RegExp` instance with a different value for flag <kbd>i</kbd> than its outer regex relies on [regex modifiers](https://github.com/tc39/proposal-regexp-modifiers), a bleeding-edge feature available in Chrome, Edge, and Opera 125+. A descriptive error is thrown in environments without support, which you can avoid by aligning the use of flag <kbd>i</kbd> on inner and outer regexes. Local-only application of other flags doesn't rely on this feature. | ||
- To ensure atomization, `regex` uses nested character classes (which require native flag <kbd>v</kbd>) when interpolating more than one token at a time *inside character classes*. A descriptive error is thrown when this isn't supported, which you can avoid by not interpolating multi-token patterns or strings into character classes. | ||
- Using an interpolated `RegExp` instance with a different value for flag <kbd>i</kbd> than its outer regex relies on [regex modifiers](https://github.com/tc39/proposal-regexp-modifiers), a bleeding-edge feature available in Chrome/Edge 125 and Opera 111. A descriptive error is thrown in environments without support, which you can avoid by aligning the use of flag <kbd>i</kbd> on inner and outer regexes. Local-only application of other flags doesn't rely on this feature. | ||
## 🙋 FAQ | ||
<details> | ||
<summary><b>How are you comparing regex flavors?</b></summary> | ||
The claim that JavaScript with the `regex` package is among the best regex flavors is based on a holistic view. Following are some of the aspects considered: | ||
1. **Performance:** An important aspect, but not the main one since mature regex implementations are generally pretty fast. JavaScript is strong on regex performance (at least considering V8's Irregexp engine and JavaScriptCore), but it uses a backtracking engine that is missing any syntax for backtracking control—a major limitation that makes ReDoS vulnerability more common. The `regex` package adds atomic groups to native JavaScript regexes, which is a solution to this problem and therefore can dramatically improve performance. | ||
2. **Support for advanced features** that enable easily creating patterns for common or important use cases: Here, JavaScript stepped up its game with ES2018 and ES2024. JavaScript is now best in class for some features like lookbehind (with it's infinite-length support) and Unicode properties (with multicharacter "properties of strings", character class subtraction and intersection, and Script_Extensions). These features are either not supported or not as robust in many other flavors. | ||
3. **Ability to write readable and maintainable patterns:** Here, native JavaScript has long been the worst of the major flavors, since it lacks the `x` (extended) flag that allows insignificant whitespace and comments. The `regex` package not only adds `x` and turns it on by default, but it additionally adds regex subroutines (matched only by PCRE and Perl, although some other flavors have inferior versions) which enable powerful subpattern composition and reuse. And it includes context-aware interpolation of `RegExp` instances, escaped strings, and partial patterns, all of which can also help with composition and readability. | ||
</details> | ||
<details> | ||
<summary><b>Does <code>regex</code> support extensions?</b></summary> | ||
Yes. There are two approaches for this: | ||
1. **Alternative constructors:** If you want `regex` to use a `RegExp` subclass or other constructor, you can do so by modifying `this`: `` regex.bind(RegExpSubclass)`…` ``. The constructor is expected to accept two arguments (the pattern and flags) and return a `RegExp` instance. | ||
2. **Postprocessors:** `regex` can be called with an options object that includes an array of postprocessor functions. Ex: `` regex({flags: 'g', postprocessors: [myExtension]})`…` ``. Postprocessors are called in order after applying emulated flags and interpolation. They're called with two arguments (the pattern and flags) and are expected to return an updated pattern string. The final result is provided to the `RegExp` (or alternative) constructor. | ||
You can make extensions easier to use by wrapping the use of these features in your own function or template tag. See extension [`regex-recursion`](https://github.com/slevithan/regex-recursion) for an example of using all of these features. For a much simpler example of a postprocessor, see `regex`'s built-in `rakePostprocessor`. | ||
</details> | ||
## 🏷️ About | ||
`regex` was partly inspired by [`XRegExp`](https://github.com/slevithan/xregexp)`.tag` and [regexp-make-js](https://github.com/mikesamuel/regexp-make-js). `regex`'s only dependency is the ultra-lightweight [`regex-utilities`](https://github.com/slevithan/regex-utilities), which was separated so it can be reused by `regex` extensions. | ||
`regex` was partly inspired by [XRegExp](https://github.com/slevithan/xregexp)'s `.tag` and [regexp-make-js](https://github.com/mikesamuel/regexp-make-js). `regex`'s only dependency is the ultra-lightweight [`regex-utilities`](https://github.com/slevithan/regex-utilities), which was separated so it can be reused by `regex` extensions. | ||
Crafted by Steven Levithan with ❤︎ for regular expressions and their enthusiasts.<br> | ||
MIT License. |
import {Context, hasUnescaped, replaceUnescaped} from 'regex-utilities'; | ||
import {noncapturingStart} from './utils.js'; | ||
import {noncapturingDelim} from './utils.js'; | ||
/** | ||
@param {string} pattern | ||
@param {string} expression | ||
@returns {string} | ||
*/ | ||
export function atomicGroupsPostprocessor(pattern) { | ||
if (!hasUnescaped(pattern, '\\(\\?>', Context.DEFAULT)) { | ||
return pattern; | ||
export function atomicGroupsPostprocessor(expression) { | ||
if (!hasUnescaped(expression, '\\(\\?>', Context.DEFAULT)) { | ||
return expression; | ||
} | ||
const token = new RegExp(String.raw`(?<noncapturingStart>${noncapturingStart})|(?<capturingStart>\((?:\?<[^>]+>)?)|(?<backrefNum>\\[1-9]\d*)|\\?.`, 'gsu'); | ||
const token = new RegExp(String.raw`(?<noncapturingStart>${noncapturingDelim})|(?<capturingStart>\((?:\?<[^>]+>)?)|(?<backrefNum>\\[1-9]\d*)|\\?.`, 'gsu'); | ||
const aGDelim = '(?>'; | ||
@@ -26,3 +26,3 @@ const emulatedAGDelim = '(?:(?=('; | ||
token.lastIndex = Number.isNaN(aGPos) ? 0 : aGPos + emulatedAGDelim.length; | ||
while (match = token.exec(pattern)) { | ||
while (match = token.exec(expression)) { | ||
const {0: m, index: pos, groups: {backrefNum, capturingStart, noncapturingStart}} = match; | ||
@@ -46,7 +46,7 @@ if (m === '[') { | ||
aGCount++; | ||
// Replace pattern and use `\k<$$N>` as a temporary shield for the backref since | ||
// numbered backrefs are prevented separately | ||
pattern = `${pattern.slice(0, aGPos)}${emulatedAGDelim}${ | ||
pattern.slice(aGPos + aGDelim.length, pos) | ||
}))\\k<$$${aGCount + capturingGroupCount}>)${pattern.slice(pos + 1)}`; | ||
// Replace `expression` and use `\k<$$N>` as a temporary shield for the backref | ||
// since numbered backrefs are prevented separately | ||
expression = `${expression.slice(0, aGPos)}${emulatedAGDelim}${ | ||
expression.slice(aGPos + aGDelim.length, pos) | ||
}))\\k<$$${aGCount + capturingGroupCount}>)${expression.slice(pos + 1)}`; | ||
hasProcessedAG = true; | ||
@@ -72,4 +72,4 @@ break; | ||
// Replace `\k<$$N>` added as a shield from the check for invalid numbered backrefs | ||
pattern = replaceUnescaped( | ||
pattern, | ||
expression = replaceUnescaped( | ||
expression, | ||
String.raw`\\k<\$\$(?<backrefNum>\d+)>`, | ||
@@ -79,3 +79,3 @@ ({groups: {backrefNum}}) => `\\${backrefNum}`, | ||
); | ||
return pattern; | ||
return expression; | ||
} |
@@ -17,7 +17,7 @@ import {doublePunctuatorChars} from './utils.js'; | ||
Assumes flag u and doesn't worry about syntax errors that are caught by it. | ||
@param {string} pattern | ||
@param {string} expression | ||
@param {string} flags | ||
@returns {string} | ||
*/ | ||
export function backcompatPostprocessor(pattern, flags) { | ||
export function backcompatPostprocessor(expression, flags) { | ||
const unescapedLiteralHyphenMsg = 'Invalid unescaped "-" in character class'; | ||
@@ -27,3 +27,3 @@ let inCharClass = false; | ||
let result = ''; | ||
for (const {0: m, groups: {dp, vOnlyEscape}} of pattern.matchAll(token)) { | ||
for (const {0: m, groups: {dp, vOnlyEscape}} of expression.matchAll(token)) { | ||
if (m[0] === '[') { | ||
@@ -30,0 +30,0 @@ if (inCharClass) { |
@@ -1,5 +0,5 @@ | ||
import {RegexContext, getEndContextForIncompletePattern, noncapturingStart} from './utils.js'; | ||
import {RegexContext, getEndContextForIncompleteExpression, noncapturingDelim} from './utils.js'; | ||
const token = new RegExp(String.raw` | ||
${noncapturingStart} | ||
${noncapturingDelim} | ||
| \(\?< | ||
@@ -10,10 +10,10 @@ | (?<backrefNum>\\[1-9]\d*) | ||
// Applied to the outer regex and interpolated partials, but not interpolated regexes or strings | ||
// Applied to the outer regex and interpolated patterns, but not interpolated regexes or strings | ||
export function flagNPreprocessor(value, runningContext) { | ||
value = String(value); | ||
let pattern = ''; | ||
let expression = ''; | ||
let transformed = ''; | ||
for (const {0: m, groups: {backrefNum}} of value.matchAll(token)) { | ||
pattern += m; | ||
runningContext = getEndContextForIncompletePattern(pattern, runningContext); | ||
expression += m; | ||
runningContext = getEndContextForIncompleteExpression(expression, runningContext); | ||
const {regexContext} = runningContext; | ||
@@ -20,0 +20,0 @@ if (regexContext === RegexContext.DEFAULT) { |
import {Context, replaceUnescaped} from 'regex-utilities'; | ||
import {CharClassContext, RegexContext, doublePunctuatorChars, getEndContextForIncompletePattern, noncapturingStart, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js'; | ||
import {CharClassContext, RegexContext, doublePunctuatorChars, getEndContextForIncompleteExpression, noncapturingDelim, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js'; | ||
@@ -17,3 +17,3 @@ const ws = /^\s$/; | ||
| \[\^ | ||
| ${noncapturingStart} | ||
| ${noncapturingDelim} | ||
| \(\?< | ||
@@ -25,3 +25,3 @@ | (?<dp>[${doublePunctuatorChars}])\k<dp> | ||
// Applied to the outer regex and interpolated partials, but not interpolated regexes or strings | ||
// Applied to the outer regex and interpolated patterns, but not interpolated regexes or strings | ||
export function flagXPreprocessor(value, runningContext) { | ||
@@ -32,3 +32,3 @@ value = String(value); | ||
let ignoringComment = false; | ||
let pattern = ''; | ||
let expression = ''; | ||
let transformed = ''; | ||
@@ -64,4 +64,4 @@ let lastSignificantToken = ''; | ||
pattern += m; | ||
runningContext = getEndContextForIncompletePattern(pattern, runningContext); | ||
expression += m; | ||
runningContext = getEndContextForIncompleteExpression(expression, runningContext); | ||
const {regexContext, charClassContext} = runningContext; | ||
@@ -139,6 +139,6 @@ if ( | ||
// Remove `(?:)` separators (most likely added by flag x) in cases where it's safe to do so | ||
export function rakePostprocessor(pattern) { | ||
export function rakePostprocessor(expression) { | ||
const sep = String.raw`\(\?:\)`; | ||
// No need for repeated separators | ||
pattern = replaceUnescaped(pattern, `(?:${sep}){2,}`, '(?:)', Context.DEFAULT); | ||
expression = replaceUnescaped(expression, `(?:${sep}){2,}`, '(?:)', Context.DEFAULT); | ||
// No need for separators at: | ||
@@ -149,4 +149,4 @@ // - The beginning, if not followed by a quantifier. | ||
// - After one of `()|>^`, `(?:`, or a lookaround opening. | ||
pattern = replaceUnescaped( | ||
pattern, | ||
expression = replaceUnescaped( | ||
expression, | ||
String.raw`^${sep}(?![?*+{])|${sep}$|${sep}(?=[()|$\\])|(?<=[()|>^]|\(\?(?:[:=!]|<[=!]))${sep}`, | ||
@@ -156,3 +156,3 @@ '', | ||
); | ||
return pattern; | ||
return expression; | ||
} |
108
src/index.js
import {Context, hasUnescaped, replaceUnescaped} from 'regex-utilities'; | ||
import {CharClassContext, RegexContext, adjustNumberedBackrefs, containsCharClassUnion, countCaptures, escapeV, flagVSupported, getBreakoutChar, getEndContextForIncompletePattern, patternModsSupported, preprocess, sandboxLoneCharClassCaret, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js'; | ||
import {CharClassContext, RegexContext, adjustNumberedBackrefs, containsCharClassUnion, countCaptures, escapeV, flagVSupported, getBreakoutChar, getEndContextForIncompleteExpression, patternModsSupported, preprocess, sandboxLoneCharClassCaret, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js'; | ||
import {flagNPreprocessor} from './flag-n.js'; | ||
import {flagXPreprocessor, rakePostprocessor} from './flag-x.js'; | ||
import {PartialPattern, partial} from './partial.js'; | ||
import {Pattern, pattern} from './pattern.js'; | ||
import {atomicGroupsPostprocessor} from './atomic-groups.js'; | ||
@@ -13,3 +13,3 @@ import {subroutinesPostprocessor} from './subroutines.js'; | ||
@prop {string} [flags] | ||
@prop {Array<(pattern: string, flags: string) => string>} [postprocessors] | ||
@prop {Array<(expression: string, flags: string) => string>} [postprocessors] | ||
@prop {boolean} [__flagN] | ||
@@ -22,18 +22,25 @@ @prop {boolean} [__flagV] | ||
/** | ||
Template tag for constructing a UnicodeSets-mode RegExp with advanced features and context-aware | ||
interpolation of regexes, escaped strings, and partial patterns. | ||
Template tag for constructing a regex with advanced features and context-aware interpolation of | ||
regexes, strings, and patterns. | ||
Can be called in multiple ways: | ||
1. `` regex`…` `` - Regex pattern as a raw string. | ||
2. `` regex('gis')`…` `` - To specify flags. | ||
3. `` regex({flags: 'gis'})`…` `` - With options. | ||
2. `` regex('gi')`…` `` - To specify flags. | ||
3. `` regex({flags: 'gi'})`…` `` - With options. | ||
4. `` regex.bind(RegExpSubclass)`…` `` - With a `this` that specifies a different constructor. | ||
@type {{ | ||
(flags?: string) => (TemplateStringsArray, ...values) => RegExp; | ||
(options: RegexTagOptions) => (TemplateStringsArray, ...values) => RegExp; | ||
(template: TemplateStringsArray, ...values) => RegExp; | ||
}} | ||
@overload | ||
@param {TemplateStringsArray} template | ||
@param {...any} substitutions | ||
@returns {RegExp} | ||
@overload | ||
@param {string} [flags] | ||
@returns {(template: TemplateStringsArray, ...substitutions: any[]) => RegExp} | ||
@overload | ||
@param {RegexTagOptions} options | ||
@returns {(template: TemplateStringsArray, ...substitutions: any[]) => RegExp} | ||
*/ | ||
const regex = function(first, ...values) { | ||
function regex(first, ...substitutions) { | ||
// Allow binding to other constructors | ||
@@ -43,22 +50,22 @@ const constructor = this instanceof Function ? this : RegExp; | ||
if (Array.isArray(first?.raw)) { | ||
return fromTemplate(constructor, {flags: ''}, first, ...values); | ||
return fromTemplate(constructor, {flags: ''}, first, ...substitutions); | ||
// Given flags | ||
} else if ((typeof first === 'string' || first === undefined) && !values.length) { | ||
} else if ((typeof first === 'string' || first === undefined) && !substitutions.length) { | ||
return fromTemplate.bind(null, constructor, {flags: first}); | ||
// Given an options object | ||
} else if ({}.toString.call(first) === '[object Object]' && !values.length) { | ||
} else if ({}.toString.call(first) === '[object Object]' && !substitutions.length) { | ||
return fromTemplate.bind(null, constructor, first); | ||
} | ||
throw new Error(`Unexpected arguments: ${JSON.stringify([first, ...values])}`); | ||
}; | ||
throw new Error(`Unexpected arguments: ${JSON.stringify([first, ...substitutions])}`); | ||
} | ||
/** | ||
Makes a UnicodeSets-mode RegExp from a template and values to fill the template holes. | ||
@param {RegExpConstructor | (pattern: string, flags: string) => RegExp} constructor | ||
Returns a UnicodeSets-mode RegExp from a template and substitutions to fill the template holes. | ||
@param {new (expression: string, flags?: string) => RegExp} constructor | ||
@param {RegexTagOptions} options | ||
@param {TemplateStringsArray} template | ||
@param {...any} values | ||
@param {...any} substitutions | ||
@returns {RegExp} | ||
*/ | ||
function fromTemplate(constructor, options, template, ...values) { | ||
function fromTemplate(constructor, options, template, ...substitutions) { | ||
const { | ||
@@ -79,14 +86,14 @@ flags = '', | ||
if (__flagX) { | ||
({template, values} = preprocess(template, values, flagXPreprocessor)); | ||
({template, substitutions} = preprocess(template, substitutions, flagXPreprocessor)); | ||
} | ||
if (__flagN) { | ||
({template, values} = preprocess(template, values, flagNPreprocessor)); | ||
({template, substitutions} = preprocess(template, substitutions, flagNPreprocessor)); | ||
} | ||
let precedingCaptures = 0; | ||
let pattern = ''; | ||
let expression = ''; | ||
let runningContext = {}; | ||
// Intersperse template raw strings and values | ||
// Intersperse template raw strings and substitutions | ||
template.raw.forEach((raw, i) => { | ||
const wrapEscapedStr = template.raw[i] || template.raw[i + 1]; | ||
const wrapEscapedStr = !!(template.raw[i] || template.raw[i + 1]); | ||
// Even with flag n enabled, we might have named captures | ||
@@ -96,10 +103,12 @@ precedingCaptures += countCaptures(raw); | ||
// cases a following interpolated value would always be atomized | ||
pattern += sandboxUnsafeNulls(raw, Context.CHAR_CLASS); | ||
runningContext = getEndContextForIncompletePattern(pattern, runningContext); | ||
expression += sandboxUnsafeNulls(raw, Context.CHAR_CLASS); | ||
runningContext = getEndContextForIncompleteExpression(expression, runningContext); | ||
const {regexContext, charClassContext} = runningContext; | ||
if (i < template.raw.length - 1) { | ||
const value = values[i]; | ||
pattern += interpolate(value, flags, regexContext, charClassContext, wrapEscapedStr, precedingCaptures); | ||
if (value instanceof RegExp || value instanceof PartialPattern) { | ||
precedingCaptures += countCaptures(value.source || String(value)); | ||
const substitution = substitutions[i]; | ||
expression += interpolate(substitution, flags, regexContext, charClassContext, wrapEscapedStr, precedingCaptures); | ||
if (substitution instanceof RegExp) { | ||
precedingCaptures += countCaptures(substitution.source); | ||
} else if (substitution instanceof Pattern) { | ||
precedingCaptures += countCaptures(String(substitution)); | ||
} | ||
@@ -116,6 +125,15 @@ } | ||
} | ||
pp.forEach(pp => pattern = pp(pattern, flags)); | ||
return new constructor(pattern, (__flagV ? 'v' : 'u') + flags); | ||
pp.forEach(pp => expression = pp(expression, flags)); | ||
return new constructor(expression, (__flagV ? 'v' : 'u') + flags); | ||
} | ||
/** | ||
@param {any} value | ||
@param {string} flags | ||
@param {string} regexContext | ||
@param {string} charClassContext | ||
@param {boolean} wrapEscapedStr | ||
@param {number} precedingCaptures | ||
@returns {string} | ||
*/ | ||
function interpolate(value, flags, regexContext, charClassContext, wrapEscapedStr, precedingCaptures) { | ||
@@ -130,7 +148,7 @@ if (value instanceof RegExp && regexContext !== RegexContext.DEFAULT) { | ||
} | ||
const isPartial = value instanceof PartialPattern; | ||
const isPattern = value instanceof Pattern; | ||
let escapedValue; | ||
if (!(value instanceof RegExp)) { | ||
value = String(value); | ||
if (!isPartial) { | ||
if (!isPattern) { | ||
escapedValue = escapeV( | ||
@@ -141,3 +159,3 @@ value, | ||
} | ||
// Check escaped values (not just partials) since possible breakout char `>` isn't escaped | ||
// Check `escapedValue` (not just patterns) since possible breakout char `>` isn't escaped | ||
const breakoutChar = getBreakoutChar(escapedValue || value, regexContext, charClassContext); | ||
@@ -156,9 +174,9 @@ if (breakoutChar) { | ||
) { | ||
return isPartial ? value : escapedValue; | ||
return isPattern ? value : escapedValue; | ||
} else if (regexContext === RegexContext.CHAR_CLASS) { | ||
if (isPartial) { | ||
if (isPattern) { | ||
if (hasUnescaped(value, '^-|^&&|-$|&&$')) { | ||
// Sandboxing so we don't change the chars outside the partial into being part of an | ||
// operation they didn't initiate. Same problem as starting a partial with a quantifier | ||
throw new Error('Cannot use range or set operator at boundary of partial; move the operation into the partial or the operator outside of it'); | ||
// Sandboxing so we don't change the chars outside the pattern into being part of an | ||
// operation they didn't initiate. Same problem as starting a pattern with a quantifier | ||
throw new Error('Cannot use range or set operator at boundary of interpolated pattern; move the operation into the pattern or the operator outside of it'); | ||
} | ||
@@ -180,3 +198,3 @@ const sandboxedValue = sandboxLoneCharClassCaret(sandboxLoneDoublePunctuatorChar(value)); | ||
} | ||
if (isPartial) { | ||
if (isPattern) { | ||
// Sandbox and atomize | ||
@@ -243,2 +261,2 @@ return `(?:${value})`; | ||
export {regex, partial}; | ||
export {regex, pattern}; |
import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped} from 'regex-utilities'; | ||
import {countCaptures} from './utils.js'; | ||
import {capturingDelim, countCaptures, namedCapturingDelim} from './utils.js'; | ||
/** | ||
@param {string} expression | ||
@returns {string} | ||
*/ | ||
export function subroutinesPostprocessor(expression) { | ||
const namedGroups = getNamedCapturingGroups(expression, true); | ||
return processDefinitionGroup( | ||
processSubroutines(expression, namedGroups), | ||
namedGroups | ||
); | ||
} | ||
// Explicitly exclude `&` from subroutine name chars because it's used by extension | ||
// `regex-recursion` for recursive subroutines via `\g<name&R=N>` | ||
const subroutinePattern = String.raw`\\g<(?<subroutineName>[^>&]+)>`; | ||
const capturingStartPattern = String.raw`\((?:(?!\?)|\?<(?![=!])(?<captureName>[^>]+)>)`; | ||
const token = new RegExp(String.raw` | ||
${subroutinePattern} | ||
| (?<capturingStart>${capturingStartPattern}) | ||
| (?<capturingStart>${capturingDelim}) | ||
| \\(?<backrefNum>[1-9]\d*) | ||
@@ -17,22 +28,30 @@ | \\k<(?<backrefName>[^>]+)> | ||
/** | ||
@param {string} pattern | ||
@typedef { | ||
Map<string, { | ||
isUnique: boolean; | ||
contents?: string; | ||
}>} NamedCapturingGroupsMap | ||
*/ | ||
/** | ||
Transform `\g<name>` | ||
@param {string} expression | ||
@param {NamedCapturingGroupsMap} namedGroups | ||
@returns {string} | ||
*/ | ||
export function subroutinesPostprocessor(pattern) { | ||
if (!hasUnescaped(pattern, '\\\\g<', Context.DEFAULT)) { | ||
return pattern; | ||
function processSubroutines(expression, namedGroups) { | ||
if (!hasUnescaped(expression, '\\\\g<', Context.DEFAULT)) { | ||
return expression; | ||
} | ||
const capturingGroups = getNamedCapturingGroups(pattern); | ||
const backrefIncrements = [0]; | ||
const numCapturesBeforeFirstReferencedBySubroutine = countCapturesBeforeFirstReferencedBySubroutine(pattern); | ||
const openSubroutinesMap = new Map(); | ||
const openSubroutinesStack = []; | ||
let numCapturesPassedOutsideSubroutines = 0; | ||
let numCapturesPassedInsideSubroutines = 0; | ||
let openSubroutinesMap = new Map(); | ||
let openSubroutinesStack = []; | ||
let numCharClassesOpen = 0; | ||
let result = pattern; | ||
let result = expression; | ||
let match; | ||
token.lastIndex = 0; | ||
while (match = token.exec(result)) { | ||
const {0: m, index: pos, groups: {subroutineName, capturingStart, backrefNum, backrefName}} = match; | ||
const {0: m, index, groups: {subroutineName, capturingStart, backrefNum, backrefName}} = match; | ||
if (m === '[') { | ||
@@ -44,3 +63,3 @@ numCharClassesOpen++; | ||
if (subroutineName) { | ||
if (!capturingGroups.has(subroutineName)) { | ||
if (!namedGroups.has(subroutineName)) { | ||
throw new Error(`Invalid named capture referenced by subroutine ${m}`); | ||
@@ -51,3 +70,3 @@ } | ||
} | ||
const contents = capturingGroups.get(subroutineName); | ||
const contents = namedGroups.get(subroutineName).contents; | ||
const numCaptures = countCaptures(contents) + 1; // Plus '(' wrapper | ||
@@ -67,3 +86,3 @@ numCapturesPassedInsideSubroutines += numCaptures; | ||
// Expand the subroutine's contents into the pattern we're looping over | ||
result = spliceStr(result, pos, m, subroutineValue); | ||
result = spliceStr(result, index, m, subroutineValue); | ||
token.lastIndex -= m.length; | ||
@@ -79,3 +98,3 @@ } else if (capturingStart) { | ||
// backrefs and `groups`, this essentially accomplishes not creating a capture | ||
result = spliceStr(result, pos, m, '('); | ||
result = spliceStr(result, index, m, '('); | ||
token.lastIndex -= m.length; | ||
@@ -91,10 +110,11 @@ } | ||
} else if (backrefNum) { | ||
// Beware: backref renumbering with subroutines is complicated | ||
const num = +backrefNum; | ||
let increment; | ||
let increment = 0; | ||
if (openSubroutinesMap.size) { | ||
if (num > numCapturesBeforeFirstReferencedBySubroutine) { | ||
increment = numCapturesPassedOutsideSubroutines + | ||
const numCapturesBeforeReferencedGroup = countCapturesBeforeGroupName(expression, openSubroutinesStack[0]); | ||
if (num > numCapturesBeforeReferencedGroup) { | ||
increment = | ||
numCapturesPassedOutsideSubroutines + | ||
numCapturesPassedInsideSubroutines - | ||
numCapturesBeforeFirstReferencedBySubroutine - | ||
numCapturesBeforeReferencedGroup - | ||
subroutine.numCaptures; | ||
@@ -107,3 +127,3 @@ } | ||
const adjusted = `\\${num + increment}`; | ||
result = spliceStr(result, pos, m, adjusted); | ||
result = spliceStr(result, index, m, adjusted); | ||
token.lastIndex += adjusted.length - m.length; | ||
@@ -113,18 +133,22 @@ } | ||
if (openSubroutinesMap.size) { | ||
// Search for the corresponding group in the contents of the subroutine stack | ||
let found = false; | ||
for (const s of openSubroutinesStack) { | ||
if (hasUnescaped( | ||
openSubroutinesMap.get(s).contents, | ||
String.raw`\(\?<${backrefName}>`, | ||
Context.DEFAULT | ||
)) { | ||
found = true; | ||
break; | ||
let isGroupFromThisSubroutine = false; | ||
if (backrefName === openSubroutinesStack[0]) { | ||
isGroupFromThisSubroutine = true; | ||
} else { | ||
// Search for the group in the contents of the subroutine stack | ||
for (const s of openSubroutinesStack) { | ||
if (hasUnescaped( | ||
openSubroutinesMap.get(s).contents, | ||
String.raw`\(\?<${backrefName}>`, | ||
Context.DEFAULT | ||
)) { | ||
isGroupFromThisSubroutine = true; | ||
break; | ||
} | ||
} | ||
} | ||
if (found) { | ||
if (isGroupFromThisSubroutine) { | ||
// Point to the group, then let normal renumbering work in the next loop iteration | ||
const adjusted = `\\${getCaptureNum(pattern, backrefName)}`; | ||
result = spliceStr(result, pos, m, adjusted); | ||
const adjusted = `\\${getCaptureNum(expression, backrefName)}`; | ||
result = spliceStr(result, index, m, adjusted); | ||
token.lastIndex -= m.length; | ||
@@ -151,16 +175,82 @@ } | ||
/** | ||
@param {string} pattern | ||
Strip `(?(DEFINE)…)` | ||
@param {string} expression | ||
@param {NamedCapturingGroupsMap} namedGroups | ||
@returns {string} | ||
*/ | ||
function processDefinitionGroup(expression, namedGroups) { | ||
const defineStart = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT); | ||
if (!defineStart) { | ||
return expression; | ||
} | ||
const defineGroup = getGroup(expression, defineStart); | ||
if (defineGroup.afterPos < expression.length) { | ||
// Supporting DEFINE at positions other than the end would significantly complicate edge-case | ||
// backref handling. Note: Flag x's preprocessing permits trailing whitespace and comments | ||
throw new Error('DEFINE group allowed only at the end of a regex'); | ||
} else if (defineGroup.afterPos > expression.length) { | ||
throw new Error('DEFINE group is unclosed'); | ||
} | ||
// `(?:)` separators can be added by the flag x preprocessor | ||
const contentsToken = new RegExp(String.raw`${namedCapturingDelim}|\(\?:\)|(?<unsupported>\\?.)`, 'gsu'); | ||
let match; | ||
while (match = contentsToken.exec(defineGroup.contents)) { | ||
const {captureName, unsupported} = match.groups; | ||
if (captureName) { | ||
let group = getGroup(defineGroup.contents, match); | ||
let duplicateName; | ||
if (!namedGroups.get(captureName).isUnique) { | ||
duplicateName = captureName; | ||
} else { | ||
const nestedNamedGroups = getNamedCapturingGroups(group.contents); | ||
for (const name of nestedNamedGroups.keys()) { | ||
if (!namedGroups.get(name).isUnique) { | ||
duplicateName = name; | ||
break; | ||
} | ||
} | ||
} | ||
if (duplicateName) { | ||
throw new Error(`Duplicate group name "${duplicateName}" within DEFINE"`); | ||
} | ||
contentsToken.lastIndex = group.afterPos; | ||
continue; | ||
} | ||
if (unsupported) { | ||
// Since a DEFINE group is stripped from its expression, we can't easily check if | ||
// unreferenced top-level syntax within it is valid. Such syntax serves no purpose, so it's | ||
// easiest to not allow it | ||
throw new Error(`DEFINE group includes unsupported syntax at top level`); | ||
} | ||
} | ||
return expression.slice(0, defineStart.index); | ||
} | ||
/** | ||
@param {string} expression | ||
@param {RegExpExecArray} delimMatch | ||
@returns {{contents: string; afterPos: number}} | ||
*/ | ||
function getGroup(expression, delimMatch) { | ||
const contentsStart = delimMatch.index + delimMatch[0].length; | ||
const contents = getGroupContents(expression, contentsStart); | ||
const afterPos = contentsStart + contents.length + 1; | ||
return { | ||
contents, | ||
afterPos, | ||
}; | ||
} | ||
/** | ||
@param {string} expression | ||
@param {string} groupName | ||
@returns {number} | ||
*/ | ||
function countCapturesBeforeFirstReferencedBySubroutine(pattern) { | ||
const subroutines = new Set(); | ||
forEachUnescaped(pattern, subroutinePattern, ({groups: {subroutineName}}) => { | ||
subroutines.add(subroutineName); | ||
}, Context.DEFAULT); | ||
function countCapturesBeforeGroupName(expression, groupName) { | ||
let num = 0; | ||
let pos = 0; | ||
let match; | ||
while (match = execUnescaped(pattern, capturingStartPattern, pos, Context.DEFAULT)) { | ||
while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) { | ||
const {0: m, index, groups: {captureName}} = match; | ||
if (subroutines.has(captureName)) { | ||
if (captureName === groupName) { | ||
break; | ||
@@ -175,11 +265,11 @@ } | ||
/** | ||
@param {string} pattern | ||
@param {string} expression | ||
@param {string} groupName | ||
@returns {number} | ||
*/ | ||
function getCaptureNum(pattern, groupName) { | ||
function getCaptureNum(expression, groupName) { | ||
let num = 0; | ||
let pos = 0; | ||
let match; | ||
while (match = execUnescaped(pattern, capturingStartPattern, pos, Context.DEFAULT)) { | ||
while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) { | ||
const {0: m, index, groups: {captureName}} = match; | ||
@@ -207,24 +297,39 @@ num++; | ||
/** | ||
@param {string} pattern | ||
@returns {Map<string, Array<{contents: string, endPos: number}>>} | ||
@param {string} expression | ||
@param {boolean} [includeContents] Leave off if unneeded, for perf | ||
@returns {NamedCapturingGroupsMap} | ||
*/ | ||
function getNamedCapturingGroups(pattern) { | ||
const capturingGroups = new Map(); | ||
forEachUnescaped(pattern, String.raw`\(\?<(?<captureName>[^>]+)>`, ({0: m, index, groups: {captureName}}) => { | ||
// If there are duplicate capture names, subroutines refer to the first instance of the given | ||
// group (matching the behavior of PCRE and Perl) | ||
if (!capturingGroups.has(captureName)) { | ||
capturingGroups.set(captureName, getGroupContents(pattern, index + m.length)); | ||
} | ||
}, Context.DEFAULT); | ||
return capturingGroups; | ||
function getNamedCapturingGroups(expression, includeContents) { | ||
const namedGroups = new Map(); | ||
forEachUnescaped( | ||
expression, | ||
namedCapturingDelim, | ||
({0: m, index, groups: {captureName}}) => { | ||
// If there are duplicate capture names, subroutines refer to the first instance of the given | ||
// group (matching the behavior of PCRE and Perl) | ||
if (namedGroups.has(captureName)) { | ||
namedGroups.get(captureName).isUnique = false; | ||
} else { | ||
namedGroups.set(captureName, { | ||
isUnique: true, | ||
...( | ||
includeContents ? { | ||
contents: getGroupContents(expression, index + m.length), | ||
} : null | ||
), | ||
}); | ||
} | ||
}, | ||
Context.DEFAULT | ||
); | ||
return namedGroups; | ||
} | ||
/** | ||
@param {string} pattern | ||
@param {string} expression | ||
@returns {number} | ||
*/ | ||
function countSubgroups(pattern) { | ||
function countSubgroups(expression) { | ||
let num = 0; | ||
forEachUnescaped(pattern, String.raw`\(`, () => num++, Context.DEFAULT); | ||
forEachUnescaped(expression, String.raw`\(`, () => num++, Context.DEFAULT); | ||
return num; | ||
@@ -231,0 +336,0 @@ } |
import {Context, forEachUnescaped, replaceUnescaped} from 'regex-utilities'; | ||
import {PartialPattern, partial} from './partial.js'; | ||
import {Pattern, pattern} from './pattern.js'; | ||
@@ -41,3 +41,5 @@ export const RegexContext = { | ||
export const noncapturingStart = String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!])`; | ||
export const namedCapturingDelim = String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`; | ||
export const capturingDelim = String.raw`\((?!\?)|${namedCapturingDelim}`; | ||
export const noncapturingDelim = String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!]|\(DEFINE\))`; | ||
@@ -87,5 +89,5 @@ /** | ||
export function sandboxUnsafeNulls(str, context) { | ||
// regex`[\0${0}]` and regex`[${partial`\0`}0]` can't be guarded against via nested `[…]` | ||
// regex`[\0${0}]` and regex`[${pattern`\0`}0]` can't be guarded against via nested `[…]` | ||
// sandboxing in character classes if the interpolated value doesn't contain union (since it | ||
// might be placed on a range boundary). So escape \0 in character classes as \u{0} | ||
// might be placed on a range boundary). So escape `\0` in character classes as `\u{0}` | ||
return replaceUnescaped(str, String.raw`\\0(?!\d)`, '\\u{0}', context); | ||
@@ -95,5 +97,5 @@ } | ||
// No special handling for escaped versions of the characters | ||
function getUnbalancedChar(pattern, leftChar, rightChar) { | ||
function getUnbalancedChar(expression, leftChar, rightChar) { | ||
let numOpen = 0; | ||
for (const [m] of pattern.matchAll(new RegExp(`[${escapeV(leftChar + rightChar, Context.CHAR_CLASS)}]`, 'g'))) { | ||
for (const [m] of expression.matchAll(new RegExp(`[${escapeV(leftChar + rightChar, Context.CHAR_CLASS)}]`, 'g'))) { | ||
numOpen += m === leftChar ? 1 : -1; | ||
@@ -111,4 +113,4 @@ if (numOpen < 0) { | ||
// Look for characters that would change the meaning of subsequent tokens outside an interpolated value | ||
export function getBreakoutChar(pattern, regexContext, charClassContext) { | ||
const escapesRemoved = pattern.replace(/\\./gsu, ''); | ||
export function getBreakoutChar(expression, regexContext, charClassContext) { | ||
const escapesRemoved = expression.replace(/\\./gsu, ''); | ||
// Trailing unescaped `\`; checking `.includes('\\')` would also work | ||
@@ -158,5 +160,5 @@ if (escapesRemoved.endsWith('\\')) { | ||
// Accepts and returns its full state so it doesn't have to reprocess pattern parts that it's | ||
// already seen. Assumes flag v and doesn't worry about syntax errors that are caught by it | ||
export function getEndContextForIncompletePattern(partialPattern, { | ||
// Accepts and returns its full state so it doesn't have to reprocess parts that have already been | ||
// seen. Assumes flag v and doesn't worry about syntax errors that are caught by it | ||
export function getEndContextForIncompleteExpression(incompleteExpression, { | ||
regexContext = RegexContext.DEFAULT, | ||
@@ -169,3 +171,3 @@ charClassContext = CharClassContext.DEFAULT, | ||
let match; | ||
while (match = contextToken.exec(partialPattern)) { | ||
while (match = contextToken.exec(incompleteExpression)) { | ||
const {0: m, groups: {groupN, enclosedT, qT, intervalQ, incompleteT}} = match; | ||
@@ -224,15 +226,24 @@ if (m === '[') { | ||
charClassDepth, | ||
lastPos: partialPattern.length, | ||
lastPos: incompleteExpression.length, | ||
}; | ||
} | ||
export function countCaptures(pattern) { | ||
/** | ||
@param {string} expression | ||
@returns {number} | ||
*/ | ||
export function countCaptures(expression) { | ||
let num = 0; | ||
forEachUnescaped(pattern, String.raw`\((?:(?!\?)|\?<[^>]+>)`, () => num++, Context.DEFAULT); | ||
forEachUnescaped(expression, capturingDelim, () => num++, Context.DEFAULT); | ||
return num; | ||
} | ||
export function adjustNumberedBackrefs(pattern, precedingCaptures) { | ||
/** | ||
@param {string} expression | ||
@param {number} precedingCaptures | ||
@returns {string} | ||
*/ | ||
export function adjustNumberedBackrefs(expression, precedingCaptures) { | ||
return replaceUnescaped( | ||
pattern, | ||
expression, | ||
String.raw`\\(?<num>[1-9]\d*)`, | ||
@@ -271,4 +282,4 @@ ({groups: {num}}) => `\\${+num + precedingCaptures}`, | ||
export function containsCharClassUnion(charClassPattern) { | ||
// Return `true` if contains: | ||
// - Lowercase `\p` and name is a property of strings (case sensitive). | ||
// Return `true` if it contains: | ||
// - `\p` (lowercase only) and the name is a property of strings (case sensitive). | ||
// - `\q`. | ||
@@ -305,12 +316,12 @@ // - Two single-char-matching tokens in sequence. | ||
/** | ||
Returns transformed versions of a template and values, using the given preprocessor. Expects the | ||
template to contain a `raw` array, and only processes values that are instanceof `PartialPattern`. | ||
Returns transformed versions of a template and substitutions, using the given preprocessor. Only | ||
processes substitutions that are instanceof `Pattern`. | ||
@param {TemplateStringsArray} template | ||
@param {any[]} values | ||
@param {any[]} substitutions | ||
@param {(value, runningContext) => {transformed: string; runningContext: Object}} preprocessor | ||
@returns {{template: TemplateStringsArray; values: any[]}} | ||
@returns {{template: TemplateStringsArray; substitutions: any[]}} | ||
*/ | ||
export function preprocess(template, values, preprocessor) { | ||
export function preprocess(template, substitutions, preprocessor) { | ||
let newTemplate = {raw: []}; | ||
let newValues = []; | ||
let newSubstitutions = []; | ||
let runningContext = {}; | ||
@@ -322,9 +333,9 @@ template.raw.forEach((raw, i) => { | ||
if (i < template.raw.length - 1) { | ||
const value = values[i]; | ||
if (value instanceof PartialPattern) { | ||
const result = preprocessor(value, {...runningContext, lastPos: 0}); | ||
newValues.push(partial(result.transformed)); | ||
const substitution = substitutions[i]; | ||
if (substitution instanceof Pattern) { | ||
const result = preprocessor(substitution, {...runningContext, lastPos: 0}); | ||
newSubstitutions.push(pattern(result.transformed)); | ||
runningContext = result.runningContext; | ||
} else { | ||
newValues.push(value); | ||
newSubstitutions.push(substitution); | ||
} | ||
@@ -335,4 +346,4 @@ } | ||
template: newTemplate, | ||
values: newValues, | ||
substitutions: newSubstitutions, | ||
}; | ||
} |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
109079
20
1558
653
4