fuzzaldrin-plus
Advanced tools
Comparing version 0.1.0 to 0.2.0
@@ -25,3 +25,5 @@ (function() { | ||
} | ||
return exports.match(subject.slice(basePos + 1, end + 1), subject_lw.slice(basePos + 1, end + 1), prepQuery, basePos + 1); | ||
basePos++; | ||
end++; | ||
return exports.match(subject.slice(basePos, end), subject_lw.slice(basePos, end), prepQuery, basePos); | ||
}; | ||
@@ -31,3 +33,2 @@ | ||
var ai, bj, i, j, m, n, out; | ||
out = []; | ||
m = a.length; | ||
@@ -43,3 +44,4 @@ n = b.length; | ||
j = 0; | ||
bj = b[0]; | ||
bj = b[j]; | ||
out = []; | ||
while (++i < m) { | ||
@@ -62,3 +64,3 @@ ai = a[i]; | ||
exports.match = function(subject, subject_lw, prepQuery, offset) { | ||
var DIAGONAL, LEFT, STOP, UP, acro_score, align, backtrack, csc_diag, csc_row, csc_score, i, imax, j, jmax, m, matches, move, n, pos, query, query_lw, score, score_diag, score_row, score_up, si_lw, start, trace, vmax; | ||
var DIAGONAL, LEFT, STOP, UP, acro_score, align, backtrack, csc_diag, csc_row, csc_score, i, j, m, matches, move, n, pos, query, query_lw, score, score_diag, score_row, score_up, si_lw, start, trace; | ||
if (offset == null) { | ||
@@ -74,5 +76,2 @@ offset = 0; | ||
csc_row = new Array(n); | ||
vmax = 0; | ||
imax = -1; | ||
jmax = -1; | ||
STOP = 0; | ||
@@ -79,0 +78,0 @@ UP = 1; |
(function() { | ||
var AcronymResult, PathSeparator, Query, basenameScore, coreChars, countDir, doScore, emptyAcronymResult, isMatch, isSeparator, isWordEnd, isWordStart, miss_coeff, opt_char_re, pos_bonus, scoreAcronyms, scoreCharacter, scoreConsecutives, scoreExact, scoreExactMatch, scorePattern, scorePosition, scoreSize, tau_depth, tau_size, wm; | ||
var AcronymResult, PathSeparator, Query, basenameScore, coreChars, countDir, doScore, emptyAcronymResult, file_coeff, isMatch, isSeparator, isWordEnd, isWordStart, miss_coeff, opt_char_re, pos_bonus, scoreAcronyms, scoreCharacter, scoreConsecutives, scoreExact, scoreExactMatch, scorePattern, scorePosition, scoreSize, tau_depth, tau_size, wm; | ||
@@ -10,6 +10,8 @@ PathSeparator = require('path').sep; | ||
tau_size = 50; | ||
tau_depth = 13; | ||
tau_size = 85; | ||
file_coeff = 1.2; | ||
miss_coeff = 0.75; | ||
@@ -156,5 +158,2 @@ | ||
var curr_s, prev_s; | ||
if (pos < 0) { | ||
return false; | ||
} | ||
if (pos === 0) { | ||
@@ -169,11 +168,9 @@ return true; | ||
exports.isWordEnd = isWordEnd = function(pos, subject, subject_lw, len) { | ||
var next_s; | ||
if (pos > len - 1) { | ||
return false; | ||
} | ||
var curr_s, next_s; | ||
if (pos === len - 1) { | ||
return true; | ||
} | ||
curr_s = subject[pos]; | ||
next_s = subject[pos + 1]; | ||
return isSeparator(next_s) || (subject[pos] === subject_lw[pos] && next_s !== subject_lw[pos + 1]); | ||
return isSeparator(curr_s) || isSeparator(next_s) || (curr_s === subject_lw[pos] && next_s !== subject_lw[pos + 1]); | ||
}; | ||
@@ -191,3 +188,3 @@ | ||
} else { | ||
return 100 + pos_bonus - pos; | ||
return Math.max(100 + pos_bonus - pos, 0); | ||
} | ||
@@ -360,3 +357,3 @@ }; | ||
alpha = 0.5 * tau_depth / (tau_depth + countDir(subject, end + 1)); | ||
return alpha * basePathScore + (1 - alpha) * fullPathScore * scoreSize(0, 0.5 * (end - basePos)); | ||
return alpha * basePathScore + (1 - alpha) * fullPathScore * scoreSize(0, file_coeff * (end - basePos)); | ||
}; | ||
@@ -371,5 +368,8 @@ | ||
i = -1; | ||
while (++i < end && path[i] === PathSeparator) { | ||
continue; | ||
} | ||
while (++i < end) { | ||
if (path[i] === PathSeparator) { | ||
++count; | ||
count++; | ||
while (++i < end && path[i] === PathSeparator) { | ||
@@ -376,0 +376,0 @@ continue; |
{ | ||
"name": "fuzzaldrin-plus", | ||
"version": "0.1.0", | ||
"version": "0.2.0", | ||
"description": "Fuzzy filtering and string scoring - compatible with fuzzaldrin", | ||
@@ -5,0 +5,0 @@ "licenses": [ |
471
README.md
@@ -1,13 +0,21 @@ | ||
## What Problem are we trying to solve ? | ||
## What is fuzzaldrin-plus? | ||
### Score how matched characters relates to one another. | ||
- A fuzzy search / highlight that specialize for programmer text editor. It tries to provide intuitive result by recognizing patterns that people use while searching. | ||
- One of the most complained thing is not being able to find exact match. | ||
- A great source of questionable results come from scattered character, spread a bit randomly in the string. | ||
- A rewrite of the fuzzaldrin library. API is backward compatible with some extra options. Tuning has been done from report usage of the Atom text editor. | ||
And we plan to address those issues by scoring run of consecutive characters. | ||
Exact match will be a special case where the run is 100% of the query length. | ||
- At this point in time, it may either be merged back into fuzzaldrin or lives as a forked library, we'll see. | ||
Up to now match length was used as a proxy for quality. This work reasonably well when subject is a single word, but break when subject contain multiple words, for example see: | ||
## What Problem are we trying to solve? | ||
### Score how matched characters relate to one another. | ||
- One of the most often reported issues is not being able to find an exact match. | ||
- A great source of questionable results come from scattered character, spread seemingly randomly in the string. | ||
We plan to address those issues by scoring runs of consecutive characters. In that scheme, an exact match will be a special case where the run is 100% of the query length. | ||
In original fuzzaldrin, candidate length was used as a proxy for match quality. This work reasonably well when subject is a single word, but break when subject contain multiple words, for example, see: | ||
- **Core** | ||
@@ -17,3 +25,3 @@ - **Co**nt**r**oll**e**r | ||
In `Core` vs `Controller` size is a good indicator of quality, but not so much in `Controller` vs `ExtentionCore`. This is because match compactness mater more than haystack size. | ||
In `Core` vs. `Controller` size is a good indicator of quality, but not so much in `Controller` vs. `ExtentionCore`. This situation happens because match compactness matters more than haystack size. Match compactness is the principle behind the scoring of the *Selecta* project. | ||
@@ -23,9 +31,9 @@ | ||
So far the examples can be handled by an `indexOf` call. However there are times where a single query can target multiple part of an candidate. | ||
So far the examples can be handled by an `indexOf` call. However, there are times where a single query can target multiple parts of a candidate. | ||
For example when candidate contain multiple words | ||
- `git push` vs `Git Plus: Push` | ||
- `email handler` vs `email/handler.py` | ||
For example when candidate contains multiple words | ||
- `git push` vs. `Git Plus: Push` | ||
- `email handler` vs. `email/handler.py` | ||
Another example is to jump above common strings. | ||
Another example is to jump over common strings. | ||
- `toLowerCase` | ||
@@ -40,3 +48,3 @@ - `toLocaleString` | ||
The current algorithm always select the first available matching character (leftmost alignment) then try to identify how it should be scored. The problem is that the most interesting instance of a character is not necessarily on the left. | ||
The previous algorithm always selects the first available matching character (leftmost alignment). Only after selection, it will try to identify how to score that character. The problem then is that the most interesting instance of a character is not necessarily on the left. | ||
@@ -46,41 +54,42 @@ For example on query `itc`, we should match | ||
Instead leftmost aligement miss the acronym pattern | ||
Instead leftmost aligement miss the acronym pattern: | ||
- **I**mpor**t**an**c**eTableCtrl. | ||
For query `core` against `controller_core` leftmost alignment miss the consecutive run | ||
- **co**nt**r**oll**e**r_core | ||
For query `core` against `controller_core` leftmost alignment miss the consecutive run: | ||
- **co**nt**r**oll**e**r_core | ||
To handle this we propose to setup the max run-length scoring inside an optimal alignment scheme. (Implemented for example using dynamic programming) | ||
To handle this, we propose to embed the pattern detection (consecutive and more) inside an optimal alignment scheme. Imagine you have an algorithm that allows you to recognize objects in images; it would make little sense to run it exclusively on the top left corner. | ||
### Accidental acronym bonus. | ||
Fuzzladrin handle acronym by giving a large per character bonus. Currently a start of word bonus match almost as much as 3 proper case character (or 7 wrong case ones!) | ||
### Prevent Accidental acronym. | ||
For query install "install" should result be in this order ? | ||
Fuzzladrin handles acronym by giving a large bonus on character matches that start words. Currently, a start-of-word bonus matches almost as much as three proper-case character. | ||
For query `install` should result be in this order ? | ||
- F**in**d & Replace **S**elec**t** **All** | ||
- Application: **Install** | ||
Here we have '**S**elect **A**ll' boost the score of the first candidate because we match two word-start VS only one for 'install'. | ||
In that example, we have '**S**elect **A**ll' boost the score of the first candidate because we score two word-starts while we only score one for 'install'. | ||
For query "git push", should we order result in that order ? | ||
- "**Git** **P**l**u**s: **S**tage **H**unk" | ||
- "**Git** Plus: **Push**" | ||
For query `git push`, should we order result in that order ? | ||
- "**Git** **P**l**u**s: **S**tage **H**unk" | ||
- "**Git** Plus: **Push**" | ||
What about the fact we match 3 start-of-word in `Plus Stage Hunk` ? PSH is very close to '**p**u**sh**'. | ||
What about the fact we match three start-of-words in `Plus Stage Hunk`? PSH is very close to '**p**u**sh**' (And `Plus` contains `u`). | ||
That kind of question arise even more often with optimal selection because the algorithm will lock on those extra acronym points. | ||
That kind of question arises even more often when we use optimal selection because the algorithm will lock on those extra acronym points. | ||
What we propose in this PR is that word start-of-word character only have a moderate advantage by themselves. Instead they form strong score by making an acronym pattern with other start-of-word character. | ||
What we propose in this project is that start-of-words characters should only have a moderate advantage by themselves. Instead, they form strong score by making an acronym pattern with other start-of-words characters. | ||
For example with query `push`: | ||
- match against `Plus: Stage Hunk`, we have P + u + SH so group of 1+1+2 | ||
- match against `push` single group of 4. | ||
- The substring win for having the largest group | ||
- against `Plus: Stage Hunk`: we have `P + u + SH` grouped as 1, 1, 2 | ||
- against `push`: we have a single group of 4. | ||
- The substring wins for having the largest group | ||
For example with query `psh`: | ||
- match against `Plus: Stage Hunk`, we have PSH so group of 3 | ||
- match against `push`, we have p+sh so group of 1+2 | ||
- The acronym win for having the largest group. | ||
- against `Plus: Stage Hunk`: we have `PSH` so a single group of 3 | ||
- against `push`: we have `p + sh` so grouped as 1, 2 | ||
- The acronym wins for having the largest group. | ||
This way we can score both substring and acronym match using the structure of the match. We'll refine the definition of consecutive acronym later. | ||
@@ -91,77 +100,104 @@ | ||
Some people proposed to give perfect score to exact case sensitive match. This can be understood because exact match and consecutive are two area where fuzzaldrin is not great. | ||
Some people proposed to give a perfect score to exact case-sensitive matches. This proposition can be understood because exact matches and case-sensitivity are two area where fuzzaldrin is not great. | ||
However should 'sw**itc**h.css' be an exact match for `itc` ? | ||
Even when we have **I**mportance**T**able**C**trl available ? | ||
However should 'sw**itc**h.css' be an exact match for `itc`? | ||
Even when we have **I**mportance**T**able**C**trl available? | ||
Few people will argue against "diag" preferring "diagnostic" to "Diagnostics". | ||
Few people will argue against `diag` preferring `diagnostic` to `Diagnostics`. | ||
However should `install` prefer "Un**install**" over "**Install**" ? Or should it be the other way around ? This is a case of case-sensitive match vs start-of-word ... | ||
However should `install` prefer "Un**install**" over "**Install**" ? Or should it be the other way around? In this case, we have to consider the relative priority of case-sensitivity and start-of-word. | ||
Exact matches are used with enougth frequency that we should not only ensure they win against approximate matches but also ensure to rank quality properly amongst them. | ||
## Proposed scoring | ||
### Manage the multiples balances of path scoring. | ||
We want to prefer match toward the start of the string. | ||
Except we also want to prefer match in the filename (which happens near the end of the string) | ||
### 1. Characters are chosen by their ability to form pattern with others. | ||
We want to prefer shorter and shallower path. | ||
Except we also want to retrieve some deeper files when filename is clearly better. | ||
Pattern can be made of | ||
- consecutive letter of the subject, | ||
- consecutive letter of the Acronym of subject. | ||
We want to prefer matches in the filename | ||
Except when the query describes a full path much better than an approximate file name. (Let's consider query `model user` vs `models/user.rb` or `moderator_column_users.rb`) | ||
Start-of-word (acronym) character are special in that they can either make pattern with the rest of the word or other acronym character. | ||
- This replace a situation where acronym character had unconditional large bonus. Now the bonus is still large but conditional to being part of some pattern. | ||
## Proposed Scoring Rules | ||
- CamelCase and snake_case acronym are treated exactly the same. They will however get different point for matching uppercase/lowercase query | ||
### 1. Characters are chosen by their ability to form a pattern with others. | ||
Patterns can be composed of | ||
- consecutive letters of the subject, | ||
- sequential letters in the Acronym of the subject. | ||
Start-of-words (acronym) characters are special in that they can either forms pattern with the rest of the word or with other acronym characters. | ||
- Pattern based scoring replaces a situation where acronyms characters have a large bonus by themselves. Now the bonus is still large but conditional to being part of some pattern. | ||
- CamelCase and snake_case acronym are treated exactly the same. They will, however, get different score for matching uppercase/lowercase query | ||
### 2. Primary score attribute is pattern length. | ||
- A pattern that span 100% of the query length is called an exact match. | ||
- There is such a thing as an acronym exact match. | ||
- There is such a thing as an acronym exact match. | ||
- Because every candidate is expected to match all of query larger group should win against multiple smaller one. | ||
- Rank of a candidate is the length of it's largest pattern. | ||
- When every pattern in a candidate are larger than pattern in a second one, highest rank candidate is said to be dominant. | ||
- The rank of a candidate is the length of it's largest pattern. | ||
- When all patterns of a first candidate are larger than patterns in a second one, the candidate with the highest rank is said to be dominant. | ||
- 1 group of 6 > 2 group of 3 > 3 group of 2. | ||
- When some group are larger and some are smaller, highest rank match is said to be semi-dominant. | ||
- group of 4+2 vs 2 groups of 3. Group of 4 wins agains the first group of 3, group of 2 loose against second group of 3. | ||
### 3. Secondary score attribute is match quality. | ||
- Match quality is made of proper-casing and context score | ||
- Main role of match quality is to order candidate of the same rank. | ||
- When match is semi-dominant match quality can overcome a rank-1 difference. | ||
- When some groups are larger, and some are smaller, the highest rank match is said to be semi-dominant. | ||
- Let's consider a first candidate grouped as 4+2 vs. a second candidate grouped as 3+3. | ||
- The first group of 4 wins against the first group of 3. | ||
- However, the group of 2 loose against the second group of 3. | ||
- In this case, we'll consider some extra information. | ||
- Context score consider where does the match occurs in the subject. | ||
- Full-word > Start-of-word > End-of-word > Middle-of-word | ||
- On that list, Acronym pattern score at Start-of-word level. (That is just bellow full-word) | ||
### 3. Secondary score attribute is the quality of matches. | ||
- Proper case is both gradual and absolute. | ||
- Match quality is made of proper casing and context score | ||
- The main role of match quality is to order candidate of the same rank. | ||
- When match is semi-dominant match quality can overcome a small rank difference. | ||
- Context score considers where does the match occurs in the subject. | ||
- Full-word > Start-of-word > End-of-word > Middle-of-word | ||
- On that list, Acronym pattern score at Start-of-word level. (That is just bellow full-word) | ||
- Score for a proper case query has both gradual and absolute components. | ||
- The less error, the better | ||
- 100% Case Error will be called wrong-case, for example matching a CamelCase acronym using lowercase query. | ||
- Exactly 0 error is called CaseSentitive or ExactCase match. | ||
They have a special bonus smaller than start-of-word bonus but greater than end-of-word bonus. | ||
- This allow a start-of-word case sentitive match to overcome a full-word wrong-case match. | ||
- Also allow to select between a lowercase consecutive and CamelCase acronym using case of query. | ||
- Make "Installed" win over "Uninstall" because start-of-word > Exact Case. | ||
- **Q:** Why can't you simply add extra length. For example add an extra virtual character for a start-of-word match. | ||
- **A:** We cannot do that on partial match because then the optimal alignment algorithm will be happy to split word and collect start-of-word bonus like stamp. (See accidental acronym) | ||
- 100% Case Error will be called wrong-case, for example matching a `CamelCase` acronym using lowercase query `cc`. | ||
- Exactly 0 error is called CaseSentitive or ExactCase match. | ||
- CaseSentitive matches have a special bonus that is smaller than start-of-word bonus but greater than the end-of-word bonus. | ||
- This scoring scheme allows a start-of-word case-sensitive match to overcome a full-word wrong-case match. | ||
- It also allows to select between a lowercase consecutive and CamelCase acronym using case of query. | ||
- To answer the question asked in the introduction, "Installed" win over "Uninstall" because start-of-word > Exact Case. | ||
- **Q:** Why do you still do it on exact match ? | ||
- **A:** First once you have matched everything there's no danger of splitting the query, then it's there for exact match to bubble up, despite longer/deeper path. If after more test/tuning we realize it's not needed, we'll be happy to remove it, the less corner case the merrier. | ||
- **Q:** Why can't you simply add extra length for some bonus. For example, score a start-of-word match as if it had an extra character. | ||
- **A:** We cannot do that on partial matches because then the optimal alignment algorithm will be happy to split word and collect start-of-word bonus like stamps. (See accidental acronym) | ||
- **Q:** Why are you using lowecase to detect CamelCase ? | ||
- **A** CamelCase are detected as a switch from lowercase to UPPERCASE. Defining UPPERCASE as not-lowercase, allow case-invariant character to count as lowercase. A lot of case invariant character are also separator, some are not such as number, and symbol such as `:!=<>()` | ||
- **Q:** Why do you add extra length on exact matches? | ||
- **A:** First, once you have matched everything, there's no danger of splitting the query. Then, that bonus exists to ensure exact matches will bubble up in the firsts results, despite longer/deeper path. If, after more test and tuning, we realize it's not needed, we'll be happy to remove it, the fewer corner cases, the merrier. | ||
- **Q:** Why are you using lowercase to detect CamelCase? | ||
- **A** CamelCase are detected as a switch from lowercase to UPPERCASE. Defining UPPERCASE as not-lowercase, allow case-invariants characters to count as lowercase. For example `Git Push` the `P` of push will be recognised as CamelCase because we consider `<space>` as lowercase. | ||
### 4. Tertiary score attributes are subject size, match position and directory depth | ||
- Mostly there to help order match of the same rank and match quality, unless the difference in tertiary attribute is large. | ||
-(Proper definition of large is to be determined using real life example) | ||
- Mostly there to help order match of the same rank and match quality, unless the difference in tertiary attributes is large. | ||
-(Proper definition of large is to be determined using real life example) | ||
- In term of importance of effect it should rank start-of-string > string size > directory depth. | ||
- In term of the relative importance of effects it should rank start-of-string > string size > directory depth. | ||
## Score Range | ||
- **Score is 0 if and only if there is no match.** | ||
- Otherwise, the score is a strictly positive integer. | ||
- The maximum range is `score(query,query)` whatever that number is. A longer query will have a greater maximal score. | ||
- Score exist mainly to for relative order with other scores of the same query and to implements scoring rule described above. | ||
- Score have a high dynamic range and consider a lot of information. Equality is unlikely. For that reason, **multiplicative bonuses should be preferred over additive ones**. | ||
------------- | ||
@@ -173,5 +209,3 @@ | ||
The acronym prefix is a group of character that are consecutive in the query and sequential in the acronym of the subject. | ||
That group start at the first character of the query and end at the first error (character not in acronym). | ||
If there's no error we have an acronym exact match (100% of query is part of acronym) | ||
An acronym prefix is a group of characters that are consecutive in the query and sequential in the acronym of the subject. That group starts at the first character of the query and end at the first character of the query, not in the acronym. If there's no missed character, then we have an acronym exact match (100% of query is sequential in the acronym) | ||
@@ -187,6 +221,6 @@ | ||
- Acronym scored as start-of-word consecutive rank 3 + an isolated letter. | ||
- Acronym scored as three consecutive character at start-of-word + an isolated letter. | ||
- Here we have a wrong-case match. "SSRb" or "SSRB" would have case-sensitive points on the acronym pattern (case of isolated letter is not important) | ||
- Position of the equivalent consecutive match is the average position of acronym characters. | ||
- Size of the candidate does not chance. | ||
- For scoring, we use the size of the original candidate. | ||
@@ -213,27 +247,166 @@ | ||
Legacy fuzzaldrin had some support for optional character (Mostly space, see `SpaceRegEx`). Because the scoring do not support errors, the optional character where simply removed from the query. | ||
Legacy fuzzaldrin had some support for optional characters (Mostly space, see `SpaceRegEx`). Because the scoring does not support errors, the optional character was simply removed from the query. | ||
With this PR, optimal alignment algorithm support an unlimited number of errors. The strict matching requirement is handled by a separate method `isMatch`. The optional character implementation is done by building a subset of query containing only non-optional character (`coreQuery`) and passing that to `isMatch`. | ||
With this PR, optimal alignment algorithm supports an unlimited number of errors. The strict matching requirement is handled by a separate method `isMatch`. The optional character implementation is done by building a subset of the query containing only non-optional characters (`coreQuery`) and passing that to `isMatch`. | ||
This new way of doing thing means that while some characters are optional, candidate that match those characters will have better score. What this allow is to add characters to the optional list without compromising ranking. | ||
This new way of doing thing means that while some characters are optional, candidates that match those characters have a better score. What this allow is to add characters to the optional list without compromising ranking. | ||
Character that has been added include `-` and `_` because multiple specs require that we should threat them as space. Also `\` and `:` to support searching a file by PHP and Ruby name-space. Finally `/` to mirror `\` and support a better work flow in a multi-OS environment. | ||
Optional character contains space, but also `-` and `_` because multiple specs require that we should treat them as space. Also `\` and `:` are also optional to support searching a file using the PHP or Ruby name-space. Finally `/` is optional to mirror `\` and support a better workflow in a multi-OS environment. | ||
Finally option `allowErrors` would make any character optional. Expected effect of that would be spell-checker like, but slower match. | ||
Finally option `allowErrors` would make any character optional. Expected effect of that options would be some forgiveness on the spelling at the price of a slower match. | ||
### Path scoring | ||
### Path Scoring | ||
- Score for a path is made of about half score of full path and half score of basename. | ||
- Score for a given path is computed from the score of the fullpath and score of the filename. For low directory depth, the influence of both is about equal. But, for deeper directory, there is less retrieval effect (importance of basename) | ||
- Exact balance between those two set by the directory dept, there is less retrieval effect (importance of basename) for deeper directory | ||
- The full path is penalized twice for size. Once for its own size, then a second time for the size of the basename. Extra basename penalty is dampened a bit. | ||
- Full path is penalized twice for size. Once for it's own size, then a second time for size of the basename. This address various demand for shorter basename to win. Extra basename penalty is dampened a bit. | ||
- The basename is scored as if `allowErrors` was set to true. (Full-path must still pass `isMatch` test). This choice is made to support query such as `model user` against path `model/user`. Previously, the basename score would be 0 because it would not find `model` inside basename `user`. Variable `queryHasSlashes` partially addressed this issue, but was inconsistent with usage of `<space>` as folder separator | ||
- Basename is scored as if `allowErrors` was set to true. (Fullpath must still pass `isMatch` test) This allow to support query such as `model user` against path `model/user`. Previously, the basename score would be 0 because it would not find `model` inside basename `user`. Variable `queryHasSlashes` sort of addressed this issue, but was inconsistent with usage of `<space>` as folder separator | ||
- When query has slashes (`path.sep`) the last or last few folder from the path are promoted to the basename. (as many folder from the path as folder in the query) | ||
------------- | ||
## Algorithm (Optimal alignment) | ||
### LCS: Dynamic programming Table | ||
Let's compare A:`surgery` and B:`gsurvey`. | ||
To do so we can try to match every letter of A against every letter of B. | ||
This problem can be solved using a score matrix. | ||
- The match starts at [0,0] trying to compare the first letter of each. | ||
- The match end at [m,n] comparing the last letter of each. | ||
At each position [i,j] the best move can be one of the 3 options. | ||
- match `A[i]` with `B[j]` (move diagonal, add 1 to score) | ||
- skip `A[i]` (move left, copy score) | ||
- skip `B[j]` (move down, copy score) | ||
We do not know which one of these 3 is the best move until we reach the end, so we record the score of the best move so far. The last cell contains the score of the best alignment. If we want to output that alignment we need to rebuild it backward from the last cell. | ||
```` | ||
s u r g e r y | ||
g [0,0,0,1,1,1,1] : best move is to align `g` of *g*survey with `g` of sur*g*ery, score 1 | ||
s [1,1,1,1,1,1,1] : we can align `s`, but doing so invalidate `g`. Both score 1, we cannot decide | ||
u [1,2,2,2,2,2,2] : if we align s, we can also align u, we have a winner | ||
r [1,2,3,3,3,3,3] : we can align `r` | ||
v [1,2,3,3,3,3,3] : nothing we can do with that `v`, score stay the same | ||
e [1,2,3,3,4,4,4] : we can align `e` (we skipped `g` the same way we skipped `v`) | ||
y [1,2,3,3,4,4,5] : align y (we skipped `r` ) | ||
```` | ||
**Best alignment is** | ||
```` | ||
gsur-ve-y | ||
-|||--|-| | ||
-surg-ery | ||
```` | ||
For those familiar with code diff, this is essentially the same problem. Except, in this case, we the do the alignment of characters in a word and a diff performs alignment of lines in a file. Characters present in the second word but not in the first counts as additions; characters present only in the first word are deletions and characters present in both are matches - like unchanged lines in a diff. | ||
To get that alignment, we start from the last character and trace back the best option. The pattern to looks for an **alignment** is the corner increase (diagonal+1 is greater than left or up.) | ||
```` | ||
4,4 3,3 2,2 1,1 0,0 | ||
4,5 3,4 2,3 1,2 0,1 | ||
```` | ||
- (There are an implicit row and column of 0 before the matrix) | ||
The pattern to look for to **move left** is: | ||
```` | ||
3,3 | ||
4,4 | ||
```` | ||
The pattern to look for to **move up** is: | ||
```` | ||
3,4 | ||
3,4 | ||
```` | ||
We try to resolve equality the following way: | ||
```` | ||
3,3 | ||
3,3 | ||
```` | ||
1. Prefer moving UP: toward the start of the candidate. This strategy ensures we highlight toward the start of string instead of the end when all else is equal. | ||
2. If not available, prefer moving LEFT (optional character) | ||
3. Only accept alignment DIAG when it is the absolute best option. | ||
### Algorithm Conclusion | ||
The LCS algorithm allows to detect which character of the query are common to both words while being in proper order. (For example g is common to both word but discarded because out of order.) | ||
LCS is not immediately useful for fuzzaldrin needs. Because fuzzaldrin require ALL characters of the query to be in subject to have a score greater than 0, LCS for all positive candidates would be the length of the query. | ||
However, the dynamic programming table used to solve LCS is very useful to our need. The ability to select the best path and skip that `g` even if it is present in both query and candidate is the key to improves over left-most alignment. All we need for this to works is a bit more detail in score than 0 or 1. | ||
### Similarity score | ||
Matching character does not have to be binary. Case sensitive match can still prefer proper case, same goes with accents. A diff tools can decide a line has been modified, instead of registering an addition and a deletion. A handwriting recognition tool can decide `a` and `o` are somewhat more similar to each other than they are to `w`, and so on. | ||
We use character similarity as a way to build and score patterns. That is, we consider that character are similar from their own quality ( such as case) as well of being part of a similar neighborhood (consecutive letters or acronyms) | ||
There are some rules that limit our scoring ability (for example we cannot go back in time and correct the score based on future choice) but overall that scheme is very flexible. | ||
### Where is the matrix? | ||
While the programming table describes computation, we do not need to store the whole matrix when we only output the score. Fundamentally when computing a score, we only need 3other previously computed cell: UP, LEFT and DIAG. | ||
Suppose we process the cell [3,5] | ||
20, 21, 22, 23, 24, 25, 26, *27, 28, 29* | ||
*30, 31, 32, 33, 34,* **35**, 36, 37, 38, 39 | ||
To build that score we only need values 24(DIAG), 25(UP), 34(LEFT). | ||
So instead of a whole matrix we can keep only the two current lines. | ||
Furthermore, anything on the left of 24 on the first line is not needed anymore. Also, anything to the right of 35 on the second line has not yet been computed. So we can build a more compact structure using one composite row + one diagonal. | ||
score_diag = 24 | ||
score_row = 30, 31, 32, 33, 34, 25, 26, 27, 28, 29 | ||
#### Preparing next value | ||
Once we have computed the value of the cell [3,5], we can insert that value into the structure, taking care of saving next diagonal before overwriting it. | ||
diag = 25 | ||
row = 30, 31, 32, 33, 34, **35**, 26, 27, 28, 29 | ||
To compute value of cell [3,6] we take | ||
- UP value (26) from the row. | ||
- DIAG value, from the diag register. | ||
- LEFT value from the previously computed value: 35 | ||
### Initial values | ||
Before entering the matching process, the row is initialized with 0. Before scoring each row, the LEFT and DIAG register are reset to 0. | ||
That strategy has the effect of placing a virtual row and column of 0 before the matrix. Moreover, it allows to deal with boundary condition without any special case. | ||
### Memory management | ||
We set up the row vector with the size of the query. Using a full matrix, scoring a query of size 5 against a path of size 100, would require a 500 cells. Instead, we use a 5 item row + some registers. This should ease memory management pressure. | ||
Each character of the query manages its best score. More precisely, each cell `row[j]` manage the best score so far of matching `query[0..j]` against candidate[0..i]. | ||
### Consecutive Score (Neighbourhood) Matrix. | ||
We cache the consecutive score in a virtual matrix following the same composite row scheme that we do with score values. | ||
In `fuzzaldrin.score` The candidate entirely determines the Neighbourhood quality. It is not affected by which character has been chosen. In highlight, (`fuzzaldrin.match`) we further refine the formula to make the consecutive bonus conditional to not breaking the consecutive chain: | ||
For example query `abcdz` vs. subject `abcdzbcdz`. Between `abcd` and `bcdz`, `abcd` wins for being sooner in the string. Now between the two `z`, the first one is isolated and the second one is part of a rank 4 group. However given that `bcd` are matched sooner, the second `z` is an isolated match, so the first `z` wins. | ||
------------- | ||
@@ -243,2 +416,55 @@ | ||
Let's consider the following autocomplete scenario. | ||
- Symbol bank has 1000 items. | ||
- The user receives about 5 suggestion for its query. | ||
- Of those 5, 1 is a exact case-sensitive match. | ||
- That particular user almost always wants that case sensitive match. | ||
Should we optimize for case sensitive `indexOf` before trying other things? Our answer to that question is no. | ||
Case sensitive exact match are valuable because they are rare. Even if the user tries to get them, for each one of those we have to reject 995 entry and deal with 4 other kinds of matches. | ||
This is our first principle for optimization: **Most of the haystack is not the needle**. Because rejection of candidate happens often, we should be very good at doing that. | ||
Failing a test for case-sensitive `indexOf` tell us exactly nothing for case-insensitive `indexOf`, or acronyms, or even scattered letters. | ||
That test is too specific. To reject match efficiently, we should aim for the lowest common denominator: scattered case-insensitive match. | ||
This is exactly the purpose of `isMatch`. | ||
### Most of the haystack is not the needle | ||
We just have shown how that sentence applies at the candidate level, but it is also at the character level. | ||
**Let's consider this line: `if (subject_lw[i] == query_lw[j])`** | ||
This test is for match points (or hits). It refers to the `diag+1` in the algorithm description, with the `+1` being refined to handle the differents levels of character and neighborhood similarity. | ||
**How often is that condition true ?** | ||
Let's consider an alphabet that contain 26 lowercase letters, 10 numbers, a few symbols ` _!?=<>`. That is a 40+ symbol alphabet. Under a uniform usage model of those symbols, we have the hit condition occurs about 2.5% of the time (1/40). If we suppose only 10-20 of those characters are popular, the hit rate is about 5-10%. | ||
This means we'll try to minimize the number of operation that happens outside of math points. In that context, increasing the cost of a hit, while decreasing the cost of non-hits looks like a possibly worthwhile proposition. | ||
A canonical example of this is that, instead of testing each character against the list of separators, setting a flag for next character being a start-of-word, we first confirm a match then look behind for separator. This characterization work is sometimes repeated more than once, but so far this scheme benchmarked better than alternatives we have tried to avoid doing extra work. | ||
Having work concentrated at hit points is also a natural fit to our logic, the most expensive part being to determine how to score similarity between characters (including context similarity). However, it also means we'll want to have some control over the number of positive hits we'll compute - that is the purpose of missed hit optimisation. | ||
### What about a stack of needles? | ||
To the extent the user is searching for a specific resource, this should be uncommon. | ||
It can still happen in some situation such as: | ||
- Search is carried as user type (the query is not intentional) | ||
- The intentional query is not fully typed, match-all is a temporary step. | ||
One way to deal with that is not to use the full matching algorithm when we can deal with something simpler. This is what we have done while searching for `indexOf` instance. | ||
One special note: Acronym still have to be checked even if we have an exact match: for example query `su` against `StatusUrl`. As an exact match it is poor: 'Statu**sU**rl' is a middle of word match and have the wrong case. However as an acronym it is great: '**S**tatus**U**rl'. That motivated us to create the specialized `scoreAcronyms`. | ||
What is nice is that while `scoreAcronyms` was created to speed up exact matches search, it also provided very valuable information for accuracy. It later became a corner stone in the processing of accidental acronym. | ||
The result is that for exact matches and exact acronym matches we bypass the optimal alignment algorithm, giving very fast results. | ||
We still have to deal with fuzzier stacks of needles and the next two optimization address this. | ||
### Hit Miss Optimization. | ||
@@ -249,9 +475,8 @@ | ||
A missed hit occurs when a hit does not improve score. | ||
A missed hit occurs when a hit does not improve the score. | ||
To guarantee optimal alignment, every hit has to be considered. | ||
However when candidate are long (deep path) & query contains popular character (vowels) , we can spend a huge amount of time scoring accidental hit. | ||
However when candidate are long (deep path) & query contains common use character, for example, vowels , we can spend a huge amount of time scoring accidental hits. | ||
So we use number of missed hit as a heuristic for unlikely to improve. | ||
Let's score `itc` vs `ImportanceTableControl` | ||
So we use the number of missed hit as a heuristic for current score that are unlikely to improve. Let's score `itc` vs `ImportanceTableControl` | ||
@@ -261,19 +486,19 @@ - `I` of `Importance`: First occurrence, improve over none. | ||
- `c` of `Importance`: First occurrence, improve over none. | ||
- `T` of `Table` : Acronym match, improve over isolated middle of word. | ||
- `C` of `Control` : Acronym match, improve over isolated middle of word. | ||
- `T` of `Table` : Acronym match, improve over an isolated middle of word. | ||
- `C` of `Control` : Acronym match, improve over an isolated middle of word. | ||
- `t` of `Control`: no improvement over acronym `T`: first hit miss. | ||
- After a certain threshold of hit miss we can consider it's unlikely the score will get better | ||
- After a certain threshold of missed hit we can consider it is unlikely the score will improve by much. | ||
- Despite above example hit miss optimization do not affect scoring of exact match (sub-string or acronym) | ||
- There are some legitimate use for hit miss, for example while scoring query `Mississippi` each positive match for `s` or `i` may trigger up to 3 hit miss on the other occurrence of that letter in query. | ||
- For that reason we propose counting consecutive hit miss and having maximum one hit miss per character of subject. | ||
- For that reason, we propose counting consecutive hit miss and having a maximum of one hit miss per character of the subject. | ||
**Q:** Does this grantee improvement over leftmost alignment ? | ||
**A:** It'll often be the case but no guarantee on pathological match. | ||
For example, in query `abcde` against candidate '**abc**abcabcabcabcabcabc_de' we may trigger the miss count before matching `de`. It'll still be registered as a match and probably a good one with `abc` at the start, `de` will be scored as optional characters not present. | ||
**Q:** Does this grantee improvement over leftmost alignment? | ||
**A:** It'll often be the case but no guarantee on pathological matches. | ||
For example, in query `abcde` against candidate '**abc**abcabcabcabcabcabczde' we may trigger the miss count before matching `de`. It'll still be registered as a match and probably a good one with `abc` at the start, `de` will be scored as optional characters not present. | ||
Candidate 'abcabcabcabcabcabc**abcde**' will not have any problem because it does not affect exact match. | ||
Candidate 'abcabcabcabcabcabc**abcde**' will not have any problem because it does not affect exact match. | ||
Real example is searching `index` in the benchmark. Where `i`, `n`, `d`, `e` exist scattered in folder name, but x exist in the extension `.txt`. However the whole point of this PR is to prefer structured match to scattered one so this might not be a problem. | ||
A real world example is searching `index` in the benchmark. Where `i`, `n`, `d`, `e` exist scattered in folder name, but x exist in the extension `.txt`. However, the whole point of this project is to prefer structured match to scattered one so this might not be a problem. | ||
@@ -283,3 +508,3 @@ ### High Positive count mitigation | ||
A lot of the speed of this PR come from the idea that rejection happens often and we need to be very efficient on them to offset slower higher quality match. Unfortunately some query will match against almost everything. | ||
A lot of the speed of this PR come from the idea that rejection happens often, and we need to be very efficient on them to offset slower higher quality match. Unfortunately, some query will match against almost everything. | ||
@@ -289,17 +514,20 @@ - Fast short-circuit path for exact substring acronym help a lot. | ||
However we may still be too slow for interactive time query on large data set. This is why `maxInners` option is provided. | ||
However, we may still be too slow for interactive time query on large data set. This is why `maxInners` option is provided. | ||
This is the maximum number of positive candidate we collect before sorting and returning the list. | ||
The realization is that a query that match everything on a 50K item data set is unlikely to show anything useful to the user above the fold (say in the first 15 results). | ||
The realization is that a query that match everything on a 50K item data set is unlikely to show anything useful to the user above the fold (say in the first 15 results). | ||
So then the priority is to detect such case of low quality (low discrimination power) query and report fast to the user so user can refine its query. | ||
A `maxInners` size of about 20% of the list works well. It is not needed on smaller list. | ||
A `maxInners` size of about 20% of the list works well. It is not needed on a smaller list. | ||
### Active Region Optimization | ||
Before the first occurrence of the first char of query in the subject, or after the last occurrence of the last char of query in the subject it is impossible to make a match. So we'll trim the subject to that active region. The search for those boundaries is linear while the optimal alignment algorithm is quadratic, so it is an improvement, however, little or large we move. | ||
### Benchmark | ||
- All test compare this PR to previous version (legacy) | ||
- The first test `index` is a typical use case, 10% positive, 1/3 of positive are exact match. | ||
- The first test `index` is a typical use case, 10% positive, 1/3 of positive are exact matches. | ||
- We are about 2x faster | ||
@@ -315,4 +543,5 @@ | ||
- Test 6 `nodemodules` is special in that it use a string that score on almost every candidate, often multiple time per candidate and individuals characters are popular. It also avoid exact match speed-up. | ||
About 2x slower, but unlikely to happens in real life. `maxInners` mitigation cover that case. | ||
- Test 6 `nodemodules` is special in that it use a string that score on almost every candidate, often multiple time per candidate and individuals characters are popular. It also avoid exact match speed-up. About 2x slower, but unlikely to happens in real life. `maxInners` mitigation cover that case. | ||
```` | ||
@@ -340,3 +569,3 @@ Filtering 66672 entries for 'index' took 62ms for 6168 results (~10% of results are positive, mix exact & fuzzy) | ||
**Q:** My results are not as good. | ||
**A:** Run the benchmark a few time, it looks like some optimisation kick in later. (Or CPU on energy efficient device migth need to warm up before some optimisations are activated) | ||
**A:** Run the benchmark a few time, it looks like some optimization kick in later. (Or CPU on energy efficient device might need to warm up before some optimization are activated) | ||
@@ -347,4 +576,3 @@ | ||
[Chrome FilePathScore]( | ||
https://chromium.googlesource.com/chromium/blink/+/master/Source/devtools/front_end/sources/FilePathScoreFunction.js#70) | ||
[Chrome FilePathScore](https://chromium.googlesource.com/chromium/blink/+/master/Source/devtools/front_end/sources/FilePathScoreFunction.js#70) | ||
@@ -431,3 +659,3 @@ [Textmate ranker](https://github.com/textmate/textmate/blob/master/Frameworks/text/src/ranker.cc#L46) | ||
-> PHP Namespaces, let "\" match "/" | ||
-> (would be nice for config file in mized OS environment too) | ||
-> (would be nice for config file in mixed OS environment too) | ||
@@ -440,3 +668,3 @@ https://github.com/atom/fuzzy-finder/pull/51 | ||
-> SpaceRegex, let " " match "/" | ||
-> was already implemented, posted here to show parallel. | ||
-> was already implemented, posted here to show parallel. | ||
@@ -448,3 +676,2 @@ | ||
-> we implement suggestion of score based on run length | ||
-> todo allow fuzzaldrin to support external knowledge. | ||
-> todo allows fuzzaldrin to support external knowledge. |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
99183
16
659
747