@orama/orama
Advanced tools
Comparing version 1.0.0-beta.9 to 1.0.0-beta.10
@@ -36,3 +36,3 @@ import { Node as AVLNode } from '../trees/avl.js'; | ||
export declare function search(context: SearchContext, index: Index, prop: string, term: string): Promise<TokenScore[]>; | ||
export declare function searchByWhereClause(index: Index, filters: Record<string, boolean | ComparisonOperator>): Promise<string[]>; | ||
export declare function searchByWhereClause(context: SearchContext, index: Index, filters: Record<string, boolean | ComparisonOperator>): Promise<string[]>; | ||
export declare function getSearchableProperties(index: Index): Promise<string[]>; | ||
@@ -39,0 +39,0 @@ export declare function getSearchablePropertiesWithTypes(index: Index): Promise<Record<string, 'string' | 'number' | 'boolean'>>; |
@@ -113,3 +113,3 @@ import { createError } from "../errors.js"; | ||
} | ||
const tokens = await tokenizer.tokenize(value, language); | ||
const tokens = await tokenizer.tokenize(value, language, prop); | ||
await implementation.insertDocumentScoreParameters(index, prop, id, tokens, docsCount); | ||
@@ -131,3 +131,3 @@ for (const token of tokens){ | ||
} | ||
const tokens = await tokenizer.tokenize(value, language); | ||
const tokens = await tokenizer.tokenize(value, language, prop); | ||
await implementation.removeDocumentScoreParameters(index, prop, id, docsCount); | ||
@@ -160,3 +160,3 @@ for (const token of tokens){ | ||
} | ||
export async function searchByWhereClause(index, filters) { | ||
export async function searchByWhereClause(context, index, filters) { | ||
const filterKeys = Object.keys(filters); | ||
@@ -175,2 +175,16 @@ const filtersMap = filterKeys.reduce((acc, key)=>({ | ||
} | ||
if (typeof operation === 'string' || Array.isArray(operation)) { | ||
const idx = index.indexes[param]; | ||
for (const raw of [ | ||
operation | ||
].flat()){ | ||
const term = await context.tokenizer.tokenize(raw, context.language, param); | ||
const filteredIDsResults = radixFind(idx, { | ||
term: term[0], | ||
exact: true | ||
}); | ||
filtersMap[param].push(...Object.values(filteredIDsResults).flat()); | ||
} | ||
continue; | ||
} | ||
const operationKeys = Object.keys(operation); | ||
@@ -177,0 +191,0 @@ if (operationKeys.length > 1) { |
@@ -5,8 +5,9 @@ import { Stemmer, Tokenizer, TokenizerConfig } from '../../types.js'; | ||
stemmer?: Stemmer; | ||
stemmerSkipProperties: Set<string>; | ||
stopWords?: string[]; | ||
allowDuplicates: boolean; | ||
normalizationCache: Map<string, string>; | ||
normalizeToken(this: DefaultTokenizer, token: string): string; | ||
normalizeToken(this: DefaultTokenizer, token: string, prop: string | undefined): string; | ||
} | ||
export declare function createTokenizer(config?: TokenizerConfig): Promise<DefaultTokenizer>; | ||
export {}; |
import { createError } from "../../errors.js"; | ||
import { replaceDiacritics } from "./diacritics.js"; | ||
import { SPLITTERS, STEMMERS, SUPPORTED_LANGUAGES } from "./languages.js"; | ||
import { SPLITTERS, SUPPORTED_LANGUAGES } from "./languages.js"; | ||
import { stemmers } from "./stemmers.js"; | ||
import { stopWords as defaultStopWords } from "./stop-words/index.js"; | ||
function normalizeToken(token) { | ||
function normalizeToken(prop, token) { | ||
var _this_stopWords; | ||
const key = `${this.language}:${token}`; | ||
const key = `${this.language}:${prop}:${token}`; | ||
if (this.normalizationCache.has(key)) { | ||
@@ -17,3 +18,3 @@ return this.normalizationCache.get(key); | ||
// Apply stemming if enabled | ||
if (this.stemmer) { | ||
if (this.stemmer && !this.stemmerSkipProperties.has(prop)) { | ||
token = this.stemmer(token); | ||
@@ -34,3 +35,3 @@ } | ||
} | ||
function tokenize(input, language) { | ||
function tokenize(input, language, prop) { | ||
if (language && language !== this.language) { | ||
@@ -45,3 +46,3 @@ throw createError('LANGUAGE_NOT_SUPPORTED', language); | ||
const splitRule = SPLITTERS[this.language]; | ||
const tokens = input.toLowerCase().split(splitRule).map(this.normalizeToken).filter(Boolean); | ||
const tokens = input.toLowerCase().split(splitRule).map(this.normalizeToken.bind(this, prop ?? '')).filter(Boolean); | ||
const trimTokens = trim(tokens); | ||
@@ -59,23 +60,12 @@ if (!this.allowDuplicates) { | ||
} | ||
// Handle stemming | ||
// Handle stemming - It is disabled by default | ||
let stemmer; | ||
if (config.stemming !== false) { | ||
if (config.stemmer && typeof config.stemmer !== 'function') { | ||
throw createError('INVALID_STEMMER_FUNCTION_TYPE'); | ||
} | ||
if (config.stemming || config.stemmer && !('stemming' in config)) { | ||
if (config.stemmer) { | ||
if (typeof config.stemmer !== 'function') { | ||
throw createError('INVALID_STEMMER_FUNCTION_TYPE'); | ||
} | ||
stemmer = config.stemmer; | ||
} else { | ||
// Check if we are in a TypeScript or Javascript scenario and determine the stemmers path | ||
// Note that the initial .. is purposely left inside the import in order to be compatible | ||
// with vite. | ||
try { | ||
// eslint-disable-next-line @typescript-eslint/ban-ts-comment | ||
// @ts-ignore This fails when verifying CJS but it's actually correct | ||
const stemmersPath = import.meta.url.endsWith('ts') ? '../../stemmers/lib' : '../stemmers'; | ||
const stemmerImport = await import(`../${stemmersPath}/${STEMMERS[config.language]}.js`); | ||
stemmer = stemmerImport.stemmer; | ||
} catch (e) { | ||
throw createError('BUNDLED_ORAMA', config.language); | ||
} | ||
stemmer = stemmers[config.language]; | ||
} | ||
@@ -109,2 +99,5 @@ } | ||
stemmer, | ||
stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [ | ||
config.stemmerSkipProperties | ||
].flat() : []), | ||
stopWords, | ||
@@ -116,3 +109,3 @@ allowDuplicates: Boolean(config.allowDuplicates), | ||
tokenizer.tokenize = tokenize.bind(tokenizer); | ||
tokenizer.normalizeToken = normalizeToken.bind(tokenizer); | ||
tokenizer.normalizeToken = normalizeToken; | ||
return tokenizer; | ||
@@ -119,0 +112,0 @@ } |
@@ -1,27 +0,27 @@ | ||
import { stemmer as arabic } from '../../stemmers/ar.js'; | ||
import { stemmer as armenian } from '../../stemmers/am.js'; | ||
import { stemmer as bulgarian } from '../../stemmers/bg.js'; | ||
import { stemmer as danish } from '../../stemmers/dk.js'; | ||
import { stemmer as dutch } from '../../stemmers/nl.js'; | ||
import { stemmer as english } from '../../stemmers/en.js'; | ||
import { stemmer as finnish } from '../../stemmers/fi.js'; | ||
import { stemmer as french } from '../../stemmers/fr.js'; | ||
import { stemmer as german } from '../../stemmers/de.js'; | ||
import { stemmer as greek } from '../../stemmers/gr.js'; | ||
import { stemmer as hungarian } from '../../stemmers/hu.js'; | ||
import { stemmer as indian } from '../../stemmers/in.js'; | ||
import { stemmer as indonesian } from '../../stemmers/id.js'; | ||
import { stemmer as irish } from '../../stemmers/ie.js'; | ||
import { stemmer as italian } from '../../stemmers/it.js'; | ||
import { stemmer as lithuanian } from '../../stemmers/lt.js'; | ||
import { stemmer as nepali } from '../../stemmers/np.js'; | ||
import { stemmer as norwegian } from '../../stemmers/no.js'; | ||
import { stemmer as portuguese } from '../../stemmers/pt.js'; | ||
import { stemmer as romanian } from '../../stemmers/ro.js'; | ||
import { stemmer as russian } from '../../stemmers/ru.js'; | ||
import { stemmer as serbian } from '../../stemmers/rs.js'; | ||
import { stemmer as spanish } from '../../stemmers/es.js'; | ||
import { stemmer as swedish } from '../../stemmers/se.js'; | ||
import { stemmer as turkish } from '../../stemmers/tr.js'; | ||
import { stemmer as ukrainian } from '../../stemmers/uk.js'; | ||
import { stemmer as arabic } from '@stemmers/ar.js'; | ||
import { stemmer as armenian } from '@stemmers/am.js'; | ||
import { stemmer as bulgarian } from '@stemmers/bg.js'; | ||
import { stemmer as danish } from '@stemmers/dk.js'; | ||
import { stemmer as dutch } from '@stemmers/nl.js'; | ||
import { stemmer as english } from '@stemmers/en.js'; | ||
import { stemmer as finnish } from '@stemmers/fi.js'; | ||
import { stemmer as french } from '@stemmers/fr.js'; | ||
import { stemmer as german } from '@stemmers/de.js'; | ||
import { stemmer as greek } from '@stemmers/gr.js'; | ||
import { stemmer as hungarian } from '@stemmers/hu.js'; | ||
import { stemmer as indian } from '@stemmers/in.js'; | ||
import { stemmer as indonesian } from '@stemmers/id.js'; | ||
import { stemmer as irish } from '@stemmers/ie.js'; | ||
import { stemmer as italian } from '@stemmers/it.js'; | ||
import { stemmer as lithuanian } from '@stemmers/lt.js'; | ||
import { stemmer as nepali } from '@stemmers/np.js'; | ||
import { stemmer as norwegian } from '@stemmers/no.js'; | ||
import { stemmer as portuguese } from '@stemmers/pt.js'; | ||
import { stemmer as romanian } from '@stemmers/ro.js'; | ||
import { stemmer as russian } from '@stemmers/ru.js'; | ||
import { stemmer as serbian } from '@stemmers/rs.js'; | ||
import { stemmer as spanish } from '@stemmers/es.js'; | ||
import { stemmer as swedish } from '@stemmers/se.js'; | ||
import { stemmer as turkish } from '@stemmers/tr.js'; | ||
import { stemmer as ukrainian } from '@stemmers/uk.js'; | ||
export declare const stemmers: { | ||
@@ -28,0 +28,0 @@ arabic: typeof arabic; |
declare const errors: { | ||
NO_LANGUAGE_WITH_CUSTOM_TOKENIZER: string; | ||
BUNDLED_ORAMA: string; | ||
LANGUAGE_NOT_SUPPORTED: string; | ||
@@ -5,0 +4,0 @@ INVALID_STEMMER_FUNCTION_TYPE: string; |
@@ -6,3 +6,2 @@ import { SUPPORTED_LANGUAGES } from "./components/tokenizer/languages.js"; | ||
NO_LANGUAGE_WITH_CUSTOM_TOKENIZER: 'Do not pass the language option to create when using a custom tokenizer.', | ||
BUNDLED_ORAMA: 'Cannot find the stemmer for the locale "%s". This can happen if you are using Orama within a bundler like webpack. To solve this issue please look at https://docs.oramasearch.com/text-analysis/stemming#using-stemming-with-bundlers.', | ||
LANGUAGE_NOT_SUPPORTED: `Language "%s" is not supported.\nSupported languages are:\n - ${allLanguages}`, | ||
@@ -9,0 +8,0 @@ INVALID_STEMMER_FUNCTION_TYPE: `config.stemmer property must be a function.`, |
@@ -11,3 +11,3 @@ import { prioritizeTokenScores } from "../components/algorithms.js"; | ||
}; | ||
async function createSearchContext(index, properties, tokens, params, docsCount) { | ||
async function createSearchContext(tokenizer, index, documentsStore, language, params, properties, tokens, docsCount) { | ||
// If filters are enabled, we need to get the IDs of the documents that match the filters. | ||
@@ -52,4 +52,7 @@ // const hasFilters = Object.keys(params.where ?? {}).length > 0; | ||
return { | ||
timeStart: await getNanosecondsTime(), | ||
tokenizer, | ||
index, | ||
timeStart: await getNanosecondsTime(), | ||
documentsStore, | ||
language, | ||
params, | ||
@@ -85,3 +88,3 @@ docsCount, | ||
// Create the search context and the results | ||
const context = await createSearchContext(orama.index, propertiesToSearch, tokens, params, await orama.documentsStore.count(docs)); | ||
const context = await createSearchContext(orama.tokenizer, orama.index, orama.documentsStore, language, params, propertiesToSearch, tokens, await orama.documentsStore.count(docs)); | ||
const results = Array.from({ | ||
@@ -94,3 +97,3 @@ length: limit | ||
if (hasFilters) { | ||
whereFiltersIDs = await orama.index.searchByWhereClause(index, params.where); | ||
whereFiltersIDs = await orama.index.searchByWhereClause(context, index, params.where); | ||
} | ||
@@ -97,0 +100,0 @@ // Now it's time to loop over all the indices and get the documents IDs for every single term |
@@ -143,3 +143,3 @@ import { Language } from './components/tokenizer/languages.js'; | ||
*/ | ||
where?: Record<string, boolean | ComparisonOperator>; | ||
where?: Record<string, boolean | string | string[] | ComparisonOperator>; | ||
}; | ||
@@ -169,2 +169,6 @@ export type Result = { | ||
timeStart: bigint; | ||
tokenizer: Tokenizer; | ||
index: IIndex; | ||
documentsStore: IDocumentsStore; | ||
language: string | undefined; | ||
params: SearchParams; | ||
@@ -175,3 +179,2 @@ docsCount: number; | ||
docsIntersection: TokenMap; | ||
index: IIndex; | ||
}; | ||
@@ -219,3 +222,3 @@ export type ElapsedTime = { | ||
search(context: SearchContext, index: I, prop: string, term: string): SyncOrAsyncValue<TokenScore[]>; | ||
searchByWhereClause(index: I, filters: Record<string, boolean | ComparisonOperator>): SyncOrAsyncValue<string[]>; | ||
searchByWhereClause(context: SearchContext, index: I, filters: Record<string, boolean | string | string[] | ComparisonOperator>): SyncOrAsyncValue<string[]>; | ||
getSearchableProperties(index: I): SyncOrAsyncValue<string[]>; | ||
@@ -243,2 +246,3 @@ getSearchablePropertiesWithTypes(index: I): SyncOrAsyncValue<Record<string, SearchableType>>; | ||
stemmer?: Stemmer; | ||
stemmerSkipProperties?: string | string[]; | ||
stopWords?: boolean | string[] | ((stopWords: string[]) => string[] | Promise<string[]>); | ||
@@ -250,3 +254,3 @@ allowDuplicates?: boolean; | ||
normalizationCache: Map<string, string>; | ||
tokenize: (raw: string, language?: string) => SyncOrAsyncValue<string[]>; | ||
tokenize: (raw: string, language?: string, prop?: string) => SyncOrAsyncValue<string[]>; | ||
} | ||
@@ -253,0 +257,0 @@ export interface ObjectComponents { |
@@ -13,2 +13,2 @@ Copyright 2023 OramaSearch Inc | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
limitations under the License. |
{ | ||
"name": "@orama/orama", | ||
"version": "1.0.0-beta.9", | ||
"version": "1.0.0-beta.10", | ||
"type": "module", | ||
@@ -5,0 +5,0 @@ "description": "Next generation full-text search engine, written in TypeScript", |
@@ -209,3 +209,3 @@ ![Orama. Search, everywhere.](https://github.com/oramasearch/orama/blob/main/misc/oramasearch.gif?raw=true) | ||
You can specify a different language by using the `defaultLanguage` property | ||
You can specify a different language by using the `language` property | ||
during Orama initialization. | ||
@@ -212,0 +212,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
1680645
65195