MiniVectorDB
This is a Python project aimed at providing an extremely simple yet powerful vector database that uses FAISS internally, while also providing functionality for extracting embeddings, using an integrated ONNX model - but also integrated with the e5 multilingual embedding models. It is now possible to index vectors with metadata (which can be used for querying), and also rerank results using a hybrid approach (text hash vectorizer + fuzzy string similarity). Please check out the code snippets below.
Integrated model link in Huggingface: universal-sentence-encoder-multilingual-3-onnx-quantized
Installation
pip install minivectordb
Quantized ONNX Model Supported Languages
["en", "pt", "ar", "zh", "fr", "de", "it", "ja", "ko", "nl", "ps", "es", "th", "tr", "ru"]
Usage
from minivectordb.embedding_model import EmbeddingModel, AlternativeModel
from minivectordb.vector_database import VectorDatabase
vector_db = VectorDatabase()
model = EmbeddingModel()
sentences_with_metadata = [
(1, "I like dogs", {"animal": "dog", "like": True}),
(2, "I like cats", {"animal": "cat", "like": True}),
(3, "The king has three kids", {"royalty": "king"}),
(4, "The queen has one daughter", {"royalty": "queen"}),
(5, "Programming is cool", {"topic": "programming", "sentiment": "positive"}),
(6, "Software development is cool", {"topic": "software development", "sentiment": "positive"}),
(7, "Being a developer is stressful", {"topic": "software development", "sentiment": "negative"}),
(8, "Being a developer is a job", {"topic": "software development", "sentiment": "neutral"}),
(9, "I like to ride my bicycle", {"activity": "riding", "object": "bicycle"}),
(10, "I like to ride my scooter", {"activity": "riding", "object": "scooter"}),
(11, "The sky is blue", {"color": "blue", "object": "sky"}),
(12, "The ocean is blue", {"color": "blue", "object": "ocean"})
]
for id, sentence, metadata in sentences_with_metadata:
sentence_embedding = model.extract_embeddings(sentence)
vector_db.store_embedding(id, sentence_embedding, metadata)
query = "animals"
query_embedding = model.extract_embeddings(query)
search_results = vector_db.find_most_similar(query_embedding, k = 2)
ids, distances, metadatas = search_results
for id, dist, metadata in zip(ids, distances, metadatas):
print(f"ID: {id}, Sentence: \"{sentences_with_metadata[id-1][1]}\", Distance: {dist}, Metadata: {metadata}")
query = "blue is cool"
query_embedding = model.extract_embeddings(query)
search_results = vector_db.find_most_similar(query_embedding, k = 6)
ids, distances, metadata = search_results
sentences = [sentences_with_metadata[id-1][1] for id in ids]
hybrid_reranked_results = vector_db.hybrid_rerank_results(
sentences = sentences,
search_scores = distances,
query = query,
k = 4
)
hybried_retrieved_sentences, hybrid_scores = hybrid_reranked_results
for sentence, score in zip(hybried_retrieved_sentences, hybrid_scores):
print(f"Sentence: \"{sentence}\", Score: {score}")
query_embedding = model.extract_embeddings("king")
metadata_filter = {"royalty": "queen"}
search_results = vector_db.find_most_similar(query_embedding, metadata_filter, k = 2)
ids, distances, metadatas = search_results
for id, dist, metadata in zip(ids, distances, metadatas):
print(f"ID: {id}, Sentence: \"{sentences_with_metadata[id-1][1]}\", Distance: {dist}, Metadata: {metadata}")
query_embedding = model.extract_embeddings("programming")
metadata_filter = {"topic": "software development"}
or_filters = [
{"sentiment": "positive"},
{"sentiment": "negative"}
]
search_results = vector_db.find_most_similar(query_embedding, metadata_filter, k = 2, or_filters = or_filters)
ids, distances, metadatas = search_results
for id, dist, metadata in zip(ids, distances, metadatas):
print(f"ID: {id}, Sentence: \"{sentences_with_metadata[id-1][1]}\", Distance: {dist}, Metadata: {metadata}")
vector_db.persist_to_disk()
License
This project is licensed under the MIT License.