item-matching
Advanced tools
+1
-1
| Metadata-Version: 2.4 | ||
| Name: item_matching | ||
| Version: 0.0.99 | ||
| Version: 0.0.100 | ||
| Summary: A name matching package | ||
@@ -5,0 +5,0 @@ Project-URL: Homepage, https://github.com/kevinkhang2909/item_matching |
+1
-1
@@ -7,3 +7,3 @@ [build-system] | ||
| name = "item_matching" | ||
| version = "0.0.99" | ||
| version = "0.0.100" | ||
| authors = [ | ||
@@ -10,0 +10,0 @@ { name="Kevin Khang", email="kevinkhang2909@gmail.com" }, |
@@ -5,3 +5,3 @@ from pathlib import Path | ||
| from autofaiss import build_index | ||
| from datasets import concatenate_datasets, load_from_disk | ||
| from datasets import Dataset | ||
| import numpy as np | ||
@@ -80,4 +80,4 @@ from rich import print | ||
| files = sorted(self.dataset_dict[f"{i}_ds_path"].glob("*"), key=self.sort_key_ds) | ||
| lst_ds = [load_from_disk(str(f)) for f in files] | ||
| dataset[i] = concatenate_datasets(lst_ds) | ||
| df = pl.concat([pl.read_parquet(f) for f in files]) | ||
| dataset[i] = Dataset.from_polars(df) | ||
@@ -84,0 +84,0 @@ # Add index |
@@ -6,3 +6,2 @@ from PIL import Image | ||
| from rich import print | ||
| from datasets import Dataset, concatenate_datasets | ||
| from numpy.lib.format import open_memmap | ||
@@ -176,3 +175,3 @@ import torch | ||
| # Check if exists: | ||
| dataset_name = self.path_ds / f"{i}" | ||
| dataset_name = self.path_ds / f"{i}.parquet" | ||
| array_name = self.path_array / f"{i}.npy" | ||
@@ -187,3 +186,3 @@ if dataset_name.exists(): | ||
| dataset_chunk = Dataset.from_polars(data[start_idx:end_idx]) | ||
| dataset_chunk = data[start_idx:end_idx] | ||
| print( | ||
@@ -198,3 +197,3 @@ f"[DataEmbedding] Shard [{i}/{num_chunks - 1}]: start {start_idx:,.0f} end {end_idx:,.0f}" | ||
| save_file_path=array_name, | ||
| iterable_list=dataset_chunk[self.col_input] | ||
| iterable_list=dataset_chunk[self.col_input].to_list() | ||
| ) | ||
@@ -205,10 +204,10 @@ else: | ||
| save_file_path=array_name, | ||
| iterable_list=dataset_chunk[self.col_input] | ||
| iterable_list=dataset_chunk[self.col_input].to_list() | ||
| ) | ||
| # Concat | ||
| dset_embed = Dataset.from_dict({self.col_embedding: embeddings}) | ||
| dataset_chunk = concatenate_datasets([dataset_chunk, dset_embed], axis=1) | ||
| dset_embed = pl.DataFrame({self.col_embedding: embeddings}) | ||
| dataset_chunk = pl.concat([dataset_chunk, dset_embed], how="horizontal") | ||
| # Save chunk | ||
| dataset_chunk.save_to_disk(str(dataset_name)) | ||
| dataset_chunk.write_parquet(str(dataset_name)) |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
3609917
01238
-0.08%