quickner
Advanced tools
| import time | ||
| import csv | ||
| from quickner import Entity, Document, Quickner | ||
| def get_entities(): | ||
| with open("data/entities.csv", "r") as f: | ||
| reader = csv.reader(f) | ||
| entities = [Entity(*row) for row in reader] | ||
| return entities | ||
| def get_documents(): | ||
| with open("data/texts.csv", "r") as f: | ||
| reader = csv.reader(f) | ||
| documents = [Document(row[0]) for row in reader] | ||
| return documents | ||
| def main(): | ||
| start = time.perf_counter() | ||
| documents = get_documents() | ||
| entities = get_entities() | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| end = time.perf_counter() | ||
| quick.to_jsonl("data/output.jsonl") | ||
| print(quick.find_documents_by_entity("Apple")) | ||
| print(f"Time elapsed: {end - start} seconds") | ||
| if __name__ == "__main__": | ||
| main() |
+178
| import unittest | ||
| from quickner import Quickner, Document, Entity | ||
| # TODO(Omar): Significantly improve tests with pytest | ||
| class TestQuickner(unittest.TestCase): | ||
| texts = ( | ||
| "rust is made by Mozilla", | ||
| "Python was created by Guido van Rossum", | ||
| "Java was created by James Gosling at Sun Microsystems", | ||
| "Swift was created by Chris Lattner and Apple", | ||
| "You can find more information about Rust at https://www.rust-lang.org/", | ||
| ) | ||
| # Create entities | ||
| entities = ( | ||
| ("Rust", "PL"), | ||
| ("Python", "PL"), | ||
| ("Java", "PL"), | ||
| ("Swift", "PL"), | ||
| ("Mozilla", "ORG"), | ||
| ("Apple", "ORG"), | ||
| ("Sun Microsystems", "ORG"), | ||
| ("Guido van Rossum", "PERSON"), | ||
| ("James Gosling", "PERSON"), | ||
| ("Chris Lattner", "PERSON"), | ||
| ) | ||
| annotations = ( | ||
| ("rust is made by Mozilla", [(0, 4, "PL"), (16, 23, "ORG")]), | ||
| ("Python was created by Guido van Rossum", [(0, 6, "PL"), (22, 38, "PERSON")]), | ||
| ( | ||
| "Java was created by James Gosling at Sun Microsystems", | ||
| [(0, 4, "PL"), (20, 33, "PERSON"), (37, 53, "ORG")], | ||
| ), | ||
| ( | ||
| "Swift was created by Chris Lattner and Apple", | ||
| [(0, 5, "PL"), (21, 34, "PERSON"), (39, 44, "ORG")], | ||
| ), | ||
| ) | ||
| def _test_correct(self, documents: list): | ||
| for document in documents: | ||
| for annotation in self.annotations: | ||
| if document.text == annotation[0]: | ||
| self.assertEqual(document.label, annotation[1]) | ||
| def test_quickner_from_documents(self): | ||
| documents = [Document(text) for text in self.texts] | ||
| entities = [Entity(*(entity)) for entity in self.entities] | ||
| quick = Quickner(documents=documents) | ||
| self.assertEqual(len(quick.documents), 5) | ||
| self.assertEqual(len(quick.entities), 0) | ||
| quick.entities = entities | ||
| quick.process() | ||
| # Check if all entities are labeled correctly | ||
| labels_count = sum(len(document.label) for document in quick.documents) | ||
| self.assertEqual(labels_count, 11) | ||
| self._test_correct(quick.documents) | ||
| def test_quickner_from_documents_and_entities(self): | ||
| entities = [Entity(*(entity)) for entity in self.entities] | ||
| documents = [Document(text) for text in self.texts] | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| self.assertEqual(len(quick.documents), 5) | ||
| self.assertEqual(len(quick.entities), 10) | ||
| quick.process() | ||
| labels_count = sum(len(document.label) for document in quick.documents) | ||
| self.assertEqual(labels_count, 11) | ||
| self._test_correct(quick.documents) | ||
| def test_find_document_by_label(self): | ||
| entities = [Entity(*(entity)) for entity in self.entities] | ||
| documents = [Document(text) for text in self.texts] | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| documents = quick.find_documents_by_label("PL") | ||
| self.assertEqual(len(documents), 5) | ||
| documents = quick.find_documents_by_label("ORG") | ||
| self.assertEqual(len(documents), 3) | ||
| documents = quick.find_documents_by_label("PERSON") | ||
| self.assertEqual(len(documents), 3) | ||
| self._test_correct(quick.documents) | ||
| def test_setting_documents(self): | ||
| entities = [Entity(*(entity)) for entity in self.entities] | ||
| documents = [Document(text) for text in self.texts] | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| quick.documents = [] | ||
| self.assertEqual(len(quick.documents), 0) | ||
| quick.documents = documents | ||
| quick.process() | ||
| self.assertEqual(len(quick.documents), 5) | ||
| docs = quick.find_documents_by_entity("Rust") | ||
| self.assertEqual(len(docs), 2) | ||
| docs = quick.find_documents_by_entity("Python") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("Java") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("Swift") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("Mozilla") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("Apple") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("Sun Microsystems") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("Guido van Rossum") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("James Gosling") | ||
| self.assertEqual(len(docs), 1) | ||
| docs = quick.find_documents_by_entity("Chris Lattner") | ||
| self.assertEqual(len(docs), 1) | ||
| def test_find_document_by_entity(self): | ||
| entities = [Entity(*(entity)) for entity in self.entities] | ||
| documents = [Document(text) for text in self.texts] | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| documents = quick.find_documents_by_entity("Rust") | ||
| self.assertEqual(len(documents), 2) | ||
| documents = quick.find_documents_by_entity("Python") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("Java") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("Swift") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("Mozilla") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("Apple") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("Sun Microsystems") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("Guido van Rossum") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("James Gosling") | ||
| self.assertEqual(len(documents), 1) | ||
| documents = quick.find_documents_by_entity("Chris Lattner") | ||
| self.assertEqual(len(documents), 1) | ||
| def test_get_spacy_generator(self): | ||
| entities = [Entity(*(entity)) for entity in self.entities] | ||
| documents = [Document(text) for text in self.texts] | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| generator = quick.spacy() | ||
| self.assertEqual(len(list(generator)), 1) | ||
| generator = quick.spacy(chunks=2) | ||
| self.assertEqual(len(list(generator)), 3) | ||
| generator = quick.spacy(chunks=3) | ||
| self.assertEqual(len(list(generator)), 2) | ||
| generator = quick.spacy(chunks=5) | ||
| self.assertEqual(len(list(generator)), 1) | ||
| def test_single_document_annotation(self): | ||
| rust = Document.from_string("rust is made by Mozilla") | ||
| entities = [Entity("Rust", "PL"), Entity("Mozilla", "ORG")] | ||
| rust.annotate(entities, case_sensitive=True) | ||
| self.assertEqual(len(rust.label), 1) | ||
| rust.annotate(entities, case_sensitive=False) | ||
| self.assertEqual(len(rust.label), 2) | ||
| self.assertEqual(rust.label[0][2], "ORG") | ||
| self.assertEqual(rust.label[1][2], "PL") | ||
| def test_character_level_slicing(self): | ||
| entity = Entity("Python", "PL") | ||
| document = Document("Indizes auf Zeichenebene anstelle von Indizes auf Byteebene, um Python-Slicing zu unterstützen") | ||
| document.annotate([entity], case_sensitive=False) | ||
| print(document.label[0][0], document.label[0][1]) | ||
| label = document.text[document.label[0][0]:document.label[0][1]] | ||
| self.assertEqual(len(document.label), 1) | ||
| self.assertEqual(label, "Python") | ||
| if __name__ == "__main__": | ||
| unittest.main() |
+0
-1
@@ -8,3 +8,2 @@ /target | ||
| *.py[cod] | ||
| *.py | ||
| *.lock | ||
@@ -11,0 +10,0 @@ |
+6
-6
@@ -604,3 +604,3 @@ # This file is automatically @generated by Cargo. | ||
| name = "quickner" | ||
| version = "0.0.1-alpha.17" | ||
| version = "0.0.1-alpha.18" | ||
| dependencies = [ | ||
@@ -615,3 +615,3 @@ "numpy", | ||
| name = "quickner-core" | ||
| version = "0.0.1-alpha.17" | ||
| version = "0.0.1-alpha.18" | ||
| dependencies = [ | ||
@@ -728,5 +728,5 @@ "aho-corasick", | ||
| name = "serde" | ||
| version = "1.0.154" | ||
| version = "1.0.155" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8cdd151213925e7f1ab45a9bbfb129316bd00799784b174b7cc7bcd16961c49e" | ||
| checksum = "71f2b4817415c6d4210bfe1c7bfcf4801b2d904cb4d0e1a8fdb651013c9e86b8" | ||
| dependencies = [ | ||
@@ -738,5 +738,5 @@ "serde_derive", | ||
| name = "serde_derive" | ||
| version = "1.0.154" | ||
| version = "1.0.155" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4fc80d722935453bcafdc2c9a73cd6fac4dc1938f0346035d84bf99fa9e33217" | ||
| checksum = "d071a94a3fac4aff69d023a7f411e33f40f3483f8c5190b1953822b6b76d7630" | ||
| dependencies = [ | ||
@@ -743,0 +743,0 @@ "proc-macro2", |
+1
-1
| [package] | ||
| name = "quickner" | ||
| version = "0.0.1-alpha.17" | ||
| version = "0.0.1-alpha.18" | ||
| edition = "2021" | ||
@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"] |
| [package] | ||
| name = "quickner-core" | ||
| version = "0.0.1-alpha.17" | ||
| version = "0.0.1-alpha.18" | ||
| edition = "2021" | ||
@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"] |
| use crate::{ | ||
| config::{Config, Filters}, | ||
| models::Text, | ||
| utils::get_progress_bar, | ||
| utils::{char_to_byte, get_progress_bar, is_valid_utf8}, | ||
| SpacyEntity, | ||
@@ -123,2 +123,6 @@ }; | ||
| ) -> Option<Vec<(usize, usize, String)>> { | ||
| if !is_valid_utf8(text.as_str()) { | ||
| warn!("Skipping invalid utf8 text: \"{}\"", text); | ||
| return None; | ||
| } | ||
| let mut annotations = Vec::new(); | ||
@@ -555,4 +559,4 @@ for mat in aho_corasick.find_iter(&text) { | ||
| for label in &annotation.label { | ||
| // Extarct the entity name using indexes | ||
| let name = annotation.text[label.0..label.1].to_string(); | ||
| let indices = char_to_byte(annotation.text.clone(), label.0, label.1); | ||
| let name = annotation.text[indices.0..indices.1].to_string(); | ||
| let entity = Entity { | ||
@@ -680,3 +684,5 @@ name: name.to_string().to_lowercase(), | ||
| for label in &document.label { | ||
| let name = document.text[label.0..label.1].to_string(); | ||
| // Translate the indices to byte indices | ||
| let indices = char_to_byte(document.text.clone(), label.0, label.1); | ||
| let name = document.text[indices.0..indices.1].to_string(); | ||
| let entry = index.entry(name.to_lowercase()).or_insert(Vec::new()); | ||
@@ -701,3 +707,4 @@ entry.push(document.id.clone()); | ||
| for label in &document.label { | ||
| let name = document.text[label.0..label.1].to_string(); | ||
| let indices = char_to_byte(document.text.clone(), label.0, label.1); | ||
| let name = document.text[indices.0..indices.1].to_string(); | ||
| let entry = self | ||
@@ -723,3 +730,4 @@ .documents_entities_index | ||
| for label in &document.label { | ||
| let name = document.text[label.0..label.1].to_string(); | ||
| let indices = char_to_byte(document.text.clone(), label.0, label.1); | ||
| let name = document.text[indices.0..indices.1].to_string(); | ||
| let entry = self | ||
@@ -726,0 +734,0 @@ .documents_entities_index |
@@ -8,3 +8,3 @@ // quickner | ||
| // | ||
| use std::collections::HashSet; | ||
| use std::{collections::HashSet, str}; | ||
@@ -104,1 +104,24 @@ use indicatif::{ProgressBar, ProgressStyle}; | ||
| } | ||
| pub(crate) fn is_valid_utf8(text: &str) -> bool { | ||
| let b = text.as_bytes(); | ||
| if str::from_utf8(b).is_err() { | ||
| return false; | ||
| } else { | ||
| return true; | ||
| } | ||
| } | ||
| pub(crate) fn char_to_byte(text: String, start: usize, end: usize) -> (usize, usize) { | ||
| let start = text.char_indices().nth(start); | ||
| let end = text.char_indices().nth(end); | ||
| let start = match start { | ||
| Some(start) => start.0, | ||
| None => 0, | ||
| }; | ||
| let end = match end { | ||
| Some(end) => end.0, | ||
| None => text.len(), | ||
| }; | ||
| (start, end) | ||
| } |
+1
-1
| Metadata-Version: 2.1 | ||
| Name: quickner | ||
| Version: 0.0.1a17 | ||
| Version: 0.0.1a18 | ||
| Classifier: Programming Language :: Rust | ||
@@ -5,0 +5,0 @@ Classifier: Programming Language :: Python :: Implementation :: CPython |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
39187274
0.02%34
6.25%473
65.38%