Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

quickner

Package Overview
Dependencies
Maintainers
1
Versions
20
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

quickner - npm Package Compare versions

Comparing version
0.0.1a17
to
0.0.1a18
+34
tests/performance.py
import time
import csv
from quickner import Entity, Document, Quickner
def get_entities():
with open("data/entities.csv", "r") as f:
reader = csv.reader(f)
entities = [Entity(*row) for row in reader]
return entities
def get_documents():
with open("data/texts.csv", "r") as f:
reader = csv.reader(f)
documents = [Document(row[0]) for row in reader]
return documents
def main():
start = time.perf_counter()
documents = get_documents()
entities = get_entities()
quick = Quickner(documents=documents, entities=entities)
quick.process()
end = time.perf_counter()
quick.to_jsonl("data/output.jsonl")
print(quick.find_documents_by_entity("Apple"))
print(f"Time elapsed: {end - start} seconds")
if __name__ == "__main__":
main()
import unittest
from quickner import Quickner, Document, Entity
# TODO(Omar): Significantly improve tests with pytest
class TestQuickner(unittest.TestCase):
texts = (
"rust is made by Mozilla",
"Python was created by Guido van Rossum",
"Java was created by James Gosling at Sun Microsystems",
"Swift was created by Chris Lattner and Apple",
"You can find more information about Rust at https://www.rust-lang.org/",
)
# Create entities
entities = (
("Rust", "PL"),
("Python", "PL"),
("Java", "PL"),
("Swift", "PL"),
("Mozilla", "ORG"),
("Apple", "ORG"),
("Sun Microsystems", "ORG"),
("Guido van Rossum", "PERSON"),
("James Gosling", "PERSON"),
("Chris Lattner", "PERSON"),
)
annotations = (
("rust is made by Mozilla", [(0, 4, "PL"), (16, 23, "ORG")]),
("Python was created by Guido van Rossum", [(0, 6, "PL"), (22, 38, "PERSON")]),
(
"Java was created by James Gosling at Sun Microsystems",
[(0, 4, "PL"), (20, 33, "PERSON"), (37, 53, "ORG")],
),
(
"Swift was created by Chris Lattner and Apple",
[(0, 5, "PL"), (21, 34, "PERSON"), (39, 44, "ORG")],
),
)
def _test_correct(self, documents: list):
for document in documents:
for annotation in self.annotations:
if document.text == annotation[0]:
self.assertEqual(document.label, annotation[1])
def test_quickner_from_documents(self):
documents = [Document(text) for text in self.texts]
entities = [Entity(*(entity)) for entity in self.entities]
quick = Quickner(documents=documents)
self.assertEqual(len(quick.documents), 5)
self.assertEqual(len(quick.entities), 0)
quick.entities = entities
quick.process()
# Check if all entities are labeled correctly
labels_count = sum(len(document.label) for document in quick.documents)
self.assertEqual(labels_count, 11)
self._test_correct(quick.documents)
def test_quickner_from_documents_and_entities(self):
entities = [Entity(*(entity)) for entity in self.entities]
documents = [Document(text) for text in self.texts]
quick = Quickner(documents=documents, entities=entities)
self.assertEqual(len(quick.documents), 5)
self.assertEqual(len(quick.entities), 10)
quick.process()
labels_count = sum(len(document.label) for document in quick.documents)
self.assertEqual(labels_count, 11)
self._test_correct(quick.documents)
def test_find_document_by_label(self):
entities = [Entity(*(entity)) for entity in self.entities]
documents = [Document(text) for text in self.texts]
quick = Quickner(documents=documents, entities=entities)
quick.process()
documents = quick.find_documents_by_label("PL")
self.assertEqual(len(documents), 5)
documents = quick.find_documents_by_label("ORG")
self.assertEqual(len(documents), 3)
documents = quick.find_documents_by_label("PERSON")
self.assertEqual(len(documents), 3)
self._test_correct(quick.documents)
def test_setting_documents(self):
entities = [Entity(*(entity)) for entity in self.entities]
documents = [Document(text) for text in self.texts]
quick = Quickner(documents=documents, entities=entities)
quick.process()
quick.documents = []
self.assertEqual(len(quick.documents), 0)
quick.documents = documents
quick.process()
self.assertEqual(len(quick.documents), 5)
docs = quick.find_documents_by_entity("Rust")
self.assertEqual(len(docs), 2)
docs = quick.find_documents_by_entity("Python")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("Java")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("Swift")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("Mozilla")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("Apple")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("Sun Microsystems")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("Guido van Rossum")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("James Gosling")
self.assertEqual(len(docs), 1)
docs = quick.find_documents_by_entity("Chris Lattner")
self.assertEqual(len(docs), 1)
def test_find_document_by_entity(self):
entities = [Entity(*(entity)) for entity in self.entities]
documents = [Document(text) for text in self.texts]
quick = Quickner(documents=documents, entities=entities)
quick.process()
documents = quick.find_documents_by_entity("Rust")
self.assertEqual(len(documents), 2)
documents = quick.find_documents_by_entity("Python")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("Java")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("Swift")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("Mozilla")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("Apple")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("Sun Microsystems")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("Guido van Rossum")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("James Gosling")
self.assertEqual(len(documents), 1)
documents = quick.find_documents_by_entity("Chris Lattner")
self.assertEqual(len(documents), 1)
def test_get_spacy_generator(self):
entities = [Entity(*(entity)) for entity in self.entities]
documents = [Document(text) for text in self.texts]
quick = Quickner(documents=documents, entities=entities)
quick.process()
generator = quick.spacy()
self.assertEqual(len(list(generator)), 1)
generator = quick.spacy(chunks=2)
self.assertEqual(len(list(generator)), 3)
generator = quick.spacy(chunks=3)
self.assertEqual(len(list(generator)), 2)
generator = quick.spacy(chunks=5)
self.assertEqual(len(list(generator)), 1)
def test_single_document_annotation(self):
rust = Document.from_string("rust is made by Mozilla")
entities = [Entity("Rust", "PL"), Entity("Mozilla", "ORG")]
rust.annotate(entities, case_sensitive=True)
self.assertEqual(len(rust.label), 1)
rust.annotate(entities, case_sensitive=False)
self.assertEqual(len(rust.label), 2)
self.assertEqual(rust.label[0][2], "ORG")
self.assertEqual(rust.label[1][2], "PL")
def test_character_level_slicing(self):
entity = Entity("Python", "PL")
document = Document("Indizes auf Zeichenebene anstelle von Indizes auf Byteebene, um Python-Slicing zu unterstützen")
document.annotate([entity], case_sensitive=False)
print(document.label[0][0], document.label[0][1])
label = document.text[document.label[0][0]:document.label[0][1]]
self.assertEqual(len(document.label), 1)
self.assertEqual(label, "Python")
if __name__ == "__main__":
unittest.main()
+0
-1

@@ -8,3 +8,2 @@ /target

*.py[cod]
*.py
*.lock

@@ -11,0 +10,0 @@

@@ -604,3 +604,3 @@ # This file is automatically @generated by Cargo.

name = "quickner"
version = "0.0.1-alpha.17"
version = "0.0.1-alpha.18"
dependencies = [

@@ -615,3 +615,3 @@ "numpy",

name = "quickner-core"
version = "0.0.1-alpha.17"
version = "0.0.1-alpha.18"
dependencies = [

@@ -728,5 +728,5 @@ "aho-corasick",

name = "serde"
version = "1.0.154"
version = "1.0.155"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8cdd151213925e7f1ab45a9bbfb129316bd00799784b174b7cc7bcd16961c49e"
checksum = "71f2b4817415c6d4210bfe1c7bfcf4801b2d904cb4d0e1a8fdb651013c9e86b8"
dependencies = [

@@ -738,5 +738,5 @@ "serde_derive",

name = "serde_derive"
version = "1.0.154"
version = "1.0.155"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fc80d722935453bcafdc2c9a73cd6fac4dc1938f0346035d84bf99fa9e33217"
checksum = "d071a94a3fac4aff69d023a7f411e33f40f3483f8c5190b1953822b6b76d7630"
dependencies = [

@@ -743,0 +743,0 @@ "proc-macro2",

[package]
name = "quickner"
version = "0.0.1-alpha.17"
version = "0.0.1-alpha.18"
edition = "2021"

@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"]

[package]
name = "quickner-core"
version = "0.0.1-alpha.17"
version = "0.0.1-alpha.18"
edition = "2021"

@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"]

use crate::{
config::{Config, Filters},
models::Text,
utils::get_progress_bar,
utils::{char_to_byte, get_progress_bar, is_valid_utf8},
SpacyEntity,

@@ -123,2 +123,6 @@ };

) -> Option<Vec<(usize, usize, String)>> {
if !is_valid_utf8(text.as_str()) {
warn!("Skipping invalid utf8 text: \"{}\"", text);
return None;
}
let mut annotations = Vec::new();

@@ -555,4 +559,4 @@ for mat in aho_corasick.find_iter(&text) {

for label in &annotation.label {
// Extarct the entity name using indexes
let name = annotation.text[label.0..label.1].to_string();
let indices = char_to_byte(annotation.text.clone(), label.0, label.1);
let name = annotation.text[indices.0..indices.1].to_string();
let entity = Entity {

@@ -680,3 +684,5 @@ name: name.to_string().to_lowercase(),

for label in &document.label {
let name = document.text[label.0..label.1].to_string();
// Translate the indices to byte indices
let indices = char_to_byte(document.text.clone(), label.0, label.1);
let name = document.text[indices.0..indices.1].to_string();
let entry = index.entry(name.to_lowercase()).or_insert(Vec::new());

@@ -701,3 +707,4 @@ entry.push(document.id.clone());

for label in &document.label {
let name = document.text[label.0..label.1].to_string();
let indices = char_to_byte(document.text.clone(), label.0, label.1);
let name = document.text[indices.0..indices.1].to_string();
let entry = self

@@ -723,3 +730,4 @@ .documents_entities_index

for label in &document.label {
let name = document.text[label.0..label.1].to_string();
let indices = char_to_byte(document.text.clone(), label.0, label.1);
let name = document.text[indices.0..indices.1].to_string();
let entry = self

@@ -726,0 +734,0 @@ .documents_entities_index

@@ -8,3 +8,3 @@ // quickner

//
use std::collections::HashSet;
use std::{collections::HashSet, str};

@@ -104,1 +104,24 @@ use indicatif::{ProgressBar, ProgressStyle};

}
pub(crate) fn is_valid_utf8(text: &str) -> bool {
let b = text.as_bytes();
if str::from_utf8(b).is_err() {
return false;
} else {
return true;
}
}
pub(crate) fn char_to_byte(text: String, start: usize, end: usize) -> (usize, usize) {
let start = text.char_indices().nth(start);
let end = text.char_indices().nth(end);
let start = match start {
Some(start) => start.0,
None => 0,
};
let end = match end {
Some(end) => end.0,
None => text.len(),
};
(start, end)
}
Metadata-Version: 2.1
Name: quickner
Version: 0.0.1a17
Version: 0.0.1a18
Classifier: Programming Language :: Rust

@@ -5,0 +5,0 @@ Classifier: Programming Language :: Python :: Implementation :: CPython