Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

quickner

Package Overview
Dependencies
Maintainers
1
Versions
20
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

quickner - npm Package Compare versions

Comparing version
0.0.1a11
to
0.0.1a12
+2
-2
Cargo.lock

@@ -548,3 +548,3 @@ # This file is automatically @generated by Cargo.

name = "quickner"
version = "0.0.1-alpha.11"
version = "0.0.1-alpha.12"
dependencies = [

@@ -558,3 +558,3 @@ "pyo3",

name = "quickner-core"
version = "0.0.1-alpha.11"
version = "0.0.1-alpha.12"
dependencies = [

@@ -561,0 +561,0 @@ "clap",

[package]
name = "quickner"
version = "0.0.1-alpha.11"
version = "0.0.1-alpha.12"
edition = "2021"

@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"]

[package]
name = "quickner-core"
version = "0.0.1-alpha.11"
version = "0.0.1-alpha.12"
edition = "2021"

@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"]

@@ -31,5 +31,20 @@ use crate::{

pub documents_hash: HashMap<String, Document>,
pub documents_index: HashMap<String, Vec<String>>,
pub documents_label_index: HashMap<String, Vec<String>>,
pub documents_entities_index: HashMap<String, Vec<String>>,
}
impl Default for Quickner {
fn default() -> Self {
Self {
config: Config::default(),
config_file: "./config.toml".to_string(),
documents: Vec::new(),
entities: Vec::new(),
documents_hash: HashMap::new(),
documents_label_index: HashMap::new(),
documents_entities_index: HashMap::new(),
}
}
}
impl Quickner {

@@ -141,21 +156,4 @@ /// Find the index of the entities in the text

.collect();
// Create LABEL dictionary {"label": [doc_id, doc_id, ...]}
self.documents_index = self
.documents
.iter()
.map(|document| {
let mut index: HashMap<String, Vec<String>> = HashMap::new();
for label in &document.label {
let entry = index.entry(label.2.clone()).or_insert(Vec::new());
entry.push(document.id.clone());
}
index
})
.fold(HashMap::new(), |mut acc, x| {
for (key, value) in x {
let entry = acc.entry(key).or_insert(Vec::new());
entry.extend(value);
}
acc
});
self.build_label_index();
self.build_entity_index();
pb.finish();

@@ -193,29 +191,24 @@ }

);
return Quickner {
config: Config::default(),
config_file,
documents: vec![],
entities: vec![],
documents_hash: HashMap::new(),
documents_index: HashMap::new(),
};
return Quickner::default();
}
let config = Config::from_file(config_file.as_str());
Quickner {
config,
config_file,
documents: vec![],
entities: vec![],
documents_hash: HashMap::new(),
documents_index: HashMap::new(),
}
let mut quick = Quickner::default();
quick.config = config;
quick.config_file = config_file;
quick
}
pub fn add_document(&mut self, document: Document) {
let document = self.documents_hash.get(&document.id);
if document.is_some() {
warn!("Document {} already exists", document.unwrap().id);
return;
{
let document = self.documents_hash.get(&document.id);
if document.is_some() {
warn!("Document {} already exists", document.unwrap().id);
return;
}
}
self.documents.push(document.unwrap().clone());
self.documents.push(document.clone());
self.documents_hash
.insert(document.id.clone(), document.clone());
self.add_to_entity_index(&document);
self.add_to_label_index(&document);
}

@@ -225,3 +218,7 @@

let document = Document::from_string(text.to_string());
self.documents.push(document);
self.documents.push(document.clone());
self.documents_hash
.insert(document.id.clone(), document.clone());
self.add_to_entity_index(&document);
self.add_to_label_index(&document);
}

@@ -485,30 +482,5 @@

.collect();
let entities = entities
.into_iter()
.collect::<HashSet<Entity>>()
.into_iter()
.collect::<Vec<Entity>>();
let documents_hash = documents
.iter()
.map(|document| (document.id.clone(), document.clone()))
.collect();
// Create LABEL dictionary {"label": [doc_id, doc_id, ...]}
let documents_index: HashMap<String, Vec<String>> = documents
.iter()
.map(|document| {
let mut index: HashMap<String, Vec<String>> = HashMap::new();
for label in &document.label {
let entry = index.entry(label.2.clone()).or_insert(Vec::new());
entry.push(document.id.clone());
}
index
})
.fold(HashMap::new(), |mut acc, x| {
for (key, value) in x {
let entry = acc.entry(key).or_insert(Vec::new());
entry.extend(value);
}
acc
});
Quickner {
let entities = Quickner::unique_entities(entities);
let documents_hash = Quickner::document_hash(&documents);
let mut quick = Quickner {
config: Config::default(),

@@ -519,4 +491,8 @@ config_file: String::from(""),

documents_hash,
documents_index,
}
documents_label_index: HashMap::new(),
documents_entities_index: HashMap::new(),
};
quick.build_entity_index();
quick.build_label_index();
quick
}

@@ -565,29 +541,5 @@

.collect();
let entities = entities
.into_iter()
.collect::<HashSet<Entity>>()
.into_iter()
.collect::<Vec<Entity>>();
let documents_hash = documents
.iter()
.map(|document| (document.id.clone(), document.clone()))
.collect();
let documents_index: HashMap<String, Vec<String>> = documents
.iter()
.map(|document| {
let mut index: HashMap<String, Vec<String>> = HashMap::new();
for label in &document.label {
let entry = index.entry(label.2.clone()).or_insert(Vec::new());
entry.push(document.id.clone());
}
index
})
.fold(HashMap::new(), |mut acc, x| {
for (key, value) in x {
let entry = acc.entry(key).or_insert(Vec::new());
entry.extend(value);
}
acc
});
Quickner {
let entities = Quickner::unique_entities(entities);
let documents_hash = Quickner::document_hash(&documents);
let mut quick = Quickner {
config: Config::default(),

@@ -598,4 +550,8 @@ config_file: String::from(""),

documents_hash,
documents_index,
}
documents_label_index: HashMap::new(),
documents_entities_index: HashMap::new(),
};
quick.build_entity_index();
quick.build_label_index();
quick
}

@@ -626,1 +582,83 @@

}
impl Quickner {
fn build_label_index(&mut self) {
let mut index: HashMap<String, Vec<String>> = HashMap::new();
for document in &self.documents {
for label in &document.label {
let entry = index.entry(label.2.clone()).or_insert(Vec::new());
entry.push(document.id.clone());
}
}
self.documents_label_index = index;
}
fn build_entity_index(&mut self) {
let mut index: HashMap<String, Vec<String>> = HashMap::new();
for document in &self.documents {
for label in &document.label {
let name = document.text[label.0..label.1].to_string();
let entry = index.entry(name.to_lowercase()).or_insert(Vec::new());
entry.push(document.id.clone());
}
}
self.documents_entities_index = index;
}
fn add_to_label_index(&mut self, document: &Document) {
for label in &document.label {
let entry = self
.documents_label_index
.entry(label.2.clone())
.or_insert(Vec::new());
entry.push(document.id.clone());
}
}
fn add_to_entity_index(&mut self, document: &Document) {
for label in &document.label {
let name = document.text[label.0..label.1].to_string();
let entry = self
.documents_entities_index
.entry(name.to_lowercase())
.or_insert(Vec::new());
entry.push(document.id.clone());
}
}
fn _remove_from_label_index(&mut self, document: &Document) {
for label in &document.label {
let entry = self
.documents_label_index
.entry(label.2.clone())
.or_insert(Vec::new());
entry.retain(|x| x != &document.id);
}
}
fn _remove_from_entity_index(&mut self, document: &Document) {
for label in &document.label {
let name = document.text[label.0..label.1].to_string();
let entry = self
.documents_entities_index
.entry(name.to_lowercase())
.or_insert(Vec::new());
entry.retain(|x| x != &document.id);
}
}
fn unique_entities(entities: Vec<Entity>) -> Vec<Entity> {
entities
.into_iter()
.collect::<HashSet<Entity>>()
.into_iter()
.collect::<Vec<Entity>>()
}
fn document_hash(documents: &Vec<Document>) -> HashMap<String, Document> {
documents
.into_iter()
.map(|document| (document.id.clone(), document.clone()))
.collect::<HashMap<String, Document>>()
}
}
+63
-61
Metadata-Version: 2.1
Name: quickner
Version: 0.0.1a11
Version: 0.0.1a12
Classifier: Programming Language :: Rust

@@ -17,5 +17,5 @@ Classifier: Programming Language :: Python :: Implementation :: CPython

<div align="center">
<h1>Quickner ⚡ </h1>
<h1 style="font-size:40px;">Quickner ⚡ </h1>
<p>
<strong>A simple, fast, and easy to use NER annotator for Python</strong>
<strong style="font-size:20px;">A simple, fast, and easy to use NER annotator for Python</strong>
</p>

@@ -40,3 +40,2 @@ <p>

Quickner is a new tool to quickly annotate texts for NER (Named Entity Recognition). It is written in Rust and accessible through a Python API.

@@ -80,8 +79,8 @@

# Create documents
doc_1 = Document("rust is made by Mozilla")
doc_2 = Document("Python was created by Guido van Rossum")
doc_3 = Document("Java was created by James Gosling")
rust = Document("rust is made by Mozilla")
python = Document("Python was created by Guido van Rossum")
java = Document("Java was created by James Gosling")
# Documents can be added to a list
documents = [doc_1, doc_2, doc_3]
documents = [rust, python, java]

@@ -91,3 +90,8 @@ # Initialize the annotator

quick = Quickner(documents=documents)
>>> Entities: 0 | Documents: 3 | Annotations:
quick
>>> Entities: 0 | Documents: 3 | Annotations:
>>> quick.documents
[Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[]), Document(id="f1da5d23ef88f3dc", text=Python was created by Guido van Rossum, label=[]), Document(id="e4324f9818e7e598", text=Java was created by James Gosling, label=[])]
>>> quick.entities
[]
```

@@ -134,32 +138,23 @@

### Find documents by label
### Find documents by label or entity
```python
from quickner import Quickner, Document, Entity
When you have annotated your documents, you can use the `find_documents_by_label` and `find_documents_by_entity` methods to find documents by label or entity.
# Create documents
doc_1 = Document("rust is made by Mozilla")
doc_2 = Document("Python was created by Guido van Rossum")
doc_3 = Document("Java was created by James Gosling")
Both methods return a list of documents, and are not case sensitive.
# Create entities
rust = Entity("Rust", "PL")
mozilla = Entity("Mozilla", "ORG")
python = Entity("Python", "PL")
guido = Entity("Guido van Rossum", "PERSON")
java = Entity("Java", "PL")
james = Entity("James Gosling", "PERSON")
Example:
# Documents and entities can be added to a list
documents = [doc_1, doc_2, doc_3]
entities = [rust, mozilla, python, guido, java, james]
```python
# Initialize the annotator
quick = Quickner(documents=documents, entities=entities)
quick.process()
# Find documents by label
>>> quick.find_documents_by_label("PERSON")
[Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])]
>>> quick
Entities: 6 | Documents: 3 | Annotations: PERSON: 2, PL: 3, ORG: 1
>>> quick.find_documents("PERSON")
[Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])]
# Find documents by entity
>>> quick.find_documents_by_entity("Guido van Rossum")
[Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)])]
>>> quick.find_documents_by_entity("rust")
[Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(0, 4, PL), (16, 23, ORG)])]
>>> quick.find_documents_by_entity("Chris Lattner")
[Document(id=3b0b3b5b0b5b0b5b, text=Swift was created by Chris Lattner and Apple, label=[(0, 5, PL), (21, 35, PERSON), (40, 45, ORG)])]
```

@@ -169,31 +164,9 @@

```python
# Create documents from texts
texts = (
"rust is made by Mozilla",
"Python was created by Guido van Rossum",
"Java was created by James Gosling at Sun Microsystems",
"Swift was created by Chris Lattner and Apple",
)
documents = [Document(text) for text in texts]
You can use the `spacy` method to get a spacy compatible generator object.
# Create entities
entities = (
("Rust", "PL"),
("Python", "PL"),
("Java", "PL"),
("Swift", "PL"),
("Mozilla", "ORG"),
("Apple", "ORG"),
("Sun Microsystems", "ORG"),
("Guido van Rossum", "PERSON"),
("James Gosling", "PERSON"),
("Chris Lattner", "PERSON"),
)
entities = [Entity(*(entity)) for entity in entities]
The generator object can be used to feed a spacy model with the annotated data, you still need to convert the data into DocBin format.
# Initialize the annotator
quick = Quickner(documents=documents, entities=entities)
quick.process()
Example:
```python
# Get a spacy compatible generator object

@@ -213,2 +186,8 @@ >>> quick.spacy()

You can also annotate a single document with a list of entities.
This is useful when you want to annotate a document with a list of entities is not in the list of entities of the Quickner object.
Example:
```python

@@ -218,9 +197,16 @@ from quickner import Document, Entity

# Create a document from a string
# Method 1
rust = Document.from_string("rust is made by Mozilla")
# Method 2
rust = Document("rust is made by Mozilla")
# Create a list of entities
entities = [Entity("Rust", "PL"), Entity("Mozilla", "ORG")]
# Annotate the document with the entities, case_sensitive is set to False by default
rust.annotate(entities, case_sensitive=True)
>>> rust.annotate(entities, case_sensitive=True)
>>> rust
Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(16, 23, ORG)])
Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG)])
>>> rust.annotate(entities, case_sensitive=False)
>>> rust
Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG), (0, 4, PL)])
```

@@ -301,2 +287,18 @@

## Features Roadmap and TODO
- [x] Add support for spaCy format
- [x] Add support for brat format
- [x] Add support for conll format
- [x] Add support for jsonl format
- [x] Add support for loading annotations from a json spaCy file
- [x] Add support for loading annotations from a jsonl file
- [x] Find documents with a specific entity/entities and return the documents
- [ ] Add support for loading annotations from a brat file
- [ ] Substring search for entities in the text (case sensitive and insensitive)
- [ ] Partial match for entities, e.g. "Rust" will match "Rustlang"
- [ ] Pattern/regex based entites, e.g. "Rustlang" will match "Rustlang 1.0"
- [ ] Fuzzy match for entities with levenstein distance, e.g. "Rustlang" will match "Rust"
- [ ] Add support for jupyter notebook
## License

@@ -303,0 +305,0 @@

@@ -336,3 +336,3 @@ from ast import Dict

def find_documents(self, label: str) -> List[Document]:
def find_documents_by_label(self, label: str) -> List[Document]:
"""

@@ -348,1 +348,15 @@ Find documents with a specific label.

...
def find_documents_by_entity(self, name: str) -> List[Document]:
"""
Find documents with a specific entity.
>>> quickner.find_documents_by_entity("John")
[Document(id="f9c68f53ee5319c8", text=John is a person., [[0, 4, "PERSON"]])]
Parameters:
name (str): Name of the entity to find.
Returns:
List[Document]: List of documents with the entity.
"""
...
+62
-60
<div align="center">
<h1>Quickner ⚡ </h1>
<h1 style="font-size:40px;">Quickner ⚡ </h1>
<p>
<strong>A simple, fast, and easy to use NER annotator for Python</strong>
<strong style="font-size:20px;">A simple, fast, and easy to use NER annotator for Python</strong>
</p>

@@ -25,3 +25,2 @@ <p>

Quickner is a new tool to quickly annotate texts for NER (Named Entity Recognition). It is written in Rust and accessible through a Python API.

@@ -65,8 +64,8 @@

# Create documents
doc_1 = Document("rust is made by Mozilla")
doc_2 = Document("Python was created by Guido van Rossum")
doc_3 = Document("Java was created by James Gosling")
rust = Document("rust is made by Mozilla")
python = Document("Python was created by Guido van Rossum")
java = Document("Java was created by James Gosling")
# Documents can be added to a list
documents = [doc_1, doc_2, doc_3]
documents = [rust, python, java]

@@ -76,3 +75,8 @@ # Initialize the annotator

quick = Quickner(documents=documents)
>>> Entities: 0 | Documents: 3 | Annotations:
quick
>>> Entities: 0 | Documents: 3 | Annotations:
>>> quick.documents
[Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[]), Document(id="f1da5d23ef88f3dc", text=Python was created by Guido van Rossum, label=[]), Document(id="e4324f9818e7e598", text=Java was created by James Gosling, label=[])]
>>> quick.entities
[]
```

@@ -119,32 +123,23 @@

### Find documents by label
### Find documents by label or entity
```python
from quickner import Quickner, Document, Entity
When you have annotated your documents, you can use the `find_documents_by_label` and `find_documents_by_entity` methods to find documents by label or entity.
# Create documents
doc_1 = Document("rust is made by Mozilla")
doc_2 = Document("Python was created by Guido van Rossum")
doc_3 = Document("Java was created by James Gosling")
Both methods return a list of documents, and are not case sensitive.
# Create entities
rust = Entity("Rust", "PL")
mozilla = Entity("Mozilla", "ORG")
python = Entity("Python", "PL")
guido = Entity("Guido van Rossum", "PERSON")
java = Entity("Java", "PL")
james = Entity("James Gosling", "PERSON")
Example:
# Documents and entities can be added to a list
documents = [doc_1, doc_2, doc_3]
entities = [rust, mozilla, python, guido, java, james]
```python
# Initialize the annotator
quick = Quickner(documents=documents, entities=entities)
quick.process()
# Find documents by label
>>> quick.find_documents_by_label("PERSON")
[Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])]
>>> quick
Entities: 6 | Documents: 3 | Annotations: PERSON: 2, PL: 3, ORG: 1
>>> quick.find_documents("PERSON")
[Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])]
# Find documents by entity
>>> quick.find_documents_by_entity("Guido van Rossum")
[Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)])]
>>> quick.find_documents_by_entity("rust")
[Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(0, 4, PL), (16, 23, ORG)])]
>>> quick.find_documents_by_entity("Chris Lattner")
[Document(id=3b0b3b5b0b5b0b5b, text=Swift was created by Chris Lattner and Apple, label=[(0, 5, PL), (21, 35, PERSON), (40, 45, ORG)])]
```

@@ -154,31 +149,9 @@

```python
# Create documents from texts
texts = (
"rust is made by Mozilla",
"Python was created by Guido van Rossum",
"Java was created by James Gosling at Sun Microsystems",
"Swift was created by Chris Lattner and Apple",
)
documents = [Document(text) for text in texts]
You can use the `spacy` method to get a spacy compatible generator object.
# Create entities
entities = (
("Rust", "PL"),
("Python", "PL"),
("Java", "PL"),
("Swift", "PL"),
("Mozilla", "ORG"),
("Apple", "ORG"),
("Sun Microsystems", "ORG"),
("Guido van Rossum", "PERSON"),
("James Gosling", "PERSON"),
("Chris Lattner", "PERSON"),
)
entities = [Entity(*(entity)) for entity in entities]
The generator object can be used to feed a spacy model with the annotated data, you still need to convert the data into DocBin format.
# Initialize the annotator
quick = Quickner(documents=documents, entities=entities)
quick.process()
Example:
```python
# Get a spacy compatible generator object

@@ -198,2 +171,8 @@ >>> quick.spacy()

You can also annotate a single document with a list of entities.
This is useful when you want to annotate a document with a list of entities is not in the list of entities of the Quickner object.
Example:
```python

@@ -203,9 +182,16 @@ from quickner import Document, Entity

# Create a document from a string
# Method 1
rust = Document.from_string("rust is made by Mozilla")
# Method 2
rust = Document("rust is made by Mozilla")
# Create a list of entities
entities = [Entity("Rust", "PL"), Entity("Mozilla", "ORG")]
# Annotate the document with the entities, case_sensitive is set to False by default
rust.annotate(entities, case_sensitive=True)
>>> rust.annotate(entities, case_sensitive=True)
>>> rust
Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(16, 23, ORG)])
Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG)])
>>> rust.annotate(entities, case_sensitive=False)
>>> rust
Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG), (0, 4, PL)])
```

@@ -286,2 +272,18 @@

## Features Roadmap and TODO
- [x] Add support for spaCy format
- [x] Add support for brat format
- [x] Add support for conll format
- [x] Add support for jsonl format
- [x] Add support for loading annotations from a json spaCy file
- [x] Add support for loading annotations from a jsonl file
- [x] Find documents with a specific entity/entities and return the documents
- [ ] Add support for loading annotations from a brat file
- [ ] Substring search for entities in the text (case sensitive and insensitive)
- [ ] Partial match for entities, e.g. "Rust" will match "Rustlang"
- [ ] Pattern/regex based entites, e.g. "Rustlang" will match "Rustlang 1.0"
- [ ] Fuzzy match for entities with levenstein distance, e.g. "Rustlang" will match "Rust"
- [ ] Add support for jupyter notebook
## License

@@ -288,0 +290,0 @@

@@ -296,5 +296,5 @@ use std::collections::HashMap;

#[pyo3(signature = (label))]
pub fn find_documents(&self, label: &str) -> Vec<PyDocument> {
pub fn find_documents_by_label(&self, label: &str) -> Vec<PyDocument> {
let documents_index = match &self.quickner {
quickner => quickner.documents_index.to_owned(),
quickner => quickner.documents_label_index.to_owned(),
};

@@ -320,2 +320,28 @@ let documents_ids = match documents_index.get(label) {

#[pyo3(signature = (name))]
pub fn find_documents_by_entity(&self, name: &str) -> Vec<PyDocument> {
let documents_entities_index = match &self.quickner {
quickner => quickner.documents_entities_index.to_owned(),
};
let binding = name.to_lowercase();
let name = binding.as_str();
let documents_ids = match documents_entities_index.get(name) {
Some(documents_ids) => documents_ids,
None => return vec![],
};
let documents = match &self.quickner {
quickner => {
let documents = documents_ids
.into_iter()
.map(|id| {
let document = quickner.documents_hash.get(id).unwrap();
PyDocument::from(document.to_owned())
})
.collect();
documents
}
};
documents
}
#[pyo3(signature = (chunks = None))]

@@ -322,0 +348,0 @@ pub fn spacy(&self, chunks: Option<usize>) -> PySpacyGenerator {