quickner
Advanced tools
+2
-2
@@ -548,3 +548,3 @@ # This file is automatically @generated by Cargo. | ||
| name = "quickner" | ||
| version = "0.0.1-alpha.11" | ||
| version = "0.0.1-alpha.12" | ||
| dependencies = [ | ||
@@ -558,3 +558,3 @@ "pyo3", | ||
| name = "quickner-core" | ||
| version = "0.0.1-alpha.11" | ||
| version = "0.0.1-alpha.12" | ||
| dependencies = [ | ||
@@ -561,0 +561,0 @@ "clap", |
+1
-1
| [package] | ||
| name = "quickner" | ||
| version = "0.0.1-alpha.11" | ||
| version = "0.0.1-alpha.12" | ||
| edition = "2021" | ||
@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"] |
| [package] | ||
| name = "quickner-core" | ||
| version = "0.0.1-alpha.11" | ||
| version = "0.0.1-alpha.12" | ||
| edition = "2021" | ||
@@ -5,0 +5,0 @@ authors = ["Omar MHAIMDAT"] |
@@ -31,5 +31,20 @@ use crate::{ | ||
| pub documents_hash: HashMap<String, Document>, | ||
| pub documents_index: HashMap<String, Vec<String>>, | ||
| pub documents_label_index: HashMap<String, Vec<String>>, | ||
| pub documents_entities_index: HashMap<String, Vec<String>>, | ||
| } | ||
| impl Default for Quickner { | ||
| fn default() -> Self { | ||
| Self { | ||
| config: Config::default(), | ||
| config_file: "./config.toml".to_string(), | ||
| documents: Vec::new(), | ||
| entities: Vec::new(), | ||
| documents_hash: HashMap::new(), | ||
| documents_label_index: HashMap::new(), | ||
| documents_entities_index: HashMap::new(), | ||
| } | ||
| } | ||
| } | ||
| impl Quickner { | ||
@@ -141,21 +156,4 @@ /// Find the index of the entities in the text | ||
| .collect(); | ||
| // Create LABEL dictionary {"label": [doc_id, doc_id, ...]} | ||
| self.documents_index = self | ||
| .documents | ||
| .iter() | ||
| .map(|document| { | ||
| let mut index: HashMap<String, Vec<String>> = HashMap::new(); | ||
| for label in &document.label { | ||
| let entry = index.entry(label.2.clone()).or_insert(Vec::new()); | ||
| entry.push(document.id.clone()); | ||
| } | ||
| index | ||
| }) | ||
| .fold(HashMap::new(), |mut acc, x| { | ||
| for (key, value) in x { | ||
| let entry = acc.entry(key).or_insert(Vec::new()); | ||
| entry.extend(value); | ||
| } | ||
| acc | ||
| }); | ||
| self.build_label_index(); | ||
| self.build_entity_index(); | ||
| pb.finish(); | ||
@@ -193,29 +191,24 @@ } | ||
| ); | ||
| return Quickner { | ||
| config: Config::default(), | ||
| config_file, | ||
| documents: vec![], | ||
| entities: vec![], | ||
| documents_hash: HashMap::new(), | ||
| documents_index: HashMap::new(), | ||
| }; | ||
| return Quickner::default(); | ||
| } | ||
| let config = Config::from_file(config_file.as_str()); | ||
| Quickner { | ||
| config, | ||
| config_file, | ||
| documents: vec![], | ||
| entities: vec![], | ||
| documents_hash: HashMap::new(), | ||
| documents_index: HashMap::new(), | ||
| } | ||
| let mut quick = Quickner::default(); | ||
| quick.config = config; | ||
| quick.config_file = config_file; | ||
| quick | ||
| } | ||
| pub fn add_document(&mut self, document: Document) { | ||
| let document = self.documents_hash.get(&document.id); | ||
| if document.is_some() { | ||
| warn!("Document {} already exists", document.unwrap().id); | ||
| return; | ||
| { | ||
| let document = self.documents_hash.get(&document.id); | ||
| if document.is_some() { | ||
| warn!("Document {} already exists", document.unwrap().id); | ||
| return; | ||
| } | ||
| } | ||
| self.documents.push(document.unwrap().clone()); | ||
| self.documents.push(document.clone()); | ||
| self.documents_hash | ||
| .insert(document.id.clone(), document.clone()); | ||
| self.add_to_entity_index(&document); | ||
| self.add_to_label_index(&document); | ||
| } | ||
@@ -225,3 +218,7 @@ | ||
| let document = Document::from_string(text.to_string()); | ||
| self.documents.push(document); | ||
| self.documents.push(document.clone()); | ||
| self.documents_hash | ||
| .insert(document.id.clone(), document.clone()); | ||
| self.add_to_entity_index(&document); | ||
| self.add_to_label_index(&document); | ||
| } | ||
@@ -485,30 +482,5 @@ | ||
| .collect(); | ||
| let entities = entities | ||
| .into_iter() | ||
| .collect::<HashSet<Entity>>() | ||
| .into_iter() | ||
| .collect::<Vec<Entity>>(); | ||
| let documents_hash = documents | ||
| .iter() | ||
| .map(|document| (document.id.clone(), document.clone())) | ||
| .collect(); | ||
| // Create LABEL dictionary {"label": [doc_id, doc_id, ...]} | ||
| let documents_index: HashMap<String, Vec<String>> = documents | ||
| .iter() | ||
| .map(|document| { | ||
| let mut index: HashMap<String, Vec<String>> = HashMap::new(); | ||
| for label in &document.label { | ||
| let entry = index.entry(label.2.clone()).or_insert(Vec::new()); | ||
| entry.push(document.id.clone()); | ||
| } | ||
| index | ||
| }) | ||
| .fold(HashMap::new(), |mut acc, x| { | ||
| for (key, value) in x { | ||
| let entry = acc.entry(key).or_insert(Vec::new()); | ||
| entry.extend(value); | ||
| } | ||
| acc | ||
| }); | ||
| Quickner { | ||
| let entities = Quickner::unique_entities(entities); | ||
| let documents_hash = Quickner::document_hash(&documents); | ||
| let mut quick = Quickner { | ||
| config: Config::default(), | ||
@@ -519,4 +491,8 @@ config_file: String::from(""), | ||
| documents_hash, | ||
| documents_index, | ||
| } | ||
| documents_label_index: HashMap::new(), | ||
| documents_entities_index: HashMap::new(), | ||
| }; | ||
| quick.build_entity_index(); | ||
| quick.build_label_index(); | ||
| quick | ||
| } | ||
@@ -565,29 +541,5 @@ | ||
| .collect(); | ||
| let entities = entities | ||
| .into_iter() | ||
| .collect::<HashSet<Entity>>() | ||
| .into_iter() | ||
| .collect::<Vec<Entity>>(); | ||
| let documents_hash = documents | ||
| .iter() | ||
| .map(|document| (document.id.clone(), document.clone())) | ||
| .collect(); | ||
| let documents_index: HashMap<String, Vec<String>> = documents | ||
| .iter() | ||
| .map(|document| { | ||
| let mut index: HashMap<String, Vec<String>> = HashMap::new(); | ||
| for label in &document.label { | ||
| let entry = index.entry(label.2.clone()).or_insert(Vec::new()); | ||
| entry.push(document.id.clone()); | ||
| } | ||
| index | ||
| }) | ||
| .fold(HashMap::new(), |mut acc, x| { | ||
| for (key, value) in x { | ||
| let entry = acc.entry(key).or_insert(Vec::new()); | ||
| entry.extend(value); | ||
| } | ||
| acc | ||
| }); | ||
| Quickner { | ||
| let entities = Quickner::unique_entities(entities); | ||
| let documents_hash = Quickner::document_hash(&documents); | ||
| let mut quick = Quickner { | ||
| config: Config::default(), | ||
@@ -598,4 +550,8 @@ config_file: String::from(""), | ||
| documents_hash, | ||
| documents_index, | ||
| } | ||
| documents_label_index: HashMap::new(), | ||
| documents_entities_index: HashMap::new(), | ||
| }; | ||
| quick.build_entity_index(); | ||
| quick.build_label_index(); | ||
| quick | ||
| } | ||
@@ -626,1 +582,83 @@ | ||
| } | ||
| impl Quickner { | ||
| fn build_label_index(&mut self) { | ||
| let mut index: HashMap<String, Vec<String>> = HashMap::new(); | ||
| for document in &self.documents { | ||
| for label in &document.label { | ||
| let entry = index.entry(label.2.clone()).or_insert(Vec::new()); | ||
| entry.push(document.id.clone()); | ||
| } | ||
| } | ||
| self.documents_label_index = index; | ||
| } | ||
| fn build_entity_index(&mut self) { | ||
| let mut index: HashMap<String, Vec<String>> = HashMap::new(); | ||
| for document in &self.documents { | ||
| for label in &document.label { | ||
| let name = document.text[label.0..label.1].to_string(); | ||
| let entry = index.entry(name.to_lowercase()).or_insert(Vec::new()); | ||
| entry.push(document.id.clone()); | ||
| } | ||
| } | ||
| self.documents_entities_index = index; | ||
| } | ||
| fn add_to_label_index(&mut self, document: &Document) { | ||
| for label in &document.label { | ||
| let entry = self | ||
| .documents_label_index | ||
| .entry(label.2.clone()) | ||
| .or_insert(Vec::new()); | ||
| entry.push(document.id.clone()); | ||
| } | ||
| } | ||
| fn add_to_entity_index(&mut self, document: &Document) { | ||
| for label in &document.label { | ||
| let name = document.text[label.0..label.1].to_string(); | ||
| let entry = self | ||
| .documents_entities_index | ||
| .entry(name.to_lowercase()) | ||
| .or_insert(Vec::new()); | ||
| entry.push(document.id.clone()); | ||
| } | ||
| } | ||
| fn _remove_from_label_index(&mut self, document: &Document) { | ||
| for label in &document.label { | ||
| let entry = self | ||
| .documents_label_index | ||
| .entry(label.2.clone()) | ||
| .or_insert(Vec::new()); | ||
| entry.retain(|x| x != &document.id); | ||
| } | ||
| } | ||
| fn _remove_from_entity_index(&mut self, document: &Document) { | ||
| for label in &document.label { | ||
| let name = document.text[label.0..label.1].to_string(); | ||
| let entry = self | ||
| .documents_entities_index | ||
| .entry(name.to_lowercase()) | ||
| .or_insert(Vec::new()); | ||
| entry.retain(|x| x != &document.id); | ||
| } | ||
| } | ||
| fn unique_entities(entities: Vec<Entity>) -> Vec<Entity> { | ||
| entities | ||
| .into_iter() | ||
| .collect::<HashSet<Entity>>() | ||
| .into_iter() | ||
| .collect::<Vec<Entity>>() | ||
| } | ||
| fn document_hash(documents: &Vec<Document>) -> HashMap<String, Document> { | ||
| documents | ||
| .into_iter() | ||
| .map(|document| (document.id.clone(), document.clone())) | ||
| .collect::<HashMap<String, Document>>() | ||
| } | ||
| } |
+63
-61
| Metadata-Version: 2.1 | ||
| Name: quickner | ||
| Version: 0.0.1a11 | ||
| Version: 0.0.1a12 | ||
| Classifier: Programming Language :: Rust | ||
@@ -17,5 +17,5 @@ Classifier: Programming Language :: Python :: Implementation :: CPython | ||
| <div align="center"> | ||
| <h1>Quickner ⚡ </h1> | ||
| <h1 style="font-size:40px;">Quickner ⚡ </h1> | ||
| <p> | ||
| <strong>A simple, fast, and easy to use NER annotator for Python</strong> | ||
| <strong style="font-size:20px;">A simple, fast, and easy to use NER annotator for Python</strong> | ||
| </p> | ||
@@ -40,3 +40,2 @@ <p> | ||
| Quickner is a new tool to quickly annotate texts for NER (Named Entity Recognition). It is written in Rust and accessible through a Python API. | ||
@@ -80,8 +79,8 @@ | ||
| # Create documents | ||
| doc_1 = Document("rust is made by Mozilla") | ||
| doc_2 = Document("Python was created by Guido van Rossum") | ||
| doc_3 = Document("Java was created by James Gosling") | ||
| rust = Document("rust is made by Mozilla") | ||
| python = Document("Python was created by Guido van Rossum") | ||
| java = Document("Java was created by James Gosling") | ||
| # Documents can be added to a list | ||
| documents = [doc_1, doc_2, doc_3] | ||
| documents = [rust, python, java] | ||
@@ -91,3 +90,8 @@ # Initialize the annotator | ||
| quick = Quickner(documents=documents) | ||
| >>> Entities: 0 | Documents: 3 | Annotations: | ||
| quick | ||
| >>> Entities: 0 | Documents: 3 | Annotations: | ||
| >>> quick.documents | ||
| [Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[]), Document(id="f1da5d23ef88f3dc", text=Python was created by Guido van Rossum, label=[]), Document(id="e4324f9818e7e598", text=Java was created by James Gosling, label=[])] | ||
| >>> quick.entities | ||
| [] | ||
| ``` | ||
@@ -134,32 +138,23 @@ | ||
| ### Find documents by label | ||
| ### Find documents by label or entity | ||
| ```python | ||
| from quickner import Quickner, Document, Entity | ||
| When you have annotated your documents, you can use the `find_documents_by_label` and `find_documents_by_entity` methods to find documents by label or entity. | ||
| # Create documents | ||
| doc_1 = Document("rust is made by Mozilla") | ||
| doc_2 = Document("Python was created by Guido van Rossum") | ||
| doc_3 = Document("Java was created by James Gosling") | ||
| Both methods return a list of documents, and are not case sensitive. | ||
| # Create entities | ||
| rust = Entity("Rust", "PL") | ||
| mozilla = Entity("Mozilla", "ORG") | ||
| python = Entity("Python", "PL") | ||
| guido = Entity("Guido van Rossum", "PERSON") | ||
| java = Entity("Java", "PL") | ||
| james = Entity("James Gosling", "PERSON") | ||
| Example: | ||
| # Documents and entities can be added to a list | ||
| documents = [doc_1, doc_2, doc_3] | ||
| entities = [rust, mozilla, python, guido, java, james] | ||
| ```python | ||
| # Initialize the annotator | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| # Find documents by label | ||
| >>> quick.find_documents_by_label("PERSON") | ||
| [Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])] | ||
| >>> quick | ||
| Entities: 6 | Documents: 3 | Annotations: PERSON: 2, PL: 3, ORG: 1 | ||
| >>> quick.find_documents("PERSON") | ||
| [Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])] | ||
| # Find documents by entity | ||
| >>> quick.find_documents_by_entity("Guido van Rossum") | ||
| [Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)])] | ||
| >>> quick.find_documents_by_entity("rust") | ||
| [Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(0, 4, PL), (16, 23, ORG)])] | ||
| >>> quick.find_documents_by_entity("Chris Lattner") | ||
| [Document(id=3b0b3b5b0b5b0b5b, text=Swift was created by Chris Lattner and Apple, label=[(0, 5, PL), (21, 35, PERSON), (40, 45, ORG)])] | ||
| ``` | ||
@@ -169,31 +164,9 @@ | ||
| ```python | ||
| # Create documents from texts | ||
| texts = ( | ||
| "rust is made by Mozilla", | ||
| "Python was created by Guido van Rossum", | ||
| "Java was created by James Gosling at Sun Microsystems", | ||
| "Swift was created by Chris Lattner and Apple", | ||
| ) | ||
| documents = [Document(text) for text in texts] | ||
| You can use the `spacy` method to get a spacy compatible generator object. | ||
| # Create entities | ||
| entities = ( | ||
| ("Rust", "PL"), | ||
| ("Python", "PL"), | ||
| ("Java", "PL"), | ||
| ("Swift", "PL"), | ||
| ("Mozilla", "ORG"), | ||
| ("Apple", "ORG"), | ||
| ("Sun Microsystems", "ORG"), | ||
| ("Guido van Rossum", "PERSON"), | ||
| ("James Gosling", "PERSON"), | ||
| ("Chris Lattner", "PERSON"), | ||
| ) | ||
| entities = [Entity(*(entity)) for entity in entities] | ||
| The generator object can be used to feed a spacy model with the annotated data, you still need to convert the data into DocBin format. | ||
| # Initialize the annotator | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| Example: | ||
| ```python | ||
| # Get a spacy compatible generator object | ||
@@ -213,2 +186,8 @@ >>> quick.spacy() | ||
| You can also annotate a single document with a list of entities. | ||
| This is useful when you want to annotate a document with a list of entities is not in the list of entities of the Quickner object. | ||
| Example: | ||
| ```python | ||
@@ -218,9 +197,16 @@ from quickner import Document, Entity | ||
| # Create a document from a string | ||
| # Method 1 | ||
| rust = Document.from_string("rust is made by Mozilla") | ||
| # Method 2 | ||
| rust = Document("rust is made by Mozilla") | ||
| # Create a list of entities | ||
| entities = [Entity("Rust", "PL"), Entity("Mozilla", "ORG")] | ||
| # Annotate the document with the entities, case_sensitive is set to False by default | ||
| rust.annotate(entities, case_sensitive=True) | ||
| >>> rust.annotate(entities, case_sensitive=True) | ||
| >>> rust | ||
| Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(16, 23, ORG)]) | ||
| Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG)]) | ||
| >>> rust.annotate(entities, case_sensitive=False) | ||
| >>> rust | ||
| Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG), (0, 4, PL)]) | ||
| ``` | ||
@@ -301,2 +287,18 @@ | ||
| ## Features Roadmap and TODO | ||
| - [x] Add support for spaCy format | ||
| - [x] Add support for brat format | ||
| - [x] Add support for conll format | ||
| - [x] Add support for jsonl format | ||
| - [x] Add support for loading annotations from a json spaCy file | ||
| - [x] Add support for loading annotations from a jsonl file | ||
| - [x] Find documents with a specific entity/entities and return the documents | ||
| - [ ] Add support for loading annotations from a brat file | ||
| - [ ] Substring search for entities in the text (case sensitive and insensitive) | ||
| - [ ] Partial match for entities, e.g. "Rust" will match "Rustlang" | ||
| - [ ] Pattern/regex based entites, e.g. "Rustlang" will match "Rustlang 1.0" | ||
| - [ ] Fuzzy match for entities with levenstein distance, e.g. "Rustlang" will match "Rust" | ||
| - [ ] Add support for jupyter notebook | ||
| ## License | ||
@@ -303,0 +305,0 @@ |
+15
-1
@@ -336,3 +336,3 @@ from ast import Dict | ||
| def find_documents(self, label: str) -> List[Document]: | ||
| def find_documents_by_label(self, label: str) -> List[Document]: | ||
| """ | ||
@@ -348,1 +348,15 @@ Find documents with a specific label. | ||
| ... | ||
| def find_documents_by_entity(self, name: str) -> List[Document]: | ||
| """ | ||
| Find documents with a specific entity. | ||
| >>> quickner.find_documents_by_entity("John") | ||
| [Document(id="f9c68f53ee5319c8", text=John is a person., [[0, 4, "PERSON"]])] | ||
| Parameters: | ||
| name (str): Name of the entity to find. | ||
| Returns: | ||
| List[Document]: List of documents with the entity. | ||
| """ | ||
| ... |
+62
-60
| <div align="center"> | ||
| <h1>Quickner ⚡ </h1> | ||
| <h1 style="font-size:40px;">Quickner ⚡ </h1> | ||
| <p> | ||
| <strong>A simple, fast, and easy to use NER annotator for Python</strong> | ||
| <strong style="font-size:20px;">A simple, fast, and easy to use NER annotator for Python</strong> | ||
| </p> | ||
@@ -25,3 +25,2 @@ <p> | ||
| Quickner is a new tool to quickly annotate texts for NER (Named Entity Recognition). It is written in Rust and accessible through a Python API. | ||
@@ -65,8 +64,8 @@ | ||
| # Create documents | ||
| doc_1 = Document("rust is made by Mozilla") | ||
| doc_2 = Document("Python was created by Guido van Rossum") | ||
| doc_3 = Document("Java was created by James Gosling") | ||
| rust = Document("rust is made by Mozilla") | ||
| python = Document("Python was created by Guido van Rossum") | ||
| java = Document("Java was created by James Gosling") | ||
| # Documents can be added to a list | ||
| documents = [doc_1, doc_2, doc_3] | ||
| documents = [rust, python, java] | ||
@@ -76,3 +75,8 @@ # Initialize the annotator | ||
| quick = Quickner(documents=documents) | ||
| >>> Entities: 0 | Documents: 3 | Annotations: | ||
| quick | ||
| >>> Entities: 0 | Documents: 3 | Annotations: | ||
| >>> quick.documents | ||
| [Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[]), Document(id="f1da5d23ef88f3dc", text=Python was created by Guido van Rossum, label=[]), Document(id="e4324f9818e7e598", text=Java was created by James Gosling, label=[])] | ||
| >>> quick.entities | ||
| [] | ||
| ``` | ||
@@ -119,32 +123,23 @@ | ||
| ### Find documents by label | ||
| ### Find documents by label or entity | ||
| ```python | ||
| from quickner import Quickner, Document, Entity | ||
| When you have annotated your documents, you can use the `find_documents_by_label` and `find_documents_by_entity` methods to find documents by label or entity. | ||
| # Create documents | ||
| doc_1 = Document("rust is made by Mozilla") | ||
| doc_2 = Document("Python was created by Guido van Rossum") | ||
| doc_3 = Document("Java was created by James Gosling") | ||
| Both methods return a list of documents, and are not case sensitive. | ||
| # Create entities | ||
| rust = Entity("Rust", "PL") | ||
| mozilla = Entity("Mozilla", "ORG") | ||
| python = Entity("Python", "PL") | ||
| guido = Entity("Guido van Rossum", "PERSON") | ||
| java = Entity("Java", "PL") | ||
| james = Entity("James Gosling", "PERSON") | ||
| Example: | ||
| # Documents and entities can be added to a list | ||
| documents = [doc_1, doc_2, doc_3] | ||
| entities = [rust, mozilla, python, guido, java, james] | ||
| ```python | ||
| # Initialize the annotator | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| # Find documents by label | ||
| >>> quick.find_documents_by_label("PERSON") | ||
| [Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])] | ||
| >>> quick | ||
| Entities: 6 | Documents: 3 | Annotations: PERSON: 2, PL: 3, ORG: 1 | ||
| >>> quick.find_documents("PERSON") | ||
| [Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)]), Document(id=e4324f9818e7e598, text=Java was created by James Gosling, label=[(0, 4, PL), (20, 33, PERSON)])] | ||
| # Find documents by entity | ||
| >>> quick.find_documents_by_entity("Guido van Rossum") | ||
| [Document(id=f1da5d23ef88f3dc, text=Python was created by Guido van Rossum, label=[(0, 6, PL), (22, 38, PERSON)])] | ||
| >>> quick.find_documents_by_entity("rust") | ||
| [Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(0, 4, PL), (16, 23, ORG)])] | ||
| >>> quick.find_documents_by_entity("Chris Lattner") | ||
| [Document(id=3b0b3b5b0b5b0b5b, text=Swift was created by Chris Lattner and Apple, label=[(0, 5, PL), (21, 35, PERSON), (40, 45, ORG)])] | ||
| ``` | ||
@@ -154,31 +149,9 @@ | ||
| ```python | ||
| # Create documents from texts | ||
| texts = ( | ||
| "rust is made by Mozilla", | ||
| "Python was created by Guido van Rossum", | ||
| "Java was created by James Gosling at Sun Microsystems", | ||
| "Swift was created by Chris Lattner and Apple", | ||
| ) | ||
| documents = [Document(text) for text in texts] | ||
| You can use the `spacy` method to get a spacy compatible generator object. | ||
| # Create entities | ||
| entities = ( | ||
| ("Rust", "PL"), | ||
| ("Python", "PL"), | ||
| ("Java", "PL"), | ||
| ("Swift", "PL"), | ||
| ("Mozilla", "ORG"), | ||
| ("Apple", "ORG"), | ||
| ("Sun Microsystems", "ORG"), | ||
| ("Guido van Rossum", "PERSON"), | ||
| ("James Gosling", "PERSON"), | ||
| ("Chris Lattner", "PERSON"), | ||
| ) | ||
| entities = [Entity(*(entity)) for entity in entities] | ||
| The generator object can be used to feed a spacy model with the annotated data, you still need to convert the data into DocBin format. | ||
| # Initialize the annotator | ||
| quick = Quickner(documents=documents, entities=entities) | ||
| quick.process() | ||
| Example: | ||
| ```python | ||
| # Get a spacy compatible generator object | ||
@@ -198,2 +171,8 @@ >>> quick.spacy() | ||
| You can also annotate a single document with a list of entities. | ||
| This is useful when you want to annotate a document with a list of entities is not in the list of entities of the Quickner object. | ||
| Example: | ||
| ```python | ||
@@ -203,9 +182,16 @@ from quickner import Document, Entity | ||
| # Create a document from a string | ||
| # Method 1 | ||
| rust = Document.from_string("rust is made by Mozilla") | ||
| # Method 2 | ||
| rust = Document("rust is made by Mozilla") | ||
| # Create a list of entities | ||
| entities = [Entity("Rust", "PL"), Entity("Mozilla", "ORG")] | ||
| # Annotate the document with the entities, case_sensitive is set to False by default | ||
| rust.annotate(entities, case_sensitive=True) | ||
| >>> rust.annotate(entities, case_sensitive=True) | ||
| >>> rust | ||
| Document(id=87e03d58b1ba4d72, text=rust is made by Mozilla, label=[(16, 23, ORG)]) | ||
| Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG)]) | ||
| >>> rust.annotate(entities, case_sensitive=False) | ||
| >>> rust | ||
| Document(id="87e03d58b1ba4d72", text=rust is made by Mozilla, label=[(16, 23, ORG), (0, 4, PL)]) | ||
| ``` | ||
@@ -286,2 +272,18 @@ | ||
| ## Features Roadmap and TODO | ||
| - [x] Add support for spaCy format | ||
| - [x] Add support for brat format | ||
| - [x] Add support for conll format | ||
| - [x] Add support for jsonl format | ||
| - [x] Add support for loading annotations from a json spaCy file | ||
| - [x] Add support for loading annotations from a jsonl file | ||
| - [x] Find documents with a specific entity/entities and return the documents | ||
| - [ ] Add support for loading annotations from a brat file | ||
| - [ ] Substring search for entities in the text (case sensitive and insensitive) | ||
| - [ ] Partial match for entities, e.g. "Rust" will match "Rustlang" | ||
| - [ ] Pattern/regex based entites, e.g. "Rustlang" will match "Rustlang 1.0" | ||
| - [ ] Fuzzy match for entities with levenstein distance, e.g. "Rustlang" will match "Rust" | ||
| - [ ] Add support for jupyter notebook | ||
| ## License | ||
@@ -288,0 +290,0 @@ |
+28
-2
@@ -296,5 +296,5 @@ use std::collections::HashMap; | ||
| #[pyo3(signature = (label))] | ||
| pub fn find_documents(&self, label: &str) -> Vec<PyDocument> { | ||
| pub fn find_documents_by_label(&self, label: &str) -> Vec<PyDocument> { | ||
| let documents_index = match &self.quickner { | ||
| quickner => quickner.documents_index.to_owned(), | ||
| quickner => quickner.documents_label_index.to_owned(), | ||
| }; | ||
@@ -320,2 +320,28 @@ let documents_ids = match documents_index.get(label) { | ||
| #[pyo3(signature = (name))] | ||
| pub fn find_documents_by_entity(&self, name: &str) -> Vec<PyDocument> { | ||
| let documents_entities_index = match &self.quickner { | ||
| quickner => quickner.documents_entities_index.to_owned(), | ||
| }; | ||
| let binding = name.to_lowercase(); | ||
| let name = binding.as_str(); | ||
| let documents_ids = match documents_entities_index.get(name) { | ||
| Some(documents_ids) => documents_ids, | ||
| None => return vec![], | ||
| }; | ||
| let documents = match &self.quickner { | ||
| quickner => { | ||
| let documents = documents_ids | ||
| .into_iter() | ||
| .map(|id| { | ||
| let document = quickner.documents_hash.get(id).unwrap(); | ||
| PyDocument::from(document.to_owned()) | ||
| }) | ||
| .collect(); | ||
| documents | ||
| } | ||
| }; | ||
| documents | ||
| } | ||
| #[pyo3(signature = (chunks = None))] | ||
@@ -322,0 +348,0 @@ pub fn spacy(&self, chunks: Option<usize>) -> PySpacyGenerator { |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
20734008
0.02%288
3.97%