compling - npm Package Compare versions

+233

compling/lexicalAn...tokenization/impl/parallel_tokenizer.py

		from typing import *
		from tqdm import tqdm
		from datetime import datetime
		from collections import defaultdict
		from nltk.tokenize import sent_tokenize
		from compling.lexicalAnalysis.tokenization import tokenization_abc
		from collections.abc import Iterable as abciterable
		from compling.lexicalAnalysis.tokenization.impl.sentence import Sentence
		from compling.lexicalAnalysis.tokenization.impl.paragraph import Paragraph
		from compling.lexicalAnalysis.tokenization.impl.document import Document


		class Tokenizer(tokenization_abc.Tokenization):
		"""
		Tokenization converts input text to streams of tokens, where each token is a separate word, punctuation sign,
		number/amount, date, etc.

		A Tokenizer object converts the corpus documents into a stream of:

		* _tokens_: tokens occurring in those documents. Each token is characterized by:
		* _token_id_: unique token identifier;
		* _sent_id_: unique sentence identifier. The id of the sentence the token occurs in;
		* _para_id_: unique paragraph identifier. The id of the paragraph the token occurs in;
		* _doc_id_: unique document identifier. The id of the document the token occurs in;
		* _text_: the text of the token;
		* a large variety of _optional meta-information_ (e.g. PoS tag, dep tag, lemma, stem, ...);
		* _sentences_ : sentences occurring in those documents. Each sentence is characterized by:
		* _sent_id_: unique sentence identifier;
		* _para_id_: unique paragraph identifier. The id of the paragraph the sentence occurs in;
		* _doc_id_: unique document identifier. The id of the document the sentence occurs in;
		* _text_: the text of the sentence;
		* a large variety of _optional meta-information_ (e.g.lemma, stem, ...);
		* _paragraphs_: sentences occurring in those documents. Each paragraph is characterized by:
		* _para_id_: unique paragraph identifier;
		* _doc_id_: unique document identifier. The id of the document the paragraph occurs in;
		* _text_: the text of the paragraph;
		* a large variety of _optional meta-information_ (e.g.lemma, stem, ...);
		* _documents_: Each document is characterized by:
		* _doc_id_: unique document identifier;
		* _text_: the text of the document;
		* a large variety of _optional meta-information_ (e.g.lemma, stem, ...);

		A Tokenizer object is also able to retrieve frequent n-grams to be considered as unique tokens.

		For each record (token, sentence, paragraph, document) are stored some metadata.
		You can edit the config.ini file to change those stored by default.

		In order to run tokenization you need to provide a Iterable[dict], where each document is a dict
		and has a key where it stores the text of the document. By default the text key is 'text', you can change it
		editing the config.ini file.

		For each document in your corpus, all key/value data (except for text key) are added as metadata to
		that document records. (e.g. title, author, ...).
		"""

		def __init__(self, domain_stopwords: Iterable[str] = None) -> None:
		"""
		Args:
		domain_stopwords (List[str], optional, default=None): You can provide a list of arbitrary stopwords specific to your corpus domain.
		"""

		# super __init__
		super(Tokenizer, self).__init__()

		# add some stopwords to nlp list.
		self.nlp.stopwords_list(include=domain_stopwords)
		self.nlp.__ngrams_replaced__ = True

		self.corpus_ngrams = None

		def ngrams2tokens(self,
		n: Union[int, Iterable[int]],
		docs_in: Iterable[Dict[str, str]],
		docs_out: Iterable[Dict[str, str]] = None,
		pos: List[str] = ("PROPN", "VERB", "NOUN", "ADJ"),
		corpus_threshold: int = 50,
		doc_threshold: int = 0,
		len_gram: int = 3,
		include: List[str] = None,
		replace: bool = False) -> Iterable[Dict[int, str]]:

		# list of sizes of n-grams
		if isinstance(n, abciterable):
		sizes = n
		else: # isinstance(n, int):
		sizes = [n]

		ngram_frequencies = defaultdict(int)

		# for each corpus doc
		for doc in tqdm(docs_in, desc='N-gram Retrieval in progress...', position=0, leave=True):
		text = doc[self.text_key]
		for n in sizes:
		for ngram, frequency in self.nlp.ngrams(text, n, pos=tuple(), threshold=doc_threshold).items():
		ngram_frequencies[ngram] += frequency

		# Select the n grams you're interested in
		ngram_frequencies = {ngram: frequency for ngram, frequency in ngram_frequencies.items() if
		frequency >= corpus_threshold}

		# Ngram final list
		result = defaultdict(int)

		# First and last word must be in pos and at least len_gram sized
		for ngram in ngram_frequencies:
		ngram_ = self.nlp.nlp_spacy(" ".join(ngram))
		if ngram_[0].pos_ in pos and ngram_[-1].pos_ in pos and \
		len(ngram_[0].text) >= len_gram and len(ngram_[-1].text) >= len_gram:
		ngram_ = " ".join([token.text for token in ngram_])
		result[ngram_] = ngram_frequencies[ngram]

		# add arbitrary n-grams
		if include is not None:
		for ngram in include:
		if ngram not in result:
		result[ngram] = 0

		# sorted by len: first we replace the longer n-grams, then shorter ones.
		self.corpus_ngrams = dict(sorted(result.items(), key=lambda x: len(x[0]), reverse=True))

		# updates doc: replaces n-grams with tokens
		if docs_out is None:
		return

		if replace:
		self.nlp.__ngrams_replaced__ = replace

		for doc in docs_out:
		for ngram in self.corpus_ngrams:
		if replace:
		doc[self.text_key] = doc[self.text_key].replace(ngram, "__" + ngram.replace(' ', '_') + "__")
		else:
		doc[self.text_key] = doc[self.text_key].replace(ngram,
		ngram + ' __' + ngram.replace(' ', '_') + "__")
		yield doc

		def run(self,
		docs: Iterable[Dict[str, str]],
		doc_id: int = 0,
		token_id: int = 0,
		sent_id: int = 0,
		para_id: int = 0,
		para_size: int = 3,
		index_doc: bool = True,
		index_sent: bool = True,
		index_para: bool = True) -> Iterable[Dict[str, dict]]:

		# list of sentences making up a paragraph
		# list of paragraphs making up a document
		sent_list, para_list = list(), list()

		# SIDE EFFECT: Sentence will increment token_id by one, so that it can start from token_id and not from token_id+1 .
		token_id = token_id - 1

		# tokenization of each doc
		for doc in tqdm(docs, desc='Tokenization in progress...', position=0, leave=True):


		text = doc[self.text_key]

		metadata = dict()
		for k, v in doc.items():
		if k == self.text_key:
		continue
		try:
		# if it's a date
		v = datetime.strptime(v, self.date_format)
		except:
		if isinstance(v, str):
		v = v.replace('.', '/')
		pass
		metadata[k] = v

		# token position inside the paragraphs/documents: -1, Sentence will increment it, so that it can start from 0.
		para_pos, doc_pos = -1, -1

		# tokenization of each sentence
		for sent in sent_tokenize(text, language=self.nlp.language):
		sent = Sentence(sent_id, token_id, sent, self.nlp, self.nlp.config, para_id,
		doc_id, para_pos, doc_pos, metadata)
		para_pos, doc_pos = sent.para_pos, sent.doc_pos
		sent_list.append(sent)

		# new paragraph
		if len(sent_list) == para_size:
		para_list.append(Paragraph(para_id, sent_list, self.nlp.config, metadata))
		sent_list = list()
		# next paragraph
		para_id += 1

		# next token
		token_id = sent.token_id
		# next sentence
		sent_id += 1

		# Reset sentence list. Remove item from the previous source.
		# The last paragraph can be shorter than the others.
		# new paragraph
		if len(sent_list) > 0:
		para_list.append(Paragraph(para_id, sent_list, self.nlp.config, metadata))
		para_id += 1
		sent_list = list()

		# new document
		doc = Document(doc_id, para_list, self.nlp.config, metadata)
		para_list = list()

		# next document
		doc_id += 1

		# build records
		doc_records, para_records, sent_records, token_records = list(), list(), list(), list()
		if index_doc:
		doc_records.append(doc.to_dict())
		if index_para:
		para_records.extend([para.to_dict() for para in doc.para_list])
		for para in doc.para_list:
		if index_sent:
		sent_records.extend([sent.to_dict() for sent in para.sent_list])
		for sent in para.sent_list:
		token_records.extend([token.to_dict() for token in sent.token_list])

		# store data in database
		tokenization_ = dict()
		tokenization_['tokens'] = token_records
		if index_sent:
		tokenization_['sentences'] = sent_records
		if index_para:
		tokenization_['paragraphs'] = para_records
		if index_doc:
		tokenization_['documents'] = doc_records

		yield tokenization_

+1

-1

compling.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: compling
		Version: 0.0.32
		Version: 0.0.33
		Summary: Computational Linguistic
		@@ -5,0 +5,0 @@ Home-page: https://github.com/FrancescoPeriti/compling

+1

-0

compling.egg-info/SOURCES.txt

		@@ -362,2 +362,3 @@ README.md
		compling/lexicalAnalysis/tokenization/impl/paragraph.py
		compling/lexicalAnalysis/tokenization/impl/parallel_tokenizer.py
		compling/lexicalAnalysis/tokenization/impl/sentence.py
		@@ -364,0 +365,0 @@ compling/lexicalAnalysis/tokenization/impl/token.py

+3

-1

compling/embeddings/documents/impl/doc2vec.py

		@@ -12,3 +12,5 @@ import gensim
		"""
		def __init__(self, index: Iterable[dict], id_field:Union[str, List[str]], output:str=None, text_field='text', skipgram_ws: int = None):
		def __init__(self, index: Iterable[dict]=None, id_field:Union[str, List[str]]=None, output:str=None, text_field='text', skipgram_ws: int = None):
		if index is None:
		return
		super().__init__(output=output, index=index, text_field=text_field, skipgram_ws=skipgram_ws, tag=True,
		@@ -15,0 +17,0 @@ module=gensim.models.doc2vec.Doc2Vec, id_field=id_field)

+2

-2

compling/embeddings/embeddings.py

		@@ -77,6 +77,6 @@ import abc

		def load(self):
		def load(self, path):
		"""Load a previously saved Embeddings model from file."""

		self.model = self.module.load(self.path)
		self.model = self.module.load(path)
		return self.model
		@@ -83,0 +83,0 @@

+3

-1

compling/embeddings/words/impl/fasttext.py

		@@ -12,4 +12,6 @@ import gensim
		"""
		def __init__(self, index: Iterable[dict], output:str=None, text_field='text', skipgram_ws: int = None):
		def __init__(self, index: Iterable[dict]=None, output:str=None, text_field='text', skipgram_ws: int = None):
		if index is None:
		return
		super().__init__(output=output, index=index, text_field=text_field, tag=False, skipgram_ws=skipgram_ws,
		module=gensim.models.fasttext.FastText)

+3

-1

compling/embeddings/words/impl/word2vec.py

		@@ -15,4 +15,6 @@ import gensim
		"""
		def __init__(self, index: Iterable[dict], output:str=None, text_field='text', skipgram_ws: int = None):
		def __init__(self, index: Iterable[dict]=None, output:str=None, text_field='text', skipgram_ws: int = None):
		if index is None:
		return
		super().__init__(output=output, index=index, text_field=text_field, tag=False,
		skipgram_ws=skipgram_ws, module=gensim.models.Word2Vec)

+1

-0

compling/lexicalAnalysis/tokenization/impl/tokenizer.py

		@@ -158,2 +158,3 @@ from typing import *


		text = doc[self.text_key]
		@@ -160,0 +161,0 @@

+1

-1

PKG-INFO

		Metadata-Version: 2.1
		Name: compling
		Version: 0.0.32
		Version: 0.0.33
		Summary: Computational Linguistic
		@@ -5,0 +5,0 @@ Home-page: https://github.com/FrancescoPeriti/compling

+1

-1

setup.py

		@@ -44,3 +44,3 @@ import subprocess
		name="compling",
		version="0.0.32",
		version="0.0.33",
		author="Francesco Periti",
		@@ -47,0 +47,0 @@ author_email="peritifrancesco@gmail.com",

compling - npm Package Compare versions

Improved metrics