fse
Advanced tools
| # Auto detect text files and perform LF normalization | ||
| * text=auto |
+74
| # Compiled source # | ||
| ################### | ||
| *.com | ||
| *.class | ||
| *.dll | ||
| *.exe | ||
| *.o | ||
| *.so | ||
| *.pyc | ||
| # Packages # | ||
| ############ | ||
| # it's better to unpack these files and commit the raw source | ||
| # git has its own built in compression methods | ||
| *.7z | ||
| *.dmg | ||
| *.gz | ||
| *.iso | ||
| *.jar | ||
| *.rar | ||
| *.tar | ||
| *.zip | ||
| # Logs and databases # | ||
| ###################### | ||
| *.log | ||
| *.sql | ||
| *.sqlite | ||
| *.pkl | ||
| *.bak | ||
| *.npy | ||
| *.npz | ||
| *.code-workspace | ||
| # OS generated files # | ||
| ###################### | ||
| .DS_Store? | ||
| .DS_Store | ||
| ehthumbs.db | ||
| Icon? | ||
| Thumbs.db | ||
| *.icloud | ||
| # Folders # | ||
| ########### | ||
| legacy | ||
| latex | ||
| draft | ||
| fse.egg-info/ | ||
| # Other # | ||
| ######### | ||
| .ipynb_checkpoints/ | ||
| .settings/ | ||
| .vscode/ | ||
| .eggs | ||
| fse*.egg-info | ||
| *.pptx | ||
| *.doc | ||
| *.docx | ||
| *.dict | ||
| .coverage | ||
| *.bak | ||
| /build/ | ||
| /dist/ | ||
| *.prof | ||
| *.lprof | ||
| *.bin | ||
| *.old | ||
| *.model | ||
| *_out.txt | ||
| *.html | ||
| vectors | ||
| *.vectors |
+198
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| from typing import NamedTuple, List, MutableSequence | ||
| from gensim.utils import any2unicode, open as s_open | ||
| from pathlib import Path | ||
| class IndexedSentence(NamedTuple): | ||
| words: List[str] | ||
| index: int | ||
| def __str__(self): | ||
| """Human readable representation of the object's state, used for debugging. | ||
| Returns | ||
| ------- | ||
| str | ||
| Human readable representation of the object's state (words and tags). | ||
| """ | ||
| return f"{self.__class__.__name__}({self.words}, {self.index})" | ||
| class IndexedList(MutableSequence): | ||
| def __init__(self, *args, split=True, split_func=None, pre_splitted=False): | ||
| """ Quasi-list to be used for feeding in-memory stored lists of sentences to | ||
| the training routine as indexed sentence. | ||
| Parameters | ||
| ---------- | ||
| args : lists, sets | ||
| Arguments to be merged into a single contianer. Can be single or multiple list/set objects. | ||
| split : bool, optional | ||
| If true performs a split function on the strings contained in the list. | ||
| split_func : function, optional | ||
| A user definable split function which turns a string into a list of strings. | ||
| pre_splitted : bool, optional | ||
| Determines if the input is already splitted in the format of ["token0", "token1"] | ||
| """ | ||
| self.pre_splitted = bool(pre_splitted) | ||
| self.split = bool(split) if not self.pre_splitted else False | ||
| self.split_func = split_func | ||
| self._check_kwargs_sanity() | ||
| self.items = list() | ||
| for arg in args: | ||
| self._check_list_type(arg) | ||
| self.items += arg | ||
| super().__init__() | ||
| def _check_list_type(self, obj): | ||
| """ Checks input validity """ | ||
| if isinstance(obj, (list, set)): | ||
| return 1 | ||
| else: | ||
| raise TypeError(f"Arg must be list/set type. Got {type(obj)}") | ||
| def _check_str_type(self, obj): | ||
| """ Checks input validity """ | ||
| if isinstance(obj, str): | ||
| return 1 | ||
| else: | ||
| raise TypeError(f"Arg must be str type. Got {type(obj)}") | ||
| def _check_kwargs_sanity(self): | ||
| """ Checks argument validity """ | ||
| if self.split and self.split_func is not None: | ||
| raise RuntimeError("You must provide either split=True or a split_func, not both") | ||
| if (self.split or self.split_func is not None) and self.pre_splitted: | ||
| raise RuntimeError("Split function and pre_splitted are not compatible") | ||
| def __len__(self): | ||
| """ List length """ | ||
| return len(self.items) | ||
| def __repr__(self): | ||
| return f"{self.__class__.__name__}, {self.items}" | ||
| def __str__(self): | ||
| return str(self.items) | ||
| def _convert_item(self, item): | ||
| """ Convert sentence to list of tokens """ | ||
| if self.pre_splitted: | ||
| return item | ||
| elif self.split: | ||
| return any2unicode(item).split() | ||
| else: | ||
| return self.split_func(any2unicode(item)) | ||
| def __getitem__(self, i): | ||
| """ Get a list item """ | ||
| item = self.items[i] | ||
| output = self._convert_item(item) | ||
| return IndexedSentence(output, i) | ||
| def __delitem__(self, i): | ||
| """ Delete an item """ | ||
| del self.items[i] | ||
| def __setitem__(self, i, item): | ||
| """ Sets an item """ | ||
| self._check_str_type(item) | ||
| self.items[i] = item | ||
| def insert(self, i, item): | ||
| """ Inserts an item at a position """ | ||
| self._check_str_type(item) | ||
| self.items.insert(i, item) | ||
| def append(self, item): | ||
| """ Appends item at last position""" | ||
| self._check_str_type(item) | ||
| self.insert(len(self.items), item) | ||
| def extend(self, *args): | ||
| """ Extens list """ | ||
| for arg in args: | ||
| self._check_list_type(arg) | ||
| self.items += arg | ||
| class IndexedLineDocument(object): | ||
| def __init__(self, path, get_able=True): | ||
| """ Iterate over a file that contains sentences: one line = :class:`~fse.inputs.IndexedSentence` object. | ||
| Words are expected to be already preprocessed and separated by whitespace. Sentence tags are constructed | ||
| automatically from the sentence line number. | ||
| Parameters | ||
| ---------- | ||
| path : str | ||
| The path of the file to read and return lines from | ||
| get_able : bool, optional | ||
| Use to determine if the IndexedLineDocument is indexable. | ||
| This functionality is required if you want to pass an indexable to | ||
| :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar`. | ||
| """ | ||
| self.path = Path(path) | ||
| self.line_offset = list() | ||
| self.get_able = bool(get_able) | ||
| if self.get_able: | ||
| self._build_offsets() | ||
| def _build_offsets(self): | ||
| """ Builds an offset table to index the file """ | ||
| with s_open(self.path, "rb") as f: | ||
| offset = f.tell() | ||
| for line in f: | ||
| self.line_offset.append(offset) | ||
| offset += len(line) | ||
| def __getitem__(self, i): | ||
| """ Returns the line indexed by i. Primarily used for | ||
| :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar` | ||
| Parameters | ||
| ---------- | ||
| i : int | ||
| The line index used to index the file | ||
| Returns | ||
| ------- | ||
| str | ||
| line at the current index | ||
| """ | ||
| if not self.get_able: | ||
| raise RuntimeError("To index the lines you must contruct with get_able=True") | ||
| with s_open(self.path, "rb") as f: | ||
| f.seek(self.line_offset[i]) | ||
| output = f.readline() | ||
| f.seek(0) | ||
| return any2unicode(output).rstrip() | ||
| def __iter__(self): | ||
| """Iterate through the lines in the source. | ||
| Yields | ||
| ------ | ||
| :class:`~fse.inputs.IndexedSentence` | ||
| IndexedSentence from `path` specified in the constructor. | ||
| """ | ||
| with s_open(self.path, "rb") as f: | ||
| for i, line in enumerate(f): | ||
| yield IndexedSentence(any2unicode(line).split(), i) |
Sorry, the diff of this file is too big to display
| #!/usr/bin/env cython | ||
| # cython: boundscheck=False | ||
| # cython: wraparound=False | ||
| # cython: cdivision=True | ||
| # cython: embedsignature=True | ||
| # coding: utf-8 | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| """Optimized cython functions for computing sentence embeddings""" | ||
| import cython | ||
| import numpy as np | ||
| cimport numpy as np | ||
| from gensim.models._utils_any2vec import compute_ngrams_bytes, ft_hash_bytes | ||
| from libc.string cimport memset | ||
| import scipy.linalg.blas as fblas | ||
| cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer) # y += alpha * x | ||
| cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer) # x = alpha * x | ||
| cdef int ONE = <int>1 | ||
| cdef int ZERO = <int>0 | ||
| cdef REAL_t ONEF = <REAL_t>1.0 | ||
| cdef REAL_t ZEROF = <REAL_t>0.0 | ||
| DEF MAX_WORDS = 10000 | ||
| DEF MAX_NGRAMS = 40 | ||
| cdef init_base_s2v_config(BaseSentenceVecsConfig *c, model, target): | ||
| """Load BaseAny2Vec parameters into a BaseSentenceVecsConfig struct. | ||
| Parameters | ||
| ---------- | ||
| c : FTSentenceVecsConfig * | ||
| A pointer to the struct to initialize. | ||
| model : fse.models.base_s2v.BaseSentence2VecModel | ||
| The model to load. | ||
| target : np.ndarray | ||
| The target array to write the averages to. | ||
| """ | ||
| c[0].workers = model.workers | ||
| c[0].size = model.sv.vector_size | ||
| c[0].word_vectors = <REAL_t *>(np.PyArray_DATA(model.wv.vectors)) | ||
| c[0].word_weights = <REAL_t *>(np.PyArray_DATA(model.word_weights)) | ||
| c[0].sentence_vectors = <REAL_t *>(np.PyArray_DATA(target)) | ||
| cdef init_ft_s2v_config(FTSentenceVecsConfig *c, model, target, memory): | ||
| """Load Fasttext parameters into a FTSentenceVecsConfig struct. | ||
| Parameters | ||
| ---------- | ||
| c : FTSentenceVecsConfig * | ||
| A pointer to the struct to initialize. | ||
| model : fse.models.base_s2v.BaseSentence2VecModel | ||
| The model to load. | ||
| target : np.ndarray | ||
| The target array to write the averages to. | ||
| memory : np.ndarray | ||
| Private working memory for each worker. | ||
| Consists of 2 nd.arrays. | ||
| """ | ||
| c[0].workers = model.workers | ||
| c[0].size = model.sv.vector_size | ||
| c[0].min_n = model.wv.min_n | ||
| c[0].max_n = model.wv.max_n | ||
| c[0].bucket = model.wv.bucket | ||
| c[0].oov_weight = <REAL_t>np.max(model.word_weights) | ||
| c[0].mem = <REAL_t *>(np.PyArray_DATA(memory[0])) | ||
| c[0].subwords_idx = <uINT_t *>(np.PyArray_DATA(memory[1])) | ||
| c[0].word_vectors = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_vocab)) | ||
| c[0].ngram_vectors = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_ngrams)) | ||
| c[0].word_weights = <REAL_t *>(np.PyArray_DATA(model.word_weights)) | ||
| c[0].sentence_vectors = <REAL_t *>(np.PyArray_DATA(target)) | ||
| cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_sentences): | ||
| """Prepare C structures for BaseAny2VecModel so we can go "full C" and release the Python GIL. | ||
| We create indices over the sentences. We also perform some calculations for | ||
| each token/ngram and store the result up front to save time. | ||
| Parameters | ||
| ---------- | ||
| c : BaseSentenceVecsConfig* | ||
| A pointer to the struct that will contain the populated indices. | ||
| vocab : dict | ||
| The vocabulary | ||
| indexed_sentences : iterable of IndexedSentences | ||
| The sentences to read | ||
| Returns | ||
| ------- | ||
| eff_words : int | ||
| The number of in-vocabulary tokens. | ||
| eff_sents : int | ||
| The number of non-empty sentences. | ||
| """ | ||
| cdef uINT_t eff_words = ZERO # Effective words encountered in a sentence | ||
| cdef uINT_t eff_sents = ZERO # Effective sentences encountered | ||
| c.sentence_boundary[0] = ZERO | ||
| for obj in indexed_sentences: | ||
| if not obj.words: | ||
| continue | ||
| for token in obj.words: | ||
| word = vocab[token] if token in vocab else None # Vocab obj | ||
| if word is None: | ||
| continue | ||
| c.word_indices[eff_words] = <uINT_t>word.index | ||
| c.sent_adresses[eff_words] = <uINT_t>obj.index | ||
| eff_words += ONE | ||
| if eff_words == MAX_WORDS: | ||
| break | ||
| eff_sents += 1 | ||
| c.sentence_boundary[eff_sents] = eff_words | ||
| if eff_words == MAX_WORDS: | ||
| break | ||
| return eff_sents, eff_words | ||
| cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, vocab, indexed_sentences): | ||
| """Prepare C structures for FastText so we can go "full C" and release the Python GIL. | ||
| We create indices over the sentences. We also perform some calculations for | ||
| each token/ngram and store the result up front to save time. | ||
| Parameters | ||
| ---------- | ||
| c : FTSentenceVecsConfig* | ||
| A pointer to the struct that will contain the populated indices. | ||
| vocab : dict | ||
| The vocabulary | ||
| indexed_sentences : iterable of IndexedSentences | ||
| The sentences to read | ||
| Returns | ||
| ------- | ||
| eff_words : int | ||
| The number of in-vocabulary tokens. | ||
| eff_sents : int | ||
| The number of non-empty sentences. | ||
| """ | ||
| cdef uINT_t eff_words = ZERO # Effective words encountered in a sentence | ||
| cdef uINT_t eff_sents = ZERO # Effective sentences encountered | ||
| c.sentence_boundary[0] = ZERO | ||
| for obj in indexed_sentences: | ||
| if not obj.words: | ||
| continue | ||
| for token in obj.words: | ||
| c.sent_adresses[eff_words] = <uINT_t>obj.index | ||
| if token in vocab: | ||
| # In Vocabulary | ||
| word = vocab[token] | ||
| c.word_indices[eff_words] = <uINT_t>word.index | ||
| c.subwords_idx_len[eff_words] = ZERO | ||
| else: | ||
| # OOV words --> write ngram indices to memory | ||
| c.word_indices[eff_words] = ZERO | ||
| encoded_ngrams = compute_ngrams_bytes(token, c.min_n, c.max_n) | ||
| hashes = [ft_hash_bytes(n) % c.bucket for n in encoded_ngrams] | ||
| c.subwords_idx_len[eff_words] = <uINT_t>min(len(encoded_ngrams), MAX_NGRAMS) | ||
| for i, h in enumerate(hashes[:MAX_NGRAMS]): | ||
| c.subwords_idx[eff_words + i] = <uINT_t>h | ||
| eff_words += ONE | ||
| if eff_words == MAX_WORDS: | ||
| break | ||
| eff_sents += 1 | ||
| c.sentence_boundary[eff_sents] = eff_words | ||
| if eff_words == MAX_WORDS: | ||
| break | ||
| return eff_sents, eff_words | ||
| cdef void compute_base_sentence_averages(BaseSentenceVecsConfig *c, uINT_t num_sentences) nogil: | ||
| """Perform optimized sentence-level averaging for BaseAny2Vec model. | ||
| Parameters | ||
| ---------- | ||
| c : BaseSentenceVecsConfig * | ||
| A pointer to a fully initialized and populated struct. | ||
| num_sentences : uINT_t | ||
| The number of sentences used to train the model. | ||
| Notes | ||
| ----- | ||
| This routine does not provide oov support. | ||
| """ | ||
| cdef: | ||
| int size = c.size | ||
| uINT_t sent_idx, sent_start, sent_end, sent_row | ||
| uINT_t i, word_idx, word_row | ||
| REAL_t sent_len, inv_count | ||
| for sent_idx in range(num_sentences): | ||
| sent_start = c.sentence_boundary[sent_idx] | ||
| sent_end = c.sentence_boundary[sent_idx + 1] | ||
| sent_len = ZEROF | ||
| for i in range(sent_start, sent_end): | ||
| sent_len += ONEF | ||
| sent_row = c.sent_adresses[i] * size | ||
| word_row = c.word_indices[i] * size | ||
| word_idx = c.word_indices[i] | ||
| saxpy(&size, &c.word_weights[word_idx], &c.word_vectors[word_row], &ONE, &c.sentence_vectors[sent_row], &ONE) | ||
| if sent_len > ZEROF: | ||
| inv_count = ONEF / sent_len | ||
| sscal(&size, &inv_count, &c.sentence_vectors[sent_row], &ONE) | ||
| cdef void compute_ft_sentence_averages(FTSentenceVecsConfig *c, uINT_t num_sentences) nogil: | ||
| """Perform optimized sentence-level averaging for FastText model. | ||
| Parameters | ||
| ---------- | ||
| c : FTSentenceVecsConfig * | ||
| A pointer to a fully initialized and populated struct. | ||
| num_sentences : uINT_t | ||
| The number of sentences used to train the model. | ||
| Notes | ||
| ----- | ||
| This routine DOES provide oov support. | ||
| """ | ||
| cdef: | ||
| int size = c.size | ||
| uINT_t sent_idx, sent_start, sent_end, sent_row | ||
| uINT_t ngram_row, ngrams | ||
| uINT_t i, j, word_idx, word_row | ||
| REAL_t sent_len | ||
| REAL_t inv_count, inv_ngram | ||
| REAL_t oov_weight = c.oov_weight | ||
| memset(c.mem, 0, size * cython.sizeof(REAL_t)) | ||
| for sent_idx in range(num_sentences): | ||
| sent_start = c.sentence_boundary[sent_idx] | ||
| sent_end = c.sentence_boundary[sent_idx + 1] | ||
| sent_len = ZEROF | ||
| for i in range(sent_start, sent_end): | ||
| sent_len += ONEF | ||
| sent_row = c.sent_adresses[i] * size | ||
| word_idx = c.word_indices[i] | ||
| ngrams = c.subwords_idx_len[i] | ||
| if ngrams == 0: | ||
| word_row = c.word_indices[i] * size | ||
| saxpy(&size, &c.word_weights[word_idx], &c.word_vectors[word_row], &ONE, &c.sentence_vectors[sent_row], &ONE) | ||
| else: | ||
| for j in range(ngrams): | ||
| ngram_row = c.subwords_idx[i+j] * size | ||
| saxpy(&size, &ONEF, &c.ngram_vectors[ngram_row], &ONE, c.mem, &ONE) | ||
| inv_ngram = ONEF / <REAL_t>ngrams | ||
| sscal(&size, &inv_ngram, c.mem, &ONE) | ||
| saxpy(&size, &c.oov_weight, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) | ||
| memset(c.mem, 0, size * cython.sizeof(REAL_t)) | ||
| if sent_len > ZEROF: | ||
| inv_count = ONEF / sent_len | ||
| sscal(&size, &inv_count, &c.sentence_vectors[sent_row], &ONE) | ||
| def train_average_cy(model, indexed_sentences, target, memory): | ||
| """Training on a sequence of sentences and update the target ndarray. | ||
| Called internally from :meth:`~fse.models.average.Average._do_train_job`. | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~fse.models.base_s2v.BaseSentence2VecModel` | ||
| The BaseSentence2VecModel model instance. | ||
| indexed_sentences : iterable of IndexedSentence | ||
| The sentences used to train the model. | ||
| target : ndarray | ||
| The target ndarray. We use the index from indexed_sentences | ||
| to write into the corresponding row of target. | ||
| memory : ndarray | ||
| Private memory for each working thread. | ||
| Returns | ||
| ------- | ||
| int, int | ||
| Number of effective sentences (non-zero) and effective words in the vocabulary used | ||
| during training the sentence embedding. | ||
| """ | ||
| cdef uINT_t eff_sentences = 0 | ||
| cdef uINT_t eff_words = 0 | ||
| cdef BaseSentenceVecsConfig w2v | ||
| cdef FTSentenceVecsConfig ft | ||
| if not model.is_ft: | ||
| init_base_s2v_config(&w2v, model, target) | ||
| eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv.vocab, indexed_sentences) | ||
| with nogil: | ||
| compute_base_sentence_averages(&w2v, eff_sentences) | ||
| else: | ||
| init_ft_s2v_config(&ft, model, target, memory) | ||
| eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv.vocab, indexed_sentences) | ||
| with nogil: | ||
| compute_ft_sentence_averages(&ft, eff_sentences) | ||
| return eff_sentences, eff_words | ||
| def init(): | ||
| return 1 | ||
| MAX_WORDS_IN_BATCH = MAX_WORDS | ||
| MAX_NGRAMS_IN_BATCH = MAX_NGRAMS | ||
| FAST_VERSION = init() |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| """This module implements the base class to compute average representations for sentences, using highly optimized C routines, | ||
| data streaming and Pythonic interfaces. | ||
| The implementation is based on Iyyer et al. (2015): Deep Unordered Composition Rivals Syntactic Methods for Text Classification. | ||
| For more information, see <https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf>. | ||
| The training algorithms is based on the Gensim implementation of Word2Vec, FastText, and Doc2Vec. | ||
| For more information, see: :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.fasttext.FastText`, or | ||
| :class:`~gensim.models.doc2vec.Doc2Vec`. | ||
| Initialize and train a :class:`~fse.models.sentence2vec.Sentence2Vec` model | ||
| .. sourcecode:: pycon | ||
| >>> from gensim.models.word2vec import Word2Vec | ||
| >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] | ||
| >>> model = Word2Vec(sentences, min_count=1, size=20) | ||
| >>> from fse.models.average import Average | ||
| >>> from fse.inputs import IndexedSentence | ||
| >>> avg = Average(model) | ||
| >>> avg.train([IndexedSentence(s, i) for i, s in enumerate(sentences)]) | ||
| >>> avg.sv.vectors.shape | ||
| (2, 20) | ||
| """ | ||
| from __future__ import division | ||
| from fse.models.base_s2v import BaseSentence2VecModel | ||
| from fse.inputs import IndexedSentence | ||
| from gensim.models.keyedvectors import BaseKeyedVectors | ||
| from gensim.models.utils_any2vec import ft_ngram_hashes | ||
| from numpy import ndarray, float32 as REAL, sum as np_sum, multiply as np_mult, zeros, max as np_max | ||
| from typing import List | ||
| import logging | ||
| logger = logging.getLogger(__name__) | ||
| def train_average_np(model:BaseSentence2VecModel, indexed_sentences:List[IndexedSentence], target:ndarray, memory:ndarray) -> [int,int]: | ||
| """Training on a sequence of sentences and update the target ndarray. | ||
| Called internally from :meth:`~fse.models.average.Average._do_train_job`. | ||
| Warnings | ||
| -------- | ||
| This is the non-optimized, pure Python version. If you have a C compiler, | ||
| fse will use an optimized code path from :mod:`fse.models.average_inner` instead. | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~fse.models.base_s2v.BaseSentence2VecModel` | ||
| The BaseSentence2VecModel model instance. | ||
| indexed_sentences : iterable of IndexedSentence | ||
| The sentences used to train the model. | ||
| target : ndarray | ||
| The target ndarray. We use the index from indexed_sentences | ||
| to write into the corresponding row of target. | ||
| memory : ndarray | ||
| Private memory for each working thread | ||
| Returns | ||
| ------- | ||
| int, int | ||
| Number of effective sentences (non-zero) and effective words in the vocabulary used | ||
| during training the sentence embedding. | ||
| """ | ||
| size = model.wv.vector_size | ||
| vocab = model.wv.vocab | ||
| w_vectors = model.wv.vectors | ||
| w_weights = model.word_weights | ||
| s_vectors = target | ||
| is_ft = model.is_ft | ||
| mem = memory | ||
| if is_ft: | ||
| # NOTE: For Fasttext: Use wv.vectors_vocab | ||
| # Using the wv.vectors from fasttext had horrible effects on the sts results | ||
| # I suspect this is because the wv.vectors are based on the averages of | ||
| # wv.vectors_vocab + wv.vectors_ngrams, which will point all into very | ||
| # similar directions. | ||
| max_ngrams = model.batch_ngrams | ||
| w_vectors = model.wv.vectors_vocab | ||
| ngram_vectors = model.wv.vectors_ngrams | ||
| min_n = model.wv.min_n | ||
| max_n = model.wv.max_n | ||
| bucket = model.wv.bucket | ||
| oov_weight = np_max(w_weights) | ||
| eff_sentences, eff_words = 0, 0 | ||
| if not is_ft: | ||
| for obj in indexed_sentences: | ||
| sent_adr = obj.index | ||
| sent = obj.words | ||
| word_indices = [vocab[word].index for word in sent if word in vocab] | ||
| eff_sentences += 1 | ||
| if not len(word_indices): | ||
| continue | ||
| eff_words += len(word_indices) | ||
| mem = np_sum(np_mult(w_vectors[word_indices],w_weights[word_indices][:,None]) , axis=0) | ||
| mem *= 1/len(word_indices) | ||
| s_vectors[sent_adr] = mem.astype(REAL) | ||
| mem = zeros(size, dtype=REAL) | ||
| else: | ||
| for obj in indexed_sentences: | ||
| sent_adr = obj.index | ||
| sent = obj.words | ||
| if not len(sent): | ||
| continue | ||
| mem = zeros(size, dtype=REAL) | ||
| eff_sentences += 1 | ||
| eff_words += len(sent) # Counts everything in the sentence | ||
| for word in sent: | ||
| if word in vocab: | ||
| word_index = vocab[word].index | ||
| mem += w_vectors[word_index] * w_weights[word_index] | ||
| else: | ||
| ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket, True)[:max_ngrams] | ||
| if len(ngram_hashes) == 0: | ||
| continue | ||
| mem += oov_weight * (np_sum(ngram_vectors[ngram_hashes], axis=0) / len(ngram_hashes)) | ||
| # Implicit addition of zero if oov does not contain any ngrams | ||
| s_vectors[sent_adr] = mem / len(sent) | ||
| mem = zeros(size, dtype=REAL) | ||
| return eff_sentences, eff_words | ||
| try: | ||
| from fse.models.average_inner import train_average_cy | ||
| from fse.models.average_inner import FAST_VERSION, MAX_WORDS_IN_BATCH, MAX_NGRAMS_IN_BATCH | ||
| train_average = train_average_cy | ||
| except ImportError: | ||
| FAST_VERSION = -1 | ||
| MAX_WORDS_IN_BATCH = 10000 | ||
| MAX_NGRAMS_IN_BATCH = 40 | ||
| train_average = train_average_np | ||
| class Average(BaseSentence2VecModel): | ||
| """ Train, use and evaluate averaged sentence vectors. | ||
| The model can be stored/loaded via its :meth:`~fse.models.average.Average.save` and | ||
| :meth:`~fse.models.average.Average.load` methods. | ||
| Some important attributes are the following: | ||
| Attributes | ||
| ---------- | ||
| wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` | ||
| This object essentially contains the mapping between words and embeddings. After training, it can be used | ||
| directly to query those embeddings in various ways. See the module level docstring for examples. | ||
| sv : :class:`~fse.models.sentencevectors.SentenceVectors` | ||
| This object contains the sentence vectors inferred from the training data. There will be one such vector | ||
| for each unique docusentence supplied during training. They may be individually accessed using the index. | ||
| prep : :class:`~fse.models.base_s2v.BaseSentence2VecPreparer` | ||
| The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used | ||
| to move the vectors to disk for training with memmap. | ||
| """ | ||
| def __init__(self, model:BaseKeyedVectors, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None, **kwargs): | ||
| """ Average (unweighted) sentence embeddings model. Performs a simple averaging operation over all | ||
| words in a sentences without further transformation. | ||
| The implementation is based on Iyyer et al. (2015): Deep Unordered Composition Rivals Syntactic Methods for Text Classification. | ||
| For more information, see <https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf>. | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` | ||
| This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings | ||
| the wv.vocab and wv.vector elements are required. | ||
| sv_mapfile_path : str, optional | ||
| Optional path to store the sentence-vectors in for very large datasets. Used for memmap. | ||
| wv_mapfile_path : str, optional | ||
| Optional path to store the word-vectors in for very large datasets. Used for memmap. | ||
| Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram. | ||
| workers : int, optional | ||
| Number of working threads, used for multithreading. For most tasks (few words in a sentence) | ||
| a value of 1 should be more than enough. | ||
| lang_freq : str, optional | ||
| Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about | ||
| the frequency of a word. As the frequency is required for estimating the word weights, we induce | ||
| frequencies into the wv.vocab.count based on :class:`~wordfreq` | ||
| If no frequency information is available, you can choose the language to estimate the frequency. | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| """ | ||
| super(Average, self).__init__( | ||
| model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, lang_freq=lang_freq, | ||
| batch_words=MAX_WORDS_IN_BATCH, batch_ngrams=MAX_NGRAMS_IN_BATCH, | ||
| fast_version=FAST_VERSION | ||
| ) | ||
| def _do_train_job(self, data_iterable:List[IndexedSentence], target:ndarray, memory:ndarray) -> [int, int]: | ||
| """ Internal routine which is called on training and performs averaging for all entries in the iterable """ | ||
| eff_sentences, eff_words = train_average(model=self, indexed_sentences=data_iterable, target=target, memory=memory) | ||
| return eff_sentences, eff_words | ||
| def _check_parameter_sanity(self, **kwargs): | ||
| """ Check the sanity of all child paramters """ | ||
| if not all(self.word_weights == 1.): | ||
| raise ValueError("All word weights must equal one for averaging") | ||
| def _pre_train_calls(self, **kwargs): | ||
| """Function calls to perform before training """ | ||
| pass | ||
| def _post_train_calls(self, **kwargs): | ||
| """ Function calls to perform after training, such as computing eigenvectors """ | ||
| pass | ||
| def _post_inference_calls(self, **kwargs): | ||
| """ Function calls to perform after training & inference | ||
| Examples include the removal of components | ||
| """ | ||
| pass | ||
| def _check_dtype_santiy(self, **kwargs): | ||
| """ Check the dtypes of all child attributes""" | ||
| pass | ||
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Licensed under GNU General Public License v3.0 | ||
| """Base class containing common methods for training, using & evaluating sentence embeddings. | ||
| A lot of the code is based on Gensim. I have to thank Radim Rehurek and the whole team | ||
| for the outstanding library which I used for a lot of my research. | ||
| Attributes | ||
| ---------- | ||
| wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` | ||
| This object essentially contains the mapping between words and embeddings. After training, it can be used | ||
| directly to query those embeddings in various ways. See the module level docstring for examples. | ||
| sv : :class:`~fse.models.sentencevectors.SentenceVectors` | ||
| This object contains the sentence vectors inferred from the training data. There will be one such vector | ||
| for each unique docusentence supplied during training. They may be individually accessed using the index. | ||
| prep : :class:`~fse.models.base_s2v.BaseSentence2VecPreparer` | ||
| The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used | ||
| to move the vectors to disk for training with memmap. | ||
| See Also | ||
| -------- | ||
| :class:`~fse.models.average.Average`. | ||
| Average sentence model. | ||
| :class:`~fse.models.sif.SIF`. | ||
| Smooth inverse frequency weighted model. | ||
| :class:`~fse.models.usif.uSIF`. | ||
| Unsupervised Smooth inverse frequency weighted model. | ||
| """ | ||
| from fse.models.sentencevectors import SentenceVectors | ||
| from fse.inputs import IndexedSentence | ||
| from gensim.models.base_any2vec import BaseWordEmbeddingsModel | ||
| from gensim.models.keyedvectors import BaseKeyedVectors, FastTextKeyedVectors, _l2_norm | ||
| from gensim.utils import SaveLoad | ||
| from gensim.matutils import zeros_aligned | ||
| from numpy import ndarray, memmap as np_memmap, float32 as REAL, uint32 as uINT, empty, zeros, vstack, dtype, ones | ||
| from wordfreq import available_languages, get_frequency_dict | ||
| from typing import List, Dict | ||
| from time import time | ||
| from psutil import virtual_memory | ||
| from pathlib import Path | ||
| import logging | ||
| import warnings | ||
| import threading | ||
| from queue import Queue | ||
| logger = logging.getLogger(__name__) | ||
| class BaseSentence2VecModel(SaveLoad): | ||
| def __init__(self, model:BaseKeyedVectors, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None, fast_version:int=0, batch_words:int=10000, batch_ngrams:int=40, **kwargs): | ||
| """ Base class for all Sentence2Vec Models. Provides core functionality, such as | ||
| save, load, sanity checking, frequency induction, data checking, scanning, etc. | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` | ||
| This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings | ||
| the wv.vocab and wv.vector elements are required. | ||
| sv_mapfile_path : str, optional | ||
| Optional path to store the sentence-vectors in for very large datasets. Used for memmap. | ||
| wv_mapfile_path : str, optional | ||
| Optional path to store the word-vectors in for very large datasets. Used for memmap. | ||
| Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram. | ||
| workers : int, optional | ||
| Number of working threads, used for multithreading. For most tasks (few words in a sentence) | ||
| a value of 1 should be more than enough. | ||
| lang_freq : str, optional | ||
| Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about | ||
| the frequency of a word. As the frequency is required for estimating the word weights, we induce | ||
| frequencies into the wv.vocab.count based on :class:`~wordfreq` | ||
| If no frequency information is available, you can choose the language to estimate the frequency. | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| fast_version : {-1, 1}, optional | ||
| Whether or not the fast cython implementation of the internal training methods is available. 1 means it is. | ||
| batch_words : int, optional | ||
| Number of words to be processed by a single job. | ||
| batch_ngrams : int, optional | ||
| Number of maxium ngrams for oov words. | ||
| **kwargs : object | ||
| Key word arguments needed to allow children classes to accept more arguments. | ||
| """ | ||
| self.workers = int(workers) | ||
| self.batch_words = batch_words | ||
| self.batch_ngrams = batch_ngrams | ||
| self.wv = None | ||
| self.is_ft = False | ||
| self.wv_mapfile_path = Path(wv_mapfile_path) if wv_mapfile_path is not None else None | ||
| self.wv_mapfile_shapes = {} | ||
| if fast_version < 0: | ||
| warnings.warn( | ||
| "C extension not loaded, training/inferring will be slow. " | ||
| "Install a C compiler and reinstall fse." | ||
| ) | ||
| self._check_and_include_model(model) | ||
| if self.wv_mapfile_path is not None: | ||
| self._map_all_vectors_to_disk(self.wv_mapfile_path) | ||
| if lang_freq is not None: | ||
| self._check_language_settings(lang_freq) | ||
| self._induce_frequencies() | ||
| self.sv = SentenceVectors(vector_size=self.wv.vector_size, mapfile_path=sv_mapfile_path) | ||
| self.prep = BaseSentence2VecPreparer() | ||
| self.word_weights = ones(len(self.wv.vocab), REAL) | ||
| def __str__(self) -> str: | ||
| """ Human readable representation of the model's state. | ||
| Returns | ||
| ------- | ||
| str | ||
| Human readable representation of the model's state. | ||
| """ | ||
| return f"{self.__class__.__name__} based on {self.wv.__class__.__name__}, size={len(self.sv)}" | ||
| def _check_and_include_model(self, model:BaseKeyedVectors): | ||
| """ Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations. | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` | ||
| The model to inject into this class. | ||
| """ | ||
| if isinstance(model, BaseWordEmbeddingsModel): | ||
| self.wv = model.wv | ||
| elif isinstance(model, BaseKeyedVectors): | ||
| self.wv = model | ||
| else: | ||
| raise RuntimeError(f"Model must be child of BaseWordEmbeddingsModel or BaseKeyedVectors. Received {str(model)}") | ||
| self.wv.vectors_norm = None | ||
| if isinstance(self.wv, FastTextKeyedVectors): | ||
| self.wv.vectors_vocab_norm = None # Save some space | ||
| self.wv.vectors_ngrams_norm = None | ||
| self.wv.vectors_vocab_norm = None | ||
| self.is_ft = True | ||
| if not self.wv.compatible_hash: | ||
| raise RuntimeError("FastText model requires compatible hash function") | ||
| if not hasattr(self.wv, 'vectors_vocab') or self.wv.vectors_vocab is None: | ||
| raise RuntimeError("vectors_vocab required for sentence embeddings not found.") | ||
| if not hasattr(self.wv, 'vectors_ngrams') or self.wv.vectors_ngrams is None: | ||
| raise RuntimeError("Ngram vectors required for sentence embeddings not found.") | ||
| if not hasattr(self.wv, 'vectors') or self.wv.vectors is None: | ||
| raise RuntimeError("Word vectors required for sentence embeddings not found.") | ||
| if not hasattr(self.wv, 'vocab'): | ||
| raise RuntimeError("Vocab required for sentence embeddings not found.") | ||
| def _check_language_settings(self, lang_freq:str): | ||
| """ Check if the supplied language is a compatible with the wordfreq package | ||
| Parameters | ||
| ---------- | ||
| lang_freq : str | ||
| The language used to induce the frequencies into the wv.vocab object. | ||
| """ | ||
| if lang_freq in available_languages(wordlist='best'): | ||
| self.lang_freq = str(lang_freq) | ||
| logger.info("no frequency mode: using wordfreq for estimation " | ||
| f"of frequency for language: {self.lang_freq}") | ||
| else: | ||
| raise ValueError(f"Language {lang_freq} is not available in wordfreq") | ||
| def _induce_frequencies(self, domain:int=2**31 - 1): | ||
| """ Induce frequencies for a pretrained model, as not all pretrained models come with frequencies. | ||
| Parameters | ||
| ---------- | ||
| domain : int | ||
| The cumulative count of the vocabulary. | ||
| """ | ||
| freq_dict = get_frequency_dict(self.lang_freq, wordlist='best') | ||
| for word in self.wv.index2word: | ||
| if word in freq_dict: | ||
| self.wv.vocab[word].count = int(freq_dict[word] * domain) | ||
| else: | ||
| self.wv.vocab[word].count = int(1e-8 * domain) | ||
| def _check_input_data_sanity(self, data_iterable:IndexedSentence): | ||
| """ Check if the input data complies with the required formats | ||
| Parameters | ||
| ---------- | ||
| data_iterable : IndexedSentence | ||
| The cumulative count of the vocabulary. | ||
| """ | ||
| if data_iterable is None: | ||
| raise TypeError("You must provide a data iterable to train on") | ||
| elif isinstance(data_iterable, str): | ||
| raise TypeError("Passed string. Input data must be iterable list of list of tokens or IndexedSentence") | ||
| elif not hasattr(data_iterable, "__iter__"): | ||
| raise TypeError("Iterable must provide __iter__ function") | ||
| def _log_train_end(self, eff_sentences:int, eff_words:int, overall_time:float): | ||
| """ Log the end of training. | ||
| Parameters | ||
| ---------- | ||
| eff_sentences : int | ||
| Number of effective (non-zero) sentences encountered in training. | ||
| eff_words : int | ||
| Number of effective words used in training (after ignoring unknown words). | ||
| overall_time : float | ||
| Time in seconds for the task to be completed. | ||
| """ | ||
| logger.info( | ||
| f"training on {eff_sentences} effective sentences with {eff_words} effective words " | ||
| f"took {int(overall_time)}s with {int(eff_sentences / overall_time)} sentences/s" | ||
| ) | ||
| def _check_pre_training_sanity(self, total_sentences:int, total_words:int, average_length:int, **kwargs): | ||
| """ Check if all available objects for training are available and compliant | ||
| Parameters | ||
| ---------- | ||
| total_sentences : int | ||
| Number of sentences encountered while scanning | ||
| total_words : int | ||
| Number of words encountered while scanning | ||
| average_length : int | ||
| Average sentence length | ||
| """ | ||
| if not hasattr(self, "wv") or self.wv is None: | ||
| raise RuntimeError("you must first load a valid BaseKeyedVectors object") | ||
| if not len(self.wv.vectors): | ||
| raise RuntimeError("you must initialize vectors before computing sentence vectors") | ||
| if self.is_ft and not len(self.wv.vectors_ngrams): | ||
| raise RuntimeError("you must initialize ngram vectors before computing sentence vectors") | ||
| if self.is_ft and not len(self.wv.vectors_vocab): | ||
| raise RuntimeError("you must initialize vectors_vocab before computing sentence vectors") | ||
| if sum([self.wv.vocab[w].count for w in self.wv.vocab]) == len(self.wv.vocab): | ||
| logger.warning( | ||
| "The sum of the word counts is equal to its length (all word counts are 1). " | ||
| "Make sure to obtain proper word counts by using lang_freq for pretrained embeddings." | ||
| ) | ||
| if not hasattr(self.sv, "vectors") or self.sv.vectors is None: | ||
| raise RuntimeError("initialization of Sentencevectors failed") | ||
| if not hasattr(self, "word_weights") or self.word_weights is None: | ||
| raise RuntimeError("initialization of word weights failed") | ||
| if not len(self.wv.vectors) == len(self.word_weights): | ||
| raise RuntimeError("Number of word vectors and weights does not match") | ||
| if self.wv.vectors.dtype != REAL: | ||
| raise TypeError(f"type of wv.vectors is wrong: {self.wv.vectors.dtype}") | ||
| if self.is_ft and self.wv.vectors_ngrams.dtype != REAL: | ||
| raise TypeError(f"type of wv.vectors_ngrams is wrong: {self.wv.vectors_ngrams.dtype}") | ||
| if self.is_ft and self.wv.vectors_vocab.dtype != REAL: | ||
| raise TypeError(f"type of wv.vectors_vocab is wrong: {self.wv.vectors_vocab.dtype}") | ||
| if self.sv.vectors.dtype != REAL: | ||
| raise TypeError(f"type of sv.vectors is wrong: {self.sv.vectors.dtype}") | ||
| if self.word_weights.dtype != REAL: | ||
| raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}") | ||
| if total_sentences is 0 or total_words is 0 or average_length is 0: | ||
| raise ValueError( | ||
| f"scanning the sentences returned invalid values. Check the input." | ||
| ) | ||
| def _check_post_training_sanity(self, eff_sentences:int, eff_words:int): | ||
| """ Check if the training results make sense | ||
| Parameters | ||
| ---------- | ||
| eff_sentences : int | ||
| Number of effective sentences encountered during training | ||
| eff_words : int | ||
| Number of effective words encountered during training | ||
| """ | ||
| if eff_sentences is 0 or eff_words is 0: | ||
| raise ValueError( | ||
| f"training returned invalid values. Check the input." | ||
| ) | ||
| def _check_indexed_sent_valid(self, iterPos:int, obj:IndexedSentence) -> [int, List[str]]: | ||
| """ Performs a check if the passed object contains valid data | ||
| Parameters | ||
| ---------- | ||
| iterPos : int | ||
| Position in file/iterable | ||
| obj : IndexedSentence | ||
| An IndexedSentence object containing the index and sentence | ||
| Returns | ||
| ------- | ||
| int | ||
| Index of the sentence used to write to (in sv.vectors) | ||
| list | ||
| List of strings containing all words in a sentence | ||
| """ | ||
| if isinstance(obj, IndexedSentence): | ||
| index = obj.index | ||
| sent = obj.words | ||
| else: | ||
| raise TypeError(f"Passed {type(obj)}: {obj}. Iterable must contain IndexedSentence.") | ||
| if not isinstance(sent, list) or not all(isinstance(w, str) for w in sent): | ||
| raise TypeError(f"At {iterPos}: Passed {type(sent)}: {sent}. IndexedSentence.words must contain list of str.") | ||
| if not isinstance(index, int): | ||
| raise TypeError(f"At {iterPos}: Passed {type(index)}: {index}. IndexedSentence.index must contain index") | ||
| if index < 0: | ||
| raise ValueError(f"At {iterPos}: Passed negative {index}") | ||
| return index, sent | ||
| def _map_all_vectors_to_disk(self, mapfile_path:Path): | ||
| """ Maps all vectors to disk | ||
| Parameters | ||
| ---------- | ||
| mapfile_path : Path | ||
| Path where to write the vectors to | ||
| """ | ||
| path = str(mapfile_path.absolute()) | ||
| self.wv_mapfile_shapes["vectors"] = self.wv.vectors.shape | ||
| self.wv.vectors = self._move_ndarray_to_disk(self.wv.vectors, mapfile_path=path, name="wv") | ||
| if self.is_ft: | ||
| self.wv_mapfile_shapes["vectors_vocab"] = self.wv.vectors_vocab.shape | ||
| self.wv_mapfile_shapes["vectors_ngrams"] = self.wv.vectors_ngrams.shape | ||
| self.wv.vectors_vocab = self._move_ndarray_to_disk(self.wv.vectors_vocab, mapfile_path=self.wv_mapfile_path, name="vocab") | ||
| self.wv.vectors_ngrams = self._move_ndarray_to_disk(self.wv.vectors_ngrams, mapfile_path=self.wv_mapfile_path, name="ngrams") | ||
| def _load_all_vectors_from_disk(self, mapfile_path:Path): | ||
| """ Reads all vectors from disk | ||
| Parameters | ||
| ---------- | ||
| mapfile_path : Path | ||
| Path where to read the vectors from | ||
| """ | ||
| path = str(mapfile_path.absolute()) | ||
| self.wv.vectors = np_memmap(f"{path}_wv.vectors", dtype=REAL, mode='r', shape=self.wv_mapfile_shapes["vectors"]) | ||
| if self.is_ft: | ||
| self.wv.vectors_vocab = np_memmap( | ||
| f"{path}_vocab.vectors", dtype=REAL, mode='r', shape=self.wv_mapfile_shapes["vectors_vocab"]) | ||
| self.wv.vectors_ngrams = np_memmap( | ||
| f"{path}_ngrams.vectors", dtype=REAL, mode='r', shape=self.wv_mapfile_shapes["vectors_ngrams"]) | ||
| def _move_ndarray_to_disk(self, vector:ndarray, mapfile_path:str, name:str="") -> ndarray: | ||
| """ Moves a numpy ndarray to disk via memmap | ||
| Parameters | ||
| ---------- | ||
| vector : ndarray | ||
| The vector to write to disk | ||
| mapfile_path : Path | ||
| Path where to write the vector to | ||
| name : str | ||
| Suffix which is appended to the path to distinguish multiple files | ||
| Returns | ||
| ------- | ||
| ndarray | ||
| readonly ndarray to be used in further computations | ||
| """ | ||
| shape = vector.shape | ||
| path = Path(f"{mapfile_path}_{name}.vectors") | ||
| if not path.exists(): | ||
| logger.info(f"writing {name} to {path}") | ||
| memvecs = np_memmap( | ||
| path, dtype=REAL, | ||
| mode='w+', shape=shape) | ||
| memvecs[:] = vector[:] | ||
| del memvecs, vector | ||
| else: | ||
| # If multiple instances of this class exist, all can access the same files | ||
| logger.info(f"loading pre-existing {name} from {path}") | ||
| readonly_memvecs = np_memmap(path, dtype=REAL, mode='r', shape=shape) | ||
| return readonly_memvecs | ||
| def _get_thread_working_mem(self) -> [ndarray, ndarray]: | ||
| """Computes the memory used per worker thread. | ||
| Returns | ||
| ------- | ||
| np.ndarray | ||
| Each worker threads private work memory. | ||
| """ | ||
| mem = zeros_aligned(self.sv.vector_size, dtype=REAL) | ||
| oov_mem = zeros_aligned((self.batch_words, self.batch_ngrams), dtype=uINT) | ||
| return (mem, oov_mem) | ||
| def _do_train_job(self, data_iterable:List[IndexedSentence], target:ndarray, memory:ndarray) -> [int, int]: | ||
| """ Function to be called on a batch of sentences. Returns eff sentences/words """ | ||
| raise NotImplementedError() | ||
| def _pre_train_calls(self, **kwargs): | ||
| """ Function calls to perform before training """ | ||
| raise NotImplementedError() | ||
| def _post_train_calls(self, **kwargs): | ||
| """ Function calls to perform after training, such as computing eigenvectors """ | ||
| raise NotImplementedError() | ||
| def _post_inference_calls(self, **kwargs): | ||
| """ Function calls to perform after training & inference | ||
| Examples include the removal of components | ||
| """ | ||
| raise NotImplementedError() | ||
| def _check_parameter_sanity(self, **kwargs): | ||
| """ Check the sanity of all child paramters """ | ||
| raise NotImplementedError() | ||
| def _check_dtype_santiy(self, **kwargs): | ||
| """ Check the dtypes of all child attributes """ | ||
| raise NotImplementedError() | ||
| @classmethod | ||
| def load(cls, *args, **kwargs): | ||
| """ Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`. | ||
| Parameters | ||
| ---------- | ||
| fname : str | ||
| Path to the saved file. | ||
| Returns | ||
| ------- | ||
| :class:`~fse.models.base_s2v.BaseSentence2VecModel` | ||
| Loaded model. | ||
| """ | ||
| # This is kind of an ugly hack because I cannot directly modify the save routine of the | ||
| # correpsonding KeyedVectors Files, as a memmap file makes the npy files irrelvant | ||
| model = super(BaseSentence2VecModel, cls).load(*args, **kwargs) | ||
| if model.wv_mapfile_path is not None: | ||
| model._load_all_vectors_from_disk(model.wv_mapfile_path) | ||
| model.wv_mapfile_shapes = None | ||
| return model | ||
| def save(self, *args, **kwargs): | ||
| """ Save the model. | ||
| This saved model can be loaded again using :func:`~fse.models.base_s2v.BaseSentence2VecModel.load` | ||
| Parameters | ||
| ---------- | ||
| fname : str | ||
| Path to the file. | ||
| """ | ||
| # Manually removes vectors from the wv class because we cannot modify the save method | ||
| if self.wv_mapfile_path is not None: | ||
| self.wv.vectors = None | ||
| if self.is_ft: | ||
| self.wv.vectors_vocab = None | ||
| self.wv.vectors_ngrams = None | ||
| super(BaseSentence2VecModel, self).save(*args, **kwargs) | ||
| def scan_sentences(self, sentences:List[IndexedSentence]=None, progress_per:int=5) -> Dict[str,int]: | ||
| """ Performs an initial scan of the data and reports all corresponding statistics | ||
| Parameters | ||
| ---------- | ||
| sentences : (list, iterable) | ||
| An iterable consisting of IndexedSentence objects | ||
| progress_per : int | ||
| Number of seconds to pass before reporting the scan progress | ||
| Returns | ||
| ------- | ||
| dict | ||
| Dictionary containing the scan statistics | ||
| """ | ||
| logger.info("scanning all indexed sentences and their word counts") | ||
| current_time = time() | ||
| total_sentences = 0 | ||
| total_words = 0 | ||
| average_length = 0 | ||
| empty_sentences = 0 | ||
| max_index = 0 | ||
| for i, obj in enumerate(sentences): | ||
| index, sent = self._check_indexed_sent_valid(iterPos=i, obj=obj) | ||
| if time() - current_time > progress_per: | ||
| current_time = time() | ||
| logger.info(f"SCANNING : finished {total_sentences} sentences with {total_words} words") | ||
| max_index = max(max_index, index) | ||
| total_sentences += 1 | ||
| total_words += len(sent) | ||
| if not len(sent): | ||
| empty_sentences += 1 | ||
| if empty_sentences: | ||
| logger.warning(f"found {empty_sentences} empty sentences") | ||
| if max_index >= total_sentences: | ||
| raise RuntimeError(f"Index {max_index} is larger than number of sentences {total_sentences}") | ||
| average_length = int(total_words / total_sentences) | ||
| logger.info( | ||
| f"finished scanning {total_sentences} sentences with an average length of {average_length} and {total_words} total words" | ||
| ) | ||
| statistics = { | ||
| "total_sentences" : total_sentences, | ||
| "total_words" : total_words, | ||
| "average_length" : average_length, | ||
| "empty_sentences" : empty_sentences, | ||
| "max_index" : max_index + 1 | ||
| } | ||
| return statistics | ||
| def estimate_memory(self, total_sentences:int, report:dict=None, **kwargs) -> Dict[str, int]: | ||
| """ Estimate the size of the sentence embedding | ||
| Parameters | ||
| ---------- | ||
| total_sentences : int | ||
| Number of sentences found during the initial scan | ||
| report : dict | ||
| Report of subclasses | ||
| Returns | ||
| ------- | ||
| dict | ||
| Dictionary of estimated memory sizes | ||
| """ | ||
| vocab_size = len(self.wv.vectors) | ||
| report = report or {} | ||
| report["Word Weights"] = vocab_size * dtype(REAL).itemsize | ||
| report["Word Vectors"] = vocab_size * self.wv.vector_size * dtype(REAL).itemsize | ||
| report["Sentence Vectors"] = total_sentences * self.wv.vector_size * dtype(REAL).itemsize | ||
| if self.is_ft: | ||
| report["Vocab Vectors"] = vocab_size * self.wv.vector_size * dtype(REAL).itemsize | ||
| report["Ngram Vectors"] = self.wv.vectors_ngrams.shape[0] * self.wv.vector_size * dtype(REAL).itemsize | ||
| report["Total"] = sum(report.values()) | ||
| mb_size = int(report["Total"] / 1024**2) | ||
| logger.info( | ||
| f"estimated memory for {total_sentences} sentences with " | ||
| f"{self.wv.vector_size} dimensions and {vocab_size} vocabulary: " | ||
| f"{mb_size} MB ({int(mb_size / 1024)} GB)" | ||
| ) | ||
| if report["Total"] >= 0.95 * virtual_memory()[1]: | ||
| logger.warning("The embeddings will likely not fit into RAM. Consider to use mapfile_path") | ||
| return report | ||
| def train(self, sentences:List[IndexedSentence]=None, update:bool=False, queue_factor:int=2, report_delay:int=5) -> [int,int]: | ||
| """ Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is | ||
| used for computing embeddings for large chunks of data. This method also handles post-training transformations, | ||
| such as computing the SVD of the sentence vectors. | ||
| Parameters | ||
| ---------- | ||
| sentences : (list, iterable) | ||
| An iterable consisting of IndexedSentence objects | ||
| update : bool | ||
| If bool is True, the sentence vector matrix will be updated in size (even with memmap) | ||
| queue_factor : int | ||
| Multiplier for size of queue -> size = number of workers * queue_factor. | ||
| report_delay : int | ||
| Number of seconds between two consecutive progress report messages in the logger. | ||
| Returns | ||
| ------- | ||
| int, int | ||
| Count of effective sentences and words encountered | ||
| """ | ||
| self._check_input_data_sanity(sentences) | ||
| statistics = self.scan_sentences(sentences) | ||
| self._check_pre_training_sanity(**statistics) | ||
| self.estimate_memory(**statistics) | ||
| self.prep.prepare_vectors(sv=self.sv, total_sentences=statistics["max_index"], update=update) | ||
| # Preform post-tain calls (i.e weight computation) | ||
| self._pre_train_calls(**statistics) | ||
| self._check_parameter_sanity() | ||
| self._check_dtype_santiy() | ||
| start_time = time() | ||
| logger.info(f"begin training") | ||
| _, eff_sentences, eff_words = self._train_manager(data_iterable=sentences, total_sentences=statistics["max_index"]+1, queue_factor=queue_factor, report_delay=report_delay) | ||
| overall_time = time() - start_time | ||
| self._check_post_training_sanity(eff_sentences=eff_sentences, eff_words=eff_words) | ||
| # Preform post-tain calls (i.e principal component removal) | ||
| self._post_train_calls() | ||
| self._log_train_end(eff_sentences=eff_sentences, eff_words=eff_words, overall_time=overall_time) | ||
| return eff_sentences, eff_words | ||
| def infer(self, sentences:List[IndexedSentence]=None, use_norm=False) -> ndarray: | ||
| """ Secondary routine to train an embedding. This method is essential for small batches of sentences, | ||
| which require little computation. Note: This method does not apply post-training transformations, | ||
| only post inference calls (such as removing principal components). | ||
| Parameters | ||
| ---------- | ||
| sentences : (list, iterable) | ||
| An iterable consisting of IndexedSentence objects | ||
| use_norm : bool | ||
| If bool is True, the sentence vectors will be L2 normalized (unit euclidean length) | ||
| Returns | ||
| ------- | ||
| ndarray | ||
| Computed sentence vectors | ||
| """ | ||
| self._check_input_data_sanity(sentences) | ||
| statistics = self.scan_sentences(sentences) | ||
| output = zeros((statistics["max_index"], self.sv.vector_size), dtype=REAL) | ||
| mem = zeros(self.sv.vector_size, dtype=REAL) | ||
| self._do_train_job(data_iterable=sentences, target=output, memory=mem) | ||
| self._post_inference_calls(output=output) | ||
| if use_norm: | ||
| output = _l2_norm(output) | ||
| return output | ||
| def _train_manager(self, data_iterable:List[IndexedSentence], total_sentences:int=None, queue_factor:int=2, report_delay:int=5): | ||
| """ Manager for the multi-core implementation. Directly adapted from gensim | ||
| Parameters | ||
| ---------- | ||
| data_iterable : (list, iterable) | ||
| An iterable consisting of IndexedSentence objects. This will be split in chunks and these chunks will be pushed to the queue. | ||
| total_sentences : int | ||
| Number of sentences found during the initial scan | ||
| queue_factor : int | ||
| Multiplier for size of queue -> size = number of workers * queue_factor. | ||
| report_delay : int | ||
| Number of seconds between two consecutive progress report messages in the logger. | ||
| """ | ||
| job_queue = Queue(maxsize=queue_factor * self.workers) | ||
| progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) | ||
| # WORKING Threads | ||
| workers = [ | ||
| threading.Thread( | ||
| target=self._worker_loop, | ||
| args=(job_queue, progress_queue)) | ||
| for _ in range(self.workers) | ||
| ] | ||
| # JOB PRODUCER | ||
| workers.append( | ||
| threading.Thread( | ||
| target=self._job_producer, | ||
| args=(data_iterable, job_queue)) | ||
| ) | ||
| for thread in workers: | ||
| thread.daemon = True # make interrupting the process with ctrl+c easier | ||
| thread.start() | ||
| jobs, eff_sentences, eff_words = self._log_train_progress( | ||
| progress_queue, total_sentences=total_sentences, | ||
| report_delay=report_delay | ||
| ) | ||
| return jobs, eff_sentences, eff_words | ||
| def _worker_loop(self, job_queue, progress_queue): | ||
| """ Train the model, lifting batches of data from the queue. | ||
| This function will be called in parallel by multiple workers (threads or processes) to make | ||
| optimal use of multicore machines. | ||
| Parameters | ||
| ---------- | ||
| job_queue : Queue of (list of IndexedSentence) | ||
| A queue of jobs still to be processed. The worker will take up jobs from this queue. | ||
| Each job is represented as a batch of IndexedSentence. | ||
| progress_queue : Queue of (int, int, int) | ||
| A queue of progress reports. Each report is represented as a tuple of these 3 elements: | ||
| * Size of job processed | ||
| * Effective sentences encountered in traning | ||
| * Effective words encountered in traning | ||
| """ | ||
| mem = self._get_thread_working_mem() | ||
| jobs_processed = 0 | ||
| while True: | ||
| job = job_queue.get() | ||
| if job is None: | ||
| progress_queue.put(None) | ||
| # no more jobs => quit this worker | ||
| break | ||
| eff_sentences, eff_words = self._do_train_job(data_iterable=job, target=self.sv.vectors, memory=mem) | ||
| progress_queue.put((len(job), eff_sentences, eff_words)) | ||
| jobs_processed += 1 | ||
| logger.debug(f"worker exiting, processed {jobs_processed} jobs") | ||
| def _job_producer(self, data_iterable:List[IndexedSentence], job_queue:Queue): | ||
| """ Fill the jobs queue using the data found in the input stream. | ||
| Each job is represented as a batch of IndexedSentence | ||
| Parameters | ||
| ---------- | ||
| data_iterable : (list, iterable) | ||
| An iterable consisting of IndexedSentence objects. This will be split in chunks and these chunks will be pushed to the queue. | ||
| job_queue : Queue of (list of IndexedSentence) | ||
| A queue of jobs still to be processed. The worker will take up jobs from this queue. | ||
| Each job is represented as a batch of IndexedSentence. | ||
| """ | ||
| job_batch, batch_size = [], 0 | ||
| job_no = 0 | ||
| for data_idx, data in enumerate(data_iterable): | ||
| data_length = len(data.words) | ||
| if batch_size + data_length <= self.batch_words: | ||
| job_batch.append(data) | ||
| batch_size += data_length | ||
| else: | ||
| job_no += 1 | ||
| job_queue.put(job_batch) | ||
| job_batch, batch_size = [data], data_length | ||
| if job_batch: | ||
| job_no += 1 | ||
| job_queue.put(job_batch) | ||
| for _ in range(self.workers): | ||
| job_queue.put(None) | ||
| logger.debug(f"job loop exiting, total {job_no} jobs") | ||
| def _log_train_progress(self, progress_queue:Queue, total_sentences:int=None, report_delay:int=5): | ||
| """ Log the training process after a couple of seconds. | ||
| Parameters | ||
| ---------- | ||
| progress_queue : Queue of (int, int, int) | ||
| A queue of progress reports. Each report is represented as a tuple of these 3 elements: | ||
| * Size of job processed | ||
| * Effective sentences encountered in traning | ||
| * Effective words encountered in traning | ||
| total_sentences : int | ||
| Number of sentences found during the initial scan | ||
| report_delay : int | ||
| Number of seconds between two consecutive progress report messages in the logger. | ||
| Returns | ||
| ------- | ||
| int, int, int | ||
| number of jobs, effective sentences, and effective words in traning | ||
| """ | ||
| jobs, eff_sentences, eff_words = 0, 0, 0 | ||
| unfinished_worker_count = self.workers | ||
| start_time = time() | ||
| sentence_inc = 0 | ||
| while unfinished_worker_count > 0: | ||
| report = progress_queue.get() | ||
| if report is None: # a thread reporting that it finished | ||
| unfinished_worker_count -= 1 | ||
| logger.info(f"worker thread finished; awaiting finish of {unfinished_worker_count} more threads") | ||
| continue | ||
| j, s, w = report | ||
| jobs += j | ||
| eff_sentences += s | ||
| eff_words += w | ||
| if time() - start_time >= report_delay: | ||
| start_time = time() | ||
| logger.info("PROGRESS : finished {:3.2f}% with {} sentences and {} words, {} sentences/s".format( | ||
| 100 * (eff_sentences/total_sentences), | ||
| eff_sentences, eff_words, | ||
| int((eff_sentences-sentence_inc) / report_delay) | ||
| )) | ||
| sentence_inc = eff_sentences | ||
| return jobs, eff_sentences, eff_words | ||
| class BaseSentence2VecPreparer(SaveLoad): | ||
| """ Contains helper functions to perpare the weights for the training of BaseSentence2VecModel """ | ||
| def prepare_vectors(self, sv:SentenceVectors, total_sentences:int, update:bool=False): | ||
| """Build tables and model weights based on final vocabulary settings.""" | ||
| if not update: | ||
| self.reset_vectors(sv, total_sentences) | ||
| else: | ||
| self.update_vectors(sv, total_sentences) | ||
| def reset_vectors(self, sv:SentenceVectors, total_sentences:int): | ||
| """Initialize all sentence vectors to zero and overwrite existing files""" | ||
| logger.info(f"initializing sentence vectors for {total_sentences} sentences") | ||
| if sv.mapfile_path: | ||
| sv.vectors = np_memmap( | ||
| str(sv.mapfile_path) + '.vectors', dtype=REAL, | ||
| mode='w+', shape=(total_sentences, sv.vector_size)) | ||
| else: | ||
| sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL) | ||
| for i in range(total_sentences): | ||
| sv.vectors[i] = zeros(sv.vector_size, dtype=REAL) | ||
| sv.vectors_norm = None | ||
| def update_vectors(self, sv:SentenceVectors, total_sentences:int): | ||
| """Given existing sentence vectors, append new ones""" | ||
| logger.info(f"appending sentence vectors for {total_sentences} sentences") | ||
| sentences_before = len(sv.vectors) | ||
| sentences_after = len(sv.vectors) + total_sentences | ||
| if sv.mapfile_path: | ||
| sv.vectors = np_memmap( | ||
| str(sv.mapfile_path) + '.vectors', dtype=REAL, | ||
| mode='r+', shape=(sentences_after, sv.vector_size)) | ||
| else: | ||
| newvectors = empty((total_sentences, sv.vector_size), dtype=REAL) | ||
| for i in range(total_sentences): | ||
| newvectors[i] = zeros(sv.vector_size, dtype=REAL) | ||
| sv.vectors = vstack([sv.vectors, newvectors]) | ||
| sv.vectors_norm = None |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| from __future__ import division | ||
| from fse.inputs import IndexedSentence, IndexedList, IndexedLineDocument | ||
| from gensim.models.keyedvectors import BaseKeyedVectors | ||
| from numpy import dot, float32 as REAL, memmap as np_memmap, \ | ||
| double, array, zeros, vstack, sqrt, newaxis, integer, \ | ||
| ndarray, sum as np_sum, prod, argmax | ||
| from gensim import utils, matutils | ||
| from gensim.models.keyedvectors import _l2_norm | ||
| from typing import List, Tuple | ||
| from pathlib import Path | ||
| import logging | ||
| logger = logging.getLogger(__name__) | ||
| class SentenceVectors(utils.SaveLoad): | ||
| def __init__(self, vector_size:int, mapfile_path:str=None): | ||
| self.vector_size = vector_size # Size of vectors | ||
| self.vectors = zeros((0, vector_size), REAL) # Vectors for sentences | ||
| self.vectors_norm = None | ||
| # File for numpy memmap | ||
| self.mapfile_path = Path(mapfile_path) if mapfile_path is not None else None | ||
| self.mapfile_shape = None | ||
| def __getitem__(self, entities:int) -> ndarray: | ||
| """Get vector representation of `entities`. | ||
| Parameters | ||
| ---------- | ||
| entities : {int, list of int} | ||
| Index or sequence of entities. | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| Vector representation for `entities` (1D if `entities` is int, otherwise - 2D). | ||
| """ | ||
| if isinstance(entities, (int, integer,)): | ||
| return self.get_vector(entities) | ||
| return vstack([self.get_vector(e) for e in entities]) | ||
| def __contains__(self, index:int) -> bool: | ||
| if isinstance(index, (int, integer,)): | ||
| return index < len(self) | ||
| else: | ||
| raise KeyError(f"index {index} is not a valid index") | ||
| def __len__(self) -> int: | ||
| return len(self.vectors) | ||
| def _load_all_vectors_from_disk(self, mapfile_path:Path): | ||
| """ Reads all vectors from disk """ | ||
| path = str(mapfile_path.absolute()) | ||
| self.vectors = np_memmap(f"{path}.vectors", dtype=REAL, mode='r+', shape=self.mapfile_shape) | ||
| def save(self, *args, **kwargs): | ||
| """Save object. | ||
| Parameters | ||
| ---------- | ||
| fname : str | ||
| Path to the output file. | ||
| See Also | ||
| -------- | ||
| :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.load` | ||
| Load object. | ||
| """ | ||
| self.mapfile_shape = self.vectors.shape | ||
| ignore = ["vectors_norm"] | ||
| # don't bother storing the cached normalized vectors | ||
| if self.mapfile_path is not None: | ||
| ignore.append("vectors") | ||
| kwargs['ignore'] = kwargs.get('ignore', ignore) | ||
| super(SentenceVectors, self).save(*args, **kwargs) | ||
| @classmethod | ||
| def load(cls, fname_or_handle, **kwargs): | ||
| # TODO: Unittests | ||
| sv = super(SentenceVectors, cls).load(fname_or_handle, **kwargs) | ||
| path = sv.mapfile_path | ||
| if path is not None: | ||
| sv._load_all_vectors_from_disk(mapfile_path=path) | ||
| return sv | ||
| def get_vector(self, index:int, use_norm:bool=False) -> ndarray: | ||
| """Get sentence representations in vector space, as a 1D numpy array. | ||
| Parameters | ||
| ---------- | ||
| index : int | ||
| Input index | ||
| use_norm : bool, optional | ||
| If True - resulting vector will be L2-normalized (unit euclidean length). | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| Vector representation of index. | ||
| Raises | ||
| ------ | ||
| KeyError | ||
| If index out of bounds. | ||
| """ | ||
| if index in self: | ||
| if use_norm: | ||
| result = self.vectors_norm[index] | ||
| else: | ||
| result = self.vectors[index] | ||
| result.setflags(write=False) | ||
| return result | ||
| else: | ||
| raise KeyError("index {index} not found") | ||
| def init_sims(self, replace:bool=False): | ||
| """Precompute L2-normalized vectors. | ||
| Parameters | ||
| ---------- | ||
| replace : bool, optional | ||
| If True - forget the original vectors and only keep the normalized ones = saves lots of memory! | ||
| """ | ||
| if getattr(self, 'vectors_norm', None) is None or replace: | ||
| logger.info("precomputing L2-norms of sentence vectors") | ||
| if not replace and self.mapfile_path is not None: | ||
| self.vectors_norm = np_memmap( | ||
| self.mapfile_path + '.vectors_norm', dtype=REAL, | ||
| mode='w+', shape=self.vectors.shape) | ||
| self.vectors_norm = _l2_norm(self.vectors, replace=replace) | ||
| def similarity(self, d1:int, d2:int) -> float: | ||
| """Compute cosine similarity between two sentences from the training set. | ||
| Parameters | ||
| ---------- | ||
| d1 : int | ||
| index of sentence | ||
| d2 : int | ||
| index of sentence | ||
| Returns | ||
| ------- | ||
| float | ||
| The cosine similarity between the vectors of the two sentences. | ||
| """ | ||
| return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2])) | ||
| def distance(self, d1:int, d2:int) -> float: | ||
| """Compute cosine similarity between two sentences from the training set. | ||
| Parameters | ||
| ---------- | ||
| d1 : int | ||
| index of sentence | ||
| d2 : int | ||
| index of sentence | ||
| Returns | ||
| ------- | ||
| float | ||
| The cosine distance between the vectors of the two sentences. | ||
| """ | ||
| return 1 - self.similarity(d1, d2) | ||
| def most_similar(self, positive:[int,ndarray]=None, negative:[int,ndarray]=None, | ||
| indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int, Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| """Find the top-N most similar sentences. | ||
| Positive sentences contribute positively towards the similarity, negative sentences negatively. | ||
| This method computes cosine similarity between a simple mean of the projection | ||
| weight vectors of the given sentences and the vectors for each sentence in the model. | ||
| Parameters | ||
| ---------- | ||
| positive : list of int, optional | ||
| List of indices that contribute positively. | ||
| negative : list of int, optional | ||
| List of indices that contribute negatively. | ||
| indexable: list, IndexedList, IndexedLineDocument | ||
| Provides an indexable object from where the most similar sentences are read | ||
| topn : int or None, optional | ||
| Number of top-N similar sentences to return, when `topn` is int. When `topn` is None, | ||
| then similarities for all sentences are returned. | ||
| restrict_size : int or Tuple(int,int), optional | ||
| Optional integer which limits the range of vectors which | ||
| are searched for most-similar values. For example, restrict_vocab=10000 would | ||
| only check the first 10000 sentence vectors. | ||
| restrict_vocab=(500, 1000) would search the sentence vectors with indices between | ||
| 500 and 1000. | ||
| Returns | ||
| ------- | ||
| list of (int, float) or list of (str, int, float) | ||
| A sequence of (index, similarity) is returned. | ||
| When an indexable is provided, returns (str, index, similarity) | ||
| When `topn` is None, then similarities for all words are returned as a | ||
| one-dimensional numpy array with the size of the vocabulary. | ||
| """ | ||
| if indexable is not None and not hasattr(indexable, "__getitem__"): | ||
| raise RuntimeError("Indexable must provide __getitem__") | ||
| if positive is None: | ||
| positive = [] | ||
| if negative is None: | ||
| negative = [] | ||
| self.init_sims() | ||
| if isinstance(positive, (int, integer)) and not negative: | ||
| positive = [positive] | ||
| if isinstance(positive, (ndarray)) and not negative: | ||
| if len(positive.shape) == 1: | ||
| positive = [positive] | ||
| positive = [ | ||
| (sent, 1.0) if isinstance(sent, (int, integer, ndarray)) else sent | ||
| for sent in positive | ||
| ] | ||
| negative = [ | ||
| (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent | ||
| for sent in negative | ||
| ] | ||
| all_sents, mean = set(), [] | ||
| for sent, weight in positive + negative: | ||
| if isinstance(sent, ndarray): | ||
| mean.append(weight * sent) | ||
| else: | ||
| mean.append(weight * self.get_vector(index=sent, use_norm=True)) | ||
| if sent in self: | ||
| all_sents.add(sent) | ||
| if not mean: | ||
| raise ValueError("cannot compute similarity with no input") | ||
| mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) | ||
| if isinstance(restrict_size, (int, integer)): | ||
| lo, hi = 0, restrict_size | ||
| elif isinstance(restrict_size, Tuple): | ||
| lo, hi = restrict_size | ||
| else: | ||
| lo, hi = 0, None | ||
| limited = self.vectors_norm if restrict_size is None else self.vectors_norm[lo:hi] | ||
| dists = dot(limited, mean) | ||
| if not topn: | ||
| return dists | ||
| best = matutils.argsort(dists, topn=topn + len(all_sents), reverse=True) | ||
| best_off = best + lo | ||
| if indexable is not None: | ||
| result = [(indexable[off_idx], off_idx, float(dists[idx])) for off_idx, idx in zip(best_off, best) if off_idx not in all_sents] | ||
| else: | ||
| result = [(off_idx, float(dists[idx])) for off_idx, idx in zip(best_off, best) if off_idx not in all_sents] | ||
| return result[:topn] | ||
| def similar_by_word(self, word:str, wv:BaseKeyedVectors, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| """Find the top-N most similar sentences to a given word. | ||
| Parameters | ||
| ---------- | ||
| word : str | ||
| Word | ||
| wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` | ||
| This object essentially contains the mapping between words and embeddings. | ||
| indexable: list, IndexedList, IndexedLineDocument | ||
| Provides an indexable object from where the most similar sentences are read | ||
| topn : int or None, optional | ||
| Number of top-N similar sentences to return, when `topn` is int. When `topn` is None, | ||
| then similarities for all sentences are returned. | ||
| restrict_size : int or Tuple(int,int), optional | ||
| Optional integer which limits the range of vectors which | ||
| are searched for most-similar values. For example, restrict_vocab=10000 would | ||
| only check the first 10000 sentence vectors. | ||
| restrict_vocab=(500, 1000) would search the sentence vectors with indices between | ||
| 500 and 1000. | ||
| Returns | ||
| ------- | ||
| list of (int, float) or list of (str, int, float) | ||
| A sequence of (index, similarity) is returned. | ||
| When an indexable is provided, returns (str, index, similarity) | ||
| When `topn` is None, then similarities for all words are returned as a | ||
| one-dimensional numpy array with the size of the vocabulary. | ||
| """ | ||
| return self.most_similar(positive=wv[word], indexable=indexable, topn=topn, restrict_size=restrict_size) | ||
| def similar_by_sentence(self, sentence:List[str], model, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| """Find the top-N most similar sentences to a given sentence. | ||
| Parameters | ||
| ---------- | ||
| sentence : list of str | ||
| Sentence as list of strings | ||
| model : :class:`~fse.models.base_s2v.BaseSentence2VecModel` | ||
| This object essentially provides the infer method used to transform . | ||
| indexable: list, IndexedList, IndexedLineDocument | ||
| Provides an indexable object from where the most similar sentences are read | ||
| topn : int or None, optional | ||
| Number of top-N similar sentences to return, when `topn` is int. When `topn` is None, | ||
| then similarities for all sentences are returned. | ||
| restrict_size : int or Tuple(int,int), optional | ||
| Optional integer which limits the range of vectors which | ||
| are searched for most-similar values. For example, restrict_vocab=10000 would | ||
| only check the first 10000 sentence vectors. | ||
| restrict_vocab=(500, 1000) would search the sentence vectors with indices between | ||
| 500 and 1000. | ||
| Returns | ||
| ------- | ||
| list of (int, float) or list of (str, int, float) | ||
| A sequence of (index, similarity) is returned. | ||
| When an indexable is provided, returns (str, index, similarity) | ||
| When `topn` is None, then similarities for all words are returned as a | ||
| one-dimensional numpy array with the size of the vocabulary. | ||
| """ | ||
| vector = model.infer([IndexedSentence(sentence, 0)]) | ||
| return self.most_similar(positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size) | ||
| def similar_by_vector(self, vector:ndarray, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| """Find the top-N most similar sentences to a given vector. | ||
| Parameters | ||
| ---------- | ||
| vector : ndarray | ||
| Vectors | ||
| indexable: list, IndexedList, IndexedLineDocument | ||
| Provides an indexable object from where the most similar sentences are read | ||
| topn : int or None, optional | ||
| Number of top-N similar sentences to return, when `topn` is int. When `topn` is None, | ||
| then similarities for all sentences are returned. | ||
| restrict_size : int or Tuple(int,int), optional | ||
| Optional integer which limits the range of vectors which | ||
| are searched for most-similar values. For example, restrict_vocab=10000 would | ||
| only check the first 10000 sentence vectors. | ||
| restrict_vocab=(500, 1000) would search the sentence vectors with indices between | ||
| 500 and 1000. | ||
| Returns | ||
| ------- | ||
| list of (int, float) or list of (str, int, float) | ||
| A sequence of (index, similarity) is returned. | ||
| When an indexable is provided, returns (str, index, similarity) | ||
| When `topn` is None, then similarities for all words are returned as a | ||
| one-dimensional numpy array with the size of the vocabulary. | ||
| """ | ||
| return self.most_similar(positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size) |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| from fse.models.average import Average | ||
| from fse.models.utils import compute_principal_components, remove_principal_components | ||
| from gensim.models.keyedvectors import BaseKeyedVectors | ||
| from numpy import ndarray, float32 as REAL, ones, zeros | ||
| import logging | ||
| logger = logging.getLogger(__name__) | ||
| class SIF(Average): | ||
| def __init__(self, model:BaseKeyedVectors, alpha:float=1e-3, components:int=1, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None): | ||
| """ Smooth-inverse frequency (SIF) weighted sentence embeddings model. Performs a weighted averaging operation over all | ||
| words in a sentences. After training, the model removes a number of singular vectors. | ||
| The implementation is based on Arora et al. (2017): A Simple but Tough-to-Beat Baseline for Sentence Embeddings. | ||
| For more information, see <https://openreview.net/pdf?id=SyK00v5xx> and <https://github.com/PrincetonML/SIF> | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` | ||
| This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings | ||
| the wv.vocab and wv.vector elements are required. | ||
| alpha : float, optional | ||
| Alpha is the weighting factor used to downweigh each individual word. | ||
| components : int, optional | ||
| Corresponds to the number of singular vectors to remove from the sentence embeddings. | ||
| sv_mapfile_path : str, optional | ||
| Optional path to store the sentence-vectors in for very large datasets. Used for memmap. | ||
| wv_mapfile_path : str, optional | ||
| Optional path to store the word-vectors in for very large datasets. Used for memmap. | ||
| Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram. | ||
| workers : int, optional | ||
| Number of working threads, used for multithreading. For most tasks (few words in a sentence) | ||
| a value of 1 should be more than enough. | ||
| lang_freq : str, optional | ||
| Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about | ||
| the frequency of a word. As the frequency is required for estimating the word weights, we induce | ||
| frequencies into the wv.vocab.count based on :class:`~wordfreq` | ||
| If no frequency information is available, you can choose the language to estimate the frequency. | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| """ | ||
| self.alpha = float(alpha) | ||
| self.components = int(components) | ||
| self.svd_res = None | ||
| super(SIF, self).__init__( | ||
| model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, lang_freq=lang_freq) | ||
| def _check_parameter_sanity(self): | ||
| """ Check the sanity of all paramters """ | ||
| if not all(self.word_weights <= 1.) or not all(self.word_weights >= 0.): | ||
| raise ValueError("For SIF, all word weights must be 0 <= w_weight <= 1") | ||
| if self.alpha <= 0.: | ||
| raise ValueError("Alpha must be greater than zero.") | ||
| if self.components < 0.: | ||
| raise ValueError("Components must be greater or equal zero") | ||
| def _pre_train_calls(self, **kwargs): | ||
| """Function calls to perform before training """ | ||
| self._compute_sif_weights() | ||
| def _post_train_calls(self): | ||
| """ Function calls to perform after training, such as computing eigenvectors """ | ||
| if self.components > 0: | ||
| self.svd_res = compute_principal_components(self.sv.vectors, components=self.components) | ||
| remove_principal_components(self.sv.vectors, svd_res=self.svd_res, inplace=True) | ||
| else: | ||
| self.svd_res = 0 | ||
| logger.info(f"no removal of principal components") | ||
| def _post_inference_calls(self, output:ndarray): | ||
| """ Function calls to perform after training & inference """ | ||
| if self.svd_res is None: | ||
| raise RuntimeError("You must first train the model to obtain SVD components") | ||
| elif self.components > 0: | ||
| remove_principal_components(output, svd_res=self.svd_res, inplace=True) | ||
| else: | ||
| logger.info(f"no removal of principal components") | ||
| def _check_dtype_santiy(self): | ||
| """ Check the dtypes of all attributes """ | ||
| if self.word_weights.dtype != REAL: | ||
| raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}") | ||
| if self.svd_res is not None: | ||
| if self.svd_res[0].dtype != REAL: | ||
| raise TypeError(f"type of svd values is wrong: {self.svd_res[0].dtype}") | ||
| if self.svd_res[1].dtype != REAL: | ||
| raise TypeError(f"type of svd components is wrong: {self.svd_res[1].dtype}") | ||
| def _compute_sif_weights(self): | ||
| """ Precomputes the SIF weights for all words in the vocabulary """ | ||
| logger.info(f"pre-computing SIF weights for {len(self.wv.vocab)} words") | ||
| v = len(self.wv.vocab) | ||
| corpus_size = 0 | ||
| pw = zeros(v, dtype=REAL) | ||
| for word in self.wv.vocab: | ||
| c = self.wv.vocab[word].count | ||
| corpus_size += c | ||
| pw[self.wv.vocab[word].index] = c | ||
| pw /= corpus_size | ||
| self.word_weights = (self.alpha / (self.alpha + pw)).astype(REAL) |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| from fse.models.average import Average | ||
| from fse.models.utils import compute_principal_components, remove_principal_components | ||
| from gensim.models.keyedvectors import BaseKeyedVectors | ||
| from numpy import ndarray, float32 as REAL, zeros | ||
| import logging | ||
| logger = logging.getLogger(__name__) | ||
| class uSIF(Average): | ||
| def __init__(self, model:BaseKeyedVectors, length:int=None, components:int=5, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None): | ||
| """ Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings model. Performs a weighted averaging operation over all | ||
| words in a sentences. After training, the model removes a number of weighted singular vectors. | ||
| The implementation is based on Ethayarajh (2018): Unsupervised Random Walk Sentence Embeddings: A Strong but Simple Baseline. | ||
| For more information, see <https://www.aclweb.org/anthology/W18-3012> and <https://github.com/kawine/usif> | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` | ||
| This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings | ||
| the wv.vocab and wv.vector elements are required. | ||
| length : int, optional | ||
| Corresponds to the average number of words in a sentence in the training corpus. | ||
| If length is None, then the model takes the average number of words from | ||
| :meth: `~fse.models.base_s2v.BaseSentence2VecModel.scan_sentences` | ||
| Is equivalent to n in the paper. | ||
| components : int, optional | ||
| Corresponds to the number of singular vectors to remove from the sentence embeddings. | ||
| Is equivalent to m in the paper. | ||
| sv_mapfile_path : str, optional | ||
| Optional path to store the sentence-vectors in for very large datasets. Used for memmap. | ||
| wv_mapfile_path : str, optional | ||
| Optional path to store the word-vectors in for very large datasets. Used for memmap. | ||
| Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram. | ||
| workers : int, optional | ||
| Number of working threads, used for multithreading. For most tasks (few words in a sentence) | ||
| a value of 1 should be more than enough. | ||
| lang_freq : str, optional | ||
| Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about | ||
| the frequency of a word. As the frequency is required for estimating the word weights, we induce | ||
| frequencies into the wv.vocab.count based on :class:`~wordfreq` | ||
| If no frequency information is available, you can choose the language to estimate the frequency. | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| """ | ||
| self.length = length | ||
| self.components = int(components) | ||
| self.svd_res = None | ||
| self.svd_weights = None | ||
| super(Average, self).__init__( | ||
| model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, lang_freq=lang_freq) | ||
| def _check_parameter_sanity(self): | ||
| """ Check the sanity of all paramters """ | ||
| if self.length <= 0.: | ||
| raise ValueError("Length must be greater than zero.") | ||
| if self.components < 0.: | ||
| raise ValueError("Components must be greater or equal zero") | ||
| def _pre_train_calls(self, **kwargs): | ||
| """ Function calls to perform before training """ | ||
| self.length = kwargs["average_length"] if self.length is None else self.length | ||
| self._compute_usif_weights() | ||
| def _post_train_calls(self): | ||
| """ Function calls to perform after training, such as computing eigenvectors """ | ||
| if self.components > 0: | ||
| self.svd_res = compute_principal_components(self.sv.vectors, components=self.components) | ||
| self.svd_weights = (self.svd_res[0] ** 2) / (self.svd_res[0] ** 2).sum().astype(REAL) | ||
| remove_principal_components(self.sv.vectors, svd_res=self.svd_res, weights=self.svd_weights, inplace=True) | ||
| else: | ||
| self.svd_res = 0 | ||
| logger.info(f"no removal of principal components") | ||
| def _post_inference_calls(self, output:ndarray): | ||
| """ Function calls to perform after training & inference """ | ||
| if self.svd_res is None: | ||
| raise RuntimeError("You must first train the model to obtain SVD components") | ||
| elif self.components > 0: | ||
| remove_principal_components(output, svd_res=self.svd_res, weights=self.svd_weights, inplace=True) | ||
| else: | ||
| logger.info(f"no removal of principal components") | ||
| def _check_dtype_santiy(self): | ||
| """ Check the dtypes of all attributes """ | ||
| if self.word_weights.dtype != REAL: | ||
| raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}") | ||
| if self.svd_res is not None: | ||
| if self.svd_res[0].dtype != REAL: | ||
| raise TypeError(f"type of svd values is wrong: {self.svd_res[0].dtype}") | ||
| if self.svd_res[1].dtype != REAL: | ||
| raise TypeError(f"type of svd components is wrong: {self.svd_res[1].dtype}") | ||
| if self.svd_weights.dtype != REAL: | ||
| raise TypeError(f"type of svd weights is wrong: {self.svd_weights.dtype}") | ||
| def _compute_usif_weights(self): | ||
| """ Precomputes the uSIF weights """ | ||
| logger.info(f"pre-computing uSIF weights for {len(self.wv.vocab)} words") | ||
| v = len(self.wv.vocab) | ||
| corpus_size = 0 | ||
| pw = zeros(v, dtype=REAL) | ||
| for word in self.wv.vocab: | ||
| c = self.wv.vocab[word].count | ||
| corpus_size += c | ||
| pw[self.wv.vocab[word].index] = c | ||
| pw /= corpus_size | ||
| threshold = 1 - (1-(1/v)) ** self.length | ||
| alpha = sum(pw > threshold) / v | ||
| z = v/2 | ||
| a = (1 - alpha)/(alpha * z) | ||
| self.word_weights = (a / ((a/2) + pw)).astype(REAL) |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| from sklearn.decomposition import TruncatedSVD | ||
| from numpy import ndarray, float32 as REAL, ones | ||
| from time import time | ||
| import logging | ||
| logger = logging.getLogger(__name__) | ||
| def compute_principal_components(vectors:ndarray, components:int=1) -> ndarray: | ||
| """ Method used to compute the first singular vectors of a given matrix | ||
| Parameters | ||
| ---------- | ||
| vectors : ndarray | ||
| (Sentence) vectors to compute the truncated SVD on | ||
| components : int, optional | ||
| Number of singular values/vectors to compute | ||
| Returns | ||
| ------- | ||
| ndarray, ndarray | ||
| Singular values and singular vectors | ||
| """ | ||
| start = time() | ||
| svd = TruncatedSVD(n_components=components, n_iter=7, random_state=42, algorithm="randomized") | ||
| svd.fit(vectors) | ||
| elapsed = time() | ||
| logger.info(f"computing {components} principal components took {int(elapsed-start)}s") | ||
| return svd.singular_values_.astype(REAL), svd.components_.astype(REAL) | ||
| def remove_principal_components(vectors:ndarray, svd_res:[ndarray, ndarray], weights:ndarray=None, inplace:bool=True) -> ndarray: | ||
| """ Method used to remove the first singular vectors of a given matrix | ||
| Parameters | ||
| ---------- | ||
| vectors : ndarray | ||
| (Sentence) vectors to remove components fromm | ||
| svd_res : (ndarray, ndarray) | ||
| Tuple consisting of the singular values and components to remove from the vectors | ||
| weights : ndarray, optional | ||
| Weights to be used to weigh the components which are removed from the vectors | ||
| inplace : bool, optional | ||
| If true, removes the componentens from the vectors inplace (memory efficient) | ||
| Returns | ||
| ------- | ||
| ndarray, ndarray | ||
| Singular values and singular vectors | ||
| """ | ||
| singular_values = svd_res[0].astype(REAL) | ||
| components = svd_res[1].astype(REAL) | ||
| start = time() | ||
| if weights is None: | ||
| w_comp = components * ones(len(components), dtype=REAL)[:, None] | ||
| else: | ||
| w_comp = components * (weights[:, None].astype(REAL)) | ||
| output = None | ||
| if len(components)==1: | ||
| if not inplace: | ||
| output = vectors.dot(w_comp.transpose()) * w_comp | ||
| else: | ||
| vectors -= vectors.dot(w_comp.transpose()) * w_comp | ||
| else: | ||
| if not inplace: | ||
| output = vectors.dot(w_comp.transpose()).dot(w_comp) | ||
| else: | ||
| vectors -= vectors.dot(w_comp.transpose()).dot(w_comp) | ||
| elapsed = time() | ||
| logger.info(f"removing {len(components)} principal components took {int(elapsed-start)}s") | ||
| if not inplace: | ||
| return output |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| """ | ||
| Automated tests for checking the average model. | ||
| """ | ||
| import logging | ||
| import unittest | ||
| from pathlib import Path | ||
| import numpy as np | ||
| from fse.models.average import Average | ||
| from fse.models.average import train_average_np | ||
| from fse.models.average_inner import train_average_cy | ||
| from fse.models.average_inner import FAST_VERSION, MAX_WORDS_IN_BATCH, MAX_NGRAMS_IN_BATCH | ||
| from fse.inputs import IndexedSentence | ||
| from gensim.models import Word2Vec, FastText | ||
| logger = logging.getLogger(__name__) | ||
| CORPUS = Path("fse/test/test_data/test_sentences.txt") | ||
| DIM = 5 | ||
| W2V = Word2Vec(min_count=1, size=DIM) | ||
| SENTENCES = [l.split() for i, l in enumerate(open(CORPUS, "r"))] | ||
| W2V.build_vocab(SENTENCES) | ||
| W2V.wv.vectors[:,] = np.arange(len(W2V.wv.vectors), dtype=np.float32)[:, None] | ||
| class TestAverageFunctions(unittest.TestCase): | ||
| def setUp(self): | ||
| self.sentences = [["They", "admit"], ["So", "Apple", "bought", "buds"], ["go", "12345"], ["pull", "12345678910111213"]] | ||
| self.sentences = [IndexedSentence(s, i) for i,s in enumerate(self.sentences)] | ||
| self.model = Average(W2V) | ||
| self.model.prep.prepare_vectors(sv=self.model.sv, total_sentences=len(self.sentences), update=False) | ||
| self.model._pre_train_calls() | ||
| def test_cython(self): | ||
| self.assertTrue(FAST_VERSION) | ||
| self.assertEqual(10000,MAX_WORDS_IN_BATCH) | ||
| self.assertEqual(40, MAX_NGRAMS_IN_BATCH) | ||
| def test_average_train_np_w2v(self): | ||
| self.model.sv.vectors = np.zeros_like(self.model.sv.vectors, dtype=np.float32) | ||
| mem = self.model._get_thread_working_mem() | ||
| output = train_average_np(self.model, self.sentences, self.model.sv.vectors, mem) | ||
| self.assertEqual((4, 7), output) | ||
| self.assertTrue((183 == self.model.sv[0]).all()) | ||
| self.assertTrue((164.5 == self.model.sv[1]).all()) | ||
| self.assertTrue((self.model.wv.vocab["go"].index == self.model.sv[2]).all()) | ||
| def test_average_train_cy_w2v(self): | ||
| self.model.sv.vectors = np.zeros_like(self.model.sv.vectors, dtype=np.float32) | ||
| mem = self.model._get_thread_working_mem() | ||
| output = train_average_cy(self.model, self.sentences, self.model.sv.vectors, mem) | ||
| self.assertEqual((4, 7), output) | ||
| self.assertTrue((183 == self.model.sv[0]).all()) | ||
| self.assertTrue((164.5 == self.model.sv[1]).all()) | ||
| self.assertTrue((self.model.wv.vocab["go"].index == self.model.sv[2]).all()) | ||
| def test_average_train_np_ft(self): | ||
| ft = FastText(min_count=1, size=DIM) | ||
| ft.build_vocab(SENTENCES) | ||
| m = Average(ft) | ||
| m.prep.prepare_vectors(sv=m.sv, total_sentences=len(self.sentences), update=False) | ||
| m._pre_train_calls() | ||
| m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32) | ||
| m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32) | ||
| mem = m._get_thread_working_mem() | ||
| output = train_average_np(m, self.sentences, m.sv.vectors, mem) | ||
| self.assertEqual((4, 10), output) | ||
| self.assertTrue((1. == m.sv[0]).all()) | ||
| self.assertTrue((1.5 == m.sv[2]).all()) | ||
| self.assertTrue((2 == m.sv[3]).all()) | ||
| # "go" -> [1,1...] | ||
| # oov: "12345" -> (14 hashes * 2) / 14 = 2 | ||
| # (2 + 1) / 2 = 1.5 | ||
| def test_average_train_cy_ft(self): | ||
| ft = FastText(min_count=1, size=DIM) | ||
| ft.build_vocab(SENTENCES) | ||
| m = Average(ft) | ||
| m.prep.prepare_vectors(sv=m.sv, total_sentences=len(self.sentences), update=False) | ||
| m._pre_train_calls() | ||
| m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32) | ||
| m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32) | ||
| mem = m._get_thread_working_mem() | ||
| output = train_average_cy(m, self.sentences, m.sv.vectors, mem) | ||
| self.assertEqual((4, 10), output) | ||
| self.assertTrue((1. == m.sv[0]).all()) | ||
| self.assertTrue((1.5 == m.sv[2]).all()) | ||
| self.assertTrue((2 == m.sv[3]).all()) | ||
| def test_cy_equal_np_w2v(self): | ||
| m1 = Average(W2V) | ||
| m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) | ||
| m1._pre_train_calls() | ||
| mem1 = m1._get_thread_working_mem() | ||
| o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1) | ||
| m2 = Average(W2V) | ||
| m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) | ||
| m2._pre_train_calls() | ||
| mem2 = m2._get_thread_working_mem() | ||
| o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2) | ||
| self.assertEqual(o1, o2) | ||
| self.assertTrue((m1.sv.vectors == m2.sv.vectors).all()) | ||
| def test_cy_equal_np_ft(self): | ||
| ft = FastText(size=20, min_count=1) | ||
| ft.build_vocab(SENTENCES) | ||
| m1 = Average(ft) | ||
| m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) | ||
| m1._pre_train_calls() | ||
| m1.batch_ngrams = MAX_NGRAMS_IN_BATCH | ||
| mem1 = m1._get_thread_working_mem() | ||
| o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1) | ||
| m2 = Average(ft) | ||
| m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) | ||
| m2._pre_train_calls() | ||
| mem2 = m2._get_thread_working_mem() | ||
| o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2) | ||
| self.assertEqual(o1, o2) | ||
| self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors)) | ||
| def test_do_train_job(self): | ||
| self.model.prep.prepare_vectors(sv=self.model.sv, total_sentences=len(SENTENCES), update=True) | ||
| mem = self.model._get_thread_working_mem() | ||
| self.assertEqual((100,1450), self.model._do_train_job( | ||
| [IndexedSentence(s, i) for i,s in enumerate(SENTENCES)], | ||
| target=self.model.sv.vectors, memory=mem) | ||
| ) | ||
| self.assertEqual((104,DIM), self.model.sv.vectors.shape) | ||
| def test_train(self): | ||
| self.assertEqual((100,1450), self.model.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)])) | ||
| def test_train_single_from_disk(self): | ||
| p = Path("fse/test/test_data/test_vecs") | ||
| p_res = Path("fse/test/test_data/test_vecs.vectors") | ||
| p_target = Path("fse/test/test_data/test_vecs_wv.vectors") | ||
| se1 = Average(W2V) | ||
| se2 = Average(W2V, sv_mapfile_path=str(p.absolute()) ,wv_mapfile_path=str(p.absolute())) | ||
| se1.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)]) | ||
| se2.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)]) | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertTrue((se1.wv.vectors == se2.wv.vectors).all()) | ||
| self.assertFalse(se2.wv.vectors.flags.writeable) | ||
| self.assertTrue((se1.sv.vectors == se2.sv.vectors).all()) | ||
| p_res.unlink() | ||
| p_target.unlink() | ||
| def test_train_multi_from_disk(self): | ||
| p = Path("fse/test/test_data/test_vecs") | ||
| p_res = Path("fse/test/test_data/test_vecs.vectors") | ||
| p_target = Path("fse/test/test_data/test_vecs_wv.vectors") | ||
| se1 = Average(W2V, workers=2) | ||
| se2 = Average(W2V, workers=2, sv_mapfile_path=str(p.absolute()) ,wv_mapfile_path=str(p.absolute())) | ||
| se1.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)]) | ||
| se2.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)]) | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertTrue((se1.wv.vectors == se2.wv.vectors).all()) | ||
| self.assertFalse(se2.wv.vectors.flags.writeable) | ||
| self.assertTrue((se1.sv.vectors == se2.sv.vectors).all()) | ||
| p_res.unlink() | ||
| p_target.unlink() | ||
| def test_check_parameter_sanity(self): | ||
| se = Average(W2V) | ||
| se.word_weights = np.full(20, 2., dtype=np.float32) | ||
| with self.assertRaises(ValueError): | ||
| se._check_parameter_sanity() | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| """ | ||
| Automated tests for checking the base_s2v class. | ||
| """ | ||
| import logging | ||
| import unittest | ||
| from pathlib import Path | ||
| import numpy as np | ||
| from fse.models.base_s2v import BaseSentence2VecModel, BaseSentence2VecPreparer | ||
| from fse.inputs import IndexedSentence | ||
| from gensim.models import Word2Vec, FastText | ||
| from gensim.models.keyedvectors import BaseKeyedVectors | ||
| from wordfreq import get_frequency_dict | ||
| logger = logging.getLogger(__name__) | ||
| CORPUS = Path("fse/test/test_data/test_sentences.txt") | ||
| DIM = 5 | ||
| W2V = Word2Vec(min_count=1, size=DIM) | ||
| SENTENCES = [l.split() for i, l in enumerate(open(CORPUS, "r"))] | ||
| W2V.build_vocab(SENTENCES) | ||
| class TestBaseSentence2VecModelFunctions(unittest.TestCase): | ||
| def test_init_wo_model(self): | ||
| with self.assertRaises(TypeError): | ||
| BaseSentence2VecModel() | ||
| def test_init_w_wrong_model(self): | ||
| with self.assertRaises(RuntimeError): | ||
| BaseSentence2VecModel(int) | ||
| def test_init_w_empty_w2v_model(self): | ||
| with self.assertRaises(RuntimeError): | ||
| w2v = Word2Vec() | ||
| del w2v.wv.vectors | ||
| BaseSentence2VecModel(w2v) | ||
| def test_init_w_empty_vocab_model(self): | ||
| with self.assertRaises(RuntimeError): | ||
| w2v = Word2Vec() | ||
| del w2v.wv.vocab | ||
| BaseSentence2VecModel(w2v) | ||
| def test_init_w_ft_model_wo_vecs(self): | ||
| ft = FastText(SENTENCES, size=5) | ||
| with self.assertRaises(RuntimeError): | ||
| ft.wv.vectors_vocab = None | ||
| BaseSentence2VecModel(ft) | ||
| with self.assertRaises(RuntimeError): | ||
| ft.wv.vectors_ngrams = None | ||
| BaseSentence2VecModel(ft) | ||
| def test_init_w_empty_ft_model(self): | ||
| ft = FastText(min_count=1, size=DIM) | ||
| ft.wv.vectors = np.zeros(10) | ||
| ft.wv.vectors_ngrams = None | ||
| with self.assertRaises(RuntimeError): | ||
| BaseSentence2VecModel(ft) | ||
| def test_init_w_incompatible_ft_model(self): | ||
| ft = FastText(min_count=1, size=DIM, compatible_hash=False) | ||
| with self.assertRaises(RuntimeError): | ||
| BaseSentence2VecModel(ft) | ||
| def test_include_model(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| self.assertTrue(isinstance(se.wv, BaseKeyedVectors)) | ||
| def test_model_w_language(self): | ||
| se = BaseSentence2VecModel(W2V, lang_freq="en") | ||
| freq = int((2**31 - 1) * get_frequency_dict("en", wordlist="best")["help"]) | ||
| self.assertEqual(freq, se.wv.vocab["help"].count) | ||
| self.assertEqual(21, se.wv.vocab["79"].count) | ||
| def test_model_w_wrong_language(self): | ||
| with self.assertRaises(ValueError): | ||
| BaseSentence2VecModel(W2V, lang_freq="test") | ||
| def test_save_load(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| p = Path("fse/test/test_data/test_emb.model") | ||
| se.save(str(p.absolute())) | ||
| self.assertTrue(p.exists()) | ||
| se2 = BaseSentence2VecModel.load(str(p.absolute())) | ||
| self.assertTrue((se.wv.vectors == se2.wv.vectors).all()) | ||
| self.assertTrue(se.wv.index2word == se2.wv.index2word) | ||
| self.assertEqual(se.workers, se2.workers) | ||
| p.unlink() | ||
| def test_save_load_with_memmap(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| shape = (1000, 1000) | ||
| ft.wv.vectors = np.zeros(shape, np.float32) | ||
| p = Path("fse/test/test_data/test_emb") | ||
| p_vecs = Path("fse/test/test_data/test_emb_wv.vectors") | ||
| p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors") | ||
| p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors") | ||
| p_not_exists = Path("fse/test/test_data/test_emb.wv.vectors.npy") | ||
| se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p)) | ||
| self.assertTrue(p_vecs.exists()) | ||
| self.assertTrue(p_ngrams.exists()) | ||
| self.assertTrue(p_vocab.exists()) | ||
| se.save(str(p.absolute())) | ||
| self.assertTrue(p.exists()) | ||
| self.assertFalse(p_not_exists.exists()) | ||
| se = BaseSentence2VecModel.load(str(p.absolute())) | ||
| self.assertFalse(se.wv.vectors_vocab.flags.writeable) | ||
| self.assertEqual(shape, se.wv.vectors.shape) | ||
| self.assertEqual((2000000, 5), se.wv.vectors_ngrams.shape) | ||
| for p in [p, p_vecs, p_ngrams, p_vocab]: | ||
| p.unlink() | ||
| def test_map_all_vectors_to_disk(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| p = Path("fse/test/test_data/test_emb") | ||
| p_vecs = Path("fse/test/test_data/test_emb_wv.vectors") | ||
| p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors") | ||
| p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors") | ||
| se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p)) | ||
| self.assertTrue(p_vecs.exists()) | ||
| self.assertTrue(p_ngrams.exists()) | ||
| self.assertTrue(p_vocab.exists()) | ||
| for p in [p_vecs, p_ngrams, p_vocab]: | ||
| p.unlink() | ||
| def test_input_check(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| class BadIterator(): | ||
| def __init__(self): | ||
| pass | ||
| with self.assertRaises(TypeError): | ||
| se._check_input_data_sanity() | ||
| with self.assertRaises(TypeError): | ||
| se._check_input_data_sanity(data_iterable = None) | ||
| with self.assertRaises(TypeError): | ||
| se._check_input_data_sanity(data_iterable = "Hello there!") | ||
| with self.assertRaises(TypeError): | ||
| se._check_input_data_sanity(data_iterable = BadIterator()) | ||
| def test_scan_w_list(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| with self.assertRaises(TypeError): | ||
| se.scan_sentences(SENTENCES) | ||
| def test_str_rep(self): | ||
| output = str(BaseSentence2VecModel(W2V)) | ||
| self.assertEqual("BaseSentence2VecModel based on Word2VecKeyedVectors, size=0", output) | ||
| def test_scan_w_IndexedSentence(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| id_sent = [IndexedSentence(s, i) for i,s in enumerate(SENTENCES)] | ||
| stats = se.scan_sentences(id_sent, progress_per=0) | ||
| self.assertEqual(100, stats["total_sentences"]) | ||
| self.assertEqual(1450, stats["total_words"]) | ||
| self.assertEqual(14, stats["average_length"]) | ||
| self.assertEqual(0, stats["empty_sentences"]) | ||
| self.assertEqual(100, stats["max_index"]) | ||
| def test_scan_w_wrong_IndexedSentence(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| id_sent = [IndexedSentence(s, str(i)) for i,s in enumerate(SENTENCES)] | ||
| with self.assertRaises(TypeError): | ||
| se.scan_sentences(id_sent) | ||
| def test_scan_w_empty(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| for i in [5, 10, 15]: | ||
| SENTENCES[i] = [] | ||
| self.assertEqual(3, se.scan_sentences([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)])["empty_sentences"]) | ||
| def test_scan_w_wrong_input(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| sentences = ["the dog hit the car", "he was very fast"] | ||
| with self.assertRaises(TypeError): | ||
| se.scan_sentences(sentences) | ||
| with self.assertRaises(TypeError): | ||
| se.scan_sentences([IndexedSentence(s, i) for i,s in enumerate(sentences)]) | ||
| with self.assertRaises(TypeError): | ||
| se.scan_sentences([list(range(10) for _ in range(2))]) | ||
| with self.assertRaises(RuntimeError): | ||
| se.scan_sentences([IndexedSentence(s, i+1) for i,s in enumerate(SENTENCES)]) | ||
| with self.assertRaises(ValueError): | ||
| se.scan_sentences([IndexedSentence(s, i-1) for i,s in enumerate(SENTENCES)]) | ||
| def test_scan_w_many_to_one_input(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| output = se.scan_sentences([IndexedSentence(s, 0) for i,s in enumerate(SENTENCES)])["max_index"] | ||
| self.assertEqual(1, output) | ||
| def test_estimate_memory(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| self.assertEqual(2040025124, se.estimate_memory(int(1e8))["Total"]) | ||
| def test_train(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| with self.assertRaises(NotImplementedError): | ||
| se.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)]) | ||
| def test_log_end(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| se._log_train_end(eff_sentences=2000, eff_words=4000, overall_time=10) | ||
| def test_child_requirements(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| with self.assertRaises(NotImplementedError): | ||
| se._do_train_job(None, None, None) | ||
| with self.assertRaises(NotImplementedError): | ||
| se._pre_train_calls() | ||
| with self.assertRaises(NotImplementedError): | ||
| se._post_train_calls() | ||
| with self.assertRaises(NotImplementedError): | ||
| se._check_parameter_sanity() | ||
| with self.assertRaises(NotImplementedError): | ||
| se._check_dtype_santiy() | ||
| with self.assertRaises(NotImplementedError): | ||
| se._post_inference_calls() | ||
| def test_check_pre_train_san_no_wv(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| se.wv = None | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| def test_check_pre_train_san_no_wv_len(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| se.wv.vectors = [] | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| def test_check_pre_train_san_no_ngrams_vectors(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| se.wv.vectors_ngrams = [] | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.wv.vectors_ngrams = [1] | ||
| se.wv.vectors_vocab = [] | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| def test_check_pre_train_san_no_sv_vecs(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| se.sv.vectors = None | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| def test_check_pre_train_san_no_word_weights(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| se.word_weights = None | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| def test_check_pre_train_san_incos_len(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| se.word_weights = np.ones(20) | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| def test_check_pre_train_dtypes(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| se.wv.vectors = np.zeros((len(se.wv.vocab),20), dtype=np.float64) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.wv.vectors = np.zeros((len(se.wv.vocab),20), dtype=np.float32) | ||
| se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32) | ||
| se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float16) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32) | ||
| se.sv.vectors = np.zeros((len(se.wv.vocab),20), dtype=int) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.sv.vectors = np.zeros((len(se.wv.vocab),20), dtype=np.float32) | ||
| se.word_weights = np.ones(len(se.wv.vocab), dtype=bool) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32) | ||
| def test_check_pre_train_statistics(self): | ||
| ft = FastText(min_count=1, size=5) | ||
| ft.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(ft) | ||
| for v in se.wv.vocab: | ||
| se.wv.vocab[v].count = 1 | ||
| # Just throws multiple warnings warning | ||
| se._check_pre_training_sanity(1,1,1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_pre_training_sanity(0,1,1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_pre_training_sanity(1,0,1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_pre_training_sanity(1,1,0) | ||
| def test_post_training_sanity(self): | ||
| w2v = Word2Vec() | ||
| w2v.build_vocab(SENTENCES) | ||
| se = BaseSentence2VecModel(w2v) | ||
| se.prep.prepare_vectors(se.sv, 20) | ||
| with self.assertRaises(ValueError): | ||
| se._check_post_training_sanity(0,1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_post_training_sanity(1,0) | ||
| def test_move_ndarray_to_disk_w2v(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| p = Path("fse/test/test_data/test_vecs") | ||
| p_target = Path("fse/test/test_data/test_vecs_wv.vectors") | ||
| se.wv.vectors[0,1] = 10 | ||
| vecs = se.wv.vectors.copy() | ||
| output = se._move_ndarray_to_disk(se.wv.vectors, name="wv", mapfile_path=str(p.absolute())) | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertFalse(output.flags.writeable) | ||
| self.assertTrue((vecs == output).all()) | ||
| p_target.unlink() | ||
| def test_move_ndarray_to_disk_wo_file(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| with self.assertRaises(TypeError): | ||
| output = se._move_ndarray_to_disk(se.wv.vectors) | ||
| def test_move_w2v_vectors_to_disk_from_init(self): | ||
| p = Path("fse/test/test_data/test_vecs") | ||
| se = BaseSentence2VecModel(W2V, wv_mapfile_path=str(p.absolute())) | ||
| p_target = Path("fse/test/test_data/test_vecs_wv.vectors") | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertFalse(se.wv.vectors.flags.writeable) | ||
| p_target.unlink() | ||
| def test_move_ft_vectors_to_disk_from_init(self): | ||
| ft = FastText(min_count=1, size=DIM) | ||
| ft.build_vocab(SENTENCES) | ||
| p = Path("fse/test/test_data/test_vecs") | ||
| p_target_wv = Path("fse/test/test_data/test_vecs_wv.vectors") | ||
| p_target_ngram = Path("fse/test/test_data/test_vecs_ngrams.vectors") | ||
| p_target_vocab = Path("fse/test/test_data/test_vecs_vocab.vectors") | ||
| se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p.absolute())) | ||
| self.assertTrue(p_target_wv.exists()) | ||
| self.assertFalse(se.wv.vectors.flags.writeable) | ||
| self.assertTrue(p_target_ngram.exists()) | ||
| self.assertFalse(se.wv.vectors_ngrams.flags.writeable) | ||
| p_target_wv.unlink() | ||
| p_target_ngram.unlink() | ||
| p_target_vocab.unlink() | ||
| def test_train_manager(self): | ||
| se = BaseSentence2VecModel(W2V, workers=2) | ||
| def temp_train_job(data_iterable, target, memory): | ||
| v1 = v2 = sum(1 for _ in data_iterable) | ||
| return v1*2, v2*3 | ||
| se._do_train_job = temp_train_job | ||
| job_output = se._train_manager(data_iterable=[IndexedSentence(s, i) for i,s in enumerate(SENTENCES)], total_sentences=len(SENTENCES),report_delay=0.01) | ||
| self.assertEqual((100,200,300), job_output) | ||
| def test_infer_method(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| def temp_train_job(data_iterable, target, memory): | ||
| for i in data_iterable: | ||
| target += 1 | ||
| return target | ||
| def pass_method(**kwargs): pass | ||
| se._post_inference_calls = pass_method | ||
| se._do_train_job = temp_train_job | ||
| output = se.infer([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)]) | ||
| self.assertTrue((100 == output).all()) | ||
| def test_infer_many_to_one(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| def temp_train_job(data_iterable, target, memory): | ||
| for i in data_iterable: | ||
| target += 1 | ||
| return target | ||
| def pass_method(**kwargs): pass | ||
| se._post_inference_calls = pass_method | ||
| se._do_train_job = temp_train_job | ||
| output = se.infer([IndexedSentence(s, 0) for i,s in enumerate(SENTENCES)]) | ||
| self.assertTrue((100 == output).all()) | ||
| self.assertEqual((1, 5), output.shape) | ||
| def test_infer_use_norm(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| def temp_train_job(data_iterable, target, memory): | ||
| for i in data_iterable: | ||
| target += 1 | ||
| return target | ||
| def pass_method(**kwargs): pass | ||
| se._post_inference_calls = pass_method | ||
| se._do_train_job = temp_train_job | ||
| output = se.infer([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)], use_norm=True) | ||
| self.assertTrue(np.allclose(1., np.sqrt(np.sum(output[0]**2)))) | ||
| class TestBaseSentence2VecPreparerFunctions(unittest.TestCase): | ||
| def test_reset_vectors(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| trainables = BaseSentence2VecPreparer() | ||
| trainables.reset_vectors(se.sv, 20) | ||
| self.assertEqual((20,DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
| self.assertTrue((np.zeros((20, DIM)) == se.sv.vectors).all()) | ||
| self.assertTrue(se.sv.vectors_norm is None) | ||
| def test_reset_vectors_memmap(self): | ||
| p = Path("fse/test/test_data/test_vectors") | ||
| p_target = Path("fse/test/test_data/test_vectors.vectors") | ||
| se = BaseSentence2VecModel(W2V, sv_mapfile_path=str(p.absolute())) | ||
| trainables = BaseSentence2VecPreparer() | ||
| trainables.reset_vectors(se.sv, 20) | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertEqual((20,DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
| self.assertTrue((np.zeros((20, DIM)) == se.sv.vectors).all()) | ||
| self.assertTrue(se.sv.vectors_norm is None) | ||
| p_target.unlink() | ||
| def test_update_vectors(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| trainables = BaseSentence2VecPreparer() | ||
| trainables.reset_vectors(se.sv, 20) | ||
| se.sv.vectors[:] = 1. | ||
| trainables.update_vectors(se.sv, 10) | ||
| self.assertEqual((30,DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
| self.assertTrue((np.ones((20, DIM)) == se.sv.vectors[:20]).all()) | ||
| self.assertTrue((np.zeros((10, DIM)) == se.sv.vectors[20:]).all()) | ||
| self.assertTrue(se.sv.vectors_norm is None) | ||
| def test_update_vectors_memmap(self): | ||
| p = Path("fse/test/test_data/test_vectors") | ||
| p_target = Path("fse/test/test_data/test_vectors.vectors") | ||
| se = BaseSentence2VecModel(W2V, sv_mapfile_path=str(p.absolute())) | ||
| trainables = BaseSentence2VecPreparer() | ||
| trainables.reset_vectors(se.sv, 20) | ||
| se.sv.vectors[:] = 1. | ||
| trainables.update_vectors(se.sv, 10) | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertEqual((30,DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
| self.assertTrue((np.ones((20, DIM)) == se.sv.vectors[:20]).all()) | ||
| self.assertTrue((np.zeros((10, DIM)) == se.sv.vectors[20:]).all()) | ||
| self.assertTrue(se.sv.vectors_norm is None) | ||
| p_target.unlink() | ||
| def test_prepare_vectors(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| trainables = BaseSentence2VecPreparer() | ||
| trainables.prepare_vectors(se.sv, 20, update=False) | ||
| self.assertEqual((20,DIM), se.sv.vectors.shape) | ||
| trainables.prepare_vectors(se.sv, 40, update=True) | ||
| self.assertEqual((60,DIM), se.sv.vectors.shape) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| """ | ||
| Automated tests for checking the input methods. | ||
| """ | ||
| import logging | ||
| import unittest | ||
| from fse.inputs import IndexedSentence, IndexedList, IndexedLineDocument | ||
| logger = logging.getLogger(__name__) | ||
| class TestIndexedSentenceFunctions(unittest.TestCase): | ||
| def test__str(self): | ||
| sent_0 = IndexedSentence(["Hello", "there"], 0) | ||
| sent_1 = IndexedSentence(["Hello", "again"], 1) | ||
| self.assertEqual(0, sent_0.index) | ||
| self.assertEqual(1, sent_1.index) | ||
| self.assertEqual(["Hello", "there"], sent_0.words) | ||
| self.assertEqual(["Hello", "again"], sent_1.words) | ||
| class TestIndexedListFuncs(unittest.TestCase): | ||
| def setUp(self): | ||
| self.list_a = ["the dog is good", "it's nice and comfy"] | ||
| self.list_b = ["lorem ipsum dolor", "si amet"] | ||
| self.list_c = [s.split() for s in self.list_a] | ||
| self.set_a = set(["hello there", "its a set"]) | ||
| self.il = IndexedList(self.list_a, self.list_b, self.set_a, split=True) | ||
| def test_init_list(self): | ||
| l = IndexedList(self.list_a) | ||
| def test_init_multiple_list(self): | ||
| l = IndexedList(self.list_a, self.list_b) | ||
| self.assertEqual(4, len(l)) | ||
| def test_init_set(self): | ||
| l = IndexedList(self.set_a) | ||
| def test_init_dict(self): | ||
| tmp = {0: "hello there"} | ||
| with self.assertRaises(TypeError): | ||
| IndexedList(tmp) | ||
| def test_init_multiple_args(self): | ||
| with self.assertRaises(RuntimeError): | ||
| IndexedList(self.list_a, split=True, split_func=self.list_a) | ||
| def test_init_multiple_splits(self): | ||
| with self.assertRaises(RuntimeError): | ||
| IndexedList(self.list_a, split_func=self.list_a, pre_splitted=True) | ||
| def test__len(self): | ||
| l = IndexedList(self.list_a) | ||
| self.assertEqual(2, len(l)) | ||
| def test__str(self): | ||
| target = "[\'the dog is good\', \"it's nice and comfy\"]" | ||
| self.assertEqual(target, str(IndexedList(self.list_a))) | ||
| def test_getitem(self): | ||
| self.assertEqual(["the", "dog", "is", "good"], self.il.__getitem__(0).words) | ||
| self.assertEqual(0, self.il.__getitem__(0).index) | ||
| def test_getitem_presplitted(self): | ||
| l = IndexedList(self.list_c, pre_splitted=True) | ||
| self.assertEqual(["the", "dog", "is", "good"], self.il.__getitem__(0).words) | ||
| def test_delitem(self): | ||
| self.il.__delitem__(0) | ||
| self.assertEqual(5, len(self.il)) | ||
| def test_setitem(self): | ||
| self.il.__setitem__(0, "is it me?") | ||
| self.assertEqual(["is", "it", "me?"], self.il[0].words) | ||
| def test_setitem_wrong_dtype(self): | ||
| with self.assertRaises(TypeError): | ||
| self.il.__setitem__(0, ["is it me?"]) | ||
| def test_append(self): | ||
| self.il.append("is it me?") | ||
| self.assertEqual(["is", "it", "me?"], self.il[-1].words) | ||
| def test_extend(self): | ||
| self.il.extend(self.list_a, self.list_b) | ||
| self.assertEqual(10, len(self.il)) | ||
| class TestIndexedLineDocument(unittest.TestCase): | ||
| def setUp(self): | ||
| self.p = "fse/test/test_data/test_sentences.txt" | ||
| self.doc = IndexedLineDocument(self.p) | ||
| def test_getitem(self): | ||
| self.assertEqual("Good stuff i just wish it lasted longer", self.doc[0]) | ||
| self.assertEqual("Save yourself money and buy it direct from lg", self.doc[19]) | ||
| self.assertEqual("I am not sure if it is a tracfone problem or the battery", self.doc[-1]) | ||
| def test_yield(self): | ||
| first = IndexedSentence("Good stuff i just wish it lasted longer".split(), 0) | ||
| last = IndexedSentence("I am not sure if it is a tracfone problem or the battery".split(), 99) | ||
| for i, obj in enumerate(self.doc): | ||
| if i == 0: | ||
| self.assertEqual(first, obj) | ||
| if i == 99: | ||
| self.assertEqual(last, obj) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| """ | ||
| Automated tests for checking the sentence vectors. | ||
| """ | ||
| import logging | ||
| import unittest | ||
| from pathlib import Path | ||
| import numpy as np | ||
| from fse.models.sentencevectors import SentenceVectors | ||
| from fse.models.average import Average | ||
| from fse.inputs import IndexedSentence, IndexedList, IndexedLineDocument | ||
| from gensim.models import Word2Vec | ||
| logger = logging.getLogger(__name__) | ||
| CORPUS = Path("fse/test/test_data/test_sentences.txt") | ||
| DIM = 5 | ||
| W2V = Word2Vec(min_count=1, size=DIM, seed=42) | ||
| SENTENCES = [l.split() for l in open(CORPUS, "r")] | ||
| W2V.build_vocab(SENTENCES) | ||
| np.random.seed(42) | ||
| W2V.wv.vectors = np.random.uniform(size=W2V.wv.vectors.shape).astype(np.float32) | ||
| class TestSentenceVectorsFunctions(unittest.TestCase): | ||
| def setUp(self): | ||
| self.sv = SentenceVectors(2) | ||
| self.sv.vectors = np.arange(10).reshape(5,2) | ||
| def test_getitem(self): | ||
| self.assertTrue(([0,1] == self.sv[0]).all()) | ||
| self.assertTrue(([[0,1],[4,5]] == self.sv[[0,2]]).all()) | ||
| def test_isin(self): | ||
| self.assertTrue(0 in self.sv) | ||
| self.assertFalse(5 in self.sv) | ||
| def test_init_sims_wo_replace(self): | ||
| self.sv.init_sims() | ||
| self.assertIsNotNone(self.sv.vectors_norm) | ||
| self.assertFalse((self.sv.vectors == self.sv.vectors_norm).all()) | ||
| v1 = self.sv.vectors[0] | ||
| v1 = v1 / np.sqrt(np.sum(v1**2)) | ||
| v2 = self.sv.vectors[1] | ||
| v2 = v2 / np.sqrt(np.sum(v2**2)) | ||
| self.assertTrue(np.allclose(v1, self.sv.vectors_norm[0])) | ||
| self.assertTrue(np.allclose(v2, self.sv.vectors_norm[1])) | ||
| self.assertTrue(np.allclose(v2, self.sv.get_vector(1, True))) | ||
| def test_get_vector(self): | ||
| self.assertTrue(([0,1] == self.sv.get_vector(0)).all()) | ||
| self.assertTrue(([2,3] == self.sv.get_vector(1)).all()) | ||
| def test_init_sims_w_replace(self): | ||
| self.sv.init_sims(True) | ||
| self.assertTrue((self.sv.vectors[0] == self.sv.vectors_norm[0]).all()) | ||
| def test_init_sims_w_mapfile(self): | ||
| p = Path("fse/test/test_data/test_vectors") | ||
| self.sv.mapfile_path = str(p.absolute()) | ||
| self.sv.init_sims() | ||
| p = Path("fse/test/test_data/test_vectors.vectors_norm") | ||
| self.assertTrue(p.exists()) | ||
| p.unlink() | ||
| def test_save_load(self): | ||
| p = Path("fse/test/test_data/test_vectors.vectors") | ||
| self.sv.save(str(p.absolute())) | ||
| self.assertTrue(p.exists()) | ||
| sv2 = SentenceVectors.load(str(p.absolute())) | ||
| self.assertTrue((self.sv.vectors == sv2.vectors).all()) | ||
| p.unlink() | ||
| def test_save_load_with_memmap(self): | ||
| p = Path("fse/test/test_data/test_vectors") | ||
| p_target = Path("fse/test/test_data/test_vectors.vectors") | ||
| p_not_exists = Path("fse/test/test_data/test_vectors.vectors.npy") | ||
| sv = SentenceVectors(2, mapfile_path=str(p)) | ||
| shape = (1000, 1000) | ||
| sv.vectors = np.ones(shape, dtype=np.float32) | ||
| memvecs = np.memmap( | ||
| p_target, dtype=np.float32, | ||
| mode='w+', shape=shape) | ||
| memvecs[:] = sv.vectors[:] | ||
| del memvecs | ||
| self.assertTrue(p_target.exists()) | ||
| sv.save(str(p.absolute())) | ||
| self.assertTrue(p.exists()) | ||
| self.assertFalse(p_not_exists.exists()) | ||
| sv = SentenceVectors.load(str(p.absolute())) | ||
| self.assertEqual(shape, sv.vectors.shape) | ||
| for t in [p, p_target]: | ||
| t.unlink() | ||
| def test_len(self): | ||
| self.assertEqual(5, len(self.sv)) | ||
| def test_similarity(self): | ||
| v1 = self.sv.vectors[0] | ||
| v1 = v1 / np.sqrt(np.sum(v1**2)) | ||
| v2 = self.sv.vectors[1] | ||
| v2 = v2 / np.sqrt(np.sum(v2**2)) | ||
| self.assertEqual(v1.dot(v2), self.sv.similarity(0,1)) | ||
| self.assertEqual(1-(v1.dot(v2)), self.sv.distance(0,1)) | ||
| def test_most_similar(self): | ||
| sent_ind = IndexedList(SENTENCES, pre_splitted=True) | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| o = m.sv.most_similar(positive=0) | ||
| self.assertEqual(45, o[0][0]) | ||
| self.assertEqual(35, o[1][0]) | ||
| o = m.sv.most_similar(positive=0, indexable=sentences) | ||
| self.assertEqual("Looks good and fits snug", o[0][0]) | ||
| o = m.sv.most_similar(positive=0, indexable=sent_ind) | ||
| self.assertEqual("Looks good and fits snug".split(), o[0][0].words) | ||
| def test_most_similar_vec(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| m.sv.init_sims() | ||
| v = m.sv.get_vector(0, use_norm=True) | ||
| o = m.sv.most_similar(positive=v) | ||
| # Includes 0 obviously | ||
| self.assertEqual(45, o[1][0]) | ||
| self.assertEqual(35, o[2][0]) | ||
| def test_most_similar_vecs(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| m.sv.init_sims() | ||
| v = m.sv[[0,1]] | ||
| o = m.sv.most_similar(positive=v) | ||
| self.assertEqual(1, o[0][0]) | ||
| self.assertEqual(0, o[1][0]) | ||
| def test_most_similar_wrong_indexable(self): | ||
| def indexable(self): | ||
| pass | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| with self.assertRaises(RuntimeError): | ||
| m.sv.most_similar(positive=0, indexable=indexable) | ||
| def test_most_similar_topn(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| o = m.sv.most_similar(positive=0, topn=20) | ||
| self.assertEqual(20, len(o)) | ||
| def test_most_similar_restrict_size(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| o = m.sv.most_similar(positive=20, topn=20, restrict_size=5) | ||
| self.assertEqual(5, len(o)) | ||
| def test_most_similar_restrict_size_tuple(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| o = m.sv.most_similar(positive=20, topn=20, restrict_size=(5, 25)) | ||
| self.assertEqual(19, len(o)) | ||
| self.assertEqual(22, o[0][0]) | ||
| o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25)) | ||
| self.assertEqual(20, len(o)) | ||
| self.assertEqual(9, o[0][0]) | ||
| o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25), indexable=sentences) | ||
| self.assertEqual(20, len(o)) | ||
| self.assertEqual(9, o[0][1]) | ||
| def test_similar_by_word(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| o = m.sv.similar_by_word(word="the", wv=m.wv) | ||
| self.assertEqual(96, o[0][0]) | ||
| o = m.sv.similar_by_word(word="the", wv=m.wv, indexable=sentences) | ||
| self.assertEqual(96, o[0][1]) | ||
| def test_similar_by_vector(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| o = m.sv.similar_by_vector(m.wv["the"]) | ||
| self.assertEqual(96, o[0][0]) | ||
| def test_similar_by_sentence(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| o = m.sv.similar_by_sentence(sentence=["the", "product", "is", "good"], model=m) | ||
| self.assertEqual(4, o[0][0]) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() |
| import logging | ||
| import unittest | ||
| from pathlib import Path | ||
| import numpy as np | ||
| from fse.models.sif import SIF, compute_principal_components, remove_principal_components | ||
| from fse.inputs import IndexedLineDocument | ||
| from gensim.models import Word2Vec | ||
| logger = logging.getLogger(__name__) | ||
| CORPUS = Path("fse/test/test_data/test_sentences.txt") | ||
| DIM = 50 | ||
| W2V = Word2Vec(min_count=1, size=DIM) | ||
| SENTENCES = [l.split() for l in open(CORPUS, "r")] | ||
| W2V.build_vocab(SENTENCES) | ||
| class TestSIFFunctions(unittest.TestCase): | ||
| def setUp(self): | ||
| self.sentences = IndexedLineDocument(CORPUS) | ||
| self.model = SIF(W2V, lang_freq="en") | ||
| def test_parameter_sanity(self): | ||
| with self.assertRaises(ValueError): | ||
| m = SIF(W2V, alpha= -1) | ||
| m._check_parameter_sanity() | ||
| with self.assertRaises(ValueError): | ||
| m = SIF(W2V, components=-1) | ||
| m._check_parameter_sanity() | ||
| with self.assertRaises(ValueError): | ||
| m = SIF(W2V) | ||
| m.word_weights = np.ones_like(m.word_weights) + 2 | ||
| m._check_parameter_sanity() | ||
| def test_pre_train_calls(self): | ||
| self.model._pre_train_calls() | ||
| def test_post_train_calls(self): | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 0, atol=1e-5)) | ||
| def test_post_train_calls_no_removal(self): | ||
| self.model.components = 0 | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5)) | ||
| def test_post_inference_calls(self): | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| output = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_inference_calls(output=output) | ||
| self.assertTrue(np.allclose(output, 0, atol=1e-5)) | ||
| def test_post_inference_calls_no_svd(self): | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model.svd_res = None | ||
| with self.assertRaises(RuntimeError): | ||
| self.model._post_inference_calls(output=None) | ||
| def test_post_inference_calls_no_removal(self): | ||
| self.model.components = 0 | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| self.model._post_inference_calls(output=None) | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5)) | ||
| def test_dtype_sanity_word_weights(self): | ||
| self.model.word_weights = np.ones_like(self.model.word_weights, dtype=int) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vals(self): | ||
| self.model.svd_res = (np.ones_like(self.model.word_weights, dtype=int), np.array(0, dtype=np.float32)) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vecs(self): | ||
| self.model.svd_res = (np.array(0, dtype=np.float32), np.ones_like(self.model.word_weights, dtype=int)) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_compute_sif_weights(self): | ||
| cs = 1095661426 | ||
| w = "Good" | ||
| pw = 1.916650481770269e-08 | ||
| alpha = self.model.alpha | ||
| sif = alpha / (alpha + pw) | ||
| idx = self.model.wv.vocab[w].index | ||
| self.model._compute_sif_weights() | ||
| self.assertTrue(np.allclose(self.model.word_weights[idx], sif)) | ||
| def test_train(self): | ||
| output = self.model.train(self.sentences) | ||
| self.assertEqual((100,1450), output) | ||
| self.assertTrue(np.isfinite(self.model.sv.vectors).all()) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() |
| import logging | ||
| import unittest | ||
| from pathlib import Path | ||
| import numpy as np | ||
| from fse.models.usif import uSIF | ||
| from fse.inputs import IndexedLineDocument | ||
| from gensim.models import Word2Vec | ||
| logger = logging.getLogger(__name__) | ||
| CORPUS = Path("fse/test/test_data/test_sentences.txt") | ||
| DIM = 50 | ||
| W2V = Word2Vec(min_count=1, size=DIM) | ||
| SENTENCES = [l.split() for l in open(CORPUS, "r")] | ||
| W2V.build_vocab(SENTENCES) | ||
| class TestuSIFFunctions(unittest.TestCase): | ||
| def setUp(self): | ||
| self.sentences = IndexedLineDocument(CORPUS) | ||
| self.model = uSIF(W2V, lang_freq="en") | ||
| def test_parameter_sanity(self): | ||
| with self.assertRaises(ValueError): | ||
| m = uSIF(W2V, length= 0) | ||
| m._check_parameter_sanity() | ||
| with self.assertRaises(ValueError): | ||
| m = uSIF(W2V, components=-1, length=11) | ||
| m._check_parameter_sanity() | ||
| def test_pre_train_calls(self): | ||
| kwargs = {"average_length": 10} | ||
| self.model._pre_train_calls(**kwargs) | ||
| self.assertEqual(10, self.model.length) | ||
| def test_post_train_calls(self): | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 0, atol=1e-5)) | ||
| def test_post_train_calls_no_removal(self): | ||
| self.model.components = 0 | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5)) | ||
| def test_post_inference_calls(self): | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| output = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_inference_calls(output=output) | ||
| self.assertTrue(np.allclose(output, 0, atol=1e-5)) | ||
| def test_post_inference_calls_no_svd(self): | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model.svd_res = None | ||
| with self.assertRaises(RuntimeError): | ||
| self.model._post_inference_calls(output=None) | ||
| def test_post_inference_calls_no_removal(self): | ||
| self.model.components = 0 | ||
| self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_train_calls() | ||
| self.model._post_inference_calls(output=None) | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5)) | ||
| def test_dtype_sanity_word_weights(self): | ||
| self.model.word_weights = np.ones_like(self.model.word_weights, dtype=int) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vals(self): | ||
| self.model.svd_res = (np.ones_like(self.model.word_weights, dtype=int), np.array(0, dtype=np.float32)) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vecs(self): | ||
| self.model.svd_res = (np.array(0, dtype=np.float32), np.ones_like(self.model.word_weights, dtype=int)) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_compute_usif_weights(self): | ||
| w = "Good" | ||
| pw = 1.916650481770269e-08 | ||
| idx = self.model.wv.vocab[w].index | ||
| self.model.length = 11 | ||
| a = 0.17831555484795414 | ||
| usif = a / ((a/2) + pw) | ||
| self.model._compute_usif_weights() | ||
| self.assertTrue(np.allclose(self.model.word_weights[idx], usif)) | ||
| def test_train(self): | ||
| output = self.model.train(self.sentences) | ||
| self.assertEqual((100,1450), output) | ||
| self.assertTrue(np.isfinite(self.model.sv.vectors).all()) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() |
| import logging | ||
| import unittest | ||
| import numpy as np | ||
| from fse.models.utils import compute_principal_components, remove_principal_components | ||
| logger = logging.getLogger(__name__) | ||
| class TestUtils(unittest.TestCase): | ||
| def test_compute_components(self): | ||
| m = np.random.uniform(size=(500, 10)).astype(np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| self.assertEqual(2, len(out)) | ||
| self.assertEqual(1, len(out[1])) | ||
| self.assertEqual(np.float32, out[1].dtype) | ||
| m = np.random.uniform(size=(500, 10)) | ||
| out = compute_principal_components(vectors = m, components=5) | ||
| self.assertEqual(2, len(out)) | ||
| self.assertEqual(5, len(out[1])) | ||
| def test_remove_components_inplace(self): | ||
| m = np.ones((500,10), dtype=np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| remove_principal_components(m, svd_res=out) | ||
| self.assertTrue(np.allclose(0., m, atol=1e-5)) | ||
| def test_remove_components(self): | ||
| m = np.ones((500,10), dtype=np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| res = remove_principal_components(m, svd_res=out, inplace=False) | ||
| self.assertTrue(np.allclose(1., res, atol=1e-5)) | ||
| def test_remove_weighted_components(self): | ||
| m = np.ones((500,10), dtype=np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| remove_principal_components(m, svd_res=out, weights=np.array([0.5])) | ||
| self.assertTrue(np.allclose(0.75, m)) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() |
+674
| GNU GENERAL PUBLIC LICENSE | ||
| Version 3, 29 June 2007 | ||
| Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/> | ||
| Everyone is permitted to copy and distribute verbatim copies | ||
| of this license document, but changing it is not allowed. | ||
| Preamble | ||
| The GNU General Public License is a free, copyleft license for | ||
| software and other kinds of works. | ||
| The licenses for most software and other practical works are designed | ||
| to take away your freedom to share and change the works. By contrast, | ||
| the GNU General Public License is intended to guarantee your freedom to | ||
| share and change all versions of a program--to make sure it remains free | ||
| software for all its users. We, the Free Software Foundation, use the | ||
| GNU General Public License for most of our software; it applies also to | ||
| any other work released this way by its authors. You can apply it to | ||
| your programs, too. | ||
| When we speak of free software, we are referring to freedom, not | ||
| price. Our General Public Licenses are designed to make sure that you | ||
| have the freedom to distribute copies of free software (and charge for | ||
| them if you wish), that you receive source code or can get it if you | ||
| want it, that you can change the software or use pieces of it in new | ||
| free programs, and that you know you can do these things. | ||
| To protect your rights, we need to prevent others from denying you | ||
| these rights or asking you to surrender the rights. Therefore, you have | ||
| certain responsibilities if you distribute copies of the software, or if | ||
| you modify it: responsibilities to respect the freedom of others. | ||
| For example, if you distribute copies of such a program, whether | ||
| gratis or for a fee, you must pass on to the recipients the same | ||
| freedoms that you received. You must make sure that they, too, receive | ||
| or can get the source code. And you must show them these terms so they | ||
| know their rights. | ||
| Developers that use the GNU GPL protect your rights with two steps: | ||
| (1) assert copyright on the software, and (2) offer you this License | ||
| giving you legal permission to copy, distribute and/or modify it. | ||
| For the developers' and authors' protection, the GPL clearly explains | ||
| that there is no warranty for this free software. For both users' and | ||
| authors' sake, the GPL requires that modified versions be marked as | ||
| changed, so that their problems will not be attributed erroneously to | ||
| authors of previous versions. | ||
| Some devices are designed to deny users access to install or run | ||
| modified versions of the software inside them, although the manufacturer | ||
| can do so. This is fundamentally incompatible with the aim of | ||
| protecting users' freedom to change the software. The systematic | ||
| pattern of such abuse occurs in the area of products for individuals to | ||
| use, which is precisely where it is most unacceptable. Therefore, we | ||
| have designed this version of the GPL to prohibit the practice for those | ||
| products. If such problems arise substantially in other domains, we | ||
| stand ready to extend this provision to those domains in future versions | ||
| of the GPL, as needed to protect the freedom of users. | ||
| Finally, every program is threatened constantly by software patents. | ||
| States should not allow patents to restrict development and use of | ||
| software on general-purpose computers, but in those that do, we wish to | ||
| avoid the special danger that patents applied to a free program could | ||
| make it effectively proprietary. To prevent this, the GPL assures that | ||
| patents cannot be used to render the program non-free. | ||
| The precise terms and conditions for copying, distribution and | ||
| modification follow. | ||
| TERMS AND CONDITIONS | ||
| 0. Definitions. | ||
| "This License" refers to version 3 of the GNU General Public License. | ||
| "Copyright" also means copyright-like laws that apply to other kinds of | ||
| works, such as semiconductor masks. | ||
| "The Program" refers to any copyrightable work licensed under this | ||
| License. Each licensee is addressed as "you". "Licensees" and | ||
| "recipients" may be individuals or organizations. | ||
| To "modify" a work means to copy from or adapt all or part of the work | ||
| in a fashion requiring copyright permission, other than the making of an | ||
| exact copy. The resulting work is called a "modified version" of the | ||
| earlier work or a work "based on" the earlier work. | ||
| A "covered work" means either the unmodified Program or a work based | ||
| on the Program. | ||
| To "propagate" a work means to do anything with it that, without | ||
| permission, would make you directly or secondarily liable for | ||
| infringement under applicable copyright law, except executing it on a | ||
| computer or modifying a private copy. Propagation includes copying, | ||
| distribution (with or without modification), making available to the | ||
| public, and in some countries other activities as well. | ||
| To "convey" a work means any kind of propagation that enables other | ||
| parties to make or receive copies. Mere interaction with a user through | ||
| a computer network, with no transfer of a copy, is not conveying. | ||
| An interactive user interface displays "Appropriate Legal Notices" | ||
| to the extent that it includes a convenient and prominently visible | ||
| feature that (1) displays an appropriate copyright notice, and (2) | ||
| tells the user that there is no warranty for the work (except to the | ||
| extent that warranties are provided), that licensees may convey the | ||
| work under this License, and how to view a copy of this License. If | ||
| the interface presents a list of user commands or options, such as a | ||
| menu, a prominent item in the list meets this criterion. | ||
| 1. Source Code. | ||
| The "source code" for a work means the preferred form of the work | ||
| for making modifications to it. "Object code" means any non-source | ||
| form of a work. | ||
| A "Standard Interface" means an interface that either is an official | ||
| standard defined by a recognized standards body, or, in the case of | ||
| interfaces specified for a particular programming language, one that | ||
| is widely used among developers working in that language. | ||
| The "System Libraries" of an executable work include anything, other | ||
| than the work as a whole, that (a) is included in the normal form of | ||
| packaging a Major Component, but which is not part of that Major | ||
| Component, and (b) serves only to enable use of the work with that | ||
| Major Component, or to implement a Standard Interface for which an | ||
| implementation is available to the public in source code form. A | ||
| "Major Component", in this context, means a major essential component | ||
| (kernel, window system, and so on) of the specific operating system | ||
| (if any) on which the executable work runs, or a compiler used to | ||
| produce the work, or an object code interpreter used to run it. | ||
| The "Corresponding Source" for a work in object code form means all | ||
| the source code needed to generate, install, and (for an executable | ||
| work) run the object code and to modify the work, including scripts to | ||
| control those activities. However, it does not include the work's | ||
| System Libraries, or general-purpose tools or generally available free | ||
| programs which are used unmodified in performing those activities but | ||
| which are not part of the work. For example, Corresponding Source | ||
| includes interface definition files associated with source files for | ||
| the work, and the source code for shared libraries and dynamically | ||
| linked subprograms that the work is specifically designed to require, | ||
| such as by intimate data communication or control flow between those | ||
| subprograms and other parts of the work. | ||
| The Corresponding Source need not include anything that users | ||
| can regenerate automatically from other parts of the Corresponding | ||
| Source. | ||
| The Corresponding Source for a work in source code form is that | ||
| same work. | ||
| 2. Basic Permissions. | ||
| All rights granted under this License are granted for the term of | ||
| copyright on the Program, and are irrevocable provided the stated | ||
| conditions are met. This License explicitly affirms your unlimited | ||
| permission to run the unmodified Program. The output from running a | ||
| covered work is covered by this License only if the output, given its | ||
| content, constitutes a covered work. This License acknowledges your | ||
| rights of fair use or other equivalent, as provided by copyright law. | ||
| You may make, run and propagate covered works that you do not | ||
| convey, without conditions so long as your license otherwise remains | ||
| in force. You may convey covered works to others for the sole purpose | ||
| of having them make modifications exclusively for you, or provide you | ||
| with facilities for running those works, provided that you comply with | ||
| the terms of this License in conveying all material for which you do | ||
| not control copyright. Those thus making or running the covered works | ||
| for you must do so exclusively on your behalf, under your direction | ||
| and control, on terms that prohibit them from making any copies of | ||
| your copyrighted material outside their relationship with you. | ||
| Conveying under any other circumstances is permitted solely under | ||
| the conditions stated below. Sublicensing is not allowed; section 10 | ||
| makes it unnecessary. | ||
| 3. Protecting Users' Legal Rights From Anti-Circumvention Law. | ||
| No covered work shall be deemed part of an effective technological | ||
| measure under any applicable law fulfilling obligations under article | ||
| 11 of the WIPO copyright treaty adopted on 20 December 1996, or | ||
| similar laws prohibiting or restricting circumvention of such | ||
| measures. | ||
| When you convey a covered work, you waive any legal power to forbid | ||
| circumvention of technological measures to the extent such circumvention | ||
| is effected by exercising rights under this License with respect to | ||
| the covered work, and you disclaim any intention to limit operation or | ||
| modification of the work as a means of enforcing, against the work's | ||
| users, your or third parties' legal rights to forbid circumvention of | ||
| technological measures. | ||
| 4. Conveying Verbatim Copies. | ||
| You may convey verbatim copies of the Program's source code as you | ||
| receive it, in any medium, provided that you conspicuously and | ||
| appropriately publish on each copy an appropriate copyright notice; | ||
| keep intact all notices stating that this License and any | ||
| non-permissive terms added in accord with section 7 apply to the code; | ||
| keep intact all notices of the absence of any warranty; and give all | ||
| recipients a copy of this License along with the Program. | ||
| You may charge any price or no price for each copy that you convey, | ||
| and you may offer support or warranty protection for a fee. | ||
| 5. Conveying Modified Source Versions. | ||
| You may convey a work based on the Program, or the modifications to | ||
| produce it from the Program, in the form of source code under the | ||
| terms of section 4, provided that you also meet all of these conditions: | ||
| a) The work must carry prominent notices stating that you modified | ||
| it, and giving a relevant date. | ||
| b) The work must carry prominent notices stating that it is | ||
| released under this License and any conditions added under section | ||
| 7. This requirement modifies the requirement in section 4 to | ||
| "keep intact all notices". | ||
| c) You must license the entire work, as a whole, under this | ||
| License to anyone who comes into possession of a copy. This | ||
| License will therefore apply, along with any applicable section 7 | ||
| additional terms, to the whole of the work, and all its parts, | ||
| regardless of how they are packaged. This License gives no | ||
| permission to license the work in any other way, but it does not | ||
| invalidate such permission if you have separately received it. | ||
| d) If the work has interactive user interfaces, each must display | ||
| Appropriate Legal Notices; however, if the Program has interactive | ||
| interfaces that do not display Appropriate Legal Notices, your | ||
| work need not make them do so. | ||
| A compilation of a covered work with other separate and independent | ||
| works, which are not by their nature extensions of the covered work, | ||
| and which are not combined with it such as to form a larger program, | ||
| in or on a volume of a storage or distribution medium, is called an | ||
| "aggregate" if the compilation and its resulting copyright are not | ||
| used to limit the access or legal rights of the compilation's users | ||
| beyond what the individual works permit. Inclusion of a covered work | ||
| in an aggregate does not cause this License to apply to the other | ||
| parts of the aggregate. | ||
| 6. Conveying Non-Source Forms. | ||
| You may convey a covered work in object code form under the terms | ||
| of sections 4 and 5, provided that you also convey the | ||
| machine-readable Corresponding Source under the terms of this License, | ||
| in one of these ways: | ||
| a) Convey the object code in, or embodied in, a physical product | ||
| (including a physical distribution medium), accompanied by the | ||
| Corresponding Source fixed on a durable physical medium | ||
| customarily used for software interchange. | ||
| b) Convey the object code in, or embodied in, a physical product | ||
| (including a physical distribution medium), accompanied by a | ||
| written offer, valid for at least three years and valid for as | ||
| long as you offer spare parts or customer support for that product | ||
| model, to give anyone who possesses the object code either (1) a | ||
| copy of the Corresponding Source for all the software in the | ||
| product that is covered by this License, on a durable physical | ||
| medium customarily used for software interchange, for a price no | ||
| more than your reasonable cost of physically performing this | ||
| conveying of source, or (2) access to copy the | ||
| Corresponding Source from a network server at no charge. | ||
| c) Convey individual copies of the object code with a copy of the | ||
| written offer to provide the Corresponding Source. This | ||
| alternative is allowed only occasionally and noncommercially, and | ||
| only if you received the object code with such an offer, in accord | ||
| with subsection 6b. | ||
| d) Convey the object code by offering access from a designated | ||
| place (gratis or for a charge), and offer equivalent access to the | ||
| Corresponding Source in the same way through the same place at no | ||
| further charge. You need not require recipients to copy the | ||
| Corresponding Source along with the object code. If the place to | ||
| copy the object code is a network server, the Corresponding Source | ||
| may be on a different server (operated by you or a third party) | ||
| that supports equivalent copying facilities, provided you maintain | ||
| clear directions next to the object code saying where to find the | ||
| Corresponding Source. Regardless of what server hosts the | ||
| Corresponding Source, you remain obligated to ensure that it is | ||
| available for as long as needed to satisfy these requirements. | ||
| e) Convey the object code using peer-to-peer transmission, provided | ||
| you inform other peers where the object code and Corresponding | ||
| Source of the work are being offered to the general public at no | ||
| charge under subsection 6d. | ||
| A separable portion of the object code, whose source code is excluded | ||
| from the Corresponding Source as a System Library, need not be | ||
| included in conveying the object code work. | ||
| A "User Product" is either (1) a "consumer product", which means any | ||
| tangible personal property which is normally used for personal, family, | ||
| or household purposes, or (2) anything designed or sold for incorporation | ||
| into a dwelling. In determining whether a product is a consumer product, | ||
| doubtful cases shall be resolved in favor of coverage. For a particular | ||
| product received by a particular user, "normally used" refers to a | ||
| typical or common use of that class of product, regardless of the status | ||
| of the particular user or of the way in which the particular user | ||
| actually uses, or expects or is expected to use, the product. A product | ||
| is a consumer product regardless of whether the product has substantial | ||
| commercial, industrial or non-consumer uses, unless such uses represent | ||
| the only significant mode of use of the product. | ||
| "Installation Information" for a User Product means any methods, | ||
| procedures, authorization keys, or other information required to install | ||
| and execute modified versions of a covered work in that User Product from | ||
| a modified version of its Corresponding Source. The information must | ||
| suffice to ensure that the continued functioning of the modified object | ||
| code is in no case prevented or interfered with solely because | ||
| modification has been made. | ||
| If you convey an object code work under this section in, or with, or | ||
| specifically for use in, a User Product, and the conveying occurs as | ||
| part of a transaction in which the right of possession and use of the | ||
| User Product is transferred to the recipient in perpetuity or for a | ||
| fixed term (regardless of how the transaction is characterized), the | ||
| Corresponding Source conveyed under this section must be accompanied | ||
| by the Installation Information. But this requirement does not apply | ||
| if neither you nor any third party retains the ability to install | ||
| modified object code on the User Product (for example, the work has | ||
| been installed in ROM). | ||
| The requirement to provide Installation Information does not include a | ||
| requirement to continue to provide support service, warranty, or updates | ||
| for a work that has been modified or installed by the recipient, or for | ||
| the User Product in which it has been modified or installed. Access to a | ||
| network may be denied when the modification itself materially and | ||
| adversely affects the operation of the network or violates the rules and | ||
| protocols for communication across the network. | ||
| Corresponding Source conveyed, and Installation Information provided, | ||
| in accord with this section must be in a format that is publicly | ||
| documented (and with an implementation available to the public in | ||
| source code form), and must require no special password or key for | ||
| unpacking, reading or copying. | ||
| 7. Additional Terms. | ||
| "Additional permissions" are terms that supplement the terms of this | ||
| License by making exceptions from one or more of its conditions. | ||
| Additional permissions that are applicable to the entire Program shall | ||
| be treated as though they were included in this License, to the extent | ||
| that they are valid under applicable law. If additional permissions | ||
| apply only to part of the Program, that part may be used separately | ||
| under those permissions, but the entire Program remains governed by | ||
| this License without regard to the additional permissions. | ||
| When you convey a copy of a covered work, you may at your option | ||
| remove any additional permissions from that copy, or from any part of | ||
| it. (Additional permissions may be written to require their own | ||
| removal in certain cases when you modify the work.) You may place | ||
| additional permissions on material, added by you to a covered work, | ||
| for which you have or can give appropriate copyright permission. | ||
| Notwithstanding any other provision of this License, for material you | ||
| add to a covered work, you may (if authorized by the copyright holders of | ||
| that material) supplement the terms of this License with terms: | ||
| a) Disclaiming warranty or limiting liability differently from the | ||
| terms of sections 15 and 16 of this License; or | ||
| b) Requiring preservation of specified reasonable legal notices or | ||
| author attributions in that material or in the Appropriate Legal | ||
| Notices displayed by works containing it; or | ||
| c) Prohibiting misrepresentation of the origin of that material, or | ||
| requiring that modified versions of such material be marked in | ||
| reasonable ways as different from the original version; or | ||
| d) Limiting the use for publicity purposes of names of licensors or | ||
| authors of the material; or | ||
| e) Declining to grant rights under trademark law for use of some | ||
| trade names, trademarks, or service marks; or | ||
| f) Requiring indemnification of licensors and authors of that | ||
| material by anyone who conveys the material (or modified versions of | ||
| it) with contractual assumptions of liability to the recipient, for | ||
| any liability that these contractual assumptions directly impose on | ||
| those licensors and authors. | ||
| All other non-permissive additional terms are considered "further | ||
| restrictions" within the meaning of section 10. If the Program as you | ||
| received it, or any part of it, contains a notice stating that it is | ||
| governed by this License along with a term that is a further | ||
| restriction, you may remove that term. If a license document contains | ||
| a further restriction but permits relicensing or conveying under this | ||
| License, you may add to a covered work material governed by the terms | ||
| of that license document, provided that the further restriction does | ||
| not survive such relicensing or conveying. | ||
| If you add terms to a covered work in accord with this section, you | ||
| must place, in the relevant source files, a statement of the | ||
| additional terms that apply to those files, or a notice indicating | ||
| where to find the applicable terms. | ||
| Additional terms, permissive or non-permissive, may be stated in the | ||
| form of a separately written license, or stated as exceptions; | ||
| the above requirements apply either way. | ||
| 8. Termination. | ||
| You may not propagate or modify a covered work except as expressly | ||
| provided under this License. Any attempt otherwise to propagate or | ||
| modify it is void, and will automatically terminate your rights under | ||
| this License (including any patent licenses granted under the third | ||
| paragraph of section 11). | ||
| However, if you cease all violation of this License, then your | ||
| license from a particular copyright holder is reinstated (a) | ||
| provisionally, unless and until the copyright holder explicitly and | ||
| finally terminates your license, and (b) permanently, if the copyright | ||
| holder fails to notify you of the violation by some reasonable means | ||
| prior to 60 days after the cessation. | ||
| Moreover, your license from a particular copyright holder is | ||
| reinstated permanently if the copyright holder notifies you of the | ||
| violation by some reasonable means, this is the first time you have | ||
| received notice of violation of this License (for any work) from that | ||
| copyright holder, and you cure the violation prior to 30 days after | ||
| your receipt of the notice. | ||
| Termination of your rights under this section does not terminate the | ||
| licenses of parties who have received copies or rights from you under | ||
| this License. If your rights have been terminated and not permanently | ||
| reinstated, you do not qualify to receive new licenses for the same | ||
| material under section 10. | ||
| 9. Acceptance Not Required for Having Copies. | ||
| You are not required to accept this License in order to receive or | ||
| run a copy of the Program. Ancillary propagation of a covered work | ||
| occurring solely as a consequence of using peer-to-peer transmission | ||
| to receive a copy likewise does not require acceptance. However, | ||
| nothing other than this License grants you permission to propagate or | ||
| modify any covered work. These actions infringe copyright if you do | ||
| not accept this License. Therefore, by modifying or propagating a | ||
| covered work, you indicate your acceptance of this License to do so. | ||
| 10. Automatic Licensing of Downstream Recipients. | ||
| Each time you convey a covered work, the recipient automatically | ||
| receives a license from the original licensors, to run, modify and | ||
| propagate that work, subject to this License. You are not responsible | ||
| for enforcing compliance by third parties with this License. | ||
| An "entity transaction" is a transaction transferring control of an | ||
| organization, or substantially all assets of one, or subdividing an | ||
| organization, or merging organizations. If propagation of a covered | ||
| work results from an entity transaction, each party to that | ||
| transaction who receives a copy of the work also receives whatever | ||
| licenses to the work the party's predecessor in interest had or could | ||
| give under the previous paragraph, plus a right to possession of the | ||
| Corresponding Source of the work from the predecessor in interest, if | ||
| the predecessor has it or can get it with reasonable efforts. | ||
| You may not impose any further restrictions on the exercise of the | ||
| rights granted or affirmed under this License. For example, you may | ||
| not impose a license fee, royalty, or other charge for exercise of | ||
| rights granted under this License, and you may not initiate litigation | ||
| (including a cross-claim or counterclaim in a lawsuit) alleging that | ||
| any patent claim is infringed by making, using, selling, offering for | ||
| sale, or importing the Program or any portion of it. | ||
| 11. Patents. | ||
| A "contributor" is a copyright holder who authorizes use under this | ||
| License of the Program or a work on which the Program is based. The | ||
| work thus licensed is called the contributor's "contributor version". | ||
| A contributor's "essential patent claims" are all patent claims | ||
| owned or controlled by the contributor, whether already acquired or | ||
| hereafter acquired, that would be infringed by some manner, permitted | ||
| by this License, of making, using, or selling its contributor version, | ||
| but do not include claims that would be infringed only as a | ||
| consequence of further modification of the contributor version. For | ||
| purposes of this definition, "control" includes the right to grant | ||
| patent sublicenses in a manner consistent with the requirements of | ||
| this License. | ||
| Each contributor grants you a non-exclusive, worldwide, royalty-free | ||
| patent license under the contributor's essential patent claims, to | ||
| make, use, sell, offer for sale, import and otherwise run, modify and | ||
| propagate the contents of its contributor version. | ||
| In the following three paragraphs, a "patent license" is any express | ||
| agreement or commitment, however denominated, not to enforce a patent | ||
| (such as an express permission to practice a patent or covenant not to | ||
| sue for patent infringement). To "grant" such a patent license to a | ||
| party means to make such an agreement or commitment not to enforce a | ||
| patent against the party. | ||
| If you convey a covered work, knowingly relying on a patent license, | ||
| and the Corresponding Source of the work is not available for anyone | ||
| to copy, free of charge and under the terms of this License, through a | ||
| publicly available network server or other readily accessible means, | ||
| then you must either (1) cause the Corresponding Source to be so | ||
| available, or (2) arrange to deprive yourself of the benefit of the | ||
| patent license for this particular work, or (3) arrange, in a manner | ||
| consistent with the requirements of this License, to extend the patent | ||
| license to downstream recipients. "Knowingly relying" means you have | ||
| actual knowledge that, but for the patent license, your conveying the | ||
| covered work in a country, or your recipient's use of the covered work | ||
| in a country, would infringe one or more identifiable patents in that | ||
| country that you have reason to believe are valid. | ||
| If, pursuant to or in connection with a single transaction or | ||
| arrangement, you convey, or propagate by procuring conveyance of, a | ||
| covered work, and grant a patent license to some of the parties | ||
| receiving the covered work authorizing them to use, propagate, modify | ||
| or convey a specific copy of the covered work, then the patent license | ||
| you grant is automatically extended to all recipients of the covered | ||
| work and works based on it. | ||
| A patent license is "discriminatory" if it does not include within | ||
| the scope of its coverage, prohibits the exercise of, or is | ||
| conditioned on the non-exercise of one or more of the rights that are | ||
| specifically granted under this License. You may not convey a covered | ||
| work if you are a party to an arrangement with a third party that is | ||
| in the business of distributing software, under which you make payment | ||
| to the third party based on the extent of your activity of conveying | ||
| the work, and under which the third party grants, to any of the | ||
| parties who would receive the covered work from you, a discriminatory | ||
| patent license (a) in connection with copies of the covered work | ||
| conveyed by you (or copies made from those copies), or (b) primarily | ||
| for and in connection with specific products or compilations that | ||
| contain the covered work, unless you entered into that arrangement, | ||
| or that patent license was granted, prior to 28 March 2007. | ||
| Nothing in this License shall be construed as excluding or limiting | ||
| any implied license or other defenses to infringement that may | ||
| otherwise be available to you under applicable patent law. | ||
| 12. No Surrender of Others' Freedom. | ||
| If conditions are imposed on you (whether by court order, agreement or | ||
| otherwise) that contradict the conditions of this License, they do not | ||
| excuse you from the conditions of this License. If you cannot convey a | ||
| covered work so as to satisfy simultaneously your obligations under this | ||
| License and any other pertinent obligations, then as a consequence you may | ||
| not convey it at all. For example, if you agree to terms that obligate you | ||
| to collect a royalty for further conveying from those to whom you convey | ||
| the Program, the only way you could satisfy both those terms and this | ||
| License would be to refrain entirely from conveying the Program. | ||
| 13. Use with the GNU Affero General Public License. | ||
| Notwithstanding any other provision of this License, you have | ||
| permission to link or combine any covered work with a work licensed | ||
| under version 3 of the GNU Affero General Public License into a single | ||
| combined work, and to convey the resulting work. The terms of this | ||
| License will continue to apply to the part which is the covered work, | ||
| but the special requirements of the GNU Affero General Public License, | ||
| section 13, concerning interaction through a network will apply to the | ||
| combination as such. | ||
| 14. Revised Versions of this License. | ||
| The Free Software Foundation may publish revised and/or new versions of | ||
| the GNU General Public License from time to time. Such new versions will | ||
| be similar in spirit to the present version, but may differ in detail to | ||
| address new problems or concerns. | ||
| Each version is given a distinguishing version number. If the | ||
| Program specifies that a certain numbered version of the GNU General | ||
| Public License "or any later version" applies to it, you have the | ||
| option of following the terms and conditions either of that numbered | ||
| version or of any later version published by the Free Software | ||
| Foundation. If the Program does not specify a version number of the | ||
| GNU General Public License, you may choose any version ever published | ||
| by the Free Software Foundation. | ||
| If the Program specifies that a proxy can decide which future | ||
| versions of the GNU General Public License can be used, that proxy's | ||
| public statement of acceptance of a version permanently authorizes you | ||
| to choose that version for the Program. | ||
| Later license versions may give you additional or different | ||
| permissions. However, no additional obligations are imposed on any | ||
| author or copyright holder as a result of your choosing to follow a | ||
| later version. | ||
| 15. Disclaimer of Warranty. | ||
| THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY | ||
| APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT | ||
| HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY | ||
| OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, | ||
| THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM | ||
| IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF | ||
| ALL NECESSARY SERVICING, REPAIR OR CORRECTION. | ||
| 16. Limitation of Liability. | ||
| IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING | ||
| WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS | ||
| THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY | ||
| GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE | ||
| USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF | ||
| DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD | ||
| PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), | ||
| EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF | ||
| SUCH DAMAGES. | ||
| 17. Interpretation of Sections 15 and 16. | ||
| If the disclaimer of warranty and limitation of liability provided | ||
| above cannot be given local legal effect according to their terms, | ||
| reviewing courts shall apply local law that most closely approximates | ||
| an absolute waiver of all civil liability in connection with the | ||
| Program, unless a warranty or assumption of liability accompanies a | ||
| copy of the Program in return for a fee. | ||
| END OF TERMS AND CONDITIONS | ||
| How to Apply These Terms to Your New Programs | ||
| If you develop a new program, and you want it to be of the greatest | ||
| possible use to the public, the best way to achieve this is to make it | ||
| free software which everyone can redistribute and change under these terms. | ||
| To do so, attach the following notices to the program. It is safest | ||
| to attach them to the start of each source file to most effectively | ||
| state the exclusion of warranty; and each file should have at least | ||
| the "copyright" line and a pointer to where the full notice is found. | ||
| <one line to give the program's name and a brief idea of what it does.> | ||
| Copyright (C) <year> <name of author> | ||
| This program is free software: you can redistribute it and/or modify | ||
| it under the terms of the GNU General Public License as published by | ||
| the Free Software Foundation, either version 3 of the License, or | ||
| (at your option) any later version. | ||
| This program is distributed in the hope that it will be useful, | ||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| GNU General Public License for more details. | ||
| You should have received a copy of the GNU General Public License | ||
| along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
| Also add information on how to contact you by electronic and paper mail. | ||
| If the program does terminal interaction, make it output a short | ||
| notice like this when it starts in an interactive mode: | ||
| <program> Copyright (C) <year> <name of author> | ||
| This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. | ||
| This is free software, and you are welcome to redistribute it | ||
| under certain conditions; type `show c' for details. | ||
| The hypothetical commands `show w' and `show c' should show the appropriate | ||
| parts of the General Public License. Of course, your program's commands | ||
| might be different; for a GUI interface, you would use an "about box". | ||
| You should also get your employer (if you work as a programmer) or school, | ||
| if any, to sign a "copyright disclaimer" for the program, if necessary. | ||
| For more information on this, and how to apply and follow the GNU GPL, see | ||
| <https://www.gnu.org/licenses/>. | ||
| The GNU General Public License does not permit incorporating your program | ||
| into proprietary programs. If your program is a subroutine library, you | ||
| may consider it more useful to permit linking proprietary applications with | ||
| the library. If this is what you want to do, use the GNU Lesser General | ||
| Public License instead of this License. But first, please read | ||
| <https://www.gnu.org/licenses/why-not-lgpl.html>. |
+131
| Fast Sentence Embeddings (fse) | ||
| ================================== | ||
| Fast Sentence Embeddings is a Python library that serves as an addition to Gensim. This library is intended to compute *sentence vectors* for large collections of sentences or documents. | ||
| Features | ||
| ------------ | ||
| Find the corresponding blog post(s) here: https://medium.com/@oliverbor/fse-2b1ffa791cf9 (will be updated soon) | ||
| **fse** implements three algorithms for sentence embeddings. You can choose | ||
| between *unweighted sentence averages*, *smooth inverse frequency averages*, and *unsupervised smooth inverse frequency averages*. | ||
| Key features of **fse** are: | ||
| **[X]** Supports Average, SIF, and uSIF Embeddings | ||
| **[X]** Full support for Gensims Word2Vec and all other compatible classes | ||
| **[X]** Full support for Gensims FastText with out-of-vocabulary words | ||
| **[X]** Induction of word frequencies for pre-trained embeddings | ||
| **[X]** Incredibly fast Cython core routines | ||
| **[X]** Dedicated input file formats for easy usage (including disk streaming) | ||
| **[X]** Ram-to-disk training for large corpora | ||
| **[X]** Disk-to-disk training for even larger corpora | ||
| **[X]** Many fail-safe checks for easy usage | ||
| **[X]** Simple interface for developing your own models | ||
| **[X]** Extensive documentation of all functions | ||
| **[X]** 98% unittest coverage | ||
| Usage | ||
| ------------- | ||
| Within the folder nootebooks you can find the following guides: | ||
| **Tutorial.ipynb** offers a detailed walk-through of some of the most important functions fse has to offer. | ||
| **STS-Benchmarks.ipynb** contains an example of how to use the library with pre-trained models to | ||
| replicate the STS Benchmark results [4] reported in the papers. | ||
| In order to use the **fse** model, you first need some pre-trained gensim | ||
| word embedding model, which is then used by **fse** to compute the sentence embeddings. | ||
| After computing sentence embeddings, you can use them in supervised or | ||
| unsupervised NLP applications, as they serve as a formidable baseline. | ||
| The models presented are based on | ||
| - Deep-averaging embeddings [1] | ||
| - Smooth inverse frequency embeddings [2] | ||
| - Unsupervised smooth inverse frequency embeddings [3] | ||
| Credits to Radim Řehůřek and all contributors for the **awesome** library | ||
| and code that Gensim provides. A whole lot of the code found in this lib is based on Gensim. | ||
| In order to use **fse** you must first estimate a Gensim model which containes a | ||
| gensim.models.keyedvectors.BaseKeyedVectors class, for example | ||
| *Word2Vec* or *Fasttext*. Then you can proceed to compute sentence embeddings | ||
| for a corpus. | ||
| from gensim.models import FastText | ||
| sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] | ||
| ft = FastText(sentences, min_count=1, size=10) | ||
| from fse.models import Average | ||
| from fse import IndexedSentence | ||
| model = Average(ft) | ||
| model.train([IndexedSentence(s, i) for i, s in enumerate(sentences)]) | ||
| model.sv.similarity(0,1) | ||
| The current version does offer multi-core support out of the box. However, for most | ||
| applications a single core will most likely suffice. | ||
| To install **fse** on Colab, check out: https://colab.research.google.com/drive/1qq9GBgEosG7YSRn7r6e02T9snJb04OEi | ||
| Installation | ||
| ------------ | ||
| This software depends on [NumPy, Scipy, Scikit-learn, Gensim, and Wordfreq]. | ||
| You must have them installed prior to installing fse. | ||
| As with gensim, it is also recommended you install a fast BLAS library | ||
| before installing fse. | ||
| The simple way to install **fse** is: | ||
| pip install --upgrade fse | ||
| In case you want to build from the source, just run: | ||
| python setup.py install | ||
| If building the Cython extension fails (you will be notified), try: | ||
| pip install git+https://github.com/oborchers/Fast_Sentence_Embeddings | ||
| Literature | ||
| ------------- | ||
| 1. Iyyer M, Manjunatha V, Boyd-Graber J, Daumé III H (2015) Deep Unordered | ||
| Composition Rivals Syntactic Methods for Text Classification. Proc. 53rd Annu. | ||
| Meet. Assoc. Comput. Linguist. 7th Int. Jt. Conf. Nat. Lang. Process., 1681–1691. | ||
| 2. Arora S, Liang Y, Ma T (2017) A Simple but Tough-to-Beat Baseline for Sentence | ||
| Embeddings. Int. Conf. Learn. Represent. (Toulon, France), 1–16. | ||
| 3. Ethayarajh K (2018) Unsupervised Random Walk Sentence Embeddings: A Strong but Simple Baseline. | ||
| Proceedings of the 3rd Workshop on Representation Learning for NLP. (Toulon, France), 91–100. | ||
| 4. Eneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia Specia. Semeval-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation. Proceedings of SemEval 2017. | ||
| Copyright | ||
| ------------- | ||
| Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| Copyright (C) 2019 Oliver Borchers |
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.0.4 | ||
| Version: 0.1.0 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
@@ -8,4 +8,4 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings | ||
| Author-email: borchers@bwl.uni-mannheim.de | ||
| License: UNKNOWN | ||
| License: GPL-3.0 | ||
| Description: UNKNOWN | ||
| Platform: UNKNOWN |
@@ -1,7 +0,6 @@ | ||
| numpy >= 1.11.3 | ||
| scipy >= 0.18.1 | ||
| six >= 1.5.0 | ||
| smart_open >= 1.5.0 | ||
| scikit-learn >= 0.19.1 | ||
| gensim >= 3.4.0 | ||
| wordfreq >= 2.2.1 | ||
| numpy>=1.11.3 | ||
| scipy>=0.18.1 | ||
| smart_open>=1.5.0 | ||
| scikit-learn>=0.19.1 | ||
| gensim>=3.4.0 | ||
| wordfreq>=2.2.1 |
@@ -0,7 +1,10 @@ | ||
| .gitattributes | ||
| .gitignore | ||
| LICENSE | ||
| README.md | ||
| setup.py | ||
| ./fse/exp/sif_variants_cy.c | ||
| ./fse/models/sentence2vec_inner.c | ||
| ./fse/models/average_inner.c | ||
| ./fse/models/average_inner.pyx | ||
| fse/__init__.py | ||
| fse/benchmark_speed.py | ||
| fse/compute_sif.py | ||
| fse/inputs.py | ||
| fse.egg-info/PKG-INFO | ||
@@ -13,5 +16,16 @@ fse.egg-info/SOURCES.txt | ||
| fse.egg-info/top_level.txt | ||
| fse/exp/__init__.py | ||
| fse/exp/sif_variants.py | ||
| fse/models/__init__.py | ||
| fse/models/sentence2vec.py | ||
| fse/models/average.py | ||
| fse/models/base_s2v.py | ||
| fse/models/sentencevectors.py | ||
| fse/models/sif.py | ||
| fse/models/usif.py | ||
| fse/models/utils.py | ||
| fse/test/__init__.py | ||
| fse/test/test_average.py | ||
| fse/test/test_base_s2v.py | ||
| fse/test/test_inputs.py | ||
| fse/test/test_sentencevectors.py | ||
| fse/test/test_sif.py | ||
| fse/test/test_usif.py | ||
| fse/test/test_utils.py |
+2
-1
@@ -1,2 +0,3 @@ | ||
| from fse import models, exp | ||
| from fse import models | ||
| from .inputs import IndexedSentence, IndexedList, IndexedLineDocument | ||
| import logging | ||
@@ -3,0 +4,0 @@ |
@@ -1,1 +0,4 @@ | ||
| from .sentence2vec import Sentence2Vec | ||
| from .average import Average | ||
| from .sif import SIF | ||
| from .usif import uSIF | ||
| from .sentencevectors import SentenceVectors |
+2
-2
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.0.4 | ||
| Version: 0.1.0 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
@@ -8,4 +8,4 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings | ||
| Author-email: borchers@bwl.uni-mannheim.de | ||
| License: UNKNOWN | ||
| License: GPL-3.0 | ||
| Description: UNKNOWN | ||
| Platform: UNKNOWN |
+0
-1
| [egg_info] | ||
| tag_build = | ||
| tag_date = 0 | ||
| tag_svn_revision = 0 | ||
+5
-9
@@ -80,3 +80,2 @@ #!/usr/bin/env python | ||
| mod_dir = os.path.join(os.path.dirname(__file__), 'fse', 'models') | ||
| dev_dir = os.path.join(os.path.dirname(__file__), 'fse', 'exp') | ||
| fse_dir = os.path.join(os.path.dirname(__file__), 'fse') | ||
@@ -88,3 +87,3 @@ | ||
| name='fse', | ||
| version='0.0.4', | ||
| version='0.1.0', | ||
| description='Fast Sentence Embeddings for Gensim', | ||
@@ -97,9 +96,8 @@ | ||
| license='GPL-3.0', | ||
| ext_modules=[ | ||
| Extension('fse.models.sentence2vec_inner', | ||
| sources=['./fse/models/sentence2vec_inner.c'], | ||
| Extension('fse.models.average_inner', | ||
| sources=['./fse/models/average_inner.pyx'], | ||
| include_dirs=[mod_dir]), | ||
| Extension('fse.exp.sif_variants_cy', | ||
| sources=['./fse/exp/sif_variants_cy.c'], | ||
| include_dirs=[dev_dir]), | ||
| ], | ||
@@ -115,3 +113,2 @@ | ||
| 'scipy >= 0.18.1', | ||
| 'six >= 1.5.0', | ||
| 'smart_open >= 1.5.0', | ||
@@ -122,4 +119,3 @@ 'scikit-learn >= 0.19.1', | ||
| ], | ||
| include_package_data=True, | ||
| ) |
| import numpy as np | ||
| from re import sub | ||
| import logging | ||
| import sys | ||
| import pathlib | ||
| np.random.seed(42) | ||
| logger = logging.getLogger(__name__) | ||
| # Import Sentence2Vec model and check if cython compiliation worked | ||
| from fse.models import Sentence2Vec | ||
| from fse.models.sentence2vec import CY_ROUTINES as CY_ROUTINES_TRAIN | ||
| # Import python sif-implementations | ||
| from fse.exp.sif_variants import sif_embeddings, \ | ||
| sif_embeddings_1, sif_embeddings_2, sif_embeddings_3, \ | ||
| sif_embeddings_4, sif_embeddings_5 | ||
| # Import cython sif-implementations | ||
| try: | ||
| from fse.exp.sif_variants_cy import sif_embeddings_6, \ | ||
| sif_embeddings_7, sif_embeddings_8 | ||
| CY_ROUTINES = 1 | ||
| except ImportError as e: | ||
| CY_ROUTINES = 0 | ||
| logger.warning("ImportError of Cython functions: %s", e) | ||
| # Simple in-place normalization | ||
| def normalize_text(sentences): | ||
| for i, s in enumerate(sentences): | ||
| sentences[i] = [sub("[^a-zA-Z]", "", w.lower()) for w in s] | ||
| if __name__ == "__main__": | ||
| import pandas as pd | ||
| import functools | ||
| import psutil | ||
| import timeit | ||
| import argparse | ||
| from gensim.models import Word2Vec | ||
| from nltk.corpus import brown | ||
| from collections import OrderedDict | ||
| from datetime import datetime | ||
| startTime = datetime.now() | ||
| logging.basicConfig( | ||
| format='%(asctime)s : %(levelname)s : %(message)s', | ||
| level=logging.WARNING | ||
| ) | ||
| logger.warning("running %s", " ".join(sys.argv)) | ||
| logger.warning("using cython routines %s", (CY_ROUTINES & CY_ROUTINES_TRAIN)) | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument("-len", help="Determine the length of the set to benchmark on", type=int, default=400) | ||
| parser.add_argument("-runs", help="Determine the number of runs used to benchmark", type=int, default=1000) | ||
| parser.add_argument("-size", help="Set the size of the embedding", type=int, default=100) | ||
| parser.add_argument("-excel", help="Output results as excel file", type=int, default=1) | ||
| args = parser.parse_args() | ||
| # Prepare the brown corpus for the benchmark (~50k sentences) | ||
| logger.warning("loading and preparing brown corpus") | ||
| sentences = [s for s in brown.sents()] | ||
| normalize_text(sentences) | ||
| # Use a simple word2vec model for estimation | ||
| # Training the model is not necessary for the comparision, the random wv.vectors are sufficient | ||
| logger.warning("train word2vec model on corpus") | ||
| model = Word2Vec(size=args.size, iter=1, workers=psutil.cpu_count(), sg=1, window=5, negative=5, min_count=5) | ||
| model.build_vocab(sentences) | ||
| # Precomputes sif weights for the final model | ||
| se_model = Sentence2Vec(model, alpha=1e-3, components=0) | ||
| # Precomputes the sif weights and sif weighted vectors for the benchmark of some python functions | ||
| model.wv.sif = se_model._precompute_sif_weights(model.wv, alpha=1e-3) | ||
| model.wv.sif_vectors = (model.wv.vectors * model.wv.sif[:, None]).astype(np.float32) | ||
| # Precompute the word-indices list for the sentences (only for comparision) | ||
| sentences_idx = [np.asarray([int(model.wv.vocab[w].index) for w in s if w in model.wv.vocab], dtype=np.intc) for s in sentences] | ||
| # Use reduced size dataset | ||
| data = sentences[:args.len] | ||
| data_idx = sentences_idx[:args.len] | ||
| results = OrderedDict() | ||
| # The first verbose implementation is our reference implementation | ||
| # All subequent computations must be allclose to the baseline | ||
| baseline = None | ||
| emb_dta = [(sif_embeddings, data), | ||
| (sif_embeddings_1, data), | ||
| (sif_embeddings_2, data), | ||
| (sif_embeddings_3, data), | ||
| (sif_embeddings_4, data), | ||
| (sif_embeddings_5, data_idx)] | ||
| if CY_ROUTINES: | ||
| emb_dta = emb_dta + [ | ||
| (sif_embeddings_6, data_idx), | ||
| (sif_embeddings_7, data_idx), | ||
| (sif_embeddings_8, data_idx), | ||
| (se_model.train, data) | ||
| ] | ||
| # Note: We do not benchmark the "train" routine, as it containes estimate memory and logging, which would disturb the result. | ||
| for i, tup in enumerate(emb_dta): | ||
| func = tup[0] # Function to evaluate | ||
| dta = tup[1] # Data to perform evaluation on | ||
| n = args.runs if i is not 0 else 1 # Limit the first loop. Otherwise it takes ages. | ||
| t = timeit.Timer(functools.partial(func, sentences=dta, model=model)) | ||
| time = np.min(t.repeat(number=n)) | ||
| logger.warning("compute embeddings with function: %s takes %2.6f sec", func.__name__, time / n) | ||
| results[str(func.__name__)] = float(time) / n | ||
| if i == 0: | ||
| # Set the baseline up | ||
| baseline = func(dta, model) | ||
| else: | ||
| # Test that all implementations are close to the baseline | ||
| assert np.allclose(baseline, func(sentences=dta, model=model), atol=1e-6) | ||
| # Compute result & store | ||
| df = pd.DataFrame(results, columns=results.keys(), index=["Time(s)"]).T | ||
| values = df["Time(s)"].values | ||
| df["Gain"] = [1] + [values[i-1]/values[i] for i in range(1, len(values))] | ||
| print("--- Results ---") | ||
| print(df) | ||
| now = datetime.now() | ||
| date_time = now.strftime("%m-%d-%Y_%H-%M-%S") | ||
| if args.excel: | ||
| p = pathlib.Path("excel") | ||
| p.mkdir(exist_ok=True) | ||
| df.to_excel("excel/results_"+date_time+".xlsx") | ||
| logger.info("TOTAL RUNTIME: %s",str(now - startTime)) |
| import numpy as np | ||
| from re import sub | ||
| import logging | ||
| import os | ||
| import sys | ||
| import pathlib | ||
| from fse.models import Sentence2Vec | ||
| from fse.models.sentence2vec import CY_ROUTINES | ||
| from gensim.models.word2vec import Word2Vec, LineSentence | ||
| np.random.seed(42) | ||
| logger = logging.getLogger(__name__) | ||
| # Simple in-place normalization | ||
| def normalize_text(sentences): | ||
| for i, s in enumerate(sentences): | ||
| sentences[i] = [sub("[^a-zA-Z]", "", w.lower()) for w in s] | ||
| if __name__ == "__main__": | ||
| import timeit | ||
| import psutil | ||
| import argparse | ||
| from gensim.models import Word2Vec | ||
| from nltk.corpus import brown | ||
| from datetime import datetime | ||
| startTime = datetime.now() | ||
| logging.basicConfig( | ||
| format='%(asctime)s : %(levelname)s : %(message)s', | ||
| level=logging.INFO | ||
| ) | ||
| logger.info("running %s", " ".join(sys.argv)) | ||
| logger.info("using cython routines %s", CY_ROUTINES) | ||
| parser = argparse.ArgumentParser() | ||
| # Training Parameters | ||
| parser.add_argument("-train", help="Use text data from file TRAIN to train the model") | ||
| parser.add_argument("-save", help="Set to True to save model", type=bool, default=True) | ||
| parser.add_argument("-window", help="Set max skip length WINDOW between words (default: 5)", type=int, default=5) | ||
| parser.add_argument("-size", help="Set size of word vectors (default: 100)", type=int, default=100) | ||
| parser.add_argument("-sample", help="Set subsampling threshold (default: 1e-4)", type=float, default=1e-4) | ||
| parser.add_argument("-negative", help="Number of negative samples (default: 5)",type=int, default=5) | ||
| parser.add_argument("-threads", help="Use THREADS threads (default: 4)", type=int, default=4) | ||
| parser.add_argument("-iter", help="Run more training iterations (default: 5)", type=int, default=5) | ||
| parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times (default: 5)", type=int, default=5) | ||
| parser.add_argument("-alpha", help="Set SIF alpha for weighted sum to ALPHA (default: 1e-3)",type=float,default=1e-3) | ||
| parser.add_argument("-pc", help="Set number of removed componented to PC (default: 1)",type=int,default=1) | ||
| args = parser.parse_args() | ||
| if args.train: | ||
| sentences = LineSentence(args.train) | ||
| else: | ||
| logger.info("loading and preparing brown corpus") | ||
| sentences = [s for s in brown.sents()] | ||
| normalize_text(sentences) | ||
| logger.info("train word2vec model on corpus") | ||
| model = Word2Vec( | ||
| sentences, size=args.size, min_count=args.min_count, workers=args.threads, | ||
| window=args.window, sample=args.sample, sg=1, hs=0, | ||
| negative=args.negative, cbow_mean=0, iter=args.iter | ||
| ) | ||
| sif_model = Sentence2Vec(model, alpha=args.alpha, components=args.pc) | ||
| sif_emb = sif_model.train(sentences) | ||
| sif_model.normalize(sif_emb) | ||
| if args.save: | ||
| now = datetime.now() | ||
| date_time = now.strftime("%m-%d-%Y_%H-%M-%S") | ||
| p = pathlib.Path("model_data") | ||
| p.mkdir(exist_ok=True) | ||
| out_model = "model_data/model_"+date_time | ||
| model.save(out_model + '.model') | ||
| out_emb = "model_data/sif_"+date_time | ||
| np.save(out_emb, sif_emb) | ||
| logger.info("saved %s", out_emb) | ||
| logger.info("TOTAL RUNTIME: %s",str(now - startTime)) |
Sorry, the diff of this file is too big to display
| import numpy as np | ||
| # Define data types for use in cython | ||
| REAL = np.float32 | ||
| INT = np.intc | ||
| def sif_embeddings(sentences, model, alpha=1e-3): | ||
| """Compute the SIF embeddings for a list of sentences | ||
| Parameters | ||
| ---------- | ||
| sentences : list | ||
| The sentences to compute the embeddings for | ||
| model : `~gensim.models.base_any2vec.BaseAny2VecModel` | ||
| A gensim model that contains the word vectors and the vocabulary | ||
| alpha : float, optional | ||
| Parameter which is used to weigh each individual word based on its probability p(w). | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| SIF sentence embedding matrix of dim len(sentences) * dimension | ||
| """ | ||
| vlookup = model.wv.vocab # Gives us access to word index and count | ||
| vectors = model.wv # Gives us access to word vectors | ||
| size = model.vector_size # Embedding size | ||
| Z = 0 | ||
| for k in vlookup: | ||
| Z += vlookup[k].count # Compute the normalization constant Z | ||
| output = [] | ||
| # Iterate all sentences | ||
| for s in sentences: | ||
| count = 0 | ||
| v = np.zeros(size, dtype=REAL) # Summary vector | ||
| # Iterare all words | ||
| for w in s: | ||
| # A word must be present in the vocabulary | ||
| if w in vlookup: | ||
| for i in range(size): | ||
| v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i] | ||
| count += 1 | ||
| if count > 0: | ||
| for i in range(size): | ||
| v[i] *= 1/count | ||
| output.append(v) | ||
| return np.vstack(output).astype(REAL) | ||
| def sif_embeddings_1(sentences, model, alpha=1e-3): | ||
| """ Removes the unecessary loop in the vector summation | ||
| """ | ||
| vlookup = model.wv.vocab | ||
| vectors = model.wv | ||
| size = model.vector_size | ||
| Z = 0 | ||
| for k in vlookup: | ||
| Z += vlookup[k].count | ||
| output = [] | ||
| for s in sentences: | ||
| count = 0 | ||
| v = np.zeros(size, dtype=REAL) | ||
| for w in s: | ||
| if w in vlookup: | ||
| # The loop over the the vector dimensions is completely unecessary and extremely slow | ||
| v += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w] | ||
| count += 1 | ||
| if count > 0: | ||
| v *= 1/count | ||
| output.append(v) | ||
| return np.vstack(output).astype(REAL) | ||
| def sif_embeddings_2(sentences, model, alpha=1e-3): | ||
| """ Uses the precomputed SIF weights via lookup | ||
| """ | ||
| vlookup = model.wv.vocab | ||
| vectors = model.wv | ||
| size = model.vector_size | ||
| output = [] | ||
| for s in sentences: | ||
| count = 0 | ||
| v = np.zeros(size, dtype=REAL) | ||
| for w in s: | ||
| if w in vlookup: | ||
| v += vectors.sif[vlookup[w].index]*vectors[w] | ||
| count += 1 | ||
| if count > 0: | ||
| v *= 1/count | ||
| output.append(v) | ||
| return np.vstack(output).astype(REAL) | ||
| def sif_embeddings_3(sentences, model, alpha=1e-3): | ||
| """ Precomputes the indices of the sentences and uses the numpy indexing to directly multiply and sum the vectors | ||
| """ | ||
| vlookup = model.wv.vocab | ||
| vectors = model.wv | ||
| output = [] | ||
| for s in sentences: | ||
| idx = [vlookup[w].index for w in s if w in vlookup] | ||
| v = np.sum(vectors.vectors[idx] * vectors.sif[idx][:, None], axis=0) | ||
| if len(idx) > 0: | ||
| v *= 1/len(idx) | ||
| output.append(v) | ||
| return np.vstack(output).astype(REAL) | ||
| def sif_embeddings_4(sentences, model): | ||
| """ Precomputes the sif_vectors in a separate matrix | ||
| """ | ||
| vlookup = model.wv.vocab | ||
| vectors = model.wv.sif_vectors | ||
| output = [] | ||
| for s in sentences: | ||
| idx = [vlookup[w].index for w in s if w in vlookup] | ||
| v = np.sum(vectors[idx], axis=0) | ||
| if len(idx) > 0: | ||
| v *= 1/len(idx) | ||
| output.append(v) | ||
| return np.vstack(output).astype(REAL) | ||
| def sif_embeddings_5(sentences, model): | ||
| """ Uses a pre-computed list of indices and skips the use of strings alltogether | ||
| """ | ||
| vectors = model.wv.sif_vectors | ||
| output = np.zeros(shape=(len(sentences), model.vector_size), dtype=REAL) | ||
| for i,s in enumerate(sentences): | ||
| output[i] = np.sum(vectors[s], axis=0) * ( (1/len(s)) if len(s)>0 else 1) | ||
| return output.astype(REAL) |
Sorry, the diff of this file is too big to display
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| from gensim.models.base_any2vec import BaseWordEmbeddingsModel | ||
| from gensim.models.keyedvectors import BaseKeyedVectors | ||
| from gensim.matutils import unitvec | ||
| from sklearn.decomposition import TruncatedSVD | ||
| from wordfreq import get_frequency_dict | ||
| from six.moves import xrange | ||
| import logging | ||
| import warnings | ||
| import psutil | ||
| logger = logging.getLogger(__name__) | ||
| from numpy import float32 as REAL, sum as np_sum, vstack, zeros, ones,\ | ||
| dtype, sqrt, newaxis, empty, full | ||
| EPS = 1e-8 | ||
| try: | ||
| # Import cython functions | ||
| CY_ROUTINES = 1 | ||
| from fse.models.sentence2vec_inner import s2v_train | ||
| except ImportError as e: | ||
| CY_ROUTINES = 0 | ||
| warnings.warn("ImportError of Cython functions: Training will be slow. Install C-Compiler and re-compile.") | ||
| def s2v_train(sentences, len_sentences, wv, weights): | ||
| """Train sentence embedding on a list of sentences | ||
| Called internally from :meth:`~fse.models.sentence2vec.Sentence2Vec.train`. | ||
| Parameters | ||
| ---------- | ||
| sentences : iterable of list of str | ||
| The corpus used to train the model. | ||
| len_sentences : int | ||
| Length of the sentence iterable | ||
| wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` | ||
| The BaseKeyedVectors instance containing the vectors used for training | ||
| weights : np.ndarray | ||
| Weights used in the summation of the vectors | ||
| Returns | ||
| ------- | ||
| np.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector_size | ||
| int | ||
| Number of words in the vocabulary actually used for training. | ||
| int | ||
| Number of sentences used for training. | ||
| """ | ||
| size = wv.vector_size | ||
| vlookup = wv.vocab | ||
| vectors = wv.vectors | ||
| w_trans = weights[:, None] | ||
| output = empty((len_sentences, size), dtype=REAL) | ||
| for i in range(len_sentences): | ||
| output[i] = full(size, EPS, dtype=REAL) | ||
| effective_words = 0 | ||
| effective_sentences = 0 | ||
| for i, s in enumerate(sentences): | ||
| sentence_idx = [vlookup[w].index for w in s if w in vlookup] | ||
| if len(sentence_idx): | ||
| v = np_sum(vectors[sentence_idx] * w_trans[sentence_idx], axis=0) | ||
| effective_words += len(sentence_idx) | ||
| effective_sentences += 1 | ||
| v *= 1/len(sentence_idx) | ||
| output[i] = v | ||
| return output.astype(REAL), effective_words, effective_sentences | ||
| class Sentence2Vec(): | ||
| """Compute smooth inverse frequency weighted or averaged sentence emeddings. | ||
| This implementation is based on the 2017 ICLR paper (https://openreview.net/pdf?id=SyK00v5xx): | ||
| Arora S, Liang Y, Ma T (2017) A Simple but Tough-to-Beat Baseline for Sentence Embeddings. Int. Conf. Learn. Represent. (Toulon, France), 1–16. | ||
| All corex routines are optimized based on the Gensim routines (https://github.com/RaRe-Technologies/gensim) | ||
| Attributes | ||
| ---------- | ||
| model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.keyedvectors.BaseWordEmbeddingsModel` | ||
| This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings | ||
| the wv.vocab and wv.vector elements are required. | ||
| numpy.ndarray : sif_weights | ||
| Contains the pre-computed SIF weights. | ||
| """ | ||
| def __init__(self, model, alpha=1e-3, components=1, no_frequency=False, lang="en"): | ||
| """ | ||
| Parameters | ||
| ---------- | ||
| model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.keyedvectors.BaseWordEmbeddingsModel` | ||
| This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings | ||
| the wv.vocab and wv.vector elements are required. | ||
| alpha : float, optional | ||
| Parameter which is used to weigh each individual word based on its probability p(w). | ||
| If alpha = 1, train simply computes the averaged sentence representation. | ||
| components : int, optional | ||
| Number of principal components to remove from the sentence embeddings. Independent of alpha. | ||
| no_frequency : bool, optional | ||
| Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about | ||
| the frequency of a word. As the frequency is required for estimating the weights, no_frequency induces | ||
| into the wv.vocab.count class based on :class:`~wordfreq` | ||
| lang : str, optional | ||
| If no frequency information is available, you can choose the language to estimate the frequency. | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| Sentence embedding matrix of dim len(sentences) * dimension | ||
| Examples | ||
| -------- | ||
| Initialize and train a :class:`~fse.models.sentence2vec.Sentence2Vec` model | ||
| >>> from gensim.models import Word2Vec | ||
| >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] | ||
| >>> model = Word2Vec(sentences, min_count=1) | ||
| >>> from fse.models import Sentence2Vec | ||
| >>> se = Sentence2Vec(model) | ||
| >>> sentences_emb = se.train(sentences) | ||
| """ | ||
| if isinstance(model, BaseWordEmbeddingsModel): | ||
| self.model = model.wv | ||
| elif isinstance(model, BaseKeyedVectors): | ||
| self.model = model | ||
| else: | ||
| raise RuntimeError("Model must be child of BaseWordEmbeddingsModel or BaseKeyedVectors.") | ||
| if not hasattr(self.model, 'vectors'): | ||
| raise RuntimeError("Parameters required for predicting sentence embeddings not found.") | ||
| assert alpha >= 0 & components >= 0 | ||
| self.alpha = float(alpha) | ||
| self.components = int(components) | ||
| self.no_frequency = bool(no_frequency) | ||
| self.lang = str(lang) | ||
| self.sif_weights = self._precompute_sif_weights(self.model, self.alpha, no_frequency, lang) | ||
| def _compute_principal_component(self, vectors, npc=1): | ||
| """Compute the n principal components for the sentence embeddings | ||
| Notes | ||
| ----- | ||
| Adapted from https://github.com/PrincetonML/SIF/blob/master/src/SIF_embedding.py | ||
| Parameters | ||
| ---------- | ||
| vectors : numpy.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector_size. | ||
| npc : int, optional | ||
| The number of principal components to be computed. Default : 1. | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| The principal components as computed by the TruncatedSVD | ||
| """ | ||
| logger.info("computing %d principal components", npc) | ||
| svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0, algorithm="randomized") | ||
| svd.fit(vectors) | ||
| return svd.components_ | ||
| def _remove_principal_component(self, vectors, npc=1): | ||
| """Remove the projection from the sentence embeddings | ||
| Notes | ||
| ----- | ||
| Adapted from https://github.com/PrincetonML/SIF/blob/master/src/SIF_embedding.py | ||
| Parameters | ||
| ---------- | ||
| vectors : numpy.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector_size. | ||
| npc : int, optional | ||
| The number of principal components to be computed. Default : 1. | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector size after removing the projection | ||
| """ | ||
| pc = self._compute_principal_component(vectors, npc) | ||
| logger.info("removing %d principal components", npc) | ||
| if npc==1: | ||
| vectors_rpc = vectors - vectors.dot(pc.transpose()) * pc | ||
| else: | ||
| vectors_rpc = vectors - vectors.dot(pc.transpose()).dot(pc) | ||
| return vectors_rpc | ||
| def _precompute_sif_weights(self, wv, alpha=1e-3, no_frequency=False, lang="en"): | ||
| """Precompute the weights used in the vector summation | ||
| Parameters | ||
| ---------- | ||
| wv : `~gensim.models.keyedvectors.BaseKeyedVectors` | ||
| A gensim keyedvectors child that contains the word vectors and the vocabulary | ||
| alpha : float, optional | ||
| Parameter which is used to weigh each individual word based on its probability p(w). | ||
| If alpha = 0, the model computes the average sentence embedding. Common values range from 1e-5 to 1e-1. | ||
| For more information, see the original paper. | ||
| no_frequency : bool, optional | ||
| Use a the commonly available frequency table if the Gensim model does not contain information about | ||
| the frequency of the words (see model.wv.vocab.count). | ||
| lang : str, optional | ||
| Determines the language of the frequency table used to compute the weights. | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| The vector of weights for all words in the model vocabulary | ||
| """ | ||
| logger.info("pre-computing SIF weights") | ||
| if no_frequency: | ||
| logger.info("no frequency mode: using wordfreq for estimation (lang=%s)",lang) | ||
| freq_dict = get_frequency_dict(str(lang), wordlist='best') | ||
| for w in wv.index2word: | ||
| if w in freq_dict: | ||
| wv.vocab[w].count = int(freq_dict[w] * (2**31 -1)) | ||
| else: | ||
| wv.vocab[w].count = 1 | ||
| if alpha > 0: | ||
| corpus_size = 0 | ||
| # Set the dtype correct for cython estimation | ||
| sif = zeros(shape=len(wv.vocab), dtype=REAL) | ||
| for k in wv.index2word: | ||
| # Compute normalization constant | ||
| corpus_size += wv.vocab[k].count | ||
| for idx, k in enumerate(wv.index2word): | ||
| pw = wv.vocab[k].count / corpus_size | ||
| sif[idx] = alpha / (alpha+pw) | ||
| else: | ||
| sif = ones(shape=len(wv.vocab), dtype=REAL) | ||
| return sif | ||
| def _estimate_memory(self, len_sentences, vocab_size, vector_size): | ||
| """Estimate the size of the embedding in memoy | ||
| Notes | ||
| ----- | ||
| Directly adapted from gensim | ||
| Parameters | ||
| ---------- | ||
| len_sentences : int | ||
| Length of the sentences iterable | ||
| vocab_size : int | ||
| Size of the vocabulary | ||
| vector_size : int | ||
| Vector size of the sentence embedding | ||
| Returns | ||
| ------- | ||
| dict | ||
| Dictionary of esitmated sizes | ||
| """ | ||
| report = {} | ||
| report["sif_weights"] = vocab_size * dtype(REAL).itemsize | ||
| report["sentence_vectors"] = len_sentences * vector_size * dtype(REAL).itemsize | ||
| report["total"] = sum(report.values()) | ||
| mb_size = int(report["sentence_vectors"] / 1024**2) | ||
| logger.info( | ||
| "estimated required memory for %i sentences and %i dimensions: %i MB (%i GB)", | ||
| len_sentences, | ||
| vector_size, | ||
| mb_size, | ||
| int(mb_size / 1024) | ||
| ) | ||
| if report["total"] >= 0.95 * psutil.virtual_memory()[1]: | ||
| warnings.warn("Sentence2Vec: The sentence embeddings will likely not fit into RAM.") | ||
| return report | ||
| def normalize(self, sentence_matrix, inplace=True): | ||
| """Normalize the sentence_matrix rows to unit_length | ||
| Notes | ||
| ----- | ||
| Directly adapted from gensim | ||
| Parameters | ||
| ---------- | ||
| sentence_matrix : numpy.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector_size | ||
| inplace : bool, optional | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector_size | ||
| """ | ||
| logger.info("computing L2-norms of sentence embeddings") | ||
| if inplace: | ||
| for i in xrange(len(sentence_matrix)): | ||
| sentence_matrix[i, :] /= sqrt((sentence_matrix[i, :] ** 2).sum(-1)) | ||
| else: | ||
| output = (sentence_matrix / sqrt((sentence_matrix ** 2).sum(-1))[..., newaxis]).astype(REAL) | ||
| return output | ||
| def train(self, sentences, **kwargs): | ||
| """Train the model on sentences | ||
| Parameters | ||
| ---------- | ||
| sentences : iterable of list of str | ||
| The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, | ||
| consider an iterable that streams the sentences directly from disk/network. | ||
| Returns | ||
| ------- | ||
| numpy.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector_size | ||
| """ | ||
| if sentences is None: | ||
| raise RuntimeError("Provide sentences object") | ||
| len_sentences = 0 | ||
| if not hasattr(sentences, '__len__'): | ||
| len_sentences = sum(1 for _ in sentences) | ||
| else: | ||
| len_sentences = len(sentences) | ||
| if len_sentences == 0: | ||
| raise RuntimeError("Sentences must be non-empty") | ||
| self._estimate_memory(len_sentences, len(self.model.vocab), self.model.vector_size) | ||
| output, no_words, no_sents = s2v_train(sentences, len_sentences, self.model, self.sif_weights) | ||
| logger.info("finished computing sentence embeddings of %i effective sentences with %i effective words", no_sents, no_words) | ||
| if self.components > 0: | ||
| output = self._remove_principal_component(output, self.components) | ||
| return output |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
32
77.78%2763
302.18%630938
-51.36%