fse
Advanced tools
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.1.15 | ||
| Version: 0.1.17 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
| Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings | ||
| Home-page: UNKNOWN | ||
| Author: Oliver Borchers | ||
| Author-email: borchers@bwl.uni-mannheim.de | ||
| License: GPL-3.0 | ||
| Author-email: o.borchers@oxolo.com | ||
| License: UNKNOWN | ||
| Description: UNKNOWN | ||
| Platform: UNKNOWN |
@@ -5,4 +5,4 @@ numpy>=1.11.3 | ||
| scikit-learn>=0.19.1 | ||
| gensim>=3.8.0 | ||
| gensim<4 | ||
| wordfreq>=2.2.1 | ||
| psutil |
@@ -1,9 +0,4 @@ | ||
| .gitattributes | ||
| .gitignore | ||
| LICENSE | ||
| MANIFEST.in | ||
| README.md | ||
| setup.py | ||
| ./fse/models/average_inner.c | ||
| ./fse/models/average_inner.pyx | ||
| fse/__init__.py | ||
@@ -36,3 +31,2 @@ fse/inputs.py | ||
| fse/test/test_utils.py | ||
| fse/test/test_data/.DS_Store | ||
| fse/test/test_data/test_sentences.txt |
+14
-10
@@ -0,13 +1,16 @@ | ||
| import logging | ||
| from fse import models | ||
| from .inputs import BaseIndexedList | ||
| from .inputs import IndexedList | ||
| from .inputs import CIndexedList | ||
| from .inputs import SplitIndexedList | ||
| from .inputs import SplitCIndexedList | ||
| from .inputs import CSplitIndexedList | ||
| from .inputs import CSplitCIndexedList | ||
| from .inputs import IndexedLineDocument | ||
| from .inputs import ( | ||
| BaseIndexedList, | ||
| CIndexedList, | ||
| CSplitCIndexedList, | ||
| CSplitIndexedList, | ||
| IndexedLineDocument, | ||
| IndexedList, | ||
| SplitCIndexedList, | ||
| SplitIndexedList, | ||
| ) | ||
| import logging | ||
@@ -18,4 +21,5 @@ class NullHandler(logging.Handler): | ||
| logger = logging.getLogger('fse') | ||
| logger = logging.getLogger("fse") | ||
| if len(logger.handlers) == 0: # To ensure reload() doesn't add another one | ||
| logger.addHandler(NullHandler()) |
+119
-118
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
| from typing import NamedTuple, List, MutableSequence | ||
| from pathlib import Path | ||
| from typing import List, MutableSequence, Union | ||
| from gensim.utils import any2unicode | ||
| from numpy import concatenate, ndarray | ||
| from smart_open import open | ||
| from pathlib import Path | ||
| from numpy import ndarray, concatenate | ||
| class BaseIndexedList(MutableSequence): | ||
| def __init__(self, *args: List[Union[list, set, ndarray]]): | ||
| """Base object to be used for feeding in-memory stored lists of sentences to the | ||
| training routine. | ||
| def __init__(self, *args:[list, set, ndarray]): | ||
| """ Base object to be used for feeding in-memory stored lists of sentences to | ||
| the training routine. | ||
| Parameters | ||
@@ -26,7 +24,6 @@ ---------- | ||
| Arguments to be merged into a single contianer. Can be single or multiple list/set/ndarray objects. | ||
| """ | ||
| self.items = list() | ||
| if len(args) == 1: | ||
@@ -41,4 +38,4 @@ self._check_list_type(args[0]) | ||
| def _check_list_type(self, obj:object): | ||
| """ Checks input validity """ | ||
| def _check_list_type(self, obj: object): | ||
| """Checks input validity.""" | ||
| if isinstance(obj, (list, set, ndarray)): | ||
@@ -49,4 +46,4 @@ return 1 | ||
| def _check_str_type(self, obj:object): | ||
| """ Checks input validity """ | ||
| def _check_str_type(self, obj: object): | ||
| """Checks input validity.""" | ||
| if isinstance(obj, str): | ||
@@ -58,4 +55,4 @@ return 1 | ||
| def __len__(self): | ||
| """ List length | ||
| """List length. | ||
| Returns | ||
@@ -75,9 +72,8 @@ ------- | ||
| Human readable representation of the object's state (words and tags). | ||
| """ | ||
| return str(self.items) | ||
| def __getitem__(self, i:int) -> tuple: | ||
| """ Getitem method | ||
| def __getitem__(self, i: int) -> tuple: | ||
| """Getitem method. | ||
| Returns | ||
@@ -90,23 +86,23 @@ ------- | ||
| def __delitem__(self, i:int): | ||
| """ Delete an item """ | ||
| def __delitem__(self, i: int): | ||
| """Delete an item.""" | ||
| del self.items[i] | ||
| def __setitem__(self, i:int, item:str): | ||
| """ Sets an item """ | ||
| def __setitem__(self, i: int, item: str): | ||
| """Sets an item.""" | ||
| self._check_str_type(item) | ||
| self.items[i] = item | ||
| def insert(self, i:int, item:str): | ||
| """ Inserts an item at a position """ | ||
| def insert(self, i: int, item: str): | ||
| """Inserts an item at a position.""" | ||
| self._check_str_type(item) | ||
| self.items.insert(i, item) | ||
| def append(self, item:str): | ||
| """ Appends item at last position""" | ||
| def append(self, item: str): | ||
| """Appends item at last position.""" | ||
| self._check_str_type(item) | ||
| self.insert(len(self.items), item) | ||
| def extend(self, arg:[list, set, ndarray]): | ||
| """ Extens list """ | ||
| def extend(self, arg: [list, set, ndarray]): | ||
| """Extens list.""" | ||
| self._check_list_type(arg) | ||
@@ -119,8 +115,8 @@ | ||
| class IndexedList(BaseIndexedList): | ||
| def __init__(self, *args: [list, set, ndarray]): | ||
| """Quasi-list to be used for feeding in-memory stored lists of sentences to the | ||
| training routine. | ||
| def __init__(self, *args:[list, set, ndarray]): | ||
| """ Quasi-list to be used for feeding in-memory stored lists of sentences to | ||
| the training routine. | ||
| Parameters | ||
@@ -130,9 +126,8 @@ ---------- | ||
| Arguments to be merged into a single contianer. Can be single or multiple list/set objects. | ||
| """ | ||
| super(IndexedList, self).__init__(*args) | ||
| def __getitem__(self, i:int) -> tuple: | ||
| """ Getitem method | ||
| def __getitem__(self, i: int) -> tuple: | ||
| """Getitem method. | ||
| Returns | ||
@@ -145,8 +140,8 @@ ------- | ||
| class CIndexedList(BaseIndexedList): | ||
| def __init__(self, *args: [list, set, ndarray], custom_index: [list, ndarray]): | ||
| """Quasi-list with custom indices to be used for feeding in-memory stored lists | ||
| of sentences to the training routine. | ||
| def __init__(self, *args:[list, set, ndarray], custom_index:[list, ndarray]): | ||
| """ Quasi-list with custom indices to be used for feeding in-memory stored lists of sentences to | ||
| the training routine. | ||
| Parameters | ||
@@ -158,3 +153,2 @@ ---------- | ||
| Custom index to support many to one mappings. | ||
| """ | ||
@@ -166,7 +160,9 @@ self.custom_index = custom_index | ||
| if len(self.items) != len(self.custom_index): | ||
| raise RuntimeError(f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}") | ||
| raise RuntimeError( | ||
| f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}" | ||
| ) | ||
| def __getitem__(self, i:int) -> tuple: | ||
| """ Getitem method | ||
| def __getitem__(self, i: int) -> tuple: | ||
| """Getitem method. | ||
| Returns | ||
@@ -179,23 +175,23 @@ ------- | ||
| def __delitem__(self, i:int): | ||
| def __delitem__(self, i: int): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def __setitem__(self, i:int, item:str): | ||
| def __setitem__(self, i: int, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def insert(self, i:int, item:str): | ||
| def insert(self, i: int, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def append(self, item:str): | ||
| def append(self, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def extend(self, arg:[list, set, ndarray]): | ||
| def extend(self, arg: [list, set, ndarray]): | ||
| raise NotImplementedError("Method currently not supported") | ||
| class SplitIndexedList(BaseIndexedList): | ||
| def __init__(self, *args: [list, set, ndarray]): | ||
| """Quasi-list with string splitting to be used for feeding in-memory stored | ||
| lists of sentences to the training routine. | ||
| def __init__(self, *args:[list, set, ndarray]): | ||
| """ Quasi-list with string splitting to be used for feeding in-memory stored lists of sentences to | ||
| the training routine. | ||
| Parameters | ||
@@ -205,9 +201,8 @@ ---------- | ||
| Arguments to be merged into a single contianer. Can be single or multiple list/set objects. | ||
| """ | ||
| super(SplitIndexedList, self).__init__(*args) | ||
| def __getitem__(self, i:int) -> tuple: | ||
| """ Getitem method | ||
| def __getitem__(self, i: int) -> tuple: | ||
| """Getitem method. | ||
| Returns | ||
@@ -220,8 +215,8 @@ ------- | ||
| class SplitCIndexedList(BaseIndexedList): | ||
| def __init__(self, *args: [list, set, ndarray], custom_index: [list, ndarray]): | ||
| """Quasi-list with custom indices and string splitting to be used for feeding | ||
| in-memory stored lists of sentences to the training routine. | ||
| def __init__(self, *args:[list, set, ndarray], custom_index:[list, ndarray]): | ||
| """ Quasi-list with custom indices and string splitting to be used for feeding in-memory stored lists of sentences to | ||
| the training routine. | ||
| Parameters | ||
@@ -233,3 +228,2 @@ ---------- | ||
| Custom index to support many to one mappings. | ||
| """ | ||
@@ -241,8 +235,9 @@ self.custom_index = custom_index | ||
| if len(self.items) != len(self.custom_index): | ||
| raise RuntimeError(f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}") | ||
| raise RuntimeError( | ||
| f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}" | ||
| ) | ||
| def __getitem__(self, i: int) -> tuple: | ||
| """Getitem method. | ||
| def __getitem__(self, i:int) -> tuple: | ||
| """ Getitem method | ||
| Returns | ||
@@ -254,24 +249,24 @@ ------- | ||
| return (self.items.__getitem__(i).split(), self.custom_index[i]) | ||
| def __delitem__(self, i:int): | ||
| def __delitem__(self, i: int): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def __setitem__(self, i:int, item:str): | ||
| def __setitem__(self, i: int, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def insert(self, i:int, item:str): | ||
| def insert(self, i: int, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def append(self, item:str): | ||
| def append(self, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def extend(self, arg:[list, set, ndarray]): | ||
| def extend(self, arg: [list, set, ndarray]): | ||
| raise NotImplementedError("Method currently not supported") | ||
| class CSplitIndexedList(BaseIndexedList): | ||
| def __init__(self, *args: [list, set, ndarray], custom_split: callable): | ||
| """Quasi-list with custom string splitting to be used for feeding in-memory | ||
| stored lists of sentences to the training routine. | ||
| def __init__(self, *args:[list, set, ndarray], custom_split:callable): | ||
| """ Quasi-list with custom string splitting to be used for feeding in-memory stored lists of sentences to | ||
| the training routine. | ||
| Parameters | ||
@@ -283,3 +278,2 @@ ---------- | ||
| Split function to be used to convert strings into list of str. | ||
| """ | ||
@@ -289,5 +283,5 @@ self.custom_split = custom_split | ||
| def __getitem__(self, i:int) -> tuple: | ||
| """ Getitem method | ||
| def __getitem__(self, i: int) -> tuple: | ||
| """Getitem method. | ||
| Returns | ||
@@ -300,8 +294,13 @@ ------- | ||
| class CSplitCIndexedList(BaseIndexedList): | ||
| def __init__( | ||
| self, | ||
| *args: [list, set, ndarray], | ||
| custom_split: callable, | ||
| custom_index: [list, ndarray], | ||
| ): | ||
| """Quasi-list with custom indices and ustom string splitting to be used for | ||
| feeding in-memory stored lists of sentences to the training routine. | ||
| def __init__(self, *args:[list, set, ndarray], custom_split:callable, custom_index:[list, ndarray]): | ||
| """ Quasi-list with custom indices and ustom string splitting to be used for feeding in-memory stored lists of sentences to | ||
| the training routine. | ||
| Parameters | ||
@@ -315,15 +314,16 @@ ---------- | ||
| Custom index to support many to one mappings. | ||
| """ | ||
| self.custom_split = custom_split | ||
| self.custom_index = custom_index | ||
| super(CSplitCIndexedList, self).__init__(*args) | ||
| if len(self.items) != len(self.custom_index): | ||
| raise RuntimeError(f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}") | ||
| raise RuntimeError( | ||
| f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}" | ||
| ) | ||
| def __getitem__(self, i:int) -> tuple: | ||
| """ Getitem method | ||
| def __getitem__(self, i: int) -> tuple: | ||
| """Getitem method. | ||
| Returns | ||
@@ -336,21 +336,21 @@ ------- | ||
| def __delitem__(self, i:int): | ||
| def __delitem__(self, i: int): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def __setitem__(self, i:int, item:str): | ||
| def __setitem__(self, i: int, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def insert(self, i:int, item:str): | ||
| def insert(self, i: int, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def append(self, item:str): | ||
| def append(self, item: str): | ||
| raise NotImplementedError("Method currently not supported") | ||
| def extend(self, arg:[list, set, ndarray]): | ||
| def extend(self, arg: [list, set, ndarray]): | ||
| raise NotImplementedError("Method currently not supported") | ||
| class IndexedLineDocument(object): | ||
| def __init__(self, path, get_able=True): | ||
| """ Iterate over a file that contains sentences: one line = tuple([str], int). | ||
| """Iterate over a file that contains sentences: one line = tuple([str], int). | ||
@@ -376,5 +376,5 @@ Words are expected to be already preprocessed and separated by whitespace. Sentence tags are constructed | ||
| self._build_offsets() | ||
| def _build_offsets(self): | ||
| """ Builds an offset table to index the file """ | ||
| """Builds an offset table to index the file.""" | ||
| with open(self.path, "rb") as f: | ||
@@ -385,7 +385,7 @@ offset = f.tell() | ||
| offset += len(line) | ||
| def __getitem__(self, i): | ||
| """ Returns the line indexed by i. Primarily used for | ||
| """Returns the line indexed by i. Primarily used for | ||
| :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar` | ||
| Parameters | ||
@@ -403,3 +403,5 @@ ---------- | ||
| if not self.get_able: | ||
| raise RuntimeError("To index the lines, you must contruct with get_able=True") | ||
| raise RuntimeError( | ||
| "To index the lines, you must contruct with get_able=True" | ||
| ) | ||
@@ -419,6 +421,5 @@ with open(self.path, "rb") as f: | ||
| Tuple of list of string and index | ||
| """ | ||
| with open(self.path, "rb") as f: | ||
| for i, line in enumerate(f): | ||
| yield (any2unicode(line).split(), i) | ||
| yield (any2unicode(line).split(), i) |
| from .average import Average | ||
| from .sif import SIF | ||
| from .sif import SIF | ||
| from .usif import uSIF | ||
| from .sentencevectors import SentenceVectors | ||
| from .sentencevectors import SentenceVectors |
@@ -7,4 +7,4 @@ # cython: boundscheck=False | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -11,0 +11,0 @@ cimport numpy as np |
@@ -8,4 +8,4 @@ #!/usr/bin/env cython | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -12,0 +12,0 @@ """Optimized cython functions for computing sentence embeddings""" |
+77
-38
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -33,3 +33,3 @@ """This module implements the base class to compute average representations for sentences, using highly optimized C routines, | ||
| from __future__ import division | ||
| from __future__ import division | ||
@@ -41,4 +41,10 @@ from fse.models.base_s2v import BaseSentence2VecModel | ||
| from numpy import ndarray, float32 as REAL, sum as np_sum, multiply as np_mult,\ | ||
| zeros, max as np_max | ||
| from numpy import ( | ||
| ndarray, | ||
| float32 as REAL, | ||
| sum as np_sum, | ||
| multiply as np_mult, | ||
| zeros, | ||
| max as np_max, | ||
| ) | ||
@@ -51,3 +57,9 @@ from typing import List | ||
| def train_average_np(model:BaseSentence2VecModel, indexed_sentences:List[tuple], target:ndarray, memory:ndarray) -> [int,int]: | ||
| def train_average_np( | ||
| model: BaseSentence2VecModel, | ||
| indexed_sentences: List[tuple], | ||
| target: ndarray, | ||
| memory: ndarray, | ||
| ) -> [int, int]: | ||
| """Training on a sequence of sentences and update the target ndarray. | ||
@@ -77,3 +89,3 @@ | ||
| int, int | ||
| Number of effective sentences (non-zero) and effective words in the vocabulary used | ||
| Number of effective sentences (non-zero) and effective words in the vocabulary used | ||
| during training the sentence embedding. | ||
@@ -93,3 +105,2 @@ | ||
| mem = memory[0] | ||
| subwords_idx = memory[1] | ||
@@ -114,6 +125,6 @@ if is_ft: | ||
| for obj in indexed_sentences: | ||
| mem.fill(0.) | ||
| mem.fill(0.0) | ||
| sent = obj[0] | ||
| sent_adr = obj[1] | ||
| word_indices = [vocab[word].index for word in sent if word in vocab] | ||
@@ -125,11 +136,14 @@ eff_sentences += 1 | ||
| mem += np_sum(np_mult(w_vectors[word_indices],w_weights[word_indices][:,None]) , axis=0) | ||
| mem *= 1/len(word_indices) | ||
| mem += np_sum( | ||
| np_mult(w_vectors[word_indices], w_weights[word_indices][:, None]), | ||
| axis=0, | ||
| ) | ||
| mem *= 1 / len(word_indices) | ||
| s_vectors[sent_adr] = mem.astype(REAL) | ||
| else: | ||
| for obj in indexed_sentences: | ||
| mem.fill(0.) | ||
| mem.fill(0.0) | ||
| sent = obj[0] | ||
| sent_adr = obj[1] | ||
| if not len(sent): | ||
@@ -140,3 +154,3 @@ continue | ||
| eff_sentences += 1 | ||
| eff_words += len(sent) # Counts everything in the sentence | ||
| eff_words += len(sent) # Counts everything in the sentence | ||
@@ -148,6 +162,10 @@ for word in sent: | ||
| else: | ||
| ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket, True)[:max_ngrams] | ||
| ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket, True)[ | ||
| :max_ngrams | ||
| ] | ||
| if len(ngram_hashes) == 0: | ||
| continue | ||
| mem += oov_weight * (np_sum(ngram_vectors[ngram_hashes], axis=0) / len(ngram_hashes)) | ||
| mem += oov_weight * ( | ||
| np_sum(ngram_vectors[ngram_hashes], axis=0) / len(ngram_hashes) | ||
| ) | ||
| # Implicit addition of zero if oov does not contain any ngrams | ||
@@ -158,5 +176,11 @@ s_vectors[sent_adr] = mem / len(sent) | ||
| try: | ||
| from fse.models.average_inner import train_average_cy | ||
| from fse.models.average_inner import FAST_VERSION, MAX_WORDS_IN_BATCH, MAX_NGRAMS_IN_BATCH | ||
| from fse.models.average_inner import ( | ||
| FAST_VERSION, | ||
| MAX_WORDS_IN_BATCH, | ||
| MAX_NGRAMS_IN_BATCH, | ||
| ) | ||
| train_average = train_average_cy | ||
@@ -169,4 +193,5 @@ except ImportError: | ||
| class Average(BaseSentence2VecModel): | ||
| """ Train, use and evaluate averaged sentence vectors. | ||
| """Train, use and evaluate averaged sentence vectors. | ||
@@ -183,15 +208,23 @@ The model can be stored/loaded via its :meth:`~fse.models.average.Average.save` and | ||
| directly to query those embeddings in various ways. See the module level docstring for examples. | ||
| sv : :class:`~fse.models.sentencevectors.SentenceVectors` | ||
| This object contains the sentence vectors inferred from the training data. There will be one such vector | ||
| for each unique docusentence supplied during training. They may be individually accessed using the index. | ||
| prep : :class:`~fse.models.base_s2v.BaseSentence2VecPreparer` | ||
| The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used | ||
| to move the vectors to disk for training with memmap. | ||
| """ | ||
| def __init__(self, model:BaseKeyedVectors, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None, **kwargs): | ||
| """ Average (unweighted) sentence embeddings model. Performs a simple averaging operation over all | ||
| def __init__( | ||
| self, | ||
| model: BaseKeyedVectors, | ||
| sv_mapfile_path: str = None, | ||
| wv_mapfile_path: str = None, | ||
| workers: int = 1, | ||
| lang_freq: str = None, | ||
| **kwargs | ||
| ): | ||
| """Average (unweighted) sentence embeddings model. Performs a simple averaging operation over all | ||
| words in a sentences without further transformation. | ||
@@ -221,15 +254,23 @@ | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| """ | ||
| super(Average, self).__init__( | ||
| model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, lang_freq=lang_freq, | ||
| batch_words=MAX_WORDS_IN_BATCH, batch_ngrams=MAX_NGRAMS_IN_BATCH, | ||
| fast_version=FAST_VERSION | ||
| ) | ||
| model=model, | ||
| sv_mapfile_path=sv_mapfile_path, | ||
| wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, | ||
| lang_freq=lang_freq, | ||
| batch_words=MAX_WORDS_IN_BATCH, | ||
| batch_ngrams=MAX_NGRAMS_IN_BATCH, | ||
| fast_version=FAST_VERSION, | ||
| ) | ||
| def _do_train_job(self, data_iterable:List[tuple], target:ndarray, memory:ndarray) -> [int, int]: | ||
| def _do_train_job( | ||
| self, data_iterable: List[tuple], target: ndarray, memory: ndarray | ||
| ) -> [int, int]: | ||
| """ Internal routine which is called on training and performs averaging for all entries in the iterable """ | ||
| eff_sentences, eff_words = train_average(model=self, indexed_sentences=data_iterable, target=target, memory=memory) | ||
| eff_sentences, eff_words = train_average( | ||
| model=self, indexed_sentences=data_iterable, target=target, memory=memory | ||
| ) | ||
| return eff_sentences, eff_words | ||
@@ -239,3 +280,3 @@ | ||
| """ Check the sanity of all child paramters """ | ||
| if not all(self.word_weights == 1.): | ||
| if not all(self.word_weights == 1.0): | ||
| raise ValueError("All word weights must equal one for averaging") | ||
@@ -250,13 +291,11 @@ | ||
| pass | ||
| def _post_inference_calls(self, **kwargs): | ||
| """ Function calls to perform after training & inference | ||
| """Function calls to perform after training & inference | ||
| Examples include the removal of components | ||
| """ | ||
| pass | ||
| def _check_dtype_santiy(self, **kwargs): | ||
| """ Check the dtypes of all child attributes""" | ||
| pass | ||
+331
-162
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
| # Licensed under GNU General Public License v3.0 | ||
@@ -39,2 +39,4 @@ | ||
| from fse.models.utils import set_madvise_for_mmap | ||
| from gensim.models.base_any2vec import BaseWordEmbeddingsModel | ||
@@ -45,4 +47,15 @@ from gensim.models.keyedvectors import BaseKeyedVectors, FastTextKeyedVectors, _l2_norm | ||
| from numpy import ndarray, memmap as np_memmap, float32 as REAL, uint32 as uINT, \ | ||
| empty, zeros, vstack, dtype, ones, finfo, full | ||
| from numpy import ( | ||
| ndarray, | ||
| memmap as np_memmap, | ||
| float32 as REAL, | ||
| uint32 as uINT, | ||
| empty, | ||
| zeros, | ||
| vstack, | ||
| dtype, | ||
| ones, | ||
| finfo, | ||
| full, | ||
| ) | ||
@@ -69,6 +82,17 @@ from wordfreq import available_languages, get_frequency_dict | ||
| class BaseSentence2VecModel(SaveLoad): | ||
| def __init__(self, model:BaseKeyedVectors, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None, fast_version:int=0, batch_words:int=10000, batch_ngrams:int=40, **kwargs): | ||
| """ Base class for all Sentence2Vec Models. Provides core functionality, such as | ||
| def __init__( | ||
| self, | ||
| model: BaseKeyedVectors, | ||
| sv_mapfile_path: str = None, | ||
| wv_mapfile_path: str = None, | ||
| workers: int = 1, | ||
| lang_freq: str = None, | ||
| fast_version: int = 0, | ||
| batch_words: int = 10000, | ||
| batch_ngrams: int = 40, | ||
| **kwargs, | ||
| ): | ||
| """Base class for all Sentence2Vec Models. Provides core functionality, such as | ||
| save, load, sanity checking, frequency induction, data checking, scanning, etc. | ||
@@ -106,10 +130,43 @@ | ||
| """ | ||
| TODO: | ||
| [ ] global: | ||
| [ ] windows support | ||
| [ ] documentation | ||
| [ ] more benchmarks | ||
| [ ] remove wv_mapfile_path? | ||
| [ ] modifiable sv_mapfile_path? | ||
| [ ] models: | ||
| [ ] check feasibility first | ||
| [ ] max-pooling -> easy | ||
| [ ] hierarchical pooling -> easy | ||
| [ ] discrete cosine transform -> somewhat easy, questionable | ||
| [ ] valve -> unclear, not cited enough | ||
| [ ] power-means embedding -> very large dimensionalty | ||
| [ ] z-score transformation is quite nice | ||
| [ ] sentencevectors: | ||
| [X] similar_by_sentence model type check | ||
| [ ] approximate NN search for large files | ||
| [ ] compare ANN libraries | ||
| [ ] ease-of-use | ||
| [ ] dependencies | ||
| [ ] compatibility | ||
| [ ] memory-usage | ||
| """ | ||
| set_madvise_for_mmap() | ||
| self.workers = int(workers) | ||
| self.batch_words = batch_words | ||
| self.batch_ngrams = batch_ngrams | ||
| self.wv = None | ||
| self.is_ft = False | ||
| self.wv = None | ||
| self.wv_mapfile_path = Path(wv_mapfile_path) if wv_mapfile_path is not None else None | ||
| self.is_ft = False | ||
| self.wv_mapfile_path = ( | ||
| Path(wv_mapfile_path) if wv_mapfile_path is not None else None | ||
| ) | ||
| self.wv_mapfile_shapes = {} | ||
@@ -132,3 +189,5 @@ | ||
| self.sv = SentenceVectors(vector_size=self.wv.vector_size, mapfile_path=sv_mapfile_path) | ||
| self.sv = SentenceVectors( | ||
| vector_size=self.wv.vector_size, mapfile_path=sv_mapfile_path | ||
| ) | ||
| self.prep = BaseSentence2VecPreparer() | ||
@@ -139,3 +198,3 @@ | ||
| def __str__(self) -> str: | ||
| """ Human readable representation of the model's state. | ||
| """Human readable representation of the model's state. | ||
@@ -150,5 +209,5 @@ Returns | ||
| def _check_and_include_model(self, model:BaseKeyedVectors): | ||
| """ Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations. | ||
| def _check_and_include_model(self, model: BaseKeyedVectors): | ||
| """Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations. | ||
| Parameters | ||
@@ -165,7 +224,9 @@ ---------- | ||
| else: | ||
| raise RuntimeError(f"Model must be child of BaseWordEmbeddingsModel or BaseKeyedVectors. Received {str(model)}") | ||
| raise RuntimeError( | ||
| f"Model must be child of BaseWordEmbeddingsModel or BaseKeyedVectors. Received {str(model)}" | ||
| ) | ||
| self.wv.vectors_norm = None | ||
| if isinstance(self.wv, FastTextKeyedVectors): | ||
| self.wv.vectors_vocab_norm = None # Save some space | ||
| self.wv.vectors_vocab_norm = None # Save some space | ||
| self.wv.vectors_ngrams_norm = None | ||
@@ -177,15 +238,21 @@ self.wv.vectors_vocab_norm = None | ||
| raise RuntimeError("FastText model requires compatible hash function") | ||
| if not hasattr(self.wv, 'vectors_vocab') or self.wv.vectors_vocab is None: | ||
| raise RuntimeError("vectors_vocab required for sentence embeddings not found.") | ||
| if not hasattr(self.wv, 'vectors_ngrams') or self.wv.vectors_ngrams is None: | ||
| raise RuntimeError("Ngram vectors required for sentence embeddings not found.") | ||
| if not hasattr(self.wv, 'vectors') or self.wv.vectors is None: | ||
| raise RuntimeError("Word vectors required for sentence embeddings not found.") | ||
| if not hasattr(self.wv, 'vocab'): | ||
| if not hasattr(self.wv, "vectors_vocab") or self.wv.vectors_vocab is None: | ||
| raise RuntimeError( | ||
| "vectors_vocab required for sentence embeddings not found." | ||
| ) | ||
| if not hasattr(self.wv, "vectors_ngrams") or self.wv.vectors_ngrams is None: | ||
| raise RuntimeError( | ||
| "Ngram vectors required for sentence embeddings not found." | ||
| ) | ||
| if not hasattr(self.wv, "vectors") or self.wv.vectors is None: | ||
| raise RuntimeError( | ||
| "Word vectors required for sentence embeddings not found." | ||
| ) | ||
| if not hasattr(self.wv, "vocab"): | ||
| raise RuntimeError("Vocab required for sentence embeddings not found.") | ||
| def _check_language_settings(self, lang_freq:str): | ||
| """ Check if the supplied language is a compatible with the wordfreq package | ||
| def _check_language_settings(self, lang_freq: str): | ||
| """Check if the supplied language is a compatible with the wordfreq package | ||
| Parameters | ||
@@ -197,12 +264,14 @@ ---------- | ||
| """ | ||
| if lang_freq in available_languages(wordlist='best'): | ||
| if lang_freq in available_languages(wordlist="best"): | ||
| self.lang_freq = str(lang_freq) | ||
| logger.info("no frequency mode: using wordfreq for estimation " | ||
| f"of frequency for language: {self.lang_freq}") | ||
| logger.info( | ||
| "no frequency mode: using wordfreq for estimation " | ||
| f"of frequency for language: {self.lang_freq}" | ||
| ) | ||
| else: | ||
| raise ValueError(f"Language {lang_freq} is not available in wordfreq") | ||
| def _induce_frequencies(self, domain:int=2**31 - 1): | ||
| """ Induce frequencies for a pretrained model, as not all pretrained models come with frequencies. | ||
| def _induce_frequencies(self, domain: int = 2 ** 31 - 1): | ||
| """Induce frequencies for a pretrained model, as not all pretrained models come with frequencies. | ||
| Parameters | ||
@@ -214,3 +283,3 @@ ---------- | ||
| """ | ||
| freq_dict = get_frequency_dict(self.lang_freq, wordlist='best') | ||
| freq_dict = get_frequency_dict(self.lang_freq, wordlist="best") | ||
| for word in self.wv.index2word: | ||
@@ -222,5 +291,5 @@ if word in freq_dict: | ||
| def _check_input_data_sanity(self, data_iterable:tuple): | ||
| """ Check if the input data complies with the required formats | ||
| def _check_input_data_sanity(self, data_iterable: tuple): | ||
| """Check if the input data complies with the required formats | ||
| Parameters | ||
@@ -235,8 +304,10 @@ ---------- | ||
| elif isinstance(data_iterable, str): | ||
| raise TypeError("Passed string. Input data must be iterable list of list of tokens or tuple") | ||
| raise TypeError( | ||
| "Passed string. Input data must be iterable list of list of tokens or tuple" | ||
| ) | ||
| elif not hasattr(data_iterable, "__iter__"): | ||
| raise TypeError("Iterable must provide __iter__ function") | ||
| def _log_train_end(self, eff_sentences:int, eff_words:int, overall_time:float): | ||
| """ Log the end of training. | ||
| def _log_train_end(self, eff_sentences: int, eff_words: int, overall_time: float): | ||
| """Log the end of training. | ||
@@ -258,4 +329,6 @@ Parameters | ||
| def _check_pre_training_sanity(self, total_sentences:int, total_words:int, average_length:int, **kwargs): | ||
| """ Check if all available objects for training are available and compliant | ||
| def _check_pre_training_sanity( | ||
| self, total_sentences: int, total_words: int, average_length: int, **kwargs | ||
| ): | ||
| """Check if all available objects for training are available and compliant | ||
@@ -272,11 +345,17 @@ Parameters | ||
| """ | ||
| if not hasattr(self, "wv") or self.wv is None: | ||
| if not hasattr(self, "wv") or self.wv is None: | ||
| raise RuntimeError("you must first load a valid BaseKeyedVectors object") | ||
| if not len(self.wv.vectors): | ||
| raise RuntimeError("you must initialize vectors before computing sentence vectors") | ||
| raise RuntimeError( | ||
| "you must initialize vectors before computing sentence vectors" | ||
| ) | ||
| if self.is_ft and not len(self.wv.vectors_ngrams): | ||
| raise RuntimeError("you must initialize ngram vectors before computing sentence vectors") | ||
| raise RuntimeError( | ||
| "you must initialize ngram vectors before computing sentence vectors" | ||
| ) | ||
| if self.is_ft and not len(self.wv.vectors_vocab): | ||
| raise RuntimeError("you must initialize vectors_vocab before computing sentence vectors") | ||
| raise RuntimeError( | ||
| "you must initialize vectors_vocab before computing sentence vectors" | ||
| ) | ||
@@ -289,7 +368,7 @@ if sum([self.wv.vocab[w].count for w in self.wv.vocab]) == len(self.wv.vocab): | ||
| if not hasattr(self.sv, "vectors") or self.sv.vectors is None: | ||
| if not hasattr(self.sv, "vectors") or self.sv.vectors is None: | ||
| raise RuntimeError("initialization of Sentencevectors failed") | ||
| if not hasattr(self, "word_weights") or self.word_weights is None: | ||
| if not hasattr(self, "word_weights") or self.word_weights is None: | ||
| raise RuntimeError("initialization of word weights failed") | ||
| if not len(self.wv.vectors) == len(self.word_weights): | ||
@@ -301,5 +380,9 @@ raise RuntimeError("Number of word vectors and weights does not match") | ||
| if self.is_ft and self.wv.vectors_ngrams.dtype != REAL: | ||
| raise TypeError(f"type of wv.vectors_ngrams is wrong: {self.wv.vectors_ngrams.dtype}") | ||
| raise TypeError( | ||
| f"type of wv.vectors_ngrams is wrong: {self.wv.vectors_ngrams.dtype}" | ||
| ) | ||
| if self.is_ft and self.wv.vectors_vocab.dtype != REAL: | ||
| raise TypeError(f"type of wv.vectors_vocab is wrong: {self.wv.vectors_vocab.dtype}") | ||
| raise TypeError( | ||
| f"type of wv.vectors_vocab is wrong: {self.wv.vectors_vocab.dtype}" | ||
| ) | ||
| if self.sv.vectors.dtype != REAL: | ||
@@ -310,3 +393,3 @@ raise TypeError(f"type of sv.vectors is wrong: {self.sv.vectors.dtype}") | ||
| if total_sentences is 0 or total_words is 0 or average_length is 0: | ||
| if total_sentences == 0 or total_words == 0 or average_length == 0: | ||
| raise ValueError( | ||
@@ -316,4 +399,4 @@ f"scanning the sentences returned invalid values. Check the input." | ||
| def _check_post_training_sanity(self, eff_sentences:int, eff_words:int): | ||
| """ Check if the training results make sense | ||
| def _check_post_training_sanity(self, eff_sentences: int, eff_words: int): | ||
| """Check if the training results make sense | ||
@@ -326,12 +409,12 @@ Parameters | ||
| Number of effective words encountered during training | ||
| """ | ||
| if eff_sentences is 0 or eff_words is 0: | ||
| raise ValueError( | ||
| f"training returned invalid values. Check the input." | ||
| ) | ||
| def _check_indexed_sent_valid(self, iterPos:int, obj:tuple, checked:int=False) -> [int, List[str]]: | ||
| """ Performs a check if the passed object contains valid data | ||
| if eff_sentences == 0 or eff_words == 0: | ||
| raise ValueError(f"training returned invalid values. Check the input.") | ||
| def _check_indexed_sent_valid( | ||
| self, iterPos: int, obj: tuple, checked: int = False | ||
| ) -> [int, List[str]]: | ||
| """Performs a check if the passed object contains valid data | ||
| Parameters | ||
@@ -343,3 +426,3 @@ ---------- | ||
| An tuple object containing the index and sentence | ||
| Returns | ||
@@ -355,3 +438,3 @@ ------- | ||
| if isinstance(obj, tuple): | ||
| sent = obj[0] #Faster than obj.words | ||
| sent = obj[0] # Faster than obj.words | ||
| index = obj[1] | ||
@@ -363,5 +446,9 @@ else: | ||
| if not isinstance(sent, list) or not all(isinstance(w, str) for w in sent): | ||
| raise TypeError(f"At {iterPos}: Passed {type(sent)}: {sent}. tuple.words must contain list of str.") | ||
| raise TypeError( | ||
| f"At {iterPos}: Passed {type(sent)}: {sent}. tuple.words must contain list of str." | ||
| ) | ||
| if not isinstance(index, int): | ||
| raise TypeError(f"At {iterPos}: Passed {type(index)}: {index}. tuple.index must contain index") | ||
| raise TypeError( | ||
| f"At {iterPos}: Passed {type(index)}: {index}. tuple.index must contain index" | ||
| ) | ||
| if index < 0: | ||
@@ -371,4 +458,4 @@ raise ValueError(f"At {iterPos}: Passed negative {index}") | ||
| def _map_all_vectors_to_disk(self, mapfile_path:Path): | ||
| """ Maps all vectors to disk | ||
| def _map_all_vectors_to_disk(self, mapfile_path: Path): | ||
| """Maps all vectors to disk | ||
@@ -384,11 +471,17 @@ Parameters | ||
| self.wv_mapfile_shapes["vectors"] = self.wv.vectors.shape | ||
| self.wv.vectors = self._move_ndarray_to_disk(self.wv.vectors, mapfile_path=path, name="wv") | ||
| self.wv.vectors = self._move_ndarray_to_disk( | ||
| self.wv.vectors, mapfile_path=path, name="wv" | ||
| ) | ||
| if self.is_ft: | ||
| self.wv_mapfile_shapes["vectors_vocab"] = self.wv.vectors_vocab.shape | ||
| self.wv_mapfile_shapes["vectors_ngrams"] = self.wv.vectors_ngrams.shape | ||
| self.wv.vectors_vocab = self._move_ndarray_to_disk(self.wv.vectors_vocab, mapfile_path=self.wv_mapfile_path, name="vocab") | ||
| self.wv.vectors_ngrams = self._move_ndarray_to_disk(self.wv.vectors_ngrams, mapfile_path=self.wv_mapfile_path, name="ngrams") | ||
| self.wv.vectors_vocab = self._move_ndarray_to_disk( | ||
| self.wv.vectors_vocab, mapfile_path=self.wv_mapfile_path, name="vocab" | ||
| ) | ||
| self.wv.vectors_ngrams = self._move_ndarray_to_disk( | ||
| self.wv.vectors_ngrams, mapfile_path=self.wv_mapfile_path, name="ngrams" | ||
| ) | ||
| def _load_all_vectors_from_disk(self, mapfile_path:Path): | ||
| """ Reads all vectors from disk | ||
| def _load_all_vectors_from_disk(self, mapfile_path: Path): | ||
| """Reads all vectors from disk | ||
@@ -403,12 +496,27 @@ Parameters | ||
| self.wv.vectors = np_memmap(f"{path}_wv.vectors", dtype=REAL, mode='r', shape=self.wv_mapfile_shapes["vectors"]) | ||
| self.wv.vectors = np_memmap( | ||
| f"{path}_wv.vectors", | ||
| dtype=REAL, | ||
| mode="r", | ||
| shape=self.wv_mapfile_shapes["vectors"], | ||
| ) | ||
| if self.is_ft: | ||
| self.wv.vectors_vocab = np_memmap( | ||
| f"{path}_vocab.vectors", dtype=REAL, mode='r', shape=self.wv_mapfile_shapes["vectors_vocab"]) | ||
| f"{path}_vocab.vectors", | ||
| dtype=REAL, | ||
| mode="r", | ||
| shape=self.wv_mapfile_shapes["vectors_vocab"], | ||
| ) | ||
| self.wv.vectors_ngrams = np_memmap( | ||
| f"{path}_ngrams.vectors", dtype=REAL, mode='r', shape=self.wv_mapfile_shapes["vectors_ngrams"]) | ||
| def _move_ndarray_to_disk(self, vector:ndarray, mapfile_path:str, name:str="") -> ndarray: | ||
| """ Moves a numpy ndarray to disk via memmap | ||
| f"{path}_ngrams.vectors", | ||
| dtype=REAL, | ||
| mode="r", | ||
| shape=self.wv_mapfile_shapes["vectors_ngrams"], | ||
| ) | ||
| def _move_ndarray_to_disk( | ||
| self, vector: ndarray, mapfile_path: str, name: str = "" | ||
| ) -> ndarray: | ||
| """Moves a numpy ndarray to disk via memmap | ||
| Parameters | ||
@@ -434,5 +542,3 @@ ---------- | ||
| logger.info(f"writing {name} to {path}") | ||
| memvecs = np_memmap( | ||
| path, dtype=REAL, | ||
| mode='w+', shape=shape) | ||
| memvecs = np_memmap(path, dtype=REAL, mode="w+", shape=shape) | ||
| memvecs[:] = vector[:] | ||
@@ -444,3 +550,3 @@ del memvecs, vector | ||
| readonly_memvecs = np_memmap(path, dtype=REAL, mode='r', shape=shape) | ||
| readonly_memvecs = np_memmap(path, dtype=REAL, mode="r", shape=shape) | ||
| return readonly_memvecs | ||
@@ -461,3 +567,5 @@ | ||
| def _do_train_job(self, data_iterable:List[tuple], target:ndarray, memory:ndarray) -> [int, int]: | ||
| def _do_train_job( | ||
| self, data_iterable: List[tuple], target: ndarray, memory: ndarray | ||
| ) -> [int, int]: | ||
| """ Function to be called on a batch of sentences. Returns eff sentences/words """ | ||
@@ -473,5 +581,5 @@ raise NotImplementedError() | ||
| raise NotImplementedError() | ||
| def _post_inference_calls(self, **kwargs): | ||
| """ Function calls to perform after training & inference | ||
| """Function calls to perform after training & inference | ||
| Examples include the removal of components | ||
@@ -491,3 +599,3 @@ """ | ||
| def load(cls, *args, **kwargs): | ||
| """ Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`. | ||
| """Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`. | ||
@@ -512,6 +620,9 @@ Parameters | ||
| model.wv_mapfile_shapes = None | ||
| set_madvise_for_mmap() | ||
| return model | ||
| def save(self, *args, **kwargs): | ||
| """ Save the model. | ||
| """Save the model. | ||
| This saved model can be loaded again using :func:`~fse.models.base_s2v.BaseSentence2VecModel.load` | ||
@@ -533,4 +644,6 @@ | ||
| def scan_sentences(self, sentences:List[tuple]=None, progress_per:int=5) -> Dict[str,int]: | ||
| """ Performs an initial scan of the data and reports all corresponding statistics | ||
| def scan_sentences( | ||
| self, sentences: List[tuple] = None, progress_per: int = 5 | ||
| ) -> Dict[str, int]: | ||
| """Performs an initial scan of the data and reports all corresponding statistics | ||
@@ -548,3 +661,3 @@ Parameters | ||
| Dictionary containing the scan statistics | ||
| """ | ||
@@ -559,10 +672,16 @@ logger.info("scanning all indexed sentences and their word counts") | ||
| max_index = 0 | ||
| checked_sentences = 0 # We only check the first item to not constrain runtime so much | ||
| checked_sentences = ( | ||
| 0 # We only check the first item to not constrain runtime so much | ||
| ) | ||
| for i, obj in enumerate(sentences): | ||
| index, sent = self._check_indexed_sent_valid(iterPos=i, obj=obj, checked=checked_sentences) | ||
| index, sent = self._check_indexed_sent_valid( | ||
| iterPos=i, obj=obj, checked=checked_sentences | ||
| ) | ||
| checked_sentences += 1 | ||
| if time() - current_time > progress_per: | ||
| current_time = time() | ||
| logger.info(f"SCANNING : finished {total_sentences} sentences with {total_words} words") | ||
| logger.info( | ||
| f"SCANNING : finished {total_sentences} sentences with {total_words} words" | ||
| ) | ||
@@ -575,3 +694,3 @@ max_index = max(max_index, index) | ||
| empty_sentences += 1 | ||
| if empty_sentences: | ||
@@ -581,3 +700,5 @@ logger.warning(f"found {empty_sentences} empty sentences") | ||
| if max_index >= total_sentences: | ||
| raise RuntimeError(f"Index {max_index} is larger than number of sentences {total_sentences}") | ||
| raise RuntimeError( | ||
| f"Index {max_index} is larger than number of sentences {total_sentences}" | ||
| ) | ||
@@ -590,13 +711,15 @@ average_length = int(total_words / total_sentences) | ||
| statistics = { | ||
| "total_sentences" : total_sentences, | ||
| "total_words" : total_words, | ||
| "average_length" : average_length, | ||
| "empty_sentences" : empty_sentences, | ||
| "max_index" : max_index + 1 | ||
| "total_sentences": total_sentences, | ||
| "total_words": total_words, | ||
| "average_length": average_length, | ||
| "empty_sentences": empty_sentences, | ||
| "max_index": max_index + 1, | ||
| } | ||
| return statistics | ||
| def estimate_memory(self, max_index:int, report:dict=None, **kwargs) -> Dict[str, int]: | ||
| """ Estimate the size of the sentence embedding | ||
| def estimate_memory( | ||
| self, max_index: int, report: dict = None, **kwargs | ||
| ) -> Dict[str, int]: | ||
| """Estimate the size of the sentence embedding | ||
| Parameters | ||
@@ -620,8 +743,16 @@ ---------- | ||
| report["Word Vectors"] = vocab_size * self.wv.vector_size * dtype(REAL).itemsize | ||
| report["Sentence Vectors"] = max_index * self.wv.vector_size * dtype(REAL).itemsize | ||
| report["Sentence Vectors"] = ( | ||
| max_index * self.wv.vector_size * dtype(REAL).itemsize | ||
| ) | ||
| if self.is_ft: | ||
| report["Vocab Vectors"] = vocab_size * self.wv.vector_size * dtype(REAL).itemsize | ||
| report["Ngram Vectors"] = self.wv.vectors_ngrams.shape[0] * self.wv.vector_size * dtype(REAL).itemsize | ||
| report["Vocab Vectors"] = ( | ||
| vocab_size * self.wv.vector_size * dtype(REAL).itemsize | ||
| ) | ||
| report["Ngram Vectors"] = ( | ||
| self.wv.vectors_ngrams.shape[0] | ||
| * self.wv.vector_size | ||
| * dtype(REAL).itemsize | ||
| ) | ||
| report["Total"] = sum(report.values()) | ||
| mb_size = int(report["Total"] / 1024**2) | ||
| mb_size = int(report["Total"] / 1024 ** 2) | ||
| logger.info( | ||
@@ -633,7 +764,15 @@ f"estimated memory for {max_index} sentences with " | ||
| if report["Total"] >= 0.95 * virtual_memory()[1]: | ||
| logger.warning("The embeddings will likely not fit into RAM. Consider to use mapfile_path") | ||
| logger.warning( | ||
| "The embeddings will likely not fit into RAM. Consider to use mapfile_path" | ||
| ) | ||
| return report | ||
| def train(self, sentences:List[tuple]=None, update:bool=False, queue_factor:int=2, report_delay:int=5) -> [int,int]: | ||
| """ Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is | ||
| def train( | ||
| self, | ||
| sentences: List[tuple] = None, | ||
| update: bool = False, | ||
| queue_factor: int = 2, | ||
| report_delay: int = 5, | ||
| ) -> [int, int]: | ||
| """Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is | ||
| used for computing embeddings for large chunks of data. This method also handles post-training transformations, | ||
@@ -665,4 +804,6 @@ such as computing the SVD of the sentence vectors. | ||
| self.estimate_memory(**statistics) | ||
| self.prep.prepare_vectors(sv=self.sv, total_sentences=statistics["max_index"], update=update) | ||
| self.prep.prepare_vectors( | ||
| sv=self.sv, total_sentences=statistics["max_index"], update=update | ||
| ) | ||
| # Preform post-tain calls (i.e weight computation) | ||
@@ -676,7 +817,14 @@ self._pre_train_calls(**statistics) | ||
| _, eff_sentences, eff_words = self._train_manager(data_iterable=sentences, total_sentences=statistics["total_sentences"], queue_factor=queue_factor, report_delay=report_delay) | ||
| _, eff_sentences, eff_words = self._train_manager( | ||
| data_iterable=sentences, | ||
| total_sentences=statistics["total_sentences"], | ||
| queue_factor=queue_factor, | ||
| report_delay=report_delay, | ||
| ) | ||
| overall_time = time() - start_time | ||
| self._check_post_training_sanity(eff_sentences=eff_sentences, eff_words=eff_words) | ||
| self._check_post_training_sanity( | ||
| eff_sentences=eff_sentences, eff_words=eff_words | ||
| ) | ||
@@ -686,8 +834,10 @@ # Preform post-tain calls (i.e principal component removal) | ||
| self._log_train_end(eff_sentences=eff_sentences, eff_words=eff_words, overall_time=overall_time) | ||
| self._log_train_end( | ||
| eff_sentences=eff_sentences, eff_words=eff_words, overall_time=overall_time | ||
| ) | ||
| return eff_sentences, eff_words | ||
| return eff_sentences, eff_words | ||
| def infer(self, sentences:List[tuple]=None, use_norm=False) -> ndarray: | ||
| """ Secondary routine to train an embedding. This method is essential for small batches of sentences, | ||
| def infer(self, sentences: List[tuple] = None, use_norm=False) -> ndarray: | ||
| """Secondary routine to train an embedding. This method is essential for small batches of sentences, | ||
| which require little computation. Note: This method does not apply post-training transformations, | ||
@@ -715,3 +865,3 @@ only post inference calls (such as removing principal components). | ||
| mem = self._get_thread_working_mem() | ||
| job_batch, batch_size = [], 0 | ||
@@ -735,5 +885,11 @@ for data_idx, data in enumerate(sentences): | ||
| def _train_manager(self, data_iterable:List[tuple], total_sentences:int=None, queue_factor:int=2, report_delay:int=5): | ||
| """ Manager for the multi-core implementation. Directly adapted from gensim | ||
| def _train_manager( | ||
| self, | ||
| data_iterable: List[tuple], | ||
| total_sentences: int = None, | ||
| queue_factor: int = 2, | ||
| report_delay: int = 5, | ||
| ): | ||
| """Manager for the multi-core implementation. Directly adapted from gensim | ||
| Parameters | ||
@@ -756,5 +912,3 @@ ---------- | ||
| workers = [ | ||
| threading.Thread( | ||
| target=self._worker_loop, | ||
| args=(job_queue, progress_queue)) | ||
| threading.Thread(target=self._worker_loop, args=(job_queue, progress_queue)) | ||
| for _ in range(self.workers) | ||
@@ -764,5 +918,3 @@ ] | ||
| workers.append( | ||
| threading.Thread( | ||
| target=self._job_producer, | ||
| args=(data_iterable, job_queue)) | ||
| threading.Thread(target=self._job_producer, args=(data_iterable, job_queue)) | ||
| ) | ||
@@ -775,4 +927,3 @@ | ||
| jobs, eff_sentences, eff_words = self._log_train_progress( | ||
| progress_queue, total_sentences=total_sentences, | ||
| report_delay=report_delay | ||
| progress_queue, total_sentences=total_sentences, report_delay=report_delay | ||
| ) | ||
@@ -782,3 +933,3 @@ return jobs, eff_sentences, eff_words | ||
| def _worker_loop(self, job_queue, progress_queue): | ||
| """ Train the model, lifting batches of data from the queue. | ||
| """Train the model, lifting batches of data from the queue. | ||
@@ -807,11 +958,13 @@ This function will be called in parallel by multiple workers (threads or processes) to make | ||
| # no more jobs => quit this worker | ||
| break | ||
| eff_sentences, eff_words = self._do_train_job(data_iterable=job, target=self.sv.vectors, memory=mem) | ||
| break | ||
| eff_sentences, eff_words = self._do_train_job( | ||
| data_iterable=job, target=self.sv.vectors, memory=mem | ||
| ) | ||
| progress_queue.put((len(job), eff_sentences, eff_words)) | ||
| jobs_processed += 1 | ||
| logger.debug(f"worker exiting, processed {jobs_processed} jobs") | ||
| def _job_producer(self, data_iterable:List[tuple], job_queue:Queue): | ||
| """ Fill the jobs queue using the data found in the input stream. | ||
| def _job_producer(self, data_iterable: List[tuple], job_queue: Queue): | ||
| """Fill the jobs queue using the data found in the input stream. | ||
| Each job is represented as a batch of tuple | ||
@@ -841,3 +994,3 @@ | ||
| job_batch, batch_size = [data], data_length | ||
| if job_batch: | ||
@@ -850,6 +1003,8 @@ job_no += 1 | ||
| logger.debug(f"job loop exiting, total {job_no} jobs") | ||
| def _log_train_progress(self, progress_queue:Queue, total_sentences:int=None, report_delay:int=5): | ||
| """ Log the training process after a couple of seconds. | ||
| def _log_train_progress( | ||
| self, progress_queue: Queue, total_sentences: int = None, report_delay: int = 5 | ||
| ): | ||
| """Log the training process after a couple of seconds. | ||
| Parameters | ||
@@ -881,3 +1036,5 @@ ---------- | ||
| unfinished_worker_count -= 1 | ||
| logger.info(f"worker thread finished; awaiting finish of {unfinished_worker_count} more threads") | ||
| logger.info( | ||
| f"worker thread finished; awaiting finish of {unfinished_worker_count} more threads" | ||
| ) | ||
| continue | ||
@@ -892,15 +1049,21 @@ | ||
| logger.info("PROGRESS : finished {:3.2f}% with {} sentences and {} words, {} sentences/s".format( | ||
| 100 * (eff_sentences/total_sentences), | ||
| eff_sentences, eff_words, | ||
| int((eff_sentences-sentence_inc) / report_delay) | ||
| )) | ||
| logger.info( | ||
| "PROGRESS : finished {:3.2f}% with {} sentences and {} words, {} sentences/s".format( | ||
| 100 * (eff_sentences / total_sentences), | ||
| eff_sentences, | ||
| eff_words, | ||
| int((eff_sentences - sentence_inc) / report_delay), | ||
| ) | ||
| ) | ||
| sentence_inc = eff_sentences | ||
| return jobs, eff_sentences, eff_words | ||
| class BaseSentence2VecPreparer(SaveLoad): | ||
| """ Contains helper functions to perpare the weights for the training of BaseSentence2VecModel """ | ||
| def prepare_vectors(self, sv:SentenceVectors, total_sentences:int, update:bool=False): | ||
| def prepare_vectors( | ||
| self, sv: SentenceVectors, total_sentences: int, update: bool = False | ||
| ): | ||
| """Build tables and model weights based on final vocabulary settings.""" | ||
@@ -912,3 +1075,3 @@ if not update: | ||
| def reset_vectors(self, sv:SentenceVectors, total_sentences:int): | ||
| def reset_vectors(self, sv: SentenceVectors, total_sentences: int): | ||
| """Initialize all sentence vectors to zero and overwrite existing files""" | ||
@@ -918,7 +1081,10 @@ logger.info(f"initializing sentence vectors for {total_sentences} sentences") | ||
| sv.vectors = np_memmap( | ||
| str(sv.mapfile_path) + '.vectors', dtype=REAL, | ||
| mode='w+', shape=(total_sentences, sv.vector_size)) | ||
| str(sv.mapfile_path) + ".vectors", | ||
| dtype=REAL, | ||
| mode="w+", | ||
| shape=(total_sentences, sv.vector_size), | ||
| ) | ||
| else: | ||
| sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL) | ||
| for i in range(total_sentences): | ||
@@ -928,3 +1094,3 @@ sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL) | ||
| def update_vectors(self, sv:SentenceVectors, total_sentences:int): | ||
| def update_vectors(self, sv: SentenceVectors, total_sentences: int): | ||
| """Given existing sentence vectors, append new ones""" | ||
@@ -937,4 +1103,7 @@ logger.info(f"appending sentence vectors for {total_sentences} sentences") | ||
| sv.vectors = np_memmap( | ||
| str(sv.mapfile_path) + '.vectors', dtype=REAL, | ||
| mode='r+', shape=(sentences_after, sv.vector_size)) | ||
| str(sv.mapfile_path) + ".vectors", | ||
| dtype=REAL, | ||
| mode="r+", | ||
| shape=(sentences_after, sv.vector_size), | ||
| ) | ||
| for i in range(sentences_before, sentences_after): | ||
@@ -947,2 +1116,2 @@ sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL) | ||
| sv.vectors = vstack([sv.vectors, newvectors]) | ||
| sv.vectors_norm = None | ||
| sv.vectors_norm = None |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -12,7 +12,18 @@ | ||
| from fse.models.utils import set_madvise_for_mmap | ||
| from gensim.models.keyedvectors import BaseKeyedVectors | ||
| from numpy import dot, float32 as REAL, memmap as np_memmap, \ | ||
| double, array, zeros, vstack, sqrt, newaxis, integer, \ | ||
| ndarray, sum as np_sum, prod, argmax | ||
| from numpy import ( | ||
| dot, | ||
| float32 as REAL, | ||
| memmap as np_memmap, | ||
| array, | ||
| zeros, | ||
| vstack, | ||
| sqrt, | ||
| newaxis, | ||
| integer, | ||
| ndarray, | ||
| ) | ||
@@ -29,14 +40,17 @@ from gensim import utils, matutils | ||
| class SentenceVectors(utils.SaveLoad): | ||
| def __init__(self, vector_size: int, mapfile_path: str = None): | ||
| def __init__(self, vector_size:int, mapfile_path:str=None): | ||
| self.vector_size = vector_size # Size of vectors | ||
| self.vectors = zeros((0, vector_size), REAL) # Vectors for sentences | ||
| set_madvise_for_mmap() | ||
| self.vector_size = vector_size # Size of vectors | ||
| self.vectors = zeros((0, vector_size), REAL) # Vectors for sentences | ||
| self.vectors_norm = None | ||
| # File for numpy memmap | ||
| self.mapfile_path = Path(mapfile_path) if mapfile_path is not None else None | ||
| self.mapfile_path = Path(mapfile_path) if mapfile_path is not None else None | ||
| self.mapfile_shape = None | ||
| def __getitem__(self, entities:int) -> ndarray: | ||
| def __getitem__(self, entities: int) -> ndarray: | ||
| """Get vector representation of `entities`. | ||
@@ -56,3 +70,9 @@ | ||
| if isinstance(entities, (int, integer,)): | ||
| if isinstance( | ||
| entities, | ||
| ( | ||
| int, | ||
| integer, | ||
| ), | ||
| ): | ||
| return self.get_vector(entities) | ||
@@ -62,4 +82,10 @@ | ||
| def __contains__(self, index:int) -> bool: | ||
| if isinstance(index, (int, integer,)): | ||
| def __contains__(self, index: int) -> bool: | ||
| if isinstance( | ||
| index, | ||
| ( | ||
| int, | ||
| integer, | ||
| ), | ||
| ): | ||
| return index < len(self) | ||
@@ -72,6 +98,8 @@ else: | ||
| def _load_all_vectors_from_disk(self, mapfile_path:Path): | ||
| def _load_all_vectors_from_disk(self, mapfile_path: Path): | ||
| """ Reads all vectors from disk """ | ||
| path = str(mapfile_path.absolute()) | ||
| self.vectors = np_memmap(f"{path}.vectors", dtype=REAL, mode='r+', shape=self.mapfile_shape) | ||
| self.vectors = np_memmap( | ||
| f"{path}.vectors", dtype=REAL, mode="r+", shape=self.mapfile_shape | ||
| ) | ||
@@ -97,3 +125,3 @@ def save(self, *args, **kwargs): | ||
| ignore.append("vectors") | ||
| kwargs['ignore'] = kwargs.get('ignore', ignore) | ||
| kwargs["ignore"] = kwargs.get("ignore", ignore) | ||
| super(SentenceVectors, self).save(*args, **kwargs) | ||
@@ -108,5 +136,6 @@ | ||
| sv._load_all_vectors_from_disk(mapfile_path=path) | ||
| set_madvise_for_mmap() | ||
| return sv | ||
| def get_vector(self, index:int, use_norm:bool=False) -> ndarray: | ||
| def get_vector(self, index: int, use_norm: bool = False) -> ndarray: | ||
| """Get sentence representations in vector space, as a 1D numpy array. | ||
@@ -143,3 +172,3 @@ | ||
| def init_sims(self, replace:bool=False): | ||
| def init_sims(self, replace: bool = False): | ||
| """Precompute L2-normalized vectors. | ||
@@ -152,11 +181,14 @@ | ||
| """ | ||
| if getattr(self, 'vectors_norm', None) is None or replace: | ||
| if getattr(self, "vectors_norm", None) is None or replace: | ||
| logger.info("precomputing L2-norms of sentence vectors") | ||
| if not replace and self.mapfile_path is not None: | ||
| self.vectors_norm = np_memmap( | ||
| self.mapfile_path + '.vectors_norm', dtype=REAL, | ||
| mode='w+', shape=self.vectors.shape) | ||
| self.mapfile_path + ".vectors_norm", | ||
| dtype=REAL, | ||
| mode="w+", | ||
| shape=self.vectors.shape, | ||
| ) | ||
| self.vectors_norm = _l2_norm(self.vectors, replace=replace) | ||
| def similarity(self, d1:int, d2:int) -> float: | ||
| def similarity(self, d1: int, d2: int) -> float: | ||
| """Compute cosine similarity between two sentences from the training set. | ||
@@ -167,5 +199,5 @@ | ||
| d1 : int | ||
| index of sentence | ||
| index of sentence | ||
| d2 : int | ||
| index of sentence | ||
| index of sentence | ||
@@ -180,3 +212,3 @@ Returns | ||
| def distance(self, d1:int, d2:int) -> float: | ||
| def distance(self, d1: int, d2: int) -> float: | ||
| """Compute cosine similarity between two sentences from the training set. | ||
@@ -187,5 +219,5 @@ | ||
| d1 : int | ||
| index of sentence | ||
| index of sentence | ||
| d2 : int | ||
| index of sentence | ||
| index of sentence | ||
@@ -200,5 +232,10 @@ Returns | ||
| def most_similar(self, positive:[int,ndarray]=None, negative:[int,ndarray]=None, | ||
| indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int, Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| def most_similar( | ||
| self, | ||
| positive: [int, ndarray] = None, | ||
| negative: [int, ndarray] = None, | ||
| indexable: [IndexedList, IndexedLineDocument] = None, | ||
| topn: int = 10, | ||
| restrict_size: [int, Tuple[int, int]] = None, | ||
| ) -> List[Tuple[int, float]]: | ||
@@ -258,3 +295,3 @@ """Find the top-N most similar sentences. | ||
| negative = [ | ||
| (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent | ||
| (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent | ||
| for sent in negative | ||
@@ -282,3 +319,5 @@ ] | ||
| limited = self.vectors_norm if restrict_size is None else self.vectors_norm[lo:hi] | ||
| limited = ( | ||
| self.vectors_norm if restrict_size is None else self.vectors_norm[lo:hi] | ||
| ) | ||
| dists = dot(limited, mean) | ||
@@ -289,11 +328,25 @@ if not topn: | ||
| best_off = best + lo | ||
| if indexable is not None: | ||
| result = [(indexable[off_idx], off_idx, float(dists[idx])) for off_idx, idx in zip(best_off, best) if off_idx not in all_sents] | ||
| result = [ | ||
| (indexable[off_idx], off_idx, float(dists[idx])) | ||
| for off_idx, idx in zip(best_off, best) | ||
| if off_idx not in all_sents | ||
| ] | ||
| else: | ||
| result = [(off_idx, float(dists[idx])) for off_idx, idx in zip(best_off, best) if off_idx not in all_sents] | ||
| result = [ | ||
| (off_idx, float(dists[idx])) | ||
| for off_idx, idx in zip(best_off, best) | ||
| if off_idx not in all_sents | ||
| ] | ||
| return result[:topn] | ||
| def similar_by_word(self, word:str, wv:BaseKeyedVectors, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| def similar_by_word( | ||
| self, | ||
| word: str, | ||
| wv: BaseKeyedVectors, | ||
| indexable: [IndexedList, IndexedLineDocument] = None, | ||
| topn: int = 10, | ||
| restrict_size: [int, Tuple[int, int]] = None, | ||
| ) -> List[Tuple[int, float]]: | ||
@@ -329,7 +382,18 @@ """Find the top-N most similar sentences to a given word. | ||
| """ | ||
| return self.most_similar(positive=wv[word], indexable=indexable, topn=topn, restrict_size=restrict_size) | ||
| return self.most_similar( | ||
| positive=wv[word], | ||
| indexable=indexable, | ||
| topn=topn, | ||
| restrict_size=restrict_size, | ||
| ) | ||
| def similar_by_sentence(self, sentence:List[str], model, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| def similar_by_sentence( | ||
| self, | ||
| sentence: List[str], | ||
| model, | ||
| indexable: [IndexedList, IndexedLineDocument] = None, | ||
| topn: int = 10, | ||
| restrict_size: [int, Tuple[int, int]] = None, | ||
| ) -> List[Tuple[int, float]]: | ||
| """Find the top-N most similar sentences to a given sentence. | ||
@@ -364,8 +428,21 @@ | ||
| """ | ||
| infer_op = getattr(model, "infer", None) | ||
| if not callable(infer_op): | ||
| raise RuntimeError( | ||
| "Model does not have infer method. Make sure to pass a BaseSentence2VecModel" | ||
| ) | ||
| vector = model.infer([(sentence, 0)]) | ||
| return self.most_similar(positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size) | ||
| def similar_by_vector(self, vector:ndarray, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, | ||
| restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]: | ||
| return self.most_similar( | ||
| positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size | ||
| ) | ||
| def similar_by_vector( | ||
| self, | ||
| vector: ndarray, | ||
| indexable: [IndexedList, IndexedLineDocument] = None, | ||
| topn: int = 10, | ||
| restrict_size: [int, Tuple[int, int]] = None, | ||
| ) -> List[Tuple[int, float]]: | ||
| """Find the top-N most similar sentences to a given vector. | ||
@@ -398,4 +475,7 @@ | ||
| """ | ||
| return self.most_similar(positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size) | ||
| return self.most_similar( | ||
| positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size | ||
| ) | ||
| def _l2_norm(m, replace=False): | ||
@@ -423,2 +503,2 @@ """Return an L2-normalized version of a matrix. | ||
| else: | ||
| return (m / dist).astype(REAL) | ||
| return (m / dist).astype(REAL) |
+58
-19
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -12,3 +12,3 @@ from fse.models.average import Average | ||
| from numpy import ndarray, float32 as REAL, ones, zeros | ||
| from numpy import ndarray, float32 as REAL, zeros, isfinite | ||
@@ -19,6 +19,16 @@ import logging | ||
| class SIF(Average): | ||
| def __init__(self, model:BaseKeyedVectors, alpha:float=1e-3, components:int=1, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None): | ||
| """ Smooth-inverse frequency (SIF) weighted sentence embeddings model. Performs a weighted averaging operation over all | ||
| def __init__( | ||
| self, | ||
| model: BaseKeyedVectors, | ||
| alpha: float = 1e-3, | ||
| components: int = 1, | ||
| cache_size_gb: float = 1.0, | ||
| sv_mapfile_path: str = None, | ||
| wv_mapfile_path: str = None, | ||
| workers: int = 1, | ||
| lang_freq: str = None, | ||
| ): | ||
| """Smooth-inverse frequency (SIF) weighted sentence embeddings model. Performs a weighted averaging operation over all | ||
| words in a sentences. After training, the model removes a number of singular vectors. | ||
@@ -38,2 +48,4 @@ | ||
| Corresponds to the number of singular vectors to remove from the sentence embeddings. | ||
| cache_size_gb : float, optional | ||
| Cache size for computing the singular vectors in GB. | ||
| sv_mapfile_path : str, optional | ||
@@ -53,3 +65,3 @@ Optional path to store the sentence-vectors in for very large datasets. Used for memmap. | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| """ | ||
@@ -59,15 +71,25 @@ | ||
| self.components = int(components) | ||
| self.cache_size_gb = float(cache_size_gb) | ||
| self.svd_res = None | ||
| if lang_freq is None: | ||
| logger.info( | ||
| "make sure you are using a model with valid word-frequency information. Otherwise use lang_freq argument." | ||
| ) | ||
| super(SIF, self).__init__( | ||
| model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, lang_freq=lang_freq) | ||
| model=model, | ||
| sv_mapfile_path=sv_mapfile_path, | ||
| wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, | ||
| lang_freq=lang_freq, | ||
| ) | ||
| def _check_parameter_sanity(self): | ||
| """ Check the sanity of all paramters """ | ||
| if not all(self.word_weights <= 1.) or not all(self.word_weights >= 0.): | ||
| if not all(self.word_weights <= 1.0) or not all(self.word_weights >= 0.0): | ||
| raise ValueError("For SIF, all word weights must be 0 <= w_weight <= 1") | ||
| if self.alpha <= 0.: | ||
| if self.alpha <= 0.0: | ||
| raise ValueError("Alpha must be greater than zero.") | ||
| if self.components < 0.: | ||
| if self.components < 0.0: | ||
| raise ValueError("Components must be greater or equal zero") | ||
@@ -78,8 +100,14 @@ | ||
| self._compute_sif_weights() | ||
| def _post_train_calls(self): | ||
| """ Function calls to perform after training, such as computing eigenvectors """ | ||
| if self.components > 0: | ||
| self.svd_res = compute_principal_components(self.sv.vectors, components=self.components) | ||
| remove_principal_components(self.sv.vectors, svd_res=self.svd_res, inplace=True) | ||
| self.svd_res = compute_principal_components( | ||
| self.sv.vectors, | ||
| components=self.components, | ||
| cache_size_gb=self.cache_size_gb, | ||
| ) | ||
| remove_principal_components( | ||
| self.sv.vectors, svd_res=self.svd_res, inplace=True | ||
| ) | ||
| else: | ||
@@ -89,6 +117,8 @@ self.svd_res = 0 | ||
| def _post_inference_calls(self, output:ndarray): | ||
| def _post_inference_calls(self, output: ndarray): | ||
| """ Function calls to perform after training & inference """ | ||
| if self.svd_res is None: | ||
| raise RuntimeError("You must first train the model to obtain SVD components") | ||
| raise RuntimeError( | ||
| "You must first train the model to obtain SVD components" | ||
| ) | ||
| elif self.components > 0: | ||
@@ -107,3 +137,5 @@ remove_principal_components(output, svd_res=self.svd_res, inplace=True) | ||
| if self.svd_res[1].dtype != REAL: | ||
| raise TypeError(f"type of svd components is wrong: {self.svd_res[1].dtype}") | ||
| raise TypeError( | ||
| f"type of svd components is wrong: {self.svd_res[1].dtype}" | ||
| ) | ||
@@ -123,2 +155,9 @@ def _compute_sif_weights(self): | ||
| self.word_weights = (self.alpha / (self.alpha + pw)).astype(REAL) | ||
| self.word_weights = (self.alpha / (self.alpha + pw)).astype(REAL) | ||
| if not all(isfinite(self.word_weights)) or any(self.word_weights < 0): | ||
| raise RuntimeError( | ||
| "Encountered nan values. " | ||
| "This likely happens because the word frequency information is wrong/missing. " | ||
| "Consider restarting using lang_freq argument to infer frequency. " | ||
| ) |
+73
-25
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -12,3 +12,3 @@ from fse.models.average import Average | ||
| from numpy import ndarray, float32 as REAL, zeros | ||
| from numpy import ndarray, float32 as REAL, zeros, isfinite | ||
@@ -19,6 +19,16 @@ import logging | ||
| class uSIF(Average): | ||
| def __init__(self, model:BaseKeyedVectors, length:int=None, components:int=5, sv_mapfile_path:str=None, wv_mapfile_path:str=None, workers:int=1, lang_freq:str=None): | ||
| """ Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings model. Performs a weighted averaging operation over all | ||
| def __init__( | ||
| self, | ||
| model: BaseKeyedVectors, | ||
| length: int = None, | ||
| components: int = 5, | ||
| cache_size_gb: float = 1.0, | ||
| sv_mapfile_path: str = None, | ||
| wv_mapfile_path: str = None, | ||
| workers: int = 1, | ||
| lang_freq: str = None, | ||
| ): | ||
| """Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings model. Performs a weighted averaging operation over all | ||
| words in a sentences. After training, the model removes a number of weighted singular vectors. | ||
@@ -42,2 +52,4 @@ | ||
| Is equivalent to m in the paper. | ||
| cache_size_gb : float, optional | ||
| Cache size for computing the singular vectors in GB. | ||
| sv_mapfile_path : str, optional | ||
@@ -57,3 +69,3 @@ Optional path to store the sentence-vectors in for very large datasets. Used for memmap. | ||
| See https://github.com/LuminosoInsight/wordfreq | ||
| """ | ||
@@ -63,14 +75,24 @@ | ||
| self.components = int(components) | ||
| self.cache_size_gb = float(cache_size_gb) | ||
| self.svd_res = None | ||
| self.svd_weights = None | ||
| super(Average, self).__init__( | ||
| model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, lang_freq=lang_freq) | ||
| if lang_freq is None: | ||
| logger.info( | ||
| "make sure you are using a model with valid word-frequency information. Otherwise use lang_freq argument." | ||
| ) | ||
| super(uSIF, self).__init__( | ||
| model=model, | ||
| sv_mapfile_path=sv_mapfile_path, | ||
| wv_mapfile_path=wv_mapfile_path, | ||
| workers=workers, | ||
| lang_freq=lang_freq, | ||
| ) | ||
| def _check_parameter_sanity(self): | ||
| """ Check the sanity of all paramters """ | ||
| if self.length <= 0.: | ||
| if self.length <= 0.0: | ||
| raise ValueError("Length must be greater than zero.") | ||
| if self.components < 0.: | ||
| if self.components < 0.0: | ||
| raise ValueError("Components must be greater or equal zero") | ||
@@ -86,5 +108,16 @@ | ||
| if self.components > 0: | ||
| self.svd_res = compute_principal_components(self.sv.vectors, components=self.components) | ||
| self.svd_weights = (self.svd_res[0] ** 2) / (self.svd_res[0] ** 2).sum().astype(REAL) | ||
| remove_principal_components(self.sv.vectors, svd_res=self.svd_res, weights=self.svd_weights, inplace=True) | ||
| self.svd_res = compute_principal_components( | ||
| self.sv.vectors, | ||
| components=self.components, | ||
| cache_size_gb=self.cache_size_gb, | ||
| ) | ||
| self.svd_weights = (self.svd_res[0] ** 2) / ( | ||
| self.svd_res[0] ** 2 | ||
| ).sum().astype(REAL) | ||
| remove_principal_components( | ||
| self.sv.vectors, | ||
| svd_res=self.svd_res, | ||
| weights=self.svd_weights, | ||
| inplace=True, | ||
| ) | ||
| else: | ||
@@ -94,11 +127,15 @@ self.svd_res = 0 | ||
| def _post_inference_calls(self, output:ndarray): | ||
| def _post_inference_calls(self, output: ndarray): | ||
| """ Function calls to perform after training & inference """ | ||
| if self.svd_res is None: | ||
| raise RuntimeError("You must first train the model to obtain SVD components") | ||
| raise RuntimeError( | ||
| "You must first train the model to obtain SVD components" | ||
| ) | ||
| elif self.components > 0: | ||
| remove_principal_components(output, svd_res=self.svd_res, weights=self.svd_weights, inplace=True) | ||
| remove_principal_components( | ||
| output, svd_res=self.svd_res, weights=self.svd_weights, inplace=True | ||
| ) | ||
| else: | ||
| logger.info(f"no removal of principal components") | ||
| def _check_dtype_santiy(self): | ||
@@ -112,5 +149,9 @@ """ Check the dtypes of all attributes """ | ||
| if self.svd_res[1].dtype != REAL: | ||
| raise TypeError(f"type of svd components is wrong: {self.svd_res[1].dtype}") | ||
| raise TypeError( | ||
| f"type of svd components is wrong: {self.svd_res[1].dtype}" | ||
| ) | ||
| if self.svd_weights.dtype != REAL: | ||
| raise TypeError(f"type of svd weights is wrong: {self.svd_weights.dtype}") | ||
| raise TypeError( | ||
| f"type of svd weights is wrong: {self.svd_weights.dtype}" | ||
| ) | ||
@@ -130,7 +171,14 @@ def _compute_usif_weights(self): | ||
| threshold = 1 - (1-(1/v)) ** self.length | ||
| threshold = 1 - (1 - (1 / v)) ** self.length | ||
| alpha = sum(pw > threshold) / v | ||
| z = v/2 | ||
| a = (1 - alpha)/(alpha * z) | ||
| z = v / 2 | ||
| a = (1 - alpha) / (alpha * z) | ||
| self.word_weights = (a / ((a/2) + pw)).astype(REAL) | ||
| self.word_weights = (a / ((a / 2) + pw)).astype(REAL) | ||
| if not all(isfinite(self.word_weights)): | ||
| raise RuntimeError( | ||
| "Encountered nan values. " | ||
| "This likely happens because the word frequency information is wrong/missing. " | ||
| "Consider restarting using lang_freq argument to infer frequency. " | ||
| ) |
+81
-17
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
| from sklearn.decomposition import TruncatedSVD | ||
| from numpy import ndarray, float32 as REAL, ones | ||
| from numpy import ndarray, float32 as REAL, ones, vstack, dtype | ||
| from numpy.random import choice | ||
@@ -15,9 +16,46 @@ from time import time | ||
| from sys import platform | ||
| import ctypes | ||
| logger = logging.getLogger(__name__) | ||
| def compute_principal_components(vectors:ndarray, components:int=1) -> ndarray: | ||
| """ Method used to compute the first singular vectors of a given matrix | ||
| def set_madvise_for_mmap(return_madvise: bool = False) -> object: | ||
| """Method used to set madvise parameters. | ||
| This problem adresses the memmap issue raised in https://github.com/numpy/numpy/issues/13172 | ||
| The issue is not applicable for windows | ||
| Parameters | ||
| ---------- | ||
| return_madvise : bool | ||
| Returns the madvise object for unittests, se test_utils.py | ||
| Returns | ||
| ------- | ||
| object | ||
| madvise object | ||
| """ | ||
| if platform in ["linux", "linux2", "darwin", "aix"]: | ||
| if platform == "darwin": | ||
| # Path different for Macos | ||
| madvise = ctypes.CDLL("libc.dylib").madvise | ||
| if platform in ["linux", "linux2", "aix"]: | ||
| madvise = ctypes.CDLL("libc.so.6").madvise | ||
| madvise.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int] | ||
| madvise.restype = ctypes.c_int | ||
| if return_madvise: | ||
| return madvise | ||
| def compute_principal_components( | ||
| vectors: ndarray, components: int = 1, cache_size_gb: float = 1.0 | ||
| ) -> [ndarray, ndarray]: | ||
| """Method used to compute the first singular vectors of a given (sub)matrix | ||
| Parameters | ||
| ---------- | ||
| vectors : ndarray | ||
@@ -27,2 +65,4 @@ (Sentence) vectors to compute the truncated SVD on | ||
| Number of singular values/vectors to compute | ||
| cache_size_gb : float, optional | ||
| Cache size for computing the principal components in GB | ||
@@ -35,11 +75,33 @@ Returns | ||
| start = time() | ||
| svd = TruncatedSVD(n_components=components, n_iter=7, random_state=42, algorithm="randomized") | ||
| svd.fit(vectors) | ||
| num_vectors = vectors.shape[0] | ||
| svd = TruncatedSVD( | ||
| n_components=components, n_iter=7, random_state=42, algorithm="randomized" | ||
| ) | ||
| sample_size = int( | ||
| 1024 ** 3 * cache_size_gb / (vectors.shape[1] * dtype(REAL).itemsize) | ||
| ) | ||
| if sample_size > num_vectors: | ||
| svd.fit(vectors) | ||
| else: | ||
| logger.info(f"sampling {sample_size} vectors to compute principal components") | ||
| sample_indices = choice(range(num_vectors), replace=False, size=int(1e6)) | ||
| svd.fit(vectors[sample_indices, :]) | ||
| elapsed = time() | ||
| logger.info(f"computing {components} principal components took {int(elapsed-start)}s") | ||
| logger.info( | ||
| f"computing {components} principal components took {int(elapsed-start)}s" | ||
| ) | ||
| return svd.singular_values_.astype(REAL), svd.components_.astype(REAL) | ||
| def remove_principal_components(vectors:ndarray, svd_res:[ndarray, ndarray], weights:ndarray=None, inplace:bool=True) -> ndarray: | ||
| """ Method used to remove the first singular vectors of a given matrix | ||
| def remove_principal_components( | ||
| vectors: ndarray, | ||
| svd_res: [ndarray, ndarray], | ||
| weights: ndarray = None, | ||
| inplace: bool = True, | ||
| ) -> ndarray: | ||
| """Method used to remove the first singular vectors of a given matrix | ||
| Parameters | ||
@@ -54,3 +116,3 @@ ---------- | ||
| inplace : bool, optional | ||
| If true, removes the componentens from the vectors inplace (memory efficient) | ||
| If true, removes the components from the vectors inplace (memory efficient) | ||
@@ -62,3 +124,2 @@ Returns | ||
| """ | ||
| singular_values = svd_res[0].astype(REAL) | ||
| components = svd_res[1].astype(REAL) | ||
@@ -73,5 +134,5 @@ | ||
| output = None | ||
| if len(components)==1: | ||
| if len(components) == 1: | ||
| if not inplace: | ||
| output = vectors.dot(w_comp.transpose()) * w_comp | ||
| output = vectors - vectors.dot(w_comp.transpose()) * w_comp | ||
| else: | ||
@@ -81,8 +142,11 @@ vectors -= vectors.dot(w_comp.transpose()) * w_comp | ||
| if not inplace: | ||
| output = vectors.dot(w_comp.transpose()).dot(w_comp) | ||
| output = vectors - vectors.dot(w_comp.transpose()).dot(w_comp) | ||
| else: | ||
| vectors -= vectors.dot(w_comp.transpose()).dot(w_comp) | ||
| elapsed = time() | ||
| logger.info(f"removing {len(components)} principal components took {int(elapsed-start)}s") | ||
| logger.info( | ||
| f"removing {len(components)} principal components took {int(elapsed-start)}s" | ||
| ) | ||
| if not inplace: | ||
| return output | ||
| return output |
+97
-38
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -30,16 +30,31 @@ """ | ||
| W2V.build_vocab(SENTENCES) | ||
| W2V.wv.vectors[:,] = np.arange(len(W2V.wv.vectors), dtype=np.float32)[:, None] | ||
| W2V.wv.vectors[:,] = np.arange( | ||
| len(W2V.wv.vectors), dtype=np.float32 | ||
| )[:, None] | ||
| class TestAverageFunctions(unittest.TestCase): | ||
| def setUp(self): | ||
| self.sentences = [["They", "admit"], ["So", "Apple", "bought", "buds"], ["go", "12345"], ["pull", "12345678910111213"]] | ||
| self.sentences = [(s, i) for i,s in enumerate(self.sentences)] | ||
| self.sentences = [ | ||
| ["They", "admit"], | ||
| ["So", "Apple", "bought", "buds"], | ||
| ["go", "12345"], | ||
| ["pull", "12345678910111213"], | ||
| ] | ||
| self.sentences = [(s, i) for i, s in enumerate(self.sentences)] | ||
| self.model = Average(W2V) | ||
| self.model.prep.prepare_vectors(sv=self.model.sv, total_sentences=len(self.sentences), update=False) | ||
| self.model.prep.prepare_vectors( | ||
| sv=self.model.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| self.model._pre_train_calls() | ||
| def test_cython(self): | ||
| from fse.models.average_inner import FAST_VERSION, MAX_WORDS_IN_BATCH, MAX_NGRAMS_IN_BATCH | ||
| from fse.models.average_inner import ( | ||
| FAST_VERSION, | ||
| MAX_WORDS_IN_BATCH, | ||
| MAX_NGRAMS_IN_BATCH, | ||
| ) | ||
| self.assertTrue(FAST_VERSION) | ||
| self.assertEqual(10000,MAX_WORDS_IN_BATCH) | ||
| self.assertEqual(10000, MAX_WORDS_IN_BATCH) | ||
| self.assertEqual(40, MAX_NGRAMS_IN_BATCH) | ||
@@ -50,3 +65,5 @@ | ||
| mem = self.model._get_thread_working_mem() | ||
| output = train_average_np(self.model, self.sentences, self.model.sv.vectors, mem) | ||
| output = train_average_np( | ||
| self.model, self.sentences, self.model.sv.vectors, mem | ||
| ) | ||
| self.assertEqual((4, 7), output) | ||
@@ -56,3 +73,3 @@ self.assertTrue((183 == self.model.sv[0]).all()) | ||
| self.assertTrue((self.model.wv.vocab["go"].index == self.model.sv[2]).all()) | ||
| def test_average_train_cy_w2v(self): | ||
@@ -63,3 +80,6 @@ self.model.sv.vectors = np.zeros_like(self.model.sv.vectors, dtype=np.float32) | ||
| from fse.models.average_inner import train_average_cy | ||
| output = train_average_cy(self.model, self.sentences, self.model.sv.vectors, mem) | ||
| output = train_average_cy( | ||
| self.model, self.sentences, self.model.sv.vectors, mem | ||
| ) | ||
| self.assertEqual((4, 7), output) | ||
@@ -74,3 +94,5 @@ self.assertTrue((183 == self.model.sv[0]).all()) | ||
| m = Average(ft) | ||
| m.prep.prepare_vectors(sv=m.sv, total_sentences=len(self.sentences), update=False) | ||
| m.prep.prepare_vectors( | ||
| sv=m.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m._pre_train_calls() | ||
@@ -82,3 +104,3 @@ m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32) | ||
| self.assertEqual((4, 10), output) | ||
| self.assertTrue((1. == m.sv[0]).all()) | ||
| self.assertTrue((1.0 == m.sv[0]).all()) | ||
| self.assertTrue((1.5 == m.sv[2]).all()) | ||
@@ -94,3 +116,5 @@ self.assertTrue((2 == m.sv[3]).all()) | ||
| m = Average(ft) | ||
| m.prep.prepare_vectors(sv=m.sv, total_sentences=len(self.sentences), update=False) | ||
| m.prep.prepare_vectors( | ||
| sv=m.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m._pre_train_calls() | ||
@@ -102,5 +126,6 @@ m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32) | ||
| from fse.models.average_inner import train_average_cy | ||
| output = train_average_cy(m, self.sentences, m.sv.vectors, mem) | ||
| self.assertEqual((4, 10), output) | ||
| self.assertTrue((1.+EPS == m.sv[0]).all()) | ||
| self.assertTrue((1.0 + EPS == m.sv[0]).all()) | ||
| self.assertTrue(np.allclose(1.5, m.sv[2])) | ||
@@ -111,3 +136,5 @@ self.assertTrue(np.allclose(2, m.sv[3])) | ||
| m1 = Average(W2V) | ||
| m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) | ||
| m1.prep.prepare_vectors( | ||
| sv=m1.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m1._pre_train_calls() | ||
@@ -118,3 +145,5 @@ mem1 = m1._get_thread_working_mem() | ||
| m2 = Average(W2V) | ||
| m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) | ||
| m2.prep.prepare_vectors( | ||
| sv=m2.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m2._pre_train_calls() | ||
@@ -124,2 +153,3 @@ mem2 = m2._get_thread_working_mem() | ||
| from fse.models.average_inner import train_average_cy | ||
| o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2) | ||
@@ -136,3 +166,5 @@ | ||
| m1 = Average(w2v) | ||
| m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) | ||
| m1.prep.prepare_vectors( | ||
| sv=m1.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m1._pre_train_calls() | ||
@@ -143,3 +175,5 @@ mem1 = m1._get_thread_working_mem() | ||
| m2 = Average(w2v) | ||
| m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) | ||
| m2.prep.prepare_vectors( | ||
| sv=m2.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m2._pre_train_calls() | ||
@@ -149,2 +183,3 @@ mem2 = m2._get_thread_working_mem() | ||
| from fse.models.average_inner import train_average_cy | ||
| o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2) | ||
@@ -159,6 +194,9 @@ | ||
| m1 = Average(ft) | ||
| m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) | ||
| m1.prep.prepare_vectors( | ||
| sv=m1.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m1._pre_train_calls() | ||
| from fse.models.average_inner import MAX_NGRAMS_IN_BATCH | ||
| m1.batch_ngrams = MAX_NGRAMS_IN_BATCH | ||
@@ -169,3 +207,5 @@ mem1 = m1._get_thread_working_mem() | ||
| m2 = Average(ft) | ||
| m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) | ||
| m2.prep.prepare_vectors( | ||
| sv=m2.sv, total_sentences=len(self.sentences), update=False | ||
| ) | ||
| m2._pre_train_calls() | ||
@@ -175,2 +215,3 @@ mem2 = m2._get_thread_working_mem() | ||
| from fse.models.average_inner import train_average_cy | ||
| o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2) | ||
@@ -182,13 +223,21 @@ | ||
| def test_do_train_job(self): | ||
| self.model.prep.prepare_vectors(sv=self.model.sv, total_sentences=len(SENTENCES), update=True) | ||
| self.model.prep.prepare_vectors( | ||
| sv=self.model.sv, total_sentences=len(SENTENCES), update=True | ||
| ) | ||
| mem = self.model._get_thread_working_mem() | ||
| self.assertEqual((100,1450), self.model._do_train_job( | ||
| [(s, i) for i,s in enumerate(SENTENCES)], | ||
| target=self.model.sv.vectors, memory=mem) | ||
| self.assertEqual( | ||
| (100, 1450), | ||
| self.model._do_train_job( | ||
| [(s, i) for i, s in enumerate(SENTENCES)], | ||
| target=self.model.sv.vectors, | ||
| memory=mem, | ||
| ), | ||
| ) | ||
| self.assertEqual((104,DIM), self.model.sv.vectors.shape) | ||
| self.assertEqual((104, DIM), self.model.sv.vectors.shape) | ||
| def test_train(self): | ||
| self.assertEqual((100,1450), self.model.train([(s, i) for i,s in enumerate(SENTENCES)])) | ||
| self.assertEqual( | ||
| (100, 1450), self.model.train([(s, i) for i, s in enumerate(SENTENCES)]) | ||
| ) | ||
| def test_train_single_from_disk(self): | ||
@@ -200,5 +249,7 @@ p = Path("fse/test/test_data/test_vecs") | ||
| se1 = Average(W2V) | ||
| se2 = Average(W2V, sv_mapfile_path=str(p.absolute()) ,wv_mapfile_path=str(p.absolute())) | ||
| se1.train([(s, i) for i,s in enumerate(SENTENCES)]) | ||
| se2.train([(s, i) for i,s in enumerate(SENTENCES)]) | ||
| se2 = Average( | ||
| W2V, sv_mapfile_path=str(p.absolute()), wv_mapfile_path=str(p.absolute()) | ||
| ) | ||
| se1.train([(s, i) for i, s in enumerate(SENTENCES)]) | ||
| se2.train([(s, i) for i, s in enumerate(SENTENCES)]) | ||
@@ -219,5 +270,10 @@ self.assertTrue(p_target.exists()) | ||
| se1 = Average(W2V, workers=2) | ||
| se2 = Average(W2V, workers=2, sv_mapfile_path=str(p.absolute()) ,wv_mapfile_path=str(p.absolute())) | ||
| se1.train([(s, i) for i,s in enumerate(SENTENCES)]) | ||
| se2.train([(s, i) for i,s in enumerate(SENTENCES)]) | ||
| se2 = Average( | ||
| W2V, | ||
| workers=2, | ||
| sv_mapfile_path=str(p.absolute()), | ||
| wv_mapfile_path=str(p.absolute()), | ||
| ) | ||
| se1.train([(s, i) for i, s in enumerate(SENTENCES)]) | ||
| se2.train([(s, i) for i, s in enumerate(SENTENCES)]) | ||
@@ -234,8 +290,11 @@ self.assertTrue(p_target.exists()) | ||
| se = Average(W2V) | ||
| se.word_weights = np.full(20, 2., dtype=np.float32) | ||
| se.word_weights = np.full(20, 2.0, dtype=np.float32) | ||
| with self.assertRaises(ValueError): | ||
| se._check_parameter_sanity() | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() | ||
| if __name__ == "__main__": | ||
| logging.basicConfig( | ||
| format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG | ||
| ) | ||
| unittest.main() |
+108
-73
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -79,6 +79,6 @@ """ | ||
| self.assertTrue(isinstance(se.wv, BaseKeyedVectors)) | ||
| def test_model_w_language(self): | ||
| se = BaseSentence2VecModel(W2V, lang_freq="en") | ||
| freq = int((2**31 - 1) * get_frequency_dict("en", wordlist="best")["help"]) | ||
| freq = int((2 ** 31 - 1) * get_frequency_dict("en", wordlist="best")["help"]) | ||
| self.assertEqual(freq, se.wv.vocab["help"].count) | ||
@@ -153,3 +153,3 @@ self.assertEqual(21, se.wv.vocab["79"].count) | ||
| class BadIterator(): | ||
| class BadIterator: | ||
| def __init__(self): | ||
@@ -161,7 +161,7 @@ pass | ||
| with self.assertRaises(TypeError): | ||
| se._check_input_data_sanity(data_iterable = None) | ||
| se._check_input_data_sanity(data_iterable=None) | ||
| with self.assertRaises(TypeError): | ||
| se._check_input_data_sanity(data_iterable = "Hello there!") | ||
| se._check_input_data_sanity(data_iterable="Hello there!") | ||
| with self.assertRaises(TypeError): | ||
| se._check_input_data_sanity(data_iterable = BadIterator()) | ||
| se._check_input_data_sanity(data_iterable=BadIterator()) | ||
@@ -175,7 +175,9 @@ def test_scan_w_list(self): | ||
| output = str(BaseSentence2VecModel(W2V)) | ||
| self.assertEqual("BaseSentence2VecModel based on Word2VecKeyedVectors, size=0", output) | ||
| self.assertEqual( | ||
| "BaseSentence2VecModel based on Word2VecKeyedVectors, size=0", output | ||
| ) | ||
| def test_scan_w_ituple(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| id_sent = [(s, i) for i,s in enumerate(SENTENCES)] | ||
| id_sent = [(s, i) for i, s in enumerate(SENTENCES)] | ||
| stats = se.scan_sentences(id_sent, progress_per=0) | ||
@@ -191,3 +193,3 @@ | ||
| se = BaseSentence2VecModel(W2V) | ||
| id_sent = [(s, str(i)) for i,s in enumerate(SENTENCES)] | ||
| id_sent = [(s, str(i)) for i, s in enumerate(SENTENCES)] | ||
| with self.assertRaises(TypeError): | ||
@@ -200,3 +202,8 @@ se.scan_sentences(id_sent) | ||
| SENTENCES[i] = [] | ||
| self.assertEqual(3, se.scan_sentences([(s, i) for i,s in enumerate(SENTENCES)])["empty_sentences"]) | ||
| self.assertEqual( | ||
| 3, | ||
| se.scan_sentences([(s, i) for i, s in enumerate(SENTENCES)])[ | ||
| "empty_sentences" | ||
| ], | ||
| ) | ||
@@ -206,7 +213,7 @@ def test_scan_w_wrong_input(self): | ||
| sentences = ["the dog hit the car", "he was very fast"] | ||
| with self.assertRaises(TypeError): | ||
| se.scan_sentences(sentences) | ||
| with self.assertRaises(TypeError): | ||
| se.scan_sentences([(s, i) for i,s in enumerate(sentences)]) | ||
| se.scan_sentences([(s, i) for i, s in enumerate(sentences)]) | ||
| with self.assertRaises(TypeError): | ||
@@ -216,9 +223,11 @@ se.scan_sentences([list(range(10) for _ in range(2))]) | ||
| with self.assertRaises(RuntimeError): | ||
| se.scan_sentences([(s, i+1) for i,s in enumerate(SENTENCES)]) | ||
| se.scan_sentences([(s, i + 1) for i, s in enumerate(SENTENCES)]) | ||
| with self.assertRaises(ValueError): | ||
| se.scan_sentences([(s, i-1) for i,s in enumerate(SENTENCES)]) | ||
| se.scan_sentences([(s, i - 1) for i, s in enumerate(SENTENCES)]) | ||
| def test_scan_w_many_to_one_input(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| output = se.scan_sentences([(s, 0) for i,s in enumerate(SENTENCES)])["max_index"] | ||
| output = se.scan_sentences([(s, 0) for i, s in enumerate(SENTENCES)])[ | ||
| "max_index" | ||
| ] | ||
| self.assertEqual(1, output) | ||
@@ -235,3 +244,3 @@ | ||
| with self.assertRaises(NotImplementedError): | ||
| se.train([(s, i) for i,s in enumerate(SENTENCES)]) | ||
| se.train([(s, i) for i, s in enumerate(SENTENCES)]) | ||
@@ -254,5 +263,5 @@ def test_log_end(self): | ||
| with self.assertRaises(NotImplementedError): | ||
| se._check_dtype_santiy() | ||
| se._check_dtype_santiy() | ||
| with self.assertRaises(NotImplementedError): | ||
| se._post_inference_calls() | ||
| se._post_inference_calls() | ||
@@ -265,3 +274,3 @@ def test_check_pre_train_san_no_wv(self): | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
@@ -274,3 +283,3 @@ def test_check_pre_train_san_no_wv_len(self): | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
@@ -283,7 +292,7 @@ def test_check_pre_train_san_no_ngrams_vectors(self): | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
| se.wv.vectors_ngrams = [1] | ||
| se.wv.vectors_vocab = [] | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
@@ -296,3 +305,3 @@ def test_check_pre_train_san_no_sv_vecs(self): | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
@@ -305,3 +314,3 @@ def test_check_pre_train_san_no_word_weights(self): | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
@@ -314,3 +323,3 @@ def test_check_pre_train_san_incos_len(self): | ||
| with self.assertRaises(RuntimeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
@@ -322,10 +331,10 @@ def test_check_pre_train_dtypes(self): | ||
| se.wv.vectors = np.zeros((len(se.wv.vocab),20), dtype=np.float64) | ||
| se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float64) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.wv.vectors = np.zeros((len(se.wv.vocab),20), dtype=np.float32) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
| se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) | ||
| se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
| se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32) | ||
@@ -335,13 +344,13 @@ | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
| se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32) | ||
| se.sv.vectors = np.zeros((len(se.wv.vocab),20), dtype=int) | ||
| se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=int) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se.sv.vectors = np.zeros((len(se.wv.vocab),20), dtype=np.float32) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
| se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) | ||
| se.word_weights = np.ones(len(se.wv.vocab), dtype=bool) | ||
| with self.assertRaises(TypeError): | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
| se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32) | ||
@@ -358,12 +367,11 @@ | ||
| # Just throws multiple warnings warning | ||
| se._check_pre_training_sanity(1,1,1) | ||
| se._check_pre_training_sanity(1, 1, 1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_pre_training_sanity(0,1,1) | ||
| se._check_pre_training_sanity(0, 1, 1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_pre_training_sanity(1,0,1) | ||
| se._check_pre_training_sanity(1, 0, 1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_pre_training_sanity(1,1,0) | ||
| se._check_pre_training_sanity(1, 1, 0) | ||
| def test_post_training_sanity(self): | ||
@@ -375,6 +383,6 @@ w2v = Word2Vec() | ||
| with self.assertRaises(ValueError): | ||
| se._check_post_training_sanity(0,1) | ||
| se._check_post_training_sanity(0, 1) | ||
| with self.assertRaises(ValueError): | ||
| se._check_post_training_sanity(1,0) | ||
| se._check_post_training_sanity(1, 0) | ||
| def test_move_ndarray_to_disk_w2v(self): | ||
@@ -384,5 +392,7 @@ se = BaseSentence2VecModel(W2V) | ||
| p_target = Path("fse/test/test_data/test_vecs_wv.vectors") | ||
| se.wv.vectors[0,1] = 10 | ||
| se.wv.vectors[0, 1] = 10 | ||
| vecs = se.wv.vectors.copy() | ||
| output = se._move_ndarray_to_disk(se.wv.vectors, name="wv", mapfile_path=str(p.absolute())) | ||
| output = se._move_ndarray_to_disk( | ||
| se.wv.vectors, name="wv", mapfile_path=str(p.absolute()) | ||
| ) | ||
| self.assertTrue(p_target.exists()) | ||
@@ -429,11 +439,18 @@ self.assertFalse(output.flags.writeable) | ||
| se = BaseSentence2VecModel(W2V, workers=2) | ||
| def temp_train_job(data_iterable, target, memory): | ||
| v1 = v2 = sum(1 for _ in data_iterable) | ||
| return v1*2, v2*3 | ||
| return v1 * 2, v2 * 3 | ||
| se._do_train_job = temp_train_job | ||
| job_output = se._train_manager(data_iterable=[(s, i) for i,s in enumerate(SENTENCES)], total_sentences=len(SENTENCES),report_delay=0.01) | ||
| self.assertEqual((100,200,300), job_output) | ||
| job_output = se._train_manager( | ||
| data_iterable=[(s, i) for i, s in enumerate(SENTENCES)], | ||
| total_sentences=len(SENTENCES), | ||
| report_delay=0.01, | ||
| ) | ||
| self.assertEqual((100, 200, 300), job_output) | ||
| def test_infer_method(self): | ||
| se = BaseSentence2VecModel(W2V) | ||
| def temp_train_job(data_iterable, target, memory): | ||
@@ -444,6 +461,8 @@ for i in data_iterable: | ||
| def pass_method(**kwargs): pass | ||
| def pass_method(**kwargs): | ||
| pass | ||
| se._post_inference_calls = pass_method | ||
| se._do_train_job = temp_train_job | ||
| output = se.infer([(s, i) for i,s in enumerate(SENTENCES)]) | ||
| output = se.infer([(s, i) for i, s in enumerate(SENTENCES)]) | ||
| self.assertTrue((100 == output).all()) | ||
@@ -453,10 +472,15 @@ | ||
| se = BaseSentence2VecModel(W2V) | ||
| from fse.models.average_inner import MAX_WORDS_IN_BATCH | ||
| from fse.models.average_inner import train_average_cy | ||
| def _do_train_job(data_iterable, target, memory): | ||
| eff_sentences, eff_words = train_average_cy(model=se, indexed_sentences=data_iterable, target=target, memory=memory) | ||
| eff_sentences, eff_words = train_average_cy( | ||
| model=se, indexed_sentences=data_iterable, target=target, memory=memory | ||
| ) | ||
| return eff_sentences, eff_words | ||
| def pass_method(**kwargs): pass | ||
| def pass_method(**kwargs): | ||
| pass | ||
| se._post_inference_calls = pass_method | ||
@@ -473,3 +497,3 @@ se._do_train_job = _do_train_job | ||
| bs += len(s) | ||
| sents = [(s, i) for i,s in enumerate(tmp)] | ||
| sents = [(s, i) for i, s in enumerate(tmp)] | ||
| output = se.infer(sents) | ||
@@ -481,2 +505,3 @@ output = output[i:] | ||
| se = BaseSentence2VecModel(W2V) | ||
| def temp_train_job(data_iterable, target, memory): | ||
@@ -486,6 +511,9 @@ for i in data_iterable: | ||
| return target | ||
| def pass_method(**kwargs): pass | ||
| def pass_method(**kwargs): | ||
| pass | ||
| se._post_inference_calls = pass_method | ||
| se._do_train_job = temp_train_job | ||
| output = se.infer([(s, 0) for i,s in enumerate(SENTENCES)]) | ||
| output = se.infer([(s, 0) for i, s in enumerate(SENTENCES)]) | ||
| self.assertTrue((100 == output).all()) | ||
@@ -496,2 +524,3 @@ self.assertEqual((1, 5), output.shape) | ||
| se = BaseSentence2VecModel(W2V) | ||
| def temp_train_job(data_iterable, target, memory): | ||
@@ -501,11 +530,14 @@ for i in data_iterable: | ||
| return target | ||
| def pass_method(**kwargs): pass | ||
| def pass_method(**kwargs): | ||
| pass | ||
| se._post_inference_calls = pass_method | ||
| se._do_train_job = temp_train_job | ||
| output = se.infer([(s, i) for i,s in enumerate(SENTENCES)], use_norm=True) | ||
| output = se.infer([(s, i) for i, s in enumerate(SENTENCES)], use_norm=True) | ||
| self.assertTrue(np.allclose(1., np.sqrt(np.sum(output[0]**2)))) | ||
| self.assertTrue(np.allclose(1.0, np.sqrt(np.sum(output[0] ** 2)))) | ||
| class TestBaseSentence2VecPreparerFunctions(unittest.TestCase): | ||
| def test_reset_vectors(self): | ||
@@ -515,3 +547,3 @@ se = BaseSentence2VecModel(W2V) | ||
| trainables.reset_vectors(se.sv, 20) | ||
| self.assertEqual((20,DIM), se.sv.vectors.shape) | ||
| self.assertEqual((20, DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
@@ -528,3 +560,3 @@ self.assertTrue((EPS == se.sv.vectors).all()) | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertEqual((20,DIM), se.sv.vectors.shape) | ||
| self.assertEqual((20, DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
@@ -539,5 +571,5 @@ self.assertTrue((EPS == se.sv.vectors).all()) | ||
| trainables.reset_vectors(se.sv, 20) | ||
| se.sv.vectors[:] = 1. | ||
| se.sv.vectors[:] = 1.0 | ||
| trainables.update_vectors(se.sv, 10) | ||
| self.assertEqual((30,DIM), se.sv.vectors.shape) | ||
| self.assertEqual((30, DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
@@ -554,6 +586,6 @@ self.assertTrue((np.ones((20, DIM)) == se.sv.vectors[:20]).all()) | ||
| trainables.reset_vectors(se.sv, 20) | ||
| se.sv.vectors[:] = 1. | ||
| se.sv.vectors[:] = 1.0 | ||
| trainables.update_vectors(se.sv, 10) | ||
| self.assertTrue(p_target.exists()) | ||
| self.assertEqual((30,DIM), se.sv.vectors.shape) | ||
| self.assertEqual((30, DIM), se.sv.vectors.shape) | ||
| self.assertEqual(np.float32, se.sv.vectors.dtype) | ||
@@ -569,8 +601,11 @@ self.assertTrue((np.ones((20, DIM)) == se.sv.vectors[:20]).all()) | ||
| trainables.prepare_vectors(se.sv, 20, update=False) | ||
| self.assertEqual((20,DIM), se.sv.vectors.shape) | ||
| self.assertEqual((20, DIM), se.sv.vectors.shape) | ||
| trainables.prepare_vectors(se.sv, 40, update=True) | ||
| self.assertEqual((60,DIM), se.sv.vectors.shape) | ||
| self.assertEqual((60, DIM), se.sv.vectors.shape) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| if __name__ == "__main__": | ||
| logging.basicConfig( | ||
| format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG | ||
| ) | ||
| unittest.main() |
+47
-29
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -17,9 +17,17 @@ | ||
| from fse.inputs import BaseIndexedList, IndexedList, SplitIndexedList, CSplitIndexedList, \ | ||
| CIndexedList, CSplitCIndexedList, IndexedLineDocument, SplitCIndexedList | ||
| from fse.inputs import ( | ||
| BaseIndexedList, | ||
| IndexedList, | ||
| SplitIndexedList, | ||
| CSplitIndexedList, | ||
| CIndexedList, | ||
| CSplitCIndexedList, | ||
| IndexedLineDocument, | ||
| SplitCIndexedList, | ||
| ) | ||
| logger = logging.getLogger(__name__) | ||
| class TestBaseIndexedList(unittest.TestCase): | ||
| def setUp(self): | ||
@@ -33,3 +41,3 @@ self.list_a = ["the dog is good", "it's nice and comfy"] | ||
| self.ll = BaseIndexedList(self.list_a, self.list_b, self.list_c) | ||
| def test_init(self): | ||
@@ -56,3 +64,3 @@ _ = BaseIndexedList(self.list_a) | ||
| self.l._check_str_type([]) | ||
| def test__len(self): | ||
@@ -62,4 +70,3 @@ self.assertEqual(2, len(self.l)) | ||
| def test__str(self): | ||
| self.assertEqual("[\'the dog is good\', \"it\'s nice and comfy\"]", | ||
| str(self.l)) | ||
| self.assertEqual("['the dog is good', \"it's nice and comfy\"]", str(self.l)) | ||
@@ -77,7 +84,7 @@ def test__getitem(self): | ||
| self.assertEqual("is it me?", self.ll.items[0]) | ||
| def test_append(self): | ||
| self.ll.append("is it me?") | ||
| self.assertEqual("is it me?", self.ll.items[-1]) | ||
| def test_extend(self): | ||
@@ -91,6 +98,7 @@ self.ll.extend(self.list_a) | ||
| def test_extend_ndarr(self): | ||
| l = BaseIndexedList(np.array([str(i) for i in [1,2,3,4]])) | ||
| l.extend(np.array([str(i) for i in [1,2,3,4]])) | ||
| l = BaseIndexedList(np.array([str(i) for i in [1, 2, 3, 4]])) | ||
| l.extend(np.array([str(i) for i in [1, 2, 3, 4]])) | ||
| self.assertEqual(8, len(l)) | ||
| class TestIndexedList(unittest.TestCase): | ||
@@ -112,7 +120,8 @@ def setUp(self): | ||
| class TestCIndexedList(unittest.TestCase): | ||
| def setUp(self): | ||
| self.list_a = ["The Dog is good", "it's nice and comfy"] | ||
| self.il = CIndexedList(self.list_a, custom_index=[1,1]) | ||
| self.il = CIndexedList(self.list_a, custom_index=[1, 1]) | ||
| def test_cust_index(self): | ||
@@ -130,3 +139,3 @@ self.assertEqual(1, self.il[0][1]) | ||
| self.il.__setitem__(0, "the") | ||
| with self.assertRaises(NotImplementedError): | ||
@@ -139,2 +148,3 @@ self.il.insert(0, "the") | ||
| class TestCSplitIndexedList(unittest.TestCase): | ||
@@ -147,11 +157,12 @@ def setUp(self): | ||
| return input.lower().split() | ||
| def test_getitem(self): | ||
| self.assertEqual("the dog is good".split(), self.il[0][0]) | ||
| class TestSplitCIndexedList(unittest.TestCase): | ||
| def setUp(self): | ||
| self.list_a = ["The Dog is good", "it's nice and comfy"] | ||
| self.il = SplitCIndexedList(self.list_a, custom_index=[1,1]) | ||
| self.il = SplitCIndexedList(self.list_a, custom_index=[1, 1]) | ||
| def test_getitem(self): | ||
@@ -165,3 +176,3 @@ self.assertEqual(("The Dog is good".split(), 1), self.il[0]) | ||
| self.il.__setitem__(0, "the") | ||
| with self.assertRaises(NotImplementedError): | ||
@@ -174,13 +185,16 @@ self.il.insert(0, "the") | ||
| class TestCSplitCIndexedList(unittest.TestCase): | ||
| def setUp(self): | ||
| self.list_a = ["The Dog is good", "it's nice and comfy"] | ||
| self.il = CSplitCIndexedList(self.list_a, custom_split=self.split_func, custom_index=[1,1]) | ||
| self.il = CSplitCIndexedList( | ||
| self.list_a, custom_split=self.split_func, custom_index=[1, 1] | ||
| ) | ||
| def split_func(self, input): | ||
| return input.lower().split() | ||
| def test_getitem(self): | ||
| self.assertEqual(("the dog is good".split(), 1), self.il[0]) | ||
| def test_mutable_funcs(self): | ||
@@ -191,3 +205,3 @@ with self.assertRaises(NotImplementedError): | ||
| self.il.__setitem__(0, "the") | ||
| with self.assertRaises(NotImplementedError): | ||
@@ -200,4 +214,4 @@ self.il.insert(0, "the") | ||
| class TestIndexedLineDocument(unittest.TestCase): | ||
| def setUp(self): | ||
@@ -210,3 +224,5 @@ self.p = "fse/test/test_data/test_sentences.txt" | ||
| self.assertEqual("Save yourself money and buy it direct from lg", self.doc[19]) | ||
| self.assertEqual("I am not sure if it is a tracfone problem or the battery", self.doc[-1]) | ||
| self.assertEqual( | ||
| "I am not sure if it is a tracfone problem or the battery", self.doc[-1] | ||
| ) | ||
@@ -223,4 +239,6 @@ def test_yield(self): | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() | ||
| if __name__ == "__main__": | ||
| logging.basicConfig( | ||
| format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG | ||
| ) | ||
| unittest.main() |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| # Author: Oliver Borchers | ||
| # Copyright (C) Oliver Borchers Oliver Borchers | ||
@@ -34,10 +34,11 @@ | ||
| class TestSentenceVectorsFunctions(unittest.TestCase): | ||
| def setUp(self): | ||
| self.sv = SentenceVectors(2) | ||
| self.sv.vectors = np.arange(10).reshape(5,2) | ||
| self.sv.vectors = np.arange(10).reshape(5, 2) | ||
| def test_getitem(self): | ||
| self.assertTrue(([0,1] == self.sv[0]).all()) | ||
| self.assertTrue(([[0,1],[4,5]] == self.sv[[0,2]]).all()) | ||
| self.assertTrue(([0, 1] == self.sv[0]).all()) | ||
| self.assertTrue(([[0, 1], [4, 5]] == self.sv[[0, 2]]).all()) | ||
@@ -54,6 +55,6 @@ def test_isin(self): | ||
| v1 = self.sv.vectors[0] | ||
| v1 = v1 / np.sqrt(np.sum(v1**2)) | ||
| v1 = v1 / np.sqrt(np.sum(v1 ** 2)) | ||
| v2 = self.sv.vectors[1] | ||
| v2 = v2 / np.sqrt(np.sum(v2**2)) | ||
| v2 = v2 / np.sqrt(np.sum(v2 ** 2)) | ||
@@ -65,4 +66,4 @@ self.assertTrue(np.allclose(v1, self.sv.vectors_norm[0])) | ||
| def test_get_vector(self): | ||
| self.assertTrue(([0,1] == self.sv.get_vector(0)).all()) | ||
| self.assertTrue(([2,3] == self.sv.get_vector(1)).all()) | ||
| self.assertTrue(([0, 1] == self.sv.get_vector(0)).all()) | ||
| self.assertTrue(([2, 3] == self.sv.get_vector(1)).all()) | ||
@@ -98,6 +99,4 @@ def test_init_sims_w_replace(self): | ||
| sv.vectors = np.ones(shape, dtype=np.float32) | ||
| memvecs = np.memmap( | ||
| p_target, dtype=np.float32, | ||
| mode='w+', shape=shape) | ||
| memvecs = np.memmap(p_target, dtype=np.float32, mode="w+", shape=shape) | ||
| memvecs[:] = sv.vectors[:] | ||
@@ -122,9 +121,9 @@ del memvecs | ||
| v1 = self.sv.vectors[0] | ||
| v1 = v1 / np.sqrt(np.sum(v1**2)) | ||
| v1 = v1 / np.sqrt(np.sum(v1 ** 2)) | ||
| v2 = self.sv.vectors[1] | ||
| v2 = v2 / np.sqrt(np.sum(v2**2)) | ||
| v2 = v2 / np.sqrt(np.sum(v2 ** 2)) | ||
| self.assertTrue(np.allclose(v1.dot(v2), self.sv.similarity(0,1))) | ||
| self.assertTrue(np.allclose(1-v1.dot(v2), self.sv.distance(0,1))) | ||
| self.assertTrue(np.allclose(v1.dot(v2), self.sv.similarity(0, 1))) | ||
| self.assertTrue(np.allclose(1 - v1.dot(v2), self.sv.distance(0, 1))) | ||
@@ -161,3 +160,3 @@ def test_most_similar(self): | ||
| m.sv.init_sims() | ||
| v = m.sv[[0,1]] | ||
| v = m.sv[[0, 1]] | ||
| o = m.sv.most_similar(positive=v) | ||
@@ -170,2 +169,3 @@ self.assertEqual(1, o[0][0]) | ||
| pass | ||
| sentences = IndexedLineDocument(CORPUS) | ||
@@ -203,3 +203,5 @@ m = Average(W2V) | ||
| o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25), indexable=sentences) | ||
| o = m.sv.most_similar( | ||
| positive=1, topn=20, restrict_size=(5, 25), indexable=sentences | ||
| ) | ||
| self.assertEqual(20, len(o)) | ||
@@ -231,16 +233,27 @@ self.assertEqual(9, o[0][1]) | ||
| def test_similar_by_sentence_wrong_model(self): | ||
| sentences = IndexedLineDocument(CORPUS) | ||
| m = Average(W2V) | ||
| m.train(sentences) | ||
| with self.assertRaises(RuntimeError): | ||
| m.sv.similar_by_sentence( | ||
| sentence=["the", "product", "is", "good"], model=W2V | ||
| ) | ||
| def test_l2_norm(self): | ||
| out = np.random.normal(size=(200,50)).astype(np.float32) | ||
| out = np.random.normal(size=(200, 50)).astype(np.float32) | ||
| result = _l2_norm(out, False) | ||
| lens = np.sqrt(np.sum((result**2), axis=-1)) | ||
| lens = np.sqrt(np.sum((result ** 2), axis=-1)) | ||
| self.assertTrue(np.allclose(1, lens, atol=1e-6)) | ||
| out = np.random.normal(size=(200,50)).astype(np.float32) | ||
| out = np.random.normal(size=(200, 50)).astype(np.float32) | ||
| out = _l2_norm(out, True) | ||
| lens = np.sqrt(np.sum((out**2), axis=-1)) | ||
| lens = np.sqrt(np.sum((out ** 2), axis=-1)) | ||
| self.assertTrue(np.allclose(1, lens, atol=1e-6)) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| if __name__ == "__main__": | ||
| logging.basicConfig( | ||
| format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG | ||
| ) | ||
| unittest.main() |
+50
-14
@@ -8,3 +8,7 @@ import logging | ||
| from fse.models.sif import SIF, compute_principal_components, remove_principal_components | ||
| from fse.models.sif import ( | ||
| SIF, | ||
| compute_principal_components, | ||
| remove_principal_components, | ||
| ) | ||
| from fse.inputs import IndexedLineDocument | ||
@@ -27,6 +31,6 @@ | ||
| self.model = SIF(W2V, lang_freq="en") | ||
| def test_parameter_sanity(self): | ||
| with self.assertRaises(ValueError): | ||
| m = SIF(W2V, alpha= -1) | ||
| m = SIF(W2V, alpha=-1) | ||
| m._check_parameter_sanity() | ||
@@ -48,3 +52,3 @@ with self.assertRaises(ValueError): | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 0, atol=1e-5)) | ||
| def test_post_train_calls_no_removal(self): | ||
@@ -55,3 +59,3 @@ self.model.components = 0 | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5)) | ||
| def test_post_inference_calls(self): | ||
@@ -70,3 +74,3 @@ self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_inference_calls(output=None) | ||
| def test_post_inference_calls_no_removal(self): | ||
@@ -83,5 +87,8 @@ self.model.components = 0 | ||
| self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vals(self): | ||
| self.model.svd_res = (np.ones_like(self.model.word_weights, dtype=int), np.array(0, dtype=np.float32)) | ||
| self.model.svd_res = ( | ||
| np.ones_like(self.model.word_weights, dtype=int), | ||
| np.array(0, dtype=np.float32), | ||
| ) | ||
| with self.assertRaises(TypeError): | ||
@@ -91,6 +98,9 @@ self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vecs(self): | ||
| self.model.svd_res = (np.array(0, dtype=np.float32), np.ones_like(self.model.word_weights, dtype=int)) | ||
| self.model.svd_res = ( | ||
| np.array(0, dtype=np.float32), | ||
| np.ones_like(self.model.word_weights, dtype=int), | ||
| ) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_compute_sif_weights(self): | ||
@@ -109,7 +119,33 @@ cs = 1095661426 | ||
| output = self.model.train(self.sentences) | ||
| self.assertEqual((100,1450), output) | ||
| self.assertEqual((100, 1450), output) | ||
| self.assertTrue(np.isfinite(self.model.sv.vectors).all()) | ||
| self.assertEqual(2, len(self.model.svd_res)) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() | ||
| def test_save_issue(self): | ||
| model = SIF(W2V) | ||
| model.train(self.sentences) | ||
| p = Path("fse/test/test_data/test_emb.model") | ||
| model.save(str(p)) | ||
| model = SIF.load(str(p)) | ||
| p.unlink() | ||
| self.assertEqual(2, len(model.svd_res)) | ||
| model.sv.similar_by_sentence("test sentence".split(), model=model) | ||
| def test_broken_vocab(self): | ||
| w2v = Word2Vec(min_count=1, size=DIM) | ||
| w2v.build_vocab([l.split() for l in open(CORPUS, "r")]) | ||
| for k in w2v.wv.vocab: | ||
| w2v.wv.vocab[k].count = np.nan | ||
| model = SIF(w2v) | ||
| with self.assertRaises(RuntimeError): | ||
| model.train(self.sentences) | ||
| if __name__ == "__main__": | ||
| logging.basicConfig( | ||
| format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG | ||
| ) | ||
| unittest.main() |
+35
-14
@@ -21,2 +21,3 @@ import logging | ||
| class TestuSIFFunctions(unittest.TestCase): | ||
@@ -26,6 +27,6 @@ def setUp(self): | ||
| self.model = uSIF(W2V, lang_freq="en") | ||
| def test_parameter_sanity(self): | ||
| with self.assertRaises(ValueError): | ||
| m = uSIF(W2V, length= 0) | ||
| m = uSIF(W2V, length=0) | ||
| m._check_parameter_sanity() | ||
@@ -45,3 +46,3 @@ with self.assertRaises(ValueError): | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 0, atol=1e-5)) | ||
| def test_post_train_calls_no_removal(self): | ||
@@ -52,3 +53,3 @@ self.model.components = 0 | ||
| self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5)) | ||
| def test_post_inference_calls(self): | ||
@@ -67,3 +68,3 @@ self.model.sv.vectors = np.ones((200, 10), dtype=np.float32) | ||
| self.model._post_inference_calls(output=None) | ||
| def test_post_inference_calls_no_removal(self): | ||
@@ -80,5 +81,8 @@ self.model.components = 0 | ||
| self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vals(self): | ||
| self.model.svd_res = (np.ones_like(self.model.word_weights, dtype=int), np.array(0, dtype=np.float32)) | ||
| self.model.svd_res = ( | ||
| np.ones_like(self.model.word_weights, dtype=int), | ||
| np.array(0, dtype=np.float32), | ||
| ) | ||
| with self.assertRaises(TypeError): | ||
@@ -88,6 +92,9 @@ self.model._check_dtype_santiy() | ||
| def test_dtype_sanity_svd_vecs(self): | ||
| self.model.svd_res = (np.array(0, dtype=np.float32), np.ones_like(self.model.word_weights, dtype=int)) | ||
| self.model.svd_res = ( | ||
| np.array(0, dtype=np.float32), | ||
| np.ones_like(self.model.word_weights, dtype=int), | ||
| ) | ||
| with self.assertRaises(TypeError): | ||
| self.model._check_dtype_santiy() | ||
| def test_compute_usif_weights(self): | ||
@@ -99,3 +106,3 @@ w = "Good" | ||
| a = 0.17831555484795414 | ||
| usif = a / ((a/2) + pw) | ||
| usif = a / ((a / 2) + pw) | ||
| self.model._compute_usif_weights() | ||
@@ -106,7 +113,21 @@ self.assertTrue(np.allclose(self.model.word_weights[idx], usif)) | ||
| output = self.model.train(self.sentences) | ||
| self.assertEqual((100,1450), output) | ||
| self.assertEqual((100, 1450), output) | ||
| self.assertTrue(np.isfinite(self.model.sv.vectors).all()) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() | ||
| def test_broken_vocab(self): | ||
| w2v = Word2Vec(min_count=1, size=DIM) | ||
| w2v.build_vocab([l.split() for l in open(CORPUS, "r")]) | ||
| for k in w2v.wv.vocab: | ||
| w2v.wv.vocab[k].count = np.nan | ||
| model = uSIF(w2v) | ||
| with self.assertRaises(RuntimeError): | ||
| model.train(self.sentences) | ||
| if __name__ == "__main__": | ||
| logging.basicConfig( | ||
| format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG | ||
| ) | ||
| unittest.main() |
+66
-18
@@ -5,2 +5,3 @@ import logging | ||
| import numpy as np | ||
| from numpy.testing import assert_allclose, assert_raises | ||
@@ -13,6 +14,5 @@ from fse.models.utils import compute_principal_components, remove_principal_components | ||
| class TestUtils(unittest.TestCase): | ||
| def test_compute_components(self): | ||
| m = np.random.uniform(size=(500, 10)).astype(np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| out = compute_principal_components(vectors=m) | ||
| self.assertEqual(2, len(out)) | ||
@@ -23,27 +23,75 @@ self.assertEqual(1, len(out[1])) | ||
| m = np.random.uniform(size=(500, 10)) | ||
| out = compute_principal_components(vectors = m, components=5) | ||
| out = compute_principal_components(vectors=m, components=5) | ||
| self.assertEqual(2, len(out)) | ||
| self.assertEqual(5, len(out[1])) | ||
| def test_compute_large_components(self): | ||
| m = np.random.uniform(size=(int(2e6), 100)).astype(np.float32) | ||
| out = compute_principal_components(vectors=m, cache_size_gb=0.2) | ||
| self.assertEqual(2, len(out)) | ||
| self.assertEqual(1, len(out[1])) | ||
| self.assertEqual(np.float32, out[1].dtype) | ||
| def test_remove_components_inplace(self): | ||
| m = np.ones((500,10), dtype=np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| m = np.ones((500, 10), dtype=np.float32) | ||
| c = np.copy(m) | ||
| out = compute_principal_components(vectors=m) | ||
| remove_principal_components(m, svd_res=out) | ||
| self.assertTrue(np.allclose(0., m, atol=1e-5)) | ||
| assert_allclose(m, 0.0, atol=1e-5) | ||
| with assert_raises(AssertionError): | ||
| assert_allclose(m, c) | ||
| def test_remove_components(self): | ||
| m = np.ones((500,10), dtype=np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| m = np.ones((500, 10), dtype=np.float32) | ||
| c = np.copy(m) | ||
| out = compute_principal_components(vectors=m) | ||
| res = remove_principal_components(m, svd_res=out, inplace=False) | ||
| self.assertTrue(np.allclose(1., res, atol=1e-5)) | ||
| assert_allclose(res, 0.0, atol=1e-5) | ||
| assert_allclose(m, c) | ||
| def test_remove_weighted_components(self): | ||
| m = np.ones((500,10), dtype=np.float32) | ||
| out = compute_principal_components(vectors = m) | ||
| def test_remove_weighted_components_inplace(self): | ||
| m = np.ones((500, 10), dtype=np.float32) | ||
| c = np.copy(m) | ||
| out = compute_principal_components(vectors=m) | ||
| remove_principal_components(m, svd_res=out, weights=np.array([0.5])) | ||
| self.assertTrue(np.allclose(0.75, m)) | ||
| assert_allclose(m, 0.75, atol=1e-5) | ||
| with assert_raises(AssertionError): | ||
| assert_allclose(m, c) | ||
| def test_remove_weighted_components(self): | ||
| m = np.ones((500, 10), dtype=np.float32) | ||
| c = np.copy(m) | ||
| out = compute_principal_components(vectors=m) | ||
| res = remove_principal_components( | ||
| m, svd_res=out, weights=np.array([0.5]), inplace=False | ||
| ) | ||
| assert_allclose(res, 0.75, atol=1e-5) | ||
| assert_allclose(m, c) | ||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
| unittest.main() | ||
| def test_madvise(self): | ||
| from pathlib import Path | ||
| from sys import platform | ||
| from fse.models.utils import set_madvise_for_mmap | ||
| if platform in ["linux", "linux2", "darwin", "aix"]: | ||
| p = Path("fse/test/test_data/test_vectors") | ||
| madvise = set_madvise_for_mmap(True) | ||
| shape = (500, 10) | ||
| mat = np.random.normal(size=shape) | ||
| memvecs = np.memmap(p, dtype=np.float32, mode="w+", shape=shape) | ||
| memvecs[:] = mat[:] | ||
| del memvecs | ||
| mat = np.memmap(p, dtype=np.float32, mode="r", shape=shape) | ||
| self.assertEqual( | ||
| madvise(mat.ctypes.data, mat.size * mat.dtype.itemsize, 1), 0 | ||
| ) | ||
| p.unlink() | ||
| if __name__ == "__main__": | ||
| logging.basicConfig( | ||
| format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG | ||
| ) | ||
| unittest.main() |
+0
-1
@@ -6,4 +6,3 @@ recursive-include fse/test/test_data * | ||
| include fse/models/average_inner.c | ||
| include fse/models/average_inner.pyx | ||
| include fse/models/average_inner.pxd |
+4
-4
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.1.15 | ||
| Version: 0.1.17 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
| Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings | ||
| Home-page: UNKNOWN | ||
| Author: Oliver Borchers | ||
| Author-email: borchers@bwl.uni-mannheim.de | ||
| License: GPL-3.0 | ||
| Author-email: o.borchers@oxolo.com | ||
| License: UNKNOWN | ||
| Description: UNKNOWN | ||
| Platform: UNKNOWN |
+36
-6
@@ -1,3 +0,9 @@ | ||
| [](https://travis-ci.com/oborchers/Fast_Sentence_Embeddings) | ||
| [](https://coveralls.io/github/oborchers/Fast_Sentence_Embeddings?branch=master) | ||
| <p align="center"> | ||
| <a href="https://travis-ci.com/oborchers/Fast_Sentence_Embeddings"><img alt="Build Status" src="https://travis-ci.com/oborchers/Fast_Sentence_Embeddings.svg?branch=master"></a> | ||
| <a href="https://coveralls.io/github/oborchers/Fast_Sentence_Embeddings?branch=master"><img alt="Coverage Status" src="https://coveralls.io/repos/github/oborchers/Fast_Sentence_Embeddings/badge.svg?branch=master"></a> | ||
| <a href="https://pepy.tech/project/fse"><img alt="Downloads" src="https://pepy.tech/badge/fse"></a> | ||
| <a href="https://lgtm.com/projects/g/oborchers/Fast_Sentence_Embeddings/context:python"><img alt="Language grade: Python" src="https://img.shields.io/lgtm/grade/python/g/oborchers/Fast_Sentence_Embeddings.svg"></a> | ||
| <a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a> | ||
| <a href="https://img.shields.io/github/license/oborchers/Fast_Sentence_Embeddings.svg?style=flat"><img alt="License: GPL3" src="https://img.shields.io/github/license/oborchers/Fast_Sentence_Embeddings.svg?style=flat"></a> | ||
| </p> | ||
@@ -9,8 +15,26 @@ Fast Sentence Embeddings (fse) | ||
| **Disclaimer**: I am working full time. Unfortunately, I have yet to find time to add all the features I'd like to see. Especially the API needs some overhaul and we need support for gensim 4.0.0. | ||
| I am looking for active contributors to keep this package alive. Please feel free to ping me at <o.borchers@oxolo.com> if you are interested. | ||
| Audience | ||
| ------------ | ||
| This package builds upon Gensim and is intenteded to compute sentence/paragraph vectors for large databases. Use this package if: | ||
| - (Sentence) Transformers are too slow | ||
| - Your dataset is too large for existing solutions (spacy) | ||
| - Using GPUs is not an option. | ||
| The average (online) inference time for a well optimized (and batched) sentence-transformer is around 1ms-10ms per sentence. | ||
| If that is not enough and you are willing to sacrifice a bit in terms of quality, this is your package. | ||
| Features | ||
| ------------ | ||
| Find the corresponding blog post(s) here: https://medium.com/@oliverbor/fse-2b1ffa791cf9 (will be updated soon) | ||
| Find the corresponding blog post(s) here (code may be outdated): | ||
| - [Visualizing 100,000 Amazon Products](https://towardsdatascience.com/vis-amz-83dea6fcb059) | ||
| - [Sentence Embeddings. Fast, please!](https://towardsdatascience.com/fse-2b1ffa791cf9) | ||
| **fse** implements three algorithms for sentence embeddings. You can choose | ||
@@ -51,2 +75,3 @@ between *unweighted sentence averages*, *smooth inverse frequency averages*, and *unsupervised smooth inverse frequency averages*. | ||
| Installation | ||
@@ -56,3 +81,3 @@ ------------ | ||
| This software depends on NumPy, Scipy, Scikit-learn, Gensim, and Wordfreq. | ||
| You must have them installed prior to installing fse. Required Python version is 3.6. | ||
| You must have them installed prior to installing fse. | ||
@@ -144,2 +169,7 @@ As with gensim, it is also recommended you install a BLAS library before installing fse. | ||
| 0.1.17: | ||
| - Fixed dependency issue where you cannot install fse properly | ||
| - Updated readme | ||
| - Updated travis python versions (3.6, 3.9) | ||
| 0.1.15 from 0.1.11: | ||
@@ -174,5 +204,5 @@ - Fixed major FT Ngram computation bug | ||
| Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| Author: Oliver Borchers | ||
| Copyright (C) 2019 Oliver Borchers | ||
| Copyright (C) 2021 Oliver Borchers | ||
@@ -179,0 +209,0 @@ Citation |
+139
-83
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| ''' | ||
| Run with: | ||
| # Author: Oliver Borchers | ||
| # For License information, see corresponding LICENSE file. | ||
| sudo python ./setup.py install | ||
| ''' | ||
| """Template setup.py Read more on | ||
| https://docs.python.org/3.7/distutils/setupscript.html.""" | ||
| import distutils | ||
| import itertools | ||
| import os | ||
| import platform | ||
| import sys | ||
| import warnings | ||
| from setuptools import setup, find_packages, Extension | ||
| import shutil | ||
| from setuptools import Extension, find_packages, setup | ||
| from setuptools.command.build_ext import build_ext | ||
| if sys.version_info[:2] < (3, 6): | ||
| raise Exception('This version of fse needs Python 3.6 or later.') | ||
| NAME = "fse" | ||
| VERSION = "0.1.17" | ||
| DESCRIPTION = "Fast Sentence Embeddings for Gensim" | ||
| AUTHOR = "Oliver Borchers" | ||
| AUTHOR_EMAIL = "o.borchers@oxolo.com" | ||
| URL = "https://github.com/oborchers/Fast_Sentence_Embeddings" | ||
| LICENSE = "GPL-3.0" | ||
| REQUIRES_PYTHON = ">=3.6" | ||
| NUMPY_STR = "numpy >= 1.11.3" | ||
| CYTHON_STR = "Cython==0.29.14" | ||
| class custom_build_ext(build_ext): | ||
| '''Allow C extension building to fail. | ||
| ''' | ||
| warning_message = ''' | ||
| ******************************************************************** | ||
| WARNING: %s could not be compiled. %s | ||
| INSTALL_REQUIRES = [ | ||
| NUMPY_STR, | ||
| "scipy >= 0.18.1", | ||
| "smart_open >= 1.5.0", | ||
| "scikit-learn >= 0.19.1", | ||
| "gensim<4", | ||
| "wordfreq >= 2.2.1", | ||
| "psutil", | ||
| ] | ||
| SETUP_REQUIRES = [NUMPY_STR] | ||
| Here are some hints for popular operating systems: | ||
| c_extensions = { | ||
| "fse.models.average_inner": "fse/models/average_inner.c", | ||
| } | ||
| cpp_extensions = {} | ||
| If you are seeing this message on Linux you probably need to | ||
| install GCC and/or the Python development package for your | ||
| version of Python. | ||
| Debian and Ubuntu users should issue the following command: | ||
| def need_cython(): | ||
| """Return True if we need Cython to translate any of the extensions. | ||
| $ sudo apt-get install build-essential python-dev | ||
| If the extensions have already been translated to C/C++, then we don"t need to | ||
| install Cython and perform the translation. | ||
| """ | ||
| expected = list(c_extensions.values()) + list(cpp_extensions.values()) | ||
| return any([not os.path.isfile(f) for f in expected]) | ||
| RedHat, CentOS, and Fedora users should issue the following command: | ||
| $ sudo yum install gcc python-devel | ||
| def make_c_ext(use_cython=False): | ||
| for module, source in c_extensions.items(): | ||
| if use_cython: | ||
| source = source.replace(".c", ".pyx") | ||
| extra_args = [] | ||
| # extra_args.extend(["-g", "-O0"]) # uncomment if optimization limiting crash info | ||
| yield Extension( | ||
| module, | ||
| sources=[source], | ||
| language="c", | ||
| extra_compile_args=extra_args, | ||
| ) | ||
| If you are seeing this message on OSX please read the documentation | ||
| here: | ||
| http://api.mongodb.org/python/current/installation.html#osx | ||
| ******************************************************************** | ||
| ''' | ||
| def make_cpp_ext(use_cython=False): | ||
| extra_args = [] | ||
| system = platform.system() | ||
| def run(self): | ||
| try: | ||
| build_ext.run(self) | ||
| except Exception: | ||
| e = sys.exc_info()[1] | ||
| sys.stdout.write('%s\n' % str(e)) | ||
| warnings.warn( | ||
| self.warning_message + | ||
| 'Extension modules' + | ||
| 'There was an issue with your platform configuration - see above.') | ||
| if system == "Linux": | ||
| extra_args.append("-std=c++11") | ||
| elif system == "Darwin": | ||
| extra_args.extend(["-stdlib=libc++", "-std=c++11"]) | ||
| # extra_args.extend(["-g", "-O0"]) # uncomment if | ||
| # optimization limiting crash info | ||
| for module, source in cpp_extensions.items(): | ||
| if use_cython: | ||
| source = source.replace(".cpp", ".pyx") | ||
| yield Extension( | ||
| module, | ||
| sources=[source], | ||
| language="c++", | ||
| extra_compile_args=extra_args, | ||
| extra_link_args=extra_args, | ||
| ) | ||
| def build_extension(self, ext): | ||
| name = ext.name | ||
| try: | ||
| build_ext.build_extension(self, ext) | ||
| except Exception: | ||
| e = sys.exc_info()[1] | ||
| sys.stdout.write('%s\n' % str(e)) | ||
| warnings.warn( | ||
| self.warning_message + | ||
| 'The %s extension module' % (name,) + | ||
| 'The output above this warning shows how the compilation failed.') | ||
| # | ||
| # We use use_cython=False here for two reasons: | ||
| # | ||
| # 1. Cython may not be available at this stage | ||
| # 2. The actual translation from Cython to C/C++ happens inside CustomBuildExt | ||
| # | ||
| ext_modules = list( | ||
| itertools.chain(make_c_ext(use_cython=False), make_cpp_ext(use_cython=False)) | ||
| ) | ||
| class CustomBuildExt(build_ext): | ||
| """Custom build_ext action with bootstrapping. | ||
| We need this in order to use numpy and Cython in this script without importing them | ||
| at module level, because they may not be available yet. | ||
| """ | ||
| # | ||
| # http://stackoverflow.com/questions/19919905/how-to-bootstrap-numpy-installation-in-setup-py | ||
| # | ||
| def finalize_options(self): | ||
| build_ext.finalize_options(self) | ||
| if isinstance(__builtins__, dict): | ||
| __builtins__['__NUMPY_SETUP__'] = False | ||
| else: | ||
| __builtins__.__NUMPY_SETUP__ = False | ||
| # Prevent numpy from thinking it is still in its setup process: | ||
| # https://docs.python.org/2/library/__builtin__.html#module-__builtin__ | ||
| __builtins__.__NUMPY_SETUP__ = False | ||
| import numpy | ||
| self.include_dirs.append(numpy.get_include()) | ||
| mod_dir = os.path.join(os.path.dirname(__file__), 'fse', 'models') | ||
| fse_dir = os.path.join(os.path.dirname(__file__), 'fse') | ||
| if need_cython(): | ||
| import Cython.Build | ||
| cmdclass = {'build_ext': custom_build_ext} | ||
| Cython.Build.cythonize(list(make_c_ext(use_cython=True))) | ||
| Cython.Build.cythonize(list(make_cpp_ext(use_cython=True))) | ||
| setup( | ||
| name='fse', | ||
| version='0.1.15', | ||
| description='Fast Sentence Embeddings for Gensim', | ||
| author=u'Oliver Borchers', | ||
| author_email='borchers@bwl.uni-mannheim.de', | ||
| class CleanExt(distutils.cmd.Command): | ||
| description = "Remove C sources, C++ sources and binaries for gensim extensions" | ||
| user_options = [] | ||
| url="https://github.com/oborchers/Fast_Sentence_Embeddings", | ||
| def initialize_options(self): | ||
| pass | ||
| license='GPL-3.0', | ||
| def finalize_options(self): | ||
| pass | ||
| ext_modules=[ | ||
| Extension('fse.models.average_inner', | ||
| sources=['./fse/models/average_inner.c'], | ||
| include_dirs=[mod_dir]), | ||
| ], | ||
| cmdclass=cmdclass, | ||
| packages=find_packages(), | ||
| def run(self): | ||
| for root, dirs, files in os.walk("gensim"): | ||
| files = [ | ||
| os.path.join(root, f) | ||
| for f in files | ||
| if os.path.splitext(f)[1] in (".c", ".cpp", ".so") | ||
| ] | ||
| for f in files: | ||
| self.announce("removing %s" % f, level=distutils.log.INFO) | ||
| os.unlink(f) | ||
| zip_safe=False, | ||
| if os.path.isdir("build"): | ||
| self.announce("recursively removing build", level=distutils.log.INFO) | ||
| shutil.rmtree("build") | ||
| test_suite="fse.test", | ||
| install_requires=[ | ||
| 'numpy >= 1.11.3', | ||
| 'scipy >= 0.18.1', | ||
| 'smart_open >= 1.5.0', | ||
| 'scikit-learn >= 0.19.1', | ||
| 'gensim >= 3.8.0', | ||
| 'wordfreq >= 2.2.1', | ||
| 'psutil' | ||
| ], | ||
| cmdclass = {"build_ext": CustomBuildExt, "clean_ext": CleanExt} | ||
| if need_cython(): | ||
| INSTALL_REQUIRES.append(CYTHON_STR) | ||
| SETUP_REQUIRES.append(CYTHON_STR) | ||
| setup( | ||
| name=NAME, | ||
| version=VERSION, | ||
| description=DESCRIPTION, | ||
| author=AUTHOR, | ||
| author_email=AUTHOR_EMAIL, | ||
| packages=find_packages(), | ||
| requires_python=REQUIRES_PYTHON, | ||
| install_requires=INSTALL_REQUIRES, | ||
| setup_requires=SETUP_REQUIRES, | ||
| ext_modules=ext_modules, | ||
| cmdclass=cmdclass, | ||
| zip_safe=False, | ||
| include_package_data=True, | ||
| ) |
| # Auto detect text files and perform LF normalization | ||
| * text=auto |
-75
| # Compiled source # | ||
| ################### | ||
| *.com | ||
| *.class | ||
| *.dll | ||
| *.exe | ||
| *.o | ||
| *.so | ||
| *.pyc | ||
| # Packages # | ||
| ############ | ||
| # it's better to unpack these files and commit the raw source | ||
| # git has its own built in compression methods | ||
| *.7z | ||
| *.dmg | ||
| *.gz | ||
| *.iso | ||
| *.jar | ||
| *.rar | ||
| *.tar | ||
| *.zip | ||
| # Logs and databases # | ||
| ###################### | ||
| *.log | ||
| *.sql | ||
| *.sqlite | ||
| *.pkl | ||
| *.bak | ||
| *.npy | ||
| *.npz | ||
| *.code-workspace | ||
| # OS generated files # | ||
| ###################### | ||
| .DS_Store? | ||
| .DS_Store | ||
| ehthumbs.db | ||
| Icon? | ||
| Thumbs.db | ||
| *.icloud | ||
| # Folders # | ||
| ########### | ||
| legacy | ||
| latex | ||
| draft | ||
| fse.egg-info/ | ||
| # Other # | ||
| ######### | ||
| .ipynb_checkpoints/ | ||
| .settings/ | ||
| .vscode/ | ||
| .eggs | ||
| fse*.egg-info | ||
| *.pptx | ||
| *.doc | ||
| *.docx | ||
| *.dict | ||
| .coverage | ||
| *.bak | ||
| /build/ | ||
| /dist/ | ||
| *.prof | ||
| *.lprof | ||
| *.bin | ||
| *.old | ||
| *.model | ||
| *_out.txt | ||
| *.html | ||
| vectors | ||
| *.vectors | ||
| *.joblib |
Sorry, the diff of this file is not supported yet
-674
| GNU GENERAL PUBLIC LICENSE | ||
| Version 3, 29 June 2007 | ||
| Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/> | ||
| Everyone is permitted to copy and distribute verbatim copies | ||
| of this license document, but changing it is not allowed. | ||
| Preamble | ||
| The GNU General Public License is a free, copyleft license for | ||
| software and other kinds of works. | ||
| The licenses for most software and other practical works are designed | ||
| to take away your freedom to share and change the works. By contrast, | ||
| the GNU General Public License is intended to guarantee your freedom to | ||
| share and change all versions of a program--to make sure it remains free | ||
| software for all its users. We, the Free Software Foundation, use the | ||
| GNU General Public License for most of our software; it applies also to | ||
| any other work released this way by its authors. You can apply it to | ||
| your programs, too. | ||
| When we speak of free software, we are referring to freedom, not | ||
| price. Our General Public Licenses are designed to make sure that you | ||
| have the freedom to distribute copies of free software (and charge for | ||
| them if you wish), that you receive source code or can get it if you | ||
| want it, that you can change the software or use pieces of it in new | ||
| free programs, and that you know you can do these things. | ||
| To protect your rights, we need to prevent others from denying you | ||
| these rights or asking you to surrender the rights. Therefore, you have | ||
| certain responsibilities if you distribute copies of the software, or if | ||
| you modify it: responsibilities to respect the freedom of others. | ||
| For example, if you distribute copies of such a program, whether | ||
| gratis or for a fee, you must pass on to the recipients the same | ||
| freedoms that you received. You must make sure that they, too, receive | ||
| or can get the source code. And you must show them these terms so they | ||
| know their rights. | ||
| Developers that use the GNU GPL protect your rights with two steps: | ||
| (1) assert copyright on the software, and (2) offer you this License | ||
| giving you legal permission to copy, distribute and/or modify it. | ||
| For the developers' and authors' protection, the GPL clearly explains | ||
| that there is no warranty for this free software. For both users' and | ||
| authors' sake, the GPL requires that modified versions be marked as | ||
| changed, so that their problems will not be attributed erroneously to | ||
| authors of previous versions. | ||
| Some devices are designed to deny users access to install or run | ||
| modified versions of the software inside them, although the manufacturer | ||
| can do so. This is fundamentally incompatible with the aim of | ||
| protecting users' freedom to change the software. The systematic | ||
| pattern of such abuse occurs in the area of products for individuals to | ||
| use, which is precisely where it is most unacceptable. Therefore, we | ||
| have designed this version of the GPL to prohibit the practice for those | ||
| products. If such problems arise substantially in other domains, we | ||
| stand ready to extend this provision to those domains in future versions | ||
| of the GPL, as needed to protect the freedom of users. | ||
| Finally, every program is threatened constantly by software patents. | ||
| States should not allow patents to restrict development and use of | ||
| software on general-purpose computers, but in those that do, we wish to | ||
| avoid the special danger that patents applied to a free program could | ||
| make it effectively proprietary. To prevent this, the GPL assures that | ||
| patents cannot be used to render the program non-free. | ||
| The precise terms and conditions for copying, distribution and | ||
| modification follow. | ||
| TERMS AND CONDITIONS | ||
| 0. Definitions. | ||
| "This License" refers to version 3 of the GNU General Public License. | ||
| "Copyright" also means copyright-like laws that apply to other kinds of | ||
| works, such as semiconductor masks. | ||
| "The Program" refers to any copyrightable work licensed under this | ||
| License. Each licensee is addressed as "you". "Licensees" and | ||
| "recipients" may be individuals or organizations. | ||
| To "modify" a work means to copy from or adapt all or part of the work | ||
| in a fashion requiring copyright permission, other than the making of an | ||
| exact copy. The resulting work is called a "modified version" of the | ||
| earlier work or a work "based on" the earlier work. | ||
| A "covered work" means either the unmodified Program or a work based | ||
| on the Program. | ||
| To "propagate" a work means to do anything with it that, without | ||
| permission, would make you directly or secondarily liable for | ||
| infringement under applicable copyright law, except executing it on a | ||
| computer or modifying a private copy. Propagation includes copying, | ||
| distribution (with or without modification), making available to the | ||
| public, and in some countries other activities as well. | ||
| To "convey" a work means any kind of propagation that enables other | ||
| parties to make or receive copies. Mere interaction with a user through | ||
| a computer network, with no transfer of a copy, is not conveying. | ||
| An interactive user interface displays "Appropriate Legal Notices" | ||
| to the extent that it includes a convenient and prominently visible | ||
| feature that (1) displays an appropriate copyright notice, and (2) | ||
| tells the user that there is no warranty for the work (except to the | ||
| extent that warranties are provided), that licensees may convey the | ||
| work under this License, and how to view a copy of this License. If | ||
| the interface presents a list of user commands or options, such as a | ||
| menu, a prominent item in the list meets this criterion. | ||
| 1. Source Code. | ||
| The "source code" for a work means the preferred form of the work | ||
| for making modifications to it. "Object code" means any non-source | ||
| form of a work. | ||
| A "Standard Interface" means an interface that either is an official | ||
| standard defined by a recognized standards body, or, in the case of | ||
| interfaces specified for a particular programming language, one that | ||
| is widely used among developers working in that language. | ||
| The "System Libraries" of an executable work include anything, other | ||
| than the work as a whole, that (a) is included in the normal form of | ||
| packaging a Major Component, but which is not part of that Major | ||
| Component, and (b) serves only to enable use of the work with that | ||
| Major Component, or to implement a Standard Interface for which an | ||
| implementation is available to the public in source code form. A | ||
| "Major Component", in this context, means a major essential component | ||
| (kernel, window system, and so on) of the specific operating system | ||
| (if any) on which the executable work runs, or a compiler used to | ||
| produce the work, or an object code interpreter used to run it. | ||
| The "Corresponding Source" for a work in object code form means all | ||
| the source code needed to generate, install, and (for an executable | ||
| work) run the object code and to modify the work, including scripts to | ||
| control those activities. However, it does not include the work's | ||
| System Libraries, or general-purpose tools or generally available free | ||
| programs which are used unmodified in performing those activities but | ||
| which are not part of the work. For example, Corresponding Source | ||
| includes interface definition files associated with source files for | ||
| the work, and the source code for shared libraries and dynamically | ||
| linked subprograms that the work is specifically designed to require, | ||
| such as by intimate data communication or control flow between those | ||
| subprograms and other parts of the work. | ||
| The Corresponding Source need not include anything that users | ||
| can regenerate automatically from other parts of the Corresponding | ||
| Source. | ||
| The Corresponding Source for a work in source code form is that | ||
| same work. | ||
| 2. Basic Permissions. | ||
| All rights granted under this License are granted for the term of | ||
| copyright on the Program, and are irrevocable provided the stated | ||
| conditions are met. This License explicitly affirms your unlimited | ||
| permission to run the unmodified Program. The output from running a | ||
| covered work is covered by this License only if the output, given its | ||
| content, constitutes a covered work. This License acknowledges your | ||
| rights of fair use or other equivalent, as provided by copyright law. | ||
| You may make, run and propagate covered works that you do not | ||
| convey, without conditions so long as your license otherwise remains | ||
| in force. You may convey covered works to others for the sole purpose | ||
| of having them make modifications exclusively for you, or provide you | ||
| with facilities for running those works, provided that you comply with | ||
| the terms of this License in conveying all material for which you do | ||
| not control copyright. Those thus making or running the covered works | ||
| for you must do so exclusively on your behalf, under your direction | ||
| and control, on terms that prohibit them from making any copies of | ||
| your copyrighted material outside their relationship with you. | ||
| Conveying under any other circumstances is permitted solely under | ||
| the conditions stated below. Sublicensing is not allowed; section 10 | ||
| makes it unnecessary. | ||
| 3. Protecting Users' Legal Rights From Anti-Circumvention Law. | ||
| No covered work shall be deemed part of an effective technological | ||
| measure under any applicable law fulfilling obligations under article | ||
| 11 of the WIPO copyright treaty adopted on 20 December 1996, or | ||
| similar laws prohibiting or restricting circumvention of such | ||
| measures. | ||
| When you convey a covered work, you waive any legal power to forbid | ||
| circumvention of technological measures to the extent such circumvention | ||
| is effected by exercising rights under this License with respect to | ||
| the covered work, and you disclaim any intention to limit operation or | ||
| modification of the work as a means of enforcing, against the work's | ||
| users, your or third parties' legal rights to forbid circumvention of | ||
| technological measures. | ||
| 4. Conveying Verbatim Copies. | ||
| You may convey verbatim copies of the Program's source code as you | ||
| receive it, in any medium, provided that you conspicuously and | ||
| appropriately publish on each copy an appropriate copyright notice; | ||
| keep intact all notices stating that this License and any | ||
| non-permissive terms added in accord with section 7 apply to the code; | ||
| keep intact all notices of the absence of any warranty; and give all | ||
| recipients a copy of this License along with the Program. | ||
| You may charge any price or no price for each copy that you convey, | ||
| and you may offer support or warranty protection for a fee. | ||
| 5. Conveying Modified Source Versions. | ||
| You may convey a work based on the Program, or the modifications to | ||
| produce it from the Program, in the form of source code under the | ||
| terms of section 4, provided that you also meet all of these conditions: | ||
| a) The work must carry prominent notices stating that you modified | ||
| it, and giving a relevant date. | ||
| b) The work must carry prominent notices stating that it is | ||
| released under this License and any conditions added under section | ||
| 7. This requirement modifies the requirement in section 4 to | ||
| "keep intact all notices". | ||
| c) You must license the entire work, as a whole, under this | ||
| License to anyone who comes into possession of a copy. This | ||
| License will therefore apply, along with any applicable section 7 | ||
| additional terms, to the whole of the work, and all its parts, | ||
| regardless of how they are packaged. This License gives no | ||
| permission to license the work in any other way, but it does not | ||
| invalidate such permission if you have separately received it. | ||
| d) If the work has interactive user interfaces, each must display | ||
| Appropriate Legal Notices; however, if the Program has interactive | ||
| interfaces that do not display Appropriate Legal Notices, your | ||
| work need not make them do so. | ||
| A compilation of a covered work with other separate and independent | ||
| works, which are not by their nature extensions of the covered work, | ||
| and which are not combined with it such as to form a larger program, | ||
| in or on a volume of a storage or distribution medium, is called an | ||
| "aggregate" if the compilation and its resulting copyright are not | ||
| used to limit the access or legal rights of the compilation's users | ||
| beyond what the individual works permit. Inclusion of a covered work | ||
| in an aggregate does not cause this License to apply to the other | ||
| parts of the aggregate. | ||
| 6. Conveying Non-Source Forms. | ||
| You may convey a covered work in object code form under the terms | ||
| of sections 4 and 5, provided that you also convey the | ||
| machine-readable Corresponding Source under the terms of this License, | ||
| in one of these ways: | ||
| a) Convey the object code in, or embodied in, a physical product | ||
| (including a physical distribution medium), accompanied by the | ||
| Corresponding Source fixed on a durable physical medium | ||
| customarily used for software interchange. | ||
| b) Convey the object code in, or embodied in, a physical product | ||
| (including a physical distribution medium), accompanied by a | ||
| written offer, valid for at least three years and valid for as | ||
| long as you offer spare parts or customer support for that product | ||
| model, to give anyone who possesses the object code either (1) a | ||
| copy of the Corresponding Source for all the software in the | ||
| product that is covered by this License, on a durable physical | ||
| medium customarily used for software interchange, for a price no | ||
| more than your reasonable cost of physically performing this | ||
| conveying of source, or (2) access to copy the | ||
| Corresponding Source from a network server at no charge. | ||
| c) Convey individual copies of the object code with a copy of the | ||
| written offer to provide the Corresponding Source. This | ||
| alternative is allowed only occasionally and noncommercially, and | ||
| only if you received the object code with such an offer, in accord | ||
| with subsection 6b. | ||
| d) Convey the object code by offering access from a designated | ||
| place (gratis or for a charge), and offer equivalent access to the | ||
| Corresponding Source in the same way through the same place at no | ||
| further charge. You need not require recipients to copy the | ||
| Corresponding Source along with the object code. If the place to | ||
| copy the object code is a network server, the Corresponding Source | ||
| may be on a different server (operated by you or a third party) | ||
| that supports equivalent copying facilities, provided you maintain | ||
| clear directions next to the object code saying where to find the | ||
| Corresponding Source. Regardless of what server hosts the | ||
| Corresponding Source, you remain obligated to ensure that it is | ||
| available for as long as needed to satisfy these requirements. | ||
| e) Convey the object code using peer-to-peer transmission, provided | ||
| you inform other peers where the object code and Corresponding | ||
| Source of the work are being offered to the general public at no | ||
| charge under subsection 6d. | ||
| A separable portion of the object code, whose source code is excluded | ||
| from the Corresponding Source as a System Library, need not be | ||
| included in conveying the object code work. | ||
| A "User Product" is either (1) a "consumer product", which means any | ||
| tangible personal property which is normally used for personal, family, | ||
| or household purposes, or (2) anything designed or sold for incorporation | ||
| into a dwelling. In determining whether a product is a consumer product, | ||
| doubtful cases shall be resolved in favor of coverage. For a particular | ||
| product received by a particular user, "normally used" refers to a | ||
| typical or common use of that class of product, regardless of the status | ||
| of the particular user or of the way in which the particular user | ||
| actually uses, or expects or is expected to use, the product. A product | ||
| is a consumer product regardless of whether the product has substantial | ||
| commercial, industrial or non-consumer uses, unless such uses represent | ||
| the only significant mode of use of the product. | ||
| "Installation Information" for a User Product means any methods, | ||
| procedures, authorization keys, or other information required to install | ||
| and execute modified versions of a covered work in that User Product from | ||
| a modified version of its Corresponding Source. The information must | ||
| suffice to ensure that the continued functioning of the modified object | ||
| code is in no case prevented or interfered with solely because | ||
| modification has been made. | ||
| If you convey an object code work under this section in, or with, or | ||
| specifically for use in, a User Product, and the conveying occurs as | ||
| part of a transaction in which the right of possession and use of the | ||
| User Product is transferred to the recipient in perpetuity or for a | ||
| fixed term (regardless of how the transaction is characterized), the | ||
| Corresponding Source conveyed under this section must be accompanied | ||
| by the Installation Information. But this requirement does not apply | ||
| if neither you nor any third party retains the ability to install | ||
| modified object code on the User Product (for example, the work has | ||
| been installed in ROM). | ||
| The requirement to provide Installation Information does not include a | ||
| requirement to continue to provide support service, warranty, or updates | ||
| for a work that has been modified or installed by the recipient, or for | ||
| the User Product in which it has been modified or installed. Access to a | ||
| network may be denied when the modification itself materially and | ||
| adversely affects the operation of the network or violates the rules and | ||
| protocols for communication across the network. | ||
| Corresponding Source conveyed, and Installation Information provided, | ||
| in accord with this section must be in a format that is publicly | ||
| documented (and with an implementation available to the public in | ||
| source code form), and must require no special password or key for | ||
| unpacking, reading or copying. | ||
| 7. Additional Terms. | ||
| "Additional permissions" are terms that supplement the terms of this | ||
| License by making exceptions from one or more of its conditions. | ||
| Additional permissions that are applicable to the entire Program shall | ||
| be treated as though they were included in this License, to the extent | ||
| that they are valid under applicable law. If additional permissions | ||
| apply only to part of the Program, that part may be used separately | ||
| under those permissions, but the entire Program remains governed by | ||
| this License without regard to the additional permissions. | ||
| When you convey a copy of a covered work, you may at your option | ||
| remove any additional permissions from that copy, or from any part of | ||
| it. (Additional permissions may be written to require their own | ||
| removal in certain cases when you modify the work.) You may place | ||
| additional permissions on material, added by you to a covered work, | ||
| for which you have or can give appropriate copyright permission. | ||
| Notwithstanding any other provision of this License, for material you | ||
| add to a covered work, you may (if authorized by the copyright holders of | ||
| that material) supplement the terms of this License with terms: | ||
| a) Disclaiming warranty or limiting liability differently from the | ||
| terms of sections 15 and 16 of this License; or | ||
| b) Requiring preservation of specified reasonable legal notices or | ||
| author attributions in that material or in the Appropriate Legal | ||
| Notices displayed by works containing it; or | ||
| c) Prohibiting misrepresentation of the origin of that material, or | ||
| requiring that modified versions of such material be marked in | ||
| reasonable ways as different from the original version; or | ||
| d) Limiting the use for publicity purposes of names of licensors or | ||
| authors of the material; or | ||
| e) Declining to grant rights under trademark law for use of some | ||
| trade names, trademarks, or service marks; or | ||
| f) Requiring indemnification of licensors and authors of that | ||
| material by anyone who conveys the material (or modified versions of | ||
| it) with contractual assumptions of liability to the recipient, for | ||
| any liability that these contractual assumptions directly impose on | ||
| those licensors and authors. | ||
| All other non-permissive additional terms are considered "further | ||
| restrictions" within the meaning of section 10. If the Program as you | ||
| received it, or any part of it, contains a notice stating that it is | ||
| governed by this License along with a term that is a further | ||
| restriction, you may remove that term. If a license document contains | ||
| a further restriction but permits relicensing or conveying under this | ||
| License, you may add to a covered work material governed by the terms | ||
| of that license document, provided that the further restriction does | ||
| not survive such relicensing or conveying. | ||
| If you add terms to a covered work in accord with this section, you | ||
| must place, in the relevant source files, a statement of the | ||
| additional terms that apply to those files, or a notice indicating | ||
| where to find the applicable terms. | ||
| Additional terms, permissive or non-permissive, may be stated in the | ||
| form of a separately written license, or stated as exceptions; | ||
| the above requirements apply either way. | ||
| 8. Termination. | ||
| You may not propagate or modify a covered work except as expressly | ||
| provided under this License. Any attempt otherwise to propagate or | ||
| modify it is void, and will automatically terminate your rights under | ||
| this License (including any patent licenses granted under the third | ||
| paragraph of section 11). | ||
| However, if you cease all violation of this License, then your | ||
| license from a particular copyright holder is reinstated (a) | ||
| provisionally, unless and until the copyright holder explicitly and | ||
| finally terminates your license, and (b) permanently, if the copyright | ||
| holder fails to notify you of the violation by some reasonable means | ||
| prior to 60 days after the cessation. | ||
| Moreover, your license from a particular copyright holder is | ||
| reinstated permanently if the copyright holder notifies you of the | ||
| violation by some reasonable means, this is the first time you have | ||
| received notice of violation of this License (for any work) from that | ||
| copyright holder, and you cure the violation prior to 30 days after | ||
| your receipt of the notice. | ||
| Termination of your rights under this section does not terminate the | ||
| licenses of parties who have received copies or rights from you under | ||
| this License. If your rights have been terminated and not permanently | ||
| reinstated, you do not qualify to receive new licenses for the same | ||
| material under section 10. | ||
| 9. Acceptance Not Required for Having Copies. | ||
| You are not required to accept this License in order to receive or | ||
| run a copy of the Program. Ancillary propagation of a covered work | ||
| occurring solely as a consequence of using peer-to-peer transmission | ||
| to receive a copy likewise does not require acceptance. However, | ||
| nothing other than this License grants you permission to propagate or | ||
| modify any covered work. These actions infringe copyright if you do | ||
| not accept this License. Therefore, by modifying or propagating a | ||
| covered work, you indicate your acceptance of this License to do so. | ||
| 10. Automatic Licensing of Downstream Recipients. | ||
| Each time you convey a covered work, the recipient automatically | ||
| receives a license from the original licensors, to run, modify and | ||
| propagate that work, subject to this License. You are not responsible | ||
| for enforcing compliance by third parties with this License. | ||
| An "entity transaction" is a transaction transferring control of an | ||
| organization, or substantially all assets of one, or subdividing an | ||
| organization, or merging organizations. If propagation of a covered | ||
| work results from an entity transaction, each party to that | ||
| transaction who receives a copy of the work also receives whatever | ||
| licenses to the work the party's predecessor in interest had or could | ||
| give under the previous paragraph, plus a right to possession of the | ||
| Corresponding Source of the work from the predecessor in interest, if | ||
| the predecessor has it or can get it with reasonable efforts. | ||
| You may not impose any further restrictions on the exercise of the | ||
| rights granted or affirmed under this License. For example, you may | ||
| not impose a license fee, royalty, or other charge for exercise of | ||
| rights granted under this License, and you may not initiate litigation | ||
| (including a cross-claim or counterclaim in a lawsuit) alleging that | ||
| any patent claim is infringed by making, using, selling, offering for | ||
| sale, or importing the Program or any portion of it. | ||
| 11. Patents. | ||
| A "contributor" is a copyright holder who authorizes use under this | ||
| License of the Program or a work on which the Program is based. The | ||
| work thus licensed is called the contributor's "contributor version". | ||
| A contributor's "essential patent claims" are all patent claims | ||
| owned or controlled by the contributor, whether already acquired or | ||
| hereafter acquired, that would be infringed by some manner, permitted | ||
| by this License, of making, using, or selling its contributor version, | ||
| but do not include claims that would be infringed only as a | ||
| consequence of further modification of the contributor version. For | ||
| purposes of this definition, "control" includes the right to grant | ||
| patent sublicenses in a manner consistent with the requirements of | ||
| this License. | ||
| Each contributor grants you a non-exclusive, worldwide, royalty-free | ||
| patent license under the contributor's essential patent claims, to | ||
| make, use, sell, offer for sale, import and otherwise run, modify and | ||
| propagate the contents of its contributor version. | ||
| In the following three paragraphs, a "patent license" is any express | ||
| agreement or commitment, however denominated, not to enforce a patent | ||
| (such as an express permission to practice a patent or covenant not to | ||
| sue for patent infringement). To "grant" such a patent license to a | ||
| party means to make such an agreement or commitment not to enforce a | ||
| patent against the party. | ||
| If you convey a covered work, knowingly relying on a patent license, | ||
| and the Corresponding Source of the work is not available for anyone | ||
| to copy, free of charge and under the terms of this License, through a | ||
| publicly available network server or other readily accessible means, | ||
| then you must either (1) cause the Corresponding Source to be so | ||
| available, or (2) arrange to deprive yourself of the benefit of the | ||
| patent license for this particular work, or (3) arrange, in a manner | ||
| consistent with the requirements of this License, to extend the patent | ||
| license to downstream recipients. "Knowingly relying" means you have | ||
| actual knowledge that, but for the patent license, your conveying the | ||
| covered work in a country, or your recipient's use of the covered work | ||
| in a country, would infringe one or more identifiable patents in that | ||
| country that you have reason to believe are valid. | ||
| If, pursuant to or in connection with a single transaction or | ||
| arrangement, you convey, or propagate by procuring conveyance of, a | ||
| covered work, and grant a patent license to some of the parties | ||
| receiving the covered work authorizing them to use, propagate, modify | ||
| or convey a specific copy of the covered work, then the patent license | ||
| you grant is automatically extended to all recipients of the covered | ||
| work and works based on it. | ||
| A patent license is "discriminatory" if it does not include within | ||
| the scope of its coverage, prohibits the exercise of, or is | ||
| conditioned on the non-exercise of one or more of the rights that are | ||
| specifically granted under this License. You may not convey a covered | ||
| work if you are a party to an arrangement with a third party that is | ||
| in the business of distributing software, under which you make payment | ||
| to the third party based on the extent of your activity of conveying | ||
| the work, and under which the third party grants, to any of the | ||
| parties who would receive the covered work from you, a discriminatory | ||
| patent license (a) in connection with copies of the covered work | ||
| conveyed by you (or copies made from those copies), or (b) primarily | ||
| for and in connection with specific products or compilations that | ||
| contain the covered work, unless you entered into that arrangement, | ||
| or that patent license was granted, prior to 28 March 2007. | ||
| Nothing in this License shall be construed as excluding or limiting | ||
| any implied license or other defenses to infringement that may | ||
| otherwise be available to you under applicable patent law. | ||
| 12. No Surrender of Others' Freedom. | ||
| If conditions are imposed on you (whether by court order, agreement or | ||
| otherwise) that contradict the conditions of this License, they do not | ||
| excuse you from the conditions of this License. If you cannot convey a | ||
| covered work so as to satisfy simultaneously your obligations under this | ||
| License and any other pertinent obligations, then as a consequence you may | ||
| not convey it at all. For example, if you agree to terms that obligate you | ||
| to collect a royalty for further conveying from those to whom you convey | ||
| the Program, the only way you could satisfy both those terms and this | ||
| License would be to refrain entirely from conveying the Program. | ||
| 13. Use with the GNU Affero General Public License. | ||
| Notwithstanding any other provision of this License, you have | ||
| permission to link or combine any covered work with a work licensed | ||
| under version 3 of the GNU Affero General Public License into a single | ||
| combined work, and to convey the resulting work. The terms of this | ||
| License will continue to apply to the part which is the covered work, | ||
| but the special requirements of the GNU Affero General Public License, | ||
| section 13, concerning interaction through a network will apply to the | ||
| combination as such. | ||
| 14. Revised Versions of this License. | ||
| The Free Software Foundation may publish revised and/or new versions of | ||
| the GNU General Public License from time to time. Such new versions will | ||
| be similar in spirit to the present version, but may differ in detail to | ||
| address new problems or concerns. | ||
| Each version is given a distinguishing version number. If the | ||
| Program specifies that a certain numbered version of the GNU General | ||
| Public License "or any later version" applies to it, you have the | ||
| option of following the terms and conditions either of that numbered | ||
| version or of any later version published by the Free Software | ||
| Foundation. If the Program does not specify a version number of the | ||
| GNU General Public License, you may choose any version ever published | ||
| by the Free Software Foundation. | ||
| If the Program specifies that a proxy can decide which future | ||
| versions of the GNU General Public License can be used, that proxy's | ||
| public statement of acceptance of a version permanently authorizes you | ||
| to choose that version for the Program. | ||
| Later license versions may give you additional or different | ||
| permissions. However, no additional obligations are imposed on any | ||
| author or copyright holder as a result of your choosing to follow a | ||
| later version. | ||
| 15. Disclaimer of Warranty. | ||
| THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY | ||
| APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT | ||
| HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY | ||
| OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, | ||
| THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM | ||
| IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF | ||
| ALL NECESSARY SERVICING, REPAIR OR CORRECTION. | ||
| 16. Limitation of Liability. | ||
| IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING | ||
| WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS | ||
| THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY | ||
| GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE | ||
| USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF | ||
| DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD | ||
| PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), | ||
| EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF | ||
| SUCH DAMAGES. | ||
| 17. Interpretation of Sections 15 and 16. | ||
| If the disclaimer of warranty and limitation of liability provided | ||
| above cannot be given local legal effect according to their terms, | ||
| reviewing courts shall apply local law that most closely approximates | ||
| an absolute waiver of all civil liability in connection with the | ||
| Program, unless a warranty or assumption of liability accompanies a | ||
| copy of the Program in return for a fee. | ||
| END OF TERMS AND CONDITIONS | ||
| How to Apply These Terms to Your New Programs | ||
| If you develop a new program, and you want it to be of the greatest | ||
| possible use to the public, the best way to achieve this is to make it | ||
| free software which everyone can redistribute and change under these terms. | ||
| To do so, attach the following notices to the program. It is safest | ||
| to attach them to the start of each source file to most effectively | ||
| state the exclusion of warranty; and each file should have at least | ||
| the "copyright" line and a pointer to where the full notice is found. | ||
| <one line to give the program's name and a brief idea of what it does.> | ||
| Copyright (C) <year> <name of author> | ||
| This program is free software: you can redistribute it and/or modify | ||
| it under the terms of the GNU General Public License as published by | ||
| the Free Software Foundation, either version 3 of the License, or | ||
| (at your option) any later version. | ||
| This program is distributed in the hope that it will be useful, | ||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| GNU General Public License for more details. | ||
| You should have received a copy of the GNU General Public License | ||
| along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
| Also add information on how to contact you by electronic and paper mail. | ||
| If the program does terminal interaction, make it output a short | ||
| notice like this when it starts in an interactive mode: | ||
| <program> Copyright (C) <year> <name of author> | ||
| This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. | ||
| This is free software, and you are welcome to redistribute it | ||
| under certain conditions; type `show c' for details. | ||
| The hypothetical commands `show w' and `show c' should show the appropriate | ||
| parts of the General Public License. Of course, your program's commands | ||
| might be different; for a GUI interface, you would use an "about box". | ||
| You should also get your employer (if you work as a programmer) or school, | ||
| if any, to sign a "copyright disclaimer" for the program, if necessary. | ||
| For more information on this, and how to apply and follow the GNU GPL, see | ||
| <https://www.gnu.org/licenses/>. | ||
| The GNU General Public License does not permit incorporating your program | ||
| into proprietary programs. If your program is a subroutine library, you | ||
| may consider it more useful to permit linking proprietary applications with | ||
| the library. If this is what you want to do, use the GNU Lesser General | ||
| Public License instead of this License. But first, please read | ||
| <https://www.gnu.org/licenses/why-not-lgpl.html>. |
Sorry, the diff of this file is too big to display
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
3737
21.65%562455
-15.32%33
-10.81%