fse
Advanced tools
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.0.2 | ||
| Version: 0.0.4 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings |
@@ -1,5 +0,4 @@ | ||
| README.md | ||
| setup.py | ||
| ./fse/exp/sif_variants_cy.pyx | ||
| ./fse/models/sentence2vec_inner.pyx | ||
| ./fse/exp/sif_variants_cy.c | ||
| ./fse/models/sentence2vec_inner.c | ||
| fse/__init__.py | ||
@@ -6,0 +5,0 @@ fse/benchmark_speed.py |
+1
-1
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.0.2 | ||
| Version: 0.0.4 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings |
+3
-3
@@ -87,3 +87,3 @@ #!/usr/bin/env python | ||
| name='fse', | ||
| version='0.0.2', | ||
| version='0.0.4', | ||
| description='Fast Sentence Embeddings for Gensim', | ||
@@ -98,6 +98,6 @@ | ||
| Extension('fse.models.sentence2vec_inner', | ||
| sources=['./fse/models/sentence2vec_inner.pyx'], | ||
| sources=['./fse/models/sentence2vec_inner.c'], | ||
| include_dirs=[mod_dir]), | ||
| Extension('fse.exp.sif_variants_cy', | ||
| sources=['./fse/exp/sif_variants_cy.pyx'], | ||
| sources=['./fse/exp/sif_variants_cy.c'], | ||
| include_dirs=[dev_dir]), | ||
@@ -104,0 +104,0 @@ ], |
| #!/usr/bin/env cython | ||
| # cython: boundscheck=False | ||
| # cython: wraparound=False | ||
| # cython: cdivision=True | ||
| # cython: embedsignature=True | ||
| # coding: utf-8 | ||
| """Optimized cython functions for computing SIF embeddings""" | ||
| import numpy as np | ||
| cimport numpy as np | ||
| import cython | ||
| import scipy.linalg.blas as fblas | ||
| cdef extern from "../voidptr.h": | ||
| void* PyCObject_AsVoidPtr(object obj) | ||
| ctypedef np.float32_t REAL_t | ||
| ctypedef np.int32_t INT_t | ||
| ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil | ||
| cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer) | ||
| ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil | ||
| cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer) | ||
| cdef REAL_t ONEF = <REAL_t>1.0 | ||
| cdef int ONE = 1 | ||
| def sif_embeddings_6(sentences, model): | ||
| cdef int size = model.vector_size | ||
| cdef float[:,:] vectors = model.wv.sif_vectors | ||
| cdef int sentence_index, word_index, d, count = 0 | ||
| cdef float inv = 1. | ||
| np_sum = np.sum | ||
| output = np.zeros((len(sentences), size), dtype=np.float32) | ||
| cdef float[:,:] sv = output | ||
| for sentence_index, sentence in enumerate(sentences): | ||
| if len(sentence) > 0: | ||
| count = 0 | ||
| for word_index in sentence: | ||
| count += 1 | ||
| for d in range(size): | ||
| sv[sentence_index, d] += vectors[word_index, d] | ||
| inv = (1./ <float>count) | ||
| for d in range(size): | ||
| sv[sentence_index, d] *= inv | ||
| return output | ||
| def sif_embeddings_7(sentences, model): | ||
| cdef int size = model.vector_size | ||
| cdef float[:,:] vectors = model.wv.sif_vectors | ||
| np_sum = np.sum | ||
| np_asarray = np.asarray | ||
| output = np.zeros((len(sentences), size), dtype=np.float32) | ||
| cdef float[:,:] sv = output | ||
| cdef int[:] sentence_view | ||
| cdef int sentence_len | ||
| for i in xrange(len(sentences)): | ||
| if len(sentences[i]) > 0: | ||
| sentence_view = sentences[i] | ||
| sentence_len = len(sentences[i]) | ||
| sif_embeddings_7_cloop(size, sentence_view, sentence_len, i, vectors, sv) | ||
| return output | ||
| cdef void sif_embeddings_7_cloop(int size, int[:] sentence_view, int sentence_len, int sentence_idx, float[:,:] vectors, float[:,:] summary_vectors) nogil: | ||
| cdef int i,d, word_index, count = 0 | ||
| cdef float inv = 1. | ||
| for i in xrange(sentence_len): | ||
| word_index = sentence_view[i] | ||
| count += 1 | ||
| for d in xrange(size): | ||
| summary_vectors[sentence_idx, d] += vectors[word_index, d] | ||
| inv = (1./ <float>count) | ||
| for d in xrange(size): | ||
| summary_vectors[sentence_idx, d] *= inv | ||
| def sif_embeddings_8(sentences, model): | ||
| cdef int size = model.vector_size | ||
| cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors)) | ||
| output = np.zeros((len(sentences), size), dtype=np.float32) | ||
| cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output)) | ||
| cdef INT_t *sentence_view | ||
| for i in xrange(len(sentences)): | ||
| if len(sentences[i]): | ||
| sentence_view = <INT_t *>(np.PyArray_DATA(sentences[i])) | ||
| sentence_len = len(sentences[i]) | ||
| sif_embeddings_8_inner(size, sentence_view, sentence_len, i, vectors, sv) | ||
| return output | ||
| cdef void sif_embeddings_8_inner(const int size, const INT_t *sentence_view, const int sentence_len, | ||
| const int sentence_idx, const REAL_t *vectors, REAL_t *summary_vectors) nogil: | ||
| cdef int i,d, word_index | ||
| cdef REAL_t inv = ONEF, count = <REAL_t> 0. | ||
| for i in xrange(sentence_len): | ||
| count += ONEF | ||
| word_index = sentence_view[i] | ||
| saxpy(&size, &ONEF, &vectors[word_index * size], &ONE, &summary_vectors[sentence_idx * size], &ONE) | ||
| inv = ONEF / count | ||
| sscal(&size, &inv, &summary_vectors[sentence_idx * size], &ONE) |
| #!/usr/bin/env cython | ||
| # cython: boundscheck=False | ||
| # cython: wraparound=False | ||
| # cython: cdivision=True | ||
| # cython: embedsignature=True | ||
| # coding: utf-8 | ||
| # Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| # Copyright (C) 2019 Oliver Borchers | ||
| """Optimized cython functions for computing sentence embeddings""" | ||
| import numpy as np | ||
| cimport numpy as np | ||
| import cython | ||
| import scipy.linalg.blas as fblas | ||
| cdef extern from "../voidptr.h": | ||
| void* PyCObject_AsVoidPtr(object obj) | ||
| REAL = np.float32 | ||
| INT = np.intc | ||
| ctypedef np.float32_t REAL_t | ||
| ctypedef np.int32_t INT_t | ||
| # Type definitions and pointers for BLAS routines | ||
| # See http://www.netlib.org/blas/#_level_1 for more information | ||
| ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil | ||
| cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer) | ||
| ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil | ||
| cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer) | ||
| cdef REAL_t ONEF = <REAL_t>1.0 | ||
| cdef int ONE = 1 | ||
| cdef REAL_t EPS = <REAL_t>1e-8 | ||
| full = np.full | ||
| empty = np.empty | ||
| def s2v_train(sentences, len_sentences, wv, weights): | ||
| """Train sentence embedding on a list of sentences | ||
| Called internally from :meth:`~fse.models.sentence2vec.Sentence2Vec.train`. | ||
| Parameters | ||
| ---------- | ||
| sentences : iterable of list of str | ||
| The corpus used to train the model. | ||
| len_sentences : int | ||
| Length of the sentence iterable | ||
| wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` | ||
| The BaseKeyedVectors instance containing the vectors used for training | ||
| weights : np.ndarray | ||
| Weights used in the summation of the vectors | ||
| Returns | ||
| ------- | ||
| np.ndarray | ||
| The sentence embedding matrix of dim len(sentences) * vector_size | ||
| int | ||
| Number of words in the vocabulary actually used for training. | ||
| int | ||
| Number of sentences used for training. | ||
| """ | ||
| # Setup variables | ||
| cdef int len_sen = len_sentences, effective_words = 0, effective_sentences = 0 | ||
| cdef int i, sentence_len, size = wv.vector_size | ||
| cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(wv.vectors)) | ||
| cdef REAL_t *vec_weights = <REAL_t *>(np.PyArray_DATA(weights)) | ||
| # Materialize output array iteratively | ||
| # We do start from a matrix with EPS values to prohibit divisions by zero in subsequent applications | ||
| output = empty((len_sen, size), dtype=REAL) | ||
| for i in xrange(len_sen): | ||
| output[i] = full(size, EPS, dtype=REAL) | ||
| cdef REAL_t *output_view = <REAL_t *>(np.PyArray_DATA(output)) | ||
| cdef INT_t *sentence_view | ||
| cdef str w | ||
| vlookup = wv.vocab | ||
| as_array = np.asarray | ||
| for i, s in enumerate(sentences): | ||
| sentence_idx = as_array([vlookup[w].index for w in s if w in vlookup], dtype=INT) | ||
| sentence_len = len(sentence_idx) | ||
| if sentence_len: | ||
| effective_words += sentence_len | ||
| effective_sentences += ONE | ||
| sentence_view = <INT_t *>(np.PyArray_DATA(sentence_idx)) | ||
| s2v_train_core(size, sentence_view, sentence_len, i, vectors, output_view, vec_weights) | ||
| return output, effective_words, effective_sentences | ||
| cdef void s2v_train_core(const int size, const INT_t *sentence_view, const int sentence_len, | ||
| const int sentence_idx, const REAL_t *vectors, REAL_t *sent_vecs, | ||
| const REAL_t *vec_weights) nogil: | ||
| cdef int i,d, word_index | ||
| cdef REAL_t inv = ONEF, count = <REAL_t> 0. | ||
| for i in xrange(sentence_len): | ||
| count += ONEF | ||
| word_index = sentence_view[i] | ||
| saxpy(&size, &vec_weights[word_index], &vectors[word_index * size], &ONE, &sent_vecs[sentence_idx * size], &ONE) | ||
| inv = ONEF / count | ||
| sscal(&size, &inv, &sent_vecs[sentence_idx * size], &ONE) |
-104
| Fast Sentence Embeddings (fse) | ||
| ================================== | ||
| Fast Sentence Embeddings is a Python library that serves as an addition to Gensim. This library is intended to compute *summary vectors* for large collections of sentences or documents. | ||
| Features | ||
| ------------ | ||
| Find the corresponding blog post here: https://medium.com/@oliverbor/fse-2b1ffa791cf9 | ||
| **fse** implements two algorithms for sentence embeddings. You can choose | ||
| between *unweighted sentence averages* and *smooth inverse frequency averages*. | ||
| In order to use the **fse** model, you first need some pre-trained embedding | ||
| gensim embedding model, which is then used by **fse** to compute the sentence embeddings. | ||
| After computing sentence embeddings, you can use them in supervised or | ||
| unsupervised NLP applications, as they serve as a formidable baseline. | ||
| The models here are based on the the smooth inverse frequency embeddings [1] | ||
| and the deep-averaging networks [2]. | ||
| Credit is due to Radim Řehůřek and all contributors for the **awesome** library | ||
| and code that gensim provides. | ||
| Installation | ||
| ------------ | ||
| This software depends on [NumPy, Scipy, Scikit-learn, Gensim, and Wordfreq]. | ||
| You must have them installed prior to installing fse. | ||
| As with gensim, it is also recommended you install a fast BLAS library | ||
| before installing fse. | ||
| The simple way to install **fse** is: | ||
| pip install fse | ||
| In case you want to build from the source, just run: | ||
| python setup.py install | ||
| Exemplary application | ||
| ------------- | ||
| In order to use **fse** you must first estimate a Gensim model which containes a | ||
| gensim.models.keyedvectors.BaseKeyedVectors class, for example | ||
| *Word2Vec* or *Fasttext*. Then you can proceed to compute sentence embeddings | ||
| for a corpus. | ||
| The current version does not offer multi-core support out of the box. | ||
| from gensim.models import Word2Vec | ||
| sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] | ||
| model = Word2Vec(sentences, min_count=1) | ||
| from fse.models import Sentence2Vec | ||
| se = Sentence2Vec(model) | ||
| sentences_emb = se.train(sentences) | ||
| Sentence Prediction.ipynb contains an example of how to use the library | ||
| with a pre-trained Word2Vec model. compute_sif.py trains a Word2Vec model | ||
| on a corpus (i.e, brown) and benchmark_speed.py reproduces the results from | ||
| the Medium post. | ||
| ToDos | ||
| ------------- | ||
| **[ ]** Various Bugfixes | ||
| **[ ]** Feature Testing | ||
| **[ ]** Support TaggedLineDocument from Doc2Vec | ||
| **[ ]** Multi Core Implementation | ||
| **[ ]** Direct to disc-estimation to to avoid RAM shortage (perhaps?) | ||
| **[ ]** Propose as a gensim feature (perhaps?) | ||
| Literature | ||
| ------------- | ||
| 1. Arora S, Liang Y, Ma T (2017) A Simple but Tough-to-Beat Baseline for Sentence | ||
| Embeddings. Int. Conf. Learn. Represent. (Toulon, France), 1–16. | ||
| 2. Iyyer M, Manjunatha V, Boyd-Graber J, Daumé III H (2015) Deep Unordered | ||
| Composition Rivals Syntactic Methods for Text Classification. Proc. 53rd Annu. | ||
| Meet. Assoc. Comput. Linguist. 7th Int. Jt. Conf. Nat. Lang. Process., 1681–1691. | ||
| 3. Eneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia Specia. Semeval-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation. Proceedings of SemEval 2017. | ||
| 4. Duong, Chi Thang, Remi Lebret, and Karl Aberer. “Multimodal Classification for Analysing Social Media.” The 27th European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD), 2017 | ||
| Credits | ||
| ------------- | ||
| The STS dataset was released by [3]. | ||
| The Reddit dataset was released by [4]: https://emoclassifier.github.io | ||
| Copyright | ||
| ------------- | ||
| Author: Oliver Borchers <borchers@bwl.uni-mannheim.de> | ||
| Copyright (C) 2019 Oliver Borchers |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
1297233
2982.56%18
-5.26%