You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

fse

Package Overview
Dependencies
Maintainers
1
Versions
10
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

fse - pypi Package Compare versions

Comparing version
0.0.2
to
0.0.4
fse/exp/sif_variants_cy.c

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

+1
-1
Metadata-Version: 1.0
Name: fse
Version: 0.0.2
Version: 0.0.4
Summary: Fast Sentence Embeddings for Gensim

@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings

@@ -1,5 +0,4 @@

README.md
setup.py
./fse/exp/sif_variants_cy.pyx
./fse/models/sentence2vec_inner.pyx
./fse/exp/sif_variants_cy.c
./fse/models/sentence2vec_inner.c
fse/__init__.py

@@ -6,0 +5,0 @@ fse/benchmark_speed.py

Metadata-Version: 1.0
Name: fse
Version: 0.0.2
Version: 0.0.4
Summary: Fast Sentence Embeddings for Gensim

@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings

@@ -87,3 +87,3 @@ #!/usr/bin/env python

name='fse',
version='0.0.2',
version='0.0.4',
description='Fast Sentence Embeddings for Gensim',

@@ -98,6 +98,6 @@

Extension('fse.models.sentence2vec_inner',
sources=['./fse/models/sentence2vec_inner.pyx'],
sources=['./fse/models/sentence2vec_inner.c'],
include_dirs=[mod_dir]),
Extension('fse.exp.sif_variants_cy',
sources=['./fse/exp/sif_variants_cy.pyx'],
sources=['./fse/exp/sif_variants_cy.c'],
include_dirs=[dev_dir]),

@@ -104,0 +104,0 @@ ],

#!/usr/bin/env cython
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True
# coding: utf-8
"""Optimized cython functions for computing SIF embeddings"""
import numpy as np
cimport numpy as np
import cython
import scipy.linalg.blas as fblas
cdef extern from "../voidptr.h":
void* PyCObject_AsVoidPtr(object obj)
ctypedef np.float32_t REAL_t
ctypedef np.int32_t INT_t
ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer)
ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer)
cdef REAL_t ONEF = <REAL_t>1.0
cdef int ONE = 1
def sif_embeddings_6(sentences, model):
cdef int size = model.vector_size
cdef float[:,:] vectors = model.wv.sif_vectors
cdef int sentence_index, word_index, d, count = 0
cdef float inv = 1.
np_sum = np.sum
output = np.zeros((len(sentences), size), dtype=np.float32)
cdef float[:,:] sv = output
for sentence_index, sentence in enumerate(sentences):
if len(sentence) > 0:
count = 0
for word_index in sentence:
count += 1
for d in range(size):
sv[sentence_index, d] += vectors[word_index, d]
inv = (1./ <float>count)
for d in range(size):
sv[sentence_index, d] *= inv
return output
def sif_embeddings_7(sentences, model):
cdef int size = model.vector_size
cdef float[:,:] vectors = model.wv.sif_vectors
np_sum = np.sum
np_asarray = np.asarray
output = np.zeros((len(sentences), size), dtype=np.float32)
cdef float[:,:] sv = output
cdef int[:] sentence_view
cdef int sentence_len
for i in xrange(len(sentences)):
if len(sentences[i]) > 0:
sentence_view = sentences[i]
sentence_len = len(sentences[i])
sif_embeddings_7_cloop(size, sentence_view, sentence_len, i, vectors, sv)
return output
cdef void sif_embeddings_7_cloop(int size, int[:] sentence_view, int sentence_len, int sentence_idx, float[:,:] vectors, float[:,:] summary_vectors) nogil:
cdef int i,d, word_index, count = 0
cdef float inv = 1.
for i in xrange(sentence_len):
word_index = sentence_view[i]
count += 1
for d in xrange(size):
summary_vectors[sentence_idx, d] += vectors[word_index, d]
inv = (1./ <float>count)
for d in xrange(size):
summary_vectors[sentence_idx, d] *= inv
def sif_embeddings_8(sentences, model):
cdef int size = model.vector_size
cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors))
output = np.zeros((len(sentences), size), dtype=np.float32)
cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output))
cdef INT_t *sentence_view
for i in xrange(len(sentences)):
if len(sentences[i]):
sentence_view = <INT_t *>(np.PyArray_DATA(sentences[i]))
sentence_len = len(sentences[i])
sif_embeddings_8_inner(size, sentence_view, sentence_len, i, vectors, sv)
return output
cdef void sif_embeddings_8_inner(const int size, const INT_t *sentence_view, const int sentence_len,
const int sentence_idx, const REAL_t *vectors, REAL_t *summary_vectors) nogil:
cdef int i,d, word_index
cdef REAL_t inv = ONEF, count = <REAL_t> 0.
for i in xrange(sentence_len):
count += ONEF
word_index = sentence_view[i]
saxpy(&size, &ONEF, &vectors[word_index * size], &ONE, &summary_vectors[sentence_idx * size], &ONE)
inv = ONEF / count
sscal(&size, &inv, &summary_vectors[sentence_idx * size], &ONE)
#!/usr/bin/env cython
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True
# coding: utf-8
# Author: Oliver Borchers <borchers@bwl.uni-mannheim.de>
# Copyright (C) 2019 Oliver Borchers
"""Optimized cython functions for computing sentence embeddings"""
import numpy as np
cimport numpy as np
import cython
import scipy.linalg.blas as fblas
cdef extern from "../voidptr.h":
void* PyCObject_AsVoidPtr(object obj)
REAL = np.float32
INT = np.intc
ctypedef np.float32_t REAL_t
ctypedef np.int32_t INT_t
# Type definitions and pointers for BLAS routines
# See http://www.netlib.org/blas/#_level_1 for more information
ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer)
ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer)
cdef REAL_t ONEF = <REAL_t>1.0
cdef int ONE = 1
cdef REAL_t EPS = <REAL_t>1e-8
full = np.full
empty = np.empty
def s2v_train(sentences, len_sentences, wv, weights):
"""Train sentence embedding on a list of sentences
Called internally from :meth:`~fse.models.sentence2vec.Sentence2Vec.train`.
Parameters
----------
sentences : iterable of list of str
The corpus used to train the model.
len_sentences : int
Length of the sentence iterable
wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
The BaseKeyedVectors instance containing the vectors used for training
weights : np.ndarray
Weights used in the summation of the vectors
Returns
-------
np.ndarray
The sentence embedding matrix of dim len(sentences) * vector_size
int
Number of words in the vocabulary actually used for training.
int
Number of sentences used for training.
"""
# Setup variables
cdef int len_sen = len_sentences, effective_words = 0, effective_sentences = 0
cdef int i, sentence_len, size = wv.vector_size
cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(wv.vectors))
cdef REAL_t *vec_weights = <REAL_t *>(np.PyArray_DATA(weights))
# Materialize output array iteratively
# We do start from a matrix with EPS values to prohibit divisions by zero in subsequent applications
output = empty((len_sen, size), dtype=REAL)
for i in xrange(len_sen):
output[i] = full(size, EPS, dtype=REAL)
cdef REAL_t *output_view = <REAL_t *>(np.PyArray_DATA(output))
cdef INT_t *sentence_view
cdef str w
vlookup = wv.vocab
as_array = np.asarray
for i, s in enumerate(sentences):
sentence_idx = as_array([vlookup[w].index for w in s if w in vlookup], dtype=INT)
sentence_len = len(sentence_idx)
if sentence_len:
effective_words += sentence_len
effective_sentences += ONE
sentence_view = <INT_t *>(np.PyArray_DATA(sentence_idx))
s2v_train_core(size, sentence_view, sentence_len, i, vectors, output_view, vec_weights)
return output, effective_words, effective_sentences
cdef void s2v_train_core(const int size, const INT_t *sentence_view, const int sentence_len,
const int sentence_idx, const REAL_t *vectors, REAL_t *sent_vecs,
const REAL_t *vec_weights) nogil:
cdef int i,d, word_index
cdef REAL_t inv = ONEF, count = <REAL_t> 0.
for i in xrange(sentence_len):
count += ONEF
word_index = sentence_view[i]
saxpy(&size, &vec_weights[word_index], &vectors[word_index * size], &ONE, &sent_vecs[sentence_idx * size], &ONE)
inv = ONEF / count
sscal(&size, &inv, &sent_vecs[sentence_idx * size], &ONE)
Fast Sentence Embeddings (fse)
==================================
Fast Sentence Embeddings is a Python library that serves as an addition to Gensim. This library is intended to compute *summary vectors* for large collections of sentences or documents.
Features
------------
Find the corresponding blog post here: https://medium.com/@oliverbor/fse-2b1ffa791cf9
**fse** implements two algorithms for sentence embeddings. You can choose
between *unweighted sentence averages* and *smooth inverse frequency averages*.
In order to use the **fse** model, you first need some pre-trained embedding
gensim embedding model, which is then used by **fse** to compute the sentence embeddings.
After computing sentence embeddings, you can use them in supervised or
unsupervised NLP applications, as they serve as a formidable baseline.
The models here are based on the the smooth inverse frequency embeddings [1]
and the deep-averaging networks [2].
Credit is due to Radim Řehůřek and all contributors for the **awesome** library
and code that gensim provides.
Installation
------------
This software depends on [NumPy, Scipy, Scikit-learn, Gensim, and Wordfreq].
You must have them installed prior to installing fse.
As with gensim, it is also recommended you install a fast BLAS library
before installing fse.
The simple way to install **fse** is:
pip install fse
In case you want to build from the source, just run:
python setup.py install
Exemplary application
-------------
In order to use **fse** you must first estimate a Gensim model which containes a
gensim.models.keyedvectors.BaseKeyedVectors class, for example
*Word2Vec* or *Fasttext*. Then you can proceed to compute sentence embeddings
for a corpus.
The current version does not offer multi-core support out of the box.
from gensim.models import Word2Vec
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = Word2Vec(sentences, min_count=1)
from fse.models import Sentence2Vec
se = Sentence2Vec(model)
sentences_emb = se.train(sentences)
Sentence Prediction.ipynb contains an example of how to use the library
with a pre-trained Word2Vec model. compute_sif.py trains a Word2Vec model
on a corpus (i.e, brown) and benchmark_speed.py reproduces the results from
the Medium post.
ToDos
-------------
**[ ]** Various Bugfixes
**[ ]** Feature Testing
**[ ]** Support TaggedLineDocument from Doc2Vec
**[ ]** Multi Core Implementation
**[ ]** Direct to disc-estimation to to avoid RAM shortage (perhaps?)
**[ ]** Propose as a gensim feature (perhaps?)
Literature
-------------
1. Arora S, Liang Y, Ma T (2017) A Simple but Tough-to-Beat Baseline for Sentence
Embeddings. Int. Conf. Learn. Represent. (Toulon, France), 1–16.
2. Iyyer M, Manjunatha V, Boyd-Graber J, Daumé III H (2015) Deep Unordered
Composition Rivals Syntactic Methods for Text Classification. Proc. 53rd Annu.
Meet. Assoc. Comput. Linguist. 7th Int. Jt. Conf. Nat. Lang. Process., 1681–1691.
3. Eneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia Specia. Semeval-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation. Proceedings of SemEval 2017.
4. Duong, Chi Thang, Remi Lebret, and Karl Aberer. “Multimodal Classification for Analysing Social Media.” The 27th European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD), 2017
Credits
-------------
The STS dataset was released by [3].
The Reddit dataset was released by [4]: https://emoclassifier.github.io
Copyright
-------------
Author: Oliver Borchers <borchers@bwl.uni-mannheim.de>
Copyright (C) 2019 Oliver Borchers