Metadata-Version: 1.0
		Name: fse
		Version: 0.0.2
		Version: 0.0.4
		Summary: Fast Sentence Embeddings for Gensim
		@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings

+2

-3

fse.egg-info/SOURCES.txt

		@@ -1,5 +0,4 @@
		README.md
		setup.py
		./fse/exp/sif_variants_cy.pyx
		./fse/models/sentence2vec_inner.pyx
		./fse/exp/sif_variants_cy.c
		./fse/models/sentence2vec_inner.c
		fse/__init__.py
		@@ -6,0 +5,0 @@ fse/benchmark_speed.py

+1

-1

PKG-INFO

		Metadata-Version: 1.0
		Name: fse
		Version: 0.0.2
		Version: 0.0.4
		Summary: Fast Sentence Embeddings for Gensim
		@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings

+3

-3

setup.py

		@@ -87,3 +87,3 @@ #!/usr/bin/env python
		name='fse',
		version='0.0.2',
		version='0.0.4',
		description='Fast Sentence Embeddings for Gensim',
		@@ -98,6 +98,6 @@
		Extension('fse.models.sentence2vec_inner',
		sources=['./fse/models/sentence2vec_inner.pyx'],
		sources=['./fse/models/sentence2vec_inner.c'],
		include_dirs=[mod_dir]),
		Extension('fse.exp.sif_variants_cy',
		sources=['./fse/exp/sif_variants_cy.pyx'],
		sources=['./fse/exp/sif_variants_cy.c'],
		include_dirs=[dev_dir]),
		@@ -104,0 +104,0 @@ ],

-121

fse/exp/sif_variants_cy.pyx

		#!/usr/bin/env cython
		# cython: boundscheck=False
		# cython: wraparound=False
		# cython: cdivision=True
		# cython: embedsignature=True
		# coding: utf-8

		"""Optimized cython functions for computing SIF embeddings"""

		import numpy as np
		cimport numpy as np
		import cython

		import scipy.linalg.blas as fblas

		cdef extern from "../voidptr.h":
		void* PyCObject_AsVoidPtr(object obj)

		ctypedef np.float32_t REAL_t
		ctypedef np.int32_t INT_t

		ctypedef void (saxpy_ptr) (const int N, const float alpha, const float X, const int incX, float Y, const int *incY) nogil
		cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer)

		ctypedef void (sscal_ptr) (const int N, const float alpha, const float X, const int *incX) nogil
		cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer)

		cdef REAL_t ONEF = <REAL_t>1.0
		cdef int ONE = 1


		def sif_embeddings_6(sentences, model):
		cdef int size = model.vector_size
		cdef float[:,:] vectors = model.wv.sif_vectors

		cdef int sentence_index, word_index, d, count = 0
		cdef float inv = 1.
		np_sum = np.sum

		output = np.zeros((len(sentences), size), dtype=np.float32)
		cdef float[:,:] sv = output

		for sentence_index, sentence in enumerate(sentences):
		if len(sentence) > 0:
		count = 0
		for word_index in sentence:
		count += 1
		for d in range(size):
		sv[sentence_index, d] += vectors[word_index, d]

		inv = (1./ <float>count)
		for d in range(size):
		sv[sentence_index, d] *= inv
		return output

		def sif_embeddings_7(sentences, model):
		cdef int size = model.vector_size
		cdef float[:,:] vectors = model.wv.sif_vectors

		np_sum = np.sum
		np_asarray = np.asarray

		output = np.zeros((len(sentences), size), dtype=np.float32)
		cdef float[:,:] sv = output

		cdef int[:] sentence_view
		cdef int sentence_len


		for i in xrange(len(sentences)):
		if len(sentences[i]) > 0:
		sentence_view = sentences[i]
		sentence_len = len(sentences[i])
		sif_embeddings_7_cloop(size, sentence_view, sentence_len, i, vectors, sv)

		return output

		cdef void sif_embeddings_7_cloop(int size, int[:] sentence_view, int sentence_len, int sentence_idx, float[:,:] vectors, float[:,:] summary_vectors) nogil:
		cdef int i,d, word_index, count = 0
		cdef float inv = 1.

		for i in xrange(sentence_len):
		word_index = sentence_view[i]
		count += 1
		for d in xrange(size):
		summary_vectors[sentence_idx, d] += vectors[word_index, d]

		inv = (1./ <float>count)
		for d in xrange(size):
		summary_vectors[sentence_idx, d] *= inv


		def sif_embeddings_8(sentences, model):
		cdef int size = model.vector_size
		cdef REAL_t vectors = <REAL_t >(np.PyArray_DATA(model.wv.sif_vectors))

		output = np.zeros((len(sentences), size), dtype=np.float32)
		cdef REAL_t sv = <REAL_t >(np.PyArray_DATA(output))

		cdef INT_t *sentence_view

		for i in xrange(len(sentences)):
		if len(sentences[i]):
		sentence_view = <INT_t *>(np.PyArray_DATA(sentences[i]))
		sentence_len = len(sentences[i])
		sif_embeddings_8_inner(size, sentence_view, sentence_len, i, vectors, sv)
		return output

		cdef void sif_embeddings_8_inner(const int size, const INT_t *sentence_view, const int sentence_len,
		const int sentence_idx, const REAL_t vectors, REAL_t summary_vectors) nogil:

		cdef int i,d, word_index
		cdef REAL_t inv = ONEF, count = <REAL_t> 0.

		for i in xrange(sentence_len):
		count += ONEF
		word_index = sentence_view[i]
		saxpy(&size, &ONEF, &vectors[word_index * size], &ONE, &summary_vectors[sentence_idx * size], &ONE)

		inv = ONEF / count
		sscal(&size, &inv, &summary_vectors[sentence_idx * size], &ONE)

-113

fse/models/sentence2vec_inner.pyx

		#!/usr/bin/env cython
		# cython: boundscheck=False
		# cython: wraparound=False
		# cython: cdivision=True
		# cython: embedsignature=True
		# coding: utf-8

		# Author: Oliver Borchers <borchers@bwl.uni-mannheim.de>
		# Copyright (C) 2019 Oliver Borchers

		"""Optimized cython functions for computing sentence embeddings"""

		import numpy as np
		cimport numpy as np
		import cython

		import scipy.linalg.blas as fblas

		cdef extern from "../voidptr.h":
		void* PyCObject_AsVoidPtr(object obj)

		REAL = np.float32
		INT = np.intc

		ctypedef np.float32_t REAL_t
		ctypedef np.int32_t INT_t

		# Type definitions and pointers for BLAS routines
		# See http://www.netlib.org/blas/#_level_1 for more information
		ctypedef void (saxpy_ptr) (const int N, const float alpha, const float X, const int incX, float Y, const int *incY) nogil
		cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer)

		ctypedef void (sscal_ptr) (const int N, const float alpha, const float X, const int *incX) nogil
		cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer)

		cdef REAL_t ONEF = <REAL_t>1.0
		cdef int ONE = 1

		cdef REAL_t EPS = <REAL_t>1e-8

		full = np.full
		empty = np.empty

		def s2v_train(sentences, len_sentences, wv, weights):
		"""Train sentence embedding on a list of sentences

		Called internally from :meth:`~fse.models.sentence2vec.Sentence2Vec.train`.

		Parameters
		----------
		sentences : iterable of list of str
		The corpus used to train the model.
		len_sentences : int
		Length of the sentence iterable
		wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
		The BaseKeyedVectors instance containing the vectors used for training
		weights : np.ndarray
		Weights used in the summation of the vectors

		Returns
		-------
		np.ndarray
		The sentence embedding matrix of dim len(sentences) * vector_size
		int
		Number of words in the vocabulary actually used for training.
		int
		Number of sentences used for training.
		"""

		# Setup variables
		cdef int len_sen = len_sentences, effective_words = 0, effective_sentences = 0
		cdef int i, sentence_len, size = wv.vector_size
		cdef REAL_t vectors = <REAL_t >(np.PyArray_DATA(wv.vectors))
		cdef REAL_t vec_weights = <REAL_t >(np.PyArray_DATA(weights))

		# Materialize output array iteratively
		# We do start from a matrix with EPS values to prohibit divisions by zero in subsequent applications
		output = empty((len_sen, size), dtype=REAL)
		for i in xrange(len_sen):
		output[i] = full(size, EPS, dtype=REAL)

		cdef REAL_t output_view = <REAL_t >(np.PyArray_DATA(output))
		cdef INT_t *sentence_view

		cdef str w

		vlookup = wv.vocab
		as_array = np.asarray

		for i, s in enumerate(sentences):
		sentence_idx = as_array([vlookup[w].index for w in s if w in vlookup], dtype=INT)
		sentence_len = len(sentence_idx)
		if sentence_len:
		effective_words += sentence_len
		effective_sentences += ONE
		sentence_view = <INT_t *>(np.PyArray_DATA(sentence_idx))
		s2v_train_core(size, sentence_view, sentence_len, i, vectors, output_view, vec_weights)
		return output, effective_words, effective_sentences

		cdef void s2v_train_core(const int size, const INT_t *sentence_view, const int sentence_len,
		const int sentence_idx, const REAL_t vectors, REAL_t sent_vecs,
		const REAL_t *vec_weights) nogil:

		cdef int i,d, word_index
		cdef REAL_t inv = ONEF, count = <REAL_t> 0.

		for i in xrange(sentence_len):
		count += ONEF
		word_index = sentence_view[i]
		saxpy(&size, &vec_weights[word_index], &vectors[word_index * size], &ONE, &sent_vecs[sentence_idx * size], &ONE)

		inv = ONEF / count
		sscal(&size, &inv, &sent_vecs[sentence_idx * size], &ONE)

-104

README.md

		Fast Sentence Embeddings (fse)
		==================================

		Fast Sentence Embeddings is a Python library that serves as an addition to Gensim. This library is intended to compute summary vectors for large collections of sentences or documents.


		Features
		------------

		Find the corresponding blog post here: https://medium.com/@oliverbor/fse-2b1ffa791cf9

		fse implements two algorithms for sentence embeddings. You can choose
		between unweighted sentence averages and smooth inverse frequency averages.
		In order to use the fse model, you first need some pre-trained embedding
		gensim embedding model, which is then used by fse to compute the sentence embeddings.

		After computing sentence embeddings, you can use them in supervised or
		unsupervised NLP applications, as they serve as a formidable baseline.

		The models here are based on the the smooth inverse frequency embeddings [1]
		and the deep-averaging networks [2].

		Credit is due to Radim Řehůřek and all contributors for the awesome library
		and code that gensim provides.


		Installation
		------------

		This software depends on [NumPy, Scipy, Scikit-learn, Gensim, and Wordfreq].
		You must have them installed prior to installing fse.

		As with gensim, it is also recommended you install a fast BLAS library
		before installing fse.

		The simple way to install fse is:

		pip install fse

		In case you want to build from the source, just run:

		python setup.py install

		Exemplary application
		-------------

		In order to use fse you must first estimate a Gensim model which containes a
		gensim.models.keyedvectors.BaseKeyedVectors class, for example
		Word2Vec or Fasttext. Then you can proceed to compute sentence embeddings
		for a corpus.

		The current version does not offer multi-core support out of the box.

		from gensim.models import Word2Vec
		sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
		model = Word2Vec(sentences, min_count=1)

		from fse.models import Sentence2Vec
		se = Sentence2Vec(model)
		sentences_emb = se.train(sentences)

		Sentence Prediction.ipynb contains an example of how to use the library
		with a pre-trained Word2Vec model. compute_sif.py trains a Word2Vec model
		on a corpus (i.e, brown) and benchmark_speed.py reproduces the results from
		the Medium post.

		ToDos
		-------------
		[ ] Various Bugfixes

		[ ] Feature Testing

		[ ] Support TaggedLineDocument from Doc2Vec

		[ ] Multi Core Implementation

		[ ] Direct to disc-estimation to to avoid RAM shortage (perhaps?)

		[ ] Propose as a gensim feature (perhaps?)

		Literature
		-------------
		1. Arora S, Liang Y, Ma T (2017) A Simple but Tough-to-Beat Baseline for Sentence
		Embeddings. Int. Conf. Learn. Represent. (Toulon, France), 1–16.

		2. Iyyer M, Manjunatha V, Boyd-Graber J, Daumé III H (2015) Deep Unordered
		Composition Rivals Syntactic Methods for Text Classification. Proc. 53rd Annu.
		Meet. Assoc. Comput. Linguist. 7th Int. Jt. Conf. Nat. Lang. Process., 1681–1691.

		3. Eneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia Specia. Semeval-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation. Proceedings of SemEval 2017.

		4. Duong, Chi Thang, Remi Lebret, and Karl Aberer. “Multimodal Classification for Analysing Social Media.” The 27th European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD), 2017

		Credits
		-------------
		The STS dataset was released by [3].
		The Reddit dataset was released by [4]: https://emoclassifier.github.io

		Copyright
		-------------

		Author: Oliver Borchers <borchers@bwl.uni-mannheim.de>

		Copyright (C) 2019 Oliver Borchers

fse - pypi Package Compare versions

Improved metrics

Worsened metrics