Metadata-Version: 1.0
		Name: fse
		Version: 0.0.1
		Version: 0.0.2
		Summary: Fast Sentence Embeddings for Gensim
		@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings

+7

-7

fse.egg-info/requires.txt

		@@ -1,7 +0,7 @@
		numpy>=1.11.3
		scipy>=0.18.1
		six>=1.5.0
		smart_open>=1.5.0
		scikit-learn>=0.19.1
		gensim>=3.4.0
		wordfreq>=2.2.1
		numpy >= 1.11.3
		scipy >= 0.18.1
		six >= 1.5.0
		smart_open >= 1.5.0
		scikit-learn >= 0.19.1
		gensim >= 3.4.0
		wordfreq >= 2.2.1

+1

-1

fse/benchmark_speed.py

		@@ -58,3 +58,3 @@ import numpy as np
		parser.add_argument("-size", help="Set the size of the embedding", type=int, default=100)
		parser.add_argument("-excel", help="Output results as excel file", type=int, default=0)
		parser.add_argument("-excel", help="Output results as excel file", type=int, default=1)
		args = parser.parse_args()
		@@ -61,0 +61,0 @@

+26

-16

fse/models/sentence2vec.py

		@@ -23,4 +23,6 @@ #!/usr/bin/env python
		from numpy import float32 as REAL, sum as np_sum, vstack, zeros, ones,\
		dtype, sqrt, newaxis
		dtype, sqrt, newaxis, empty, full

		EPS = 1e-8

		try:
		@@ -32,7 +34,6 @@ # Import cython functions
		CY_ROUTINES = 0
		logger.warning("ImportError of Cython functions: %s", e)
		warnings.warn("ImportError of Cython functions: Training will be slow. Install C-Compiler and re-compile.")

		def s2v_train(sentences, wv, weights):
		"""Train sentence embedding on a list of sentences.
		This methods is numpy only and is much slower than the cython variant.
		def s2v_train(sentences, len_sentences, wv, weights):
		"""Train sentence embedding on a list of sentences

		@@ -45,2 +46,4 @@ Called internally from :meth:`~fse.models.sentence2vec.Sentence2Vec.train`.
		The corpus used to train the model.
		len_sentences : int
		Length of the sentence iterable
		wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
		@@ -60,19 +63,25 @@ The BaseKeyedVectors instance containing the vectors used for training
		"""

		size = wv.vector_size
		vlookup = wv.vocab
		vectors = wv
		output = []
		vectors = wv.vectors

		w_trans = weights[:, None]

		output = empty((len_sentences, size), dtype=REAL)
		for i in range(len_sentences):
		output[i] = full(size, EPS, dtype=REAL)

		effective_words = 0
		effective_sentences = 0

		for s in sentences:
		idx = [vlookup[w].index for w in s if w in vlookup]
		v = np_sum(vectors.vectors[idx] * weights[idx][:, None], axis=0)
		if len(idx) > 0:
		effective_words += len(idx)
		for i, s in enumerate(sentences):
		sentence_idx = [vlookup[w].index for w in s if w in vlookup]
		if len(sentence_idx):
		v = np_sum(vectors[sentence_idx] * w_trans[sentence_idx], axis=0)
		effective_words += len(sentence_idx)
		effective_sentences += 1
		v *= 1/len(idx)
		output.append(v)
		return vstack(output).astype(REAL), effective_words, effective_sentences
		v *= 1/len(sentence_idx)
		output[i] = v

		return output.astype(REAL), effective_words, effective_sentences

		@@ -343,2 +352,3 @@ class Sentence2Vec():
		"""

		if sentences is None:
		@@ -345,0 +355,0 @@ raise RuntimeError("Provide sentences object")

+1

-1

PKG-INFO

		Metadata-Version: 1.0
		Name: fse
		Version: 0.0.1
		Version: 0.0.2
		Summary: Fast Sentence Embeddings for Gensim
		@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings

+21

-7

README.md

		@@ -10,2 +10,4 @@ Fast Sentence Embeddings (fse)

		Find the corresponding blog post here: https://medium.com/@oliverbor/fse-2b1ffa791cf9

		fse implements two algorithms for sentence embeddings. You can choose
		@@ -35,8 +37,7 @@ between unweighted sentence averages and smooth inverse frequency averages.

		The simple way to install gensim is:
		The simple way to install fse is:

		pip install git+https://github.com/oborchers/Fast_Sentence_Embeddings
		pip install fse

		Or, if you have instead downloaded and unzipped the [source tar.gz]
		package, you’d run:
		In case you want to build from the source, just run:

		@@ -70,9 +71,13 @@ python setup.py install
		-------------
		[ ] Bugfixes
		[ ] Various Bugfixes

		[ ] Feature Testing

		[ ] Support TaggedLineDocument from Doc2Vec

		[ ] Multi Core Implementation

		[ ] Direct to disc-estimation to to avoid RAM shortage
		[ ] Direct to disc-estimation to to avoid RAM shortage (perhaps?)

		[ ] Add as a gensim feature (perhaps?)
		[ ] Propose as a gensim feature (perhaps?)

		@@ -88,2 +93,11 @@ Literature

		3. Eneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia Specia. Semeval-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation. Proceedings of SemEval 2017.

		4. Duong, Chi Thang, Remi Lebret, and Karl Aberer. “Multimodal Classification for Analysing Social Media.” The 27th European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD), 2017

		Credits
		-------------
		The STS dataset was released by [3].
		The Reddit dataset was released by [4]: https://emoclassifier.github.io

		Copyright
		@@ -90,0 +104,0 @@ -------------

+1

-0

setup.cfg

		[egg_info]
		tag_build =
		tag_date = 0
		tag_svn_revision = 0

+1

-1

setup.py

		@@ -87,3 +87,3 @@ #!/usr/bin/env python
		name='fse',
		version='0.0.1',
		version='0.0.2',
		description='Fast Sentence Embeddings for Gensim',
		@@ -90,0 +90,0 @@

fse - pypi Package Compare versions

Improved metrics