fse
Advanced tools
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.0.1 | ||
| Version: 0.0.2 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings |
@@ -1,7 +0,7 @@ | ||
| numpy>=1.11.3 | ||
| scipy>=0.18.1 | ||
| six>=1.5.0 | ||
| smart_open>=1.5.0 | ||
| scikit-learn>=0.19.1 | ||
| gensim>=3.4.0 | ||
| wordfreq>=2.2.1 | ||
| numpy >= 1.11.3 | ||
| scipy >= 0.18.1 | ||
| six >= 1.5.0 | ||
| smart_open >= 1.5.0 | ||
| scikit-learn >= 0.19.1 | ||
| gensim >= 3.4.0 | ||
| wordfreq >= 2.2.1 |
@@ -58,3 +58,3 @@ import numpy as np | ||
| parser.add_argument("-size", help="Set the size of the embedding", type=int, default=100) | ||
| parser.add_argument("-excel", help="Output results as excel file", type=int, default=0) | ||
| parser.add_argument("-excel", help="Output results as excel file", type=int, default=1) | ||
| args = parser.parse_args() | ||
@@ -61,0 +61,0 @@ |
@@ -23,4 +23,6 @@ #!/usr/bin/env python | ||
| from numpy import float32 as REAL, sum as np_sum, vstack, zeros, ones,\ | ||
| dtype, sqrt, newaxis | ||
| dtype, sqrt, newaxis, empty, full | ||
| EPS = 1e-8 | ||
| try: | ||
@@ -32,7 +34,6 @@ # Import cython functions | ||
| CY_ROUTINES = 0 | ||
| logger.warning("ImportError of Cython functions: %s", e) | ||
| warnings.warn("ImportError of Cython functions: Training will be slow. Install C-Compiler and re-compile.") | ||
| def s2v_train(sentences, wv, weights): | ||
| """Train sentence embedding on a list of sentences. | ||
| This methods is numpy only and is much slower than the cython variant. | ||
| def s2v_train(sentences, len_sentences, wv, weights): | ||
| """Train sentence embedding on a list of sentences | ||
@@ -45,2 +46,4 @@ Called internally from :meth:`~fse.models.sentence2vec.Sentence2Vec.train`. | ||
| The corpus used to train the model. | ||
| len_sentences : int | ||
| Length of the sentence iterable | ||
| wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` | ||
@@ -60,19 +63,25 @@ The BaseKeyedVectors instance containing the vectors used for training | ||
| """ | ||
| size = wv.vector_size | ||
| vlookup = wv.vocab | ||
| vectors = wv | ||
| output = [] | ||
| vectors = wv.vectors | ||
| w_trans = weights[:, None] | ||
| output = empty((len_sentences, size), dtype=REAL) | ||
| for i in range(len_sentences): | ||
| output[i] = full(size, EPS, dtype=REAL) | ||
| effective_words = 0 | ||
| effective_sentences = 0 | ||
| for s in sentences: | ||
| idx = [vlookup[w].index for w in s if w in vlookup] | ||
| v = np_sum(vectors.vectors[idx] * weights[idx][:, None], axis=0) | ||
| if len(idx) > 0: | ||
| effective_words += len(idx) | ||
| for i, s in enumerate(sentences): | ||
| sentence_idx = [vlookup[w].index for w in s if w in vlookup] | ||
| if len(sentence_idx): | ||
| v = np_sum(vectors[sentence_idx] * w_trans[sentence_idx], axis=0) | ||
| effective_words += len(sentence_idx) | ||
| effective_sentences += 1 | ||
| v *= 1/len(idx) | ||
| output.append(v) | ||
| return vstack(output).astype(REAL), effective_words, effective_sentences | ||
| v *= 1/len(sentence_idx) | ||
| output[i] = v | ||
| return output.astype(REAL), effective_words, effective_sentences | ||
@@ -343,2 +352,3 @@ class Sentence2Vec(): | ||
| """ | ||
| if sentences is None: | ||
@@ -345,0 +355,0 @@ raise RuntimeError("Provide sentences object") |
+1
-1
| Metadata-Version: 1.0 | ||
| Name: fse | ||
| Version: 0.0.1 | ||
| Version: 0.0.2 | ||
| Summary: Fast Sentence Embeddings for Gensim | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/oborchers/Fast_Sentence_Embeddings |
+21
-7
@@ -10,2 +10,4 @@ Fast Sentence Embeddings (fse) | ||
| Find the corresponding blog post here: https://medium.com/@oliverbor/fse-2b1ffa791cf9 | ||
| **fse** implements two algorithms for sentence embeddings. You can choose | ||
@@ -35,8 +37,7 @@ between *unweighted sentence averages* and *smooth inverse frequency averages*. | ||
| The simple way to install gensim is: | ||
| The simple way to install **fse** is: | ||
| pip install git+https://github.com/oborchers/Fast_Sentence_Embeddings | ||
| pip install fse | ||
| Or, if you have instead downloaded and unzipped the [source tar.gz] | ||
| package, you’d run: | ||
| In case you want to build from the source, just run: | ||
@@ -70,9 +71,13 @@ python setup.py install | ||
| ------------- | ||
| **[ ]** Bugfixes | ||
| **[ ]** Various Bugfixes | ||
| **[ ]** Feature Testing | ||
| **[ ]** Support TaggedLineDocument from Doc2Vec | ||
| **[ ]** Multi Core Implementation | ||
| **[ ]** Direct to disc-estimation to to avoid RAM shortage | ||
| **[ ]** Direct to disc-estimation to to avoid RAM shortage (perhaps?) | ||
| **[ ]** Add as a gensim feature (perhaps?) | ||
| **[ ]** Propose as a gensim feature (perhaps?) | ||
@@ -88,2 +93,11 @@ Literature | ||
| 3. Eneko Agirre, Daniel Cer, Mona Diab, Iñigo Lopez-Gazpio, Lucia Specia. Semeval-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation. Proceedings of SemEval 2017. | ||
| 4. Duong, Chi Thang, Remi Lebret, and Karl Aberer. “Multimodal Classification for Analysing Social Media.” The 27th European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD), 2017 | ||
| Credits | ||
| ------------- | ||
| The STS dataset was released by [3]. | ||
| The Reddit dataset was released by [4]: https://emoclassifier.github.io | ||
| Copyright | ||
@@ -90,0 +104,0 @@ ------------- |
+1
-0
| [egg_info] | ||
| tag_build = | ||
| tag_date = 0 | ||
| tag_svn_revision = 0 | ||
+1
-1
@@ -87,3 +87,3 @@ #!/usr/bin/env python | ||
| name='fse', | ||
| version='0.0.1', | ||
| version='0.0.2', | ||
| description='Fast Sentence Embeddings for Gensim', | ||
@@ -90,0 +90,0 @@ |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
42083
2.54%687
0.88%