natasha
Advanced tools
| import pytest | ||
| from natasha.span import ( | ||
| Span, | ||
| envelop_spans | ||
| ) | ||
| tests = [ | ||
| [ | ||
| [(0, 1)], | ||
| [], | ||
| [] | ||
| ], | ||
| [ | ||
| [], | ||
| [(0, 1)], | ||
| [[]] | ||
| ], | ||
| [ | ||
| [(0, 1), (1, 2)], | ||
| [(1, 2)], | ||
| [[(1, 2)]] | ||
| ], | ||
| [ | ||
| [(0, 1), (1, 2)], | ||
| [(0, 1)], | ||
| [[(0, 1)]] | ||
| ], | ||
| [ | ||
| [(0, 1), (1, 2)], | ||
| [(0, 1), (1, 2)], | ||
| [[(0, 1)], [(1, 2)]] | ||
| ], | ||
| ] | ||
| def adapt_spans(spans): | ||
| for start, stop in spans: | ||
| yield Span(start, stop, type=None) | ||
| @pytest.mark.parametrize('test', tests) | ||
| def test_envelope_spans(test): | ||
| spans, envelopes, target = test | ||
| spans = list(adapt_spans(spans)) | ||
| envelopes = list(adapt_spans(envelopes)) | ||
| target = [ | ||
| list(adapt_spans(group)) | ||
| for group in target | ||
| ] | ||
| pred = list(envelop_spans(spans, envelopes)) | ||
| assert pred == target |
| Metadata-Version: 2.1 | ||
| Name: natasha | ||
| Version: 1.2.0 | ||
| Version: 1.3.0 | ||
| Summary: Named-entity recognition for russian language | ||
@@ -332,336 +332,8 @@ Home-page: https://github.com/natasha/natasha | ||
| ### Segmentation | ||
| * Segmentation — <a href="https://github.com/natasha/razdel#quality-performance">Razdel evalualtion section</a> | ||
| * Embedding — <a href="https://github.com/natasha/navec#evaluation">Navec evalualtion section</a> | ||
| * Morphology — <a href="https://github.com/natasha/slovnet#morphology-1">Slovnet Morph evaluation section</a> | ||
| * Syntax — <a href="https://github.com/natasha/slovnet#syntax-1">Slovnet Syntax evaluation section</a> | ||
| * NER — <a href="https://github.com/natasha/slovnet#ner-1">Slovnet NER evaluation section</a> | ||
| Natasha uses <a href="https://github.com/natasha/razdel">Razdel</a> for text segmentation. | ||
| `errors` — number of errors aggregated over 4 datasets, see <a href="https://github.com/natasha/razdel#quality-performance">Razdel evalualtion section</a> for more info. | ||
| <!--- token ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>errors</th> | ||
| <th>time</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>razdel.tokenize</th> | ||
| <td>5439</td> | ||
| <td>9.898350</td> | ||
| </tr> | ||
| <tr> | ||
| <th>mystem</th> | ||
| <td>12192</td> | ||
| <td>17.210470</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>12288</td> | ||
| <td>19.920618</td> | ||
| </tr> | ||
| <tr> | ||
| <th>nltk.word_tokenize</th> | ||
| <td>130119</td> | ||
| <td>12.405366</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- token ---> | ||
| <!--- sent ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>errors</th> | ||
| <th>time</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>razdel.sentenize</th> | ||
| <td>32106</td> | ||
| <td>21.989045</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov/rusenttokenize</th> | ||
| <td>41722</td> | ||
| <td>32.535322</td> | ||
| </tr> | ||
| <tr> | ||
| <th>nltk.sent_tokenize</th> | ||
| <td>60378</td> | ||
| <td>29.916063</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- sent ---> | ||
| ### Embedding | ||
| Natasha uses <a href="https://github.com/natasha/navec">Navec pretrained embeddings</a>. | ||
| `precision` — Average precision over 4 datasets, see <a href="https://github.com/natasha/navec#evaluation">Navec evalualtion section</a> for more info. | ||
| <!--- emb1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>type</th> | ||
| <th>precision</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>vocab</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>hudlit_12B_500K_300d_100q</th> | ||
| <td>navec</td> | ||
| <td>0.825</td> | ||
| <td>1.0</td> | ||
| <td>50.6</td> | ||
| <td>95.3</td> | ||
| <td>500K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>news_1B_250K_300d_100q</th> | ||
| <td>navec</td> | ||
| <td>0.775</td> | ||
| <td>0.5</td> | ||
| <td>25.4</td> | ||
| <td>47.7</td> | ||
| <td>250K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>ruscorpora_upos_cbow_300_20_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.777</td> | ||
| <td>12.1</td> | ||
| <td>220.6</td> | ||
| <td>236.1</td> | ||
| <td>189K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>ruwikiruscorpora_upos_skipgram_300_2_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.776</td> | ||
| <td>15.7</td> | ||
| <td>290.0</td> | ||
| <td>309.4</td> | ||
| <td>248K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>tayga_upos_skipgram_300_2_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.795</td> | ||
| <td>15.7</td> | ||
| <td>290.7</td> | ||
| <td>310.9</td> | ||
| <td>249K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>tayga_none_fasttextcbow_300_10_2019</th> | ||
| <td>fasttext</td> | ||
| <td>0.706</td> | ||
| <td>11.3</td> | ||
| <td>2741.9</td> | ||
| <td>2746.9</td> | ||
| <td>192K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>araneum_none_fasttextcbow_300_5_2018</th> | ||
| <td>fasttext</td> | ||
| <td>0.720</td> | ||
| <td>7.8</td> | ||
| <td>2752.1</td> | ||
| <td>2754.7</td> | ||
| <td>195K</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- emb1 ---> | ||
| ### Morphology | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#morphology">Slovnet morphology tagger</a>. | ||
| `accuracy` — accuracy on news dataset, see <a href="https://github.com/natasha/slovnet#morphology-1">Slovnet evaluation section</a> for more. | ||
| <!--- morph1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>accuracy</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, sents/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.961</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>115</td> | ||
| <td>532.0</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.951</td> | ||
| <td>20.0</td> | ||
| <td>1393</td> | ||
| <td>8704</td> | ||
| <td>85.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov</th> | ||
| <td>0.940</td> | ||
| <td>4.0</td> | ||
| <td>32</td> | ||
| <td>10240</td> | ||
| <td>90.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>0.919</td> | ||
| <td>10.9</td> | ||
| <td>89</td> | ||
| <td>579</td> | ||
| <td>30.6</td> | ||
| </tr> | ||
| <tr> | ||
| <th>udpipe</th> | ||
| <td>0.918</td> | ||
| <td>6.9</td> | ||
| <td>45</td> | ||
| <td>242</td> | ||
| <td>56.2</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- morph1 ---> | ||
| ### Syntax | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#syntax">Slovnet syntax parser</a>. | ||
| `uas`, `las` — accuracy on news dataset, see <a href="https://github.com/natasha/slovnet#syntax-1">Slovnet evaluation section</a> for more. | ||
| <!--- syntax1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>uas</th> | ||
| <th>las</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, sents/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.907</td> | ||
| <td>0.880</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>125</td> | ||
| <td>450.0</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.962</td> | ||
| <td>0.910</td> | ||
| <td>34.0</td> | ||
| <td>1427</td> | ||
| <td>8704</td> | ||
| <td>75.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>0.876</td> | ||
| <td>0.818</td> | ||
| <td>10.9</td> | ||
| <td>89</td> | ||
| <td>579</td> | ||
| <td>31.6</td> | ||
| </tr> | ||
| <tr> | ||
| <th>udpipe</th> | ||
| <td>0.873</td> | ||
| <td>0.823</td> | ||
| <td>6.9</td> | ||
| <td>45</td> | ||
| <td>242</td> | ||
| <td>56.2</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- syntax1 ---> | ||
| ### NER | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#ner">Slovnet NER tagger</a>. | ||
| `f1` — score aggregated over 4 datasets, see <a href="https://github.com/natasha/slovnet#ner-1">Slovnet evaluation section</a> for more. | ||
| <!--- ner1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>PER/LOC/ORG f1</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, articles/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.97/0.91/0.85</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>205</td> | ||
| <td>25.3</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.98/0.92/0.86</td> | ||
| <td>34.5</td> | ||
| <td>2048</td> | ||
| <td>6144</td> | ||
| <td>13.1 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov</th> | ||
| <td>0.92/0.86/0.76</td> | ||
| <td>5.9</td> | ||
| <td>1024</td> | ||
| <td>3072</td> | ||
| <td>24.3 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>pullenti</th> | ||
| <td>0.92/0.82/0.64</td> | ||
| <td>2.9</td> | ||
| <td>16</td> | ||
| <td>253</td> | ||
| <td>6.0</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- ner1 ---> | ||
| ## Support | ||
@@ -671,3 +343,3 @@ | ||
| - Issues — https://github.com/natasha/natasha/issues | ||
| - Commercial support — http://lab.alexkuk.ru/natasha | ||
| - Commercial support — https://lab.alexkuk.ru | ||
@@ -674,0 +346,0 @@ ## Development |
@@ -45,2 +45,3 @@ README.md | ||
| natasha/tests/test_money.py | ||
| natasha/tests/test_name.py | ||
| natasha/tests/test_name.py | ||
| natasha/tests/test_span.py |
@@ -20,2 +20,2 @@ | ||
| __version__ = '1.2.0' | ||
| __version__ = '1.3.0' |
+14
-32
@@ -23,34 +23,16 @@ | ||
| def append_sentinel(items, sentinel=None): | ||
| for item in items: | ||
| yield item | ||
| yield sentinel | ||
| def envelop_spans(spans, envelopes): | ||
| if not spans or not envelopes: | ||
| return | ||
| spans = append_sentinel(spans) | ||
| span = next(spans) | ||
| envelopes = append_sentinel(envelopes) | ||
| envelope = next(envelopes) | ||
| buffer = [] | ||
| while span and envelope: | ||
| if span.start < envelope.start: | ||
| span = next(spans) | ||
| elif span.stop <= envelope.stop: | ||
| buffer.append(span) | ||
| span = next(spans) | ||
| else: | ||
| if buffer: | ||
| yield buffer | ||
| buffer = [] | ||
| envelope = next(envelopes) | ||
| if buffer: | ||
| yield buffer | ||
| index = 0 | ||
| for envelope in envelopes: | ||
| chunk = [] | ||
| while index < len(spans): | ||
| span = spans[index] | ||
| index += 1 | ||
| if span.start < envelope.start: | ||
| continue | ||
| elif span.stop <= envelope.stop: | ||
| chunk.append(span) | ||
| else: | ||
| index -= 1 | ||
| break | ||
| yield chunk |
+7
-335
| Metadata-Version: 2.1 | ||
| Name: natasha | ||
| Version: 1.2.0 | ||
| Version: 1.3.0 | ||
| Summary: Named-entity recognition for russian language | ||
@@ -332,336 +332,8 @@ Home-page: https://github.com/natasha/natasha | ||
| ### Segmentation | ||
| * Segmentation — <a href="https://github.com/natasha/razdel#quality-performance">Razdel evalualtion section</a> | ||
| * Embedding — <a href="https://github.com/natasha/navec#evaluation">Navec evalualtion section</a> | ||
| * Morphology — <a href="https://github.com/natasha/slovnet#morphology-1">Slovnet Morph evaluation section</a> | ||
| * Syntax — <a href="https://github.com/natasha/slovnet#syntax-1">Slovnet Syntax evaluation section</a> | ||
| * NER — <a href="https://github.com/natasha/slovnet#ner-1">Slovnet NER evaluation section</a> | ||
| Natasha uses <a href="https://github.com/natasha/razdel">Razdel</a> for text segmentation. | ||
| `errors` — number of errors aggregated over 4 datasets, see <a href="https://github.com/natasha/razdel#quality-performance">Razdel evalualtion section</a> for more info. | ||
| <!--- token ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>errors</th> | ||
| <th>time</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>razdel.tokenize</th> | ||
| <td>5439</td> | ||
| <td>9.898350</td> | ||
| </tr> | ||
| <tr> | ||
| <th>mystem</th> | ||
| <td>12192</td> | ||
| <td>17.210470</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>12288</td> | ||
| <td>19.920618</td> | ||
| </tr> | ||
| <tr> | ||
| <th>nltk.word_tokenize</th> | ||
| <td>130119</td> | ||
| <td>12.405366</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- token ---> | ||
| <!--- sent ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>errors</th> | ||
| <th>time</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>razdel.sentenize</th> | ||
| <td>32106</td> | ||
| <td>21.989045</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov/rusenttokenize</th> | ||
| <td>41722</td> | ||
| <td>32.535322</td> | ||
| </tr> | ||
| <tr> | ||
| <th>nltk.sent_tokenize</th> | ||
| <td>60378</td> | ||
| <td>29.916063</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- sent ---> | ||
| ### Embedding | ||
| Natasha uses <a href="https://github.com/natasha/navec">Navec pretrained embeddings</a>. | ||
| `precision` — Average precision over 4 datasets, see <a href="https://github.com/natasha/navec#evaluation">Navec evalualtion section</a> for more info. | ||
| <!--- emb1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>type</th> | ||
| <th>precision</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>vocab</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>hudlit_12B_500K_300d_100q</th> | ||
| <td>navec</td> | ||
| <td>0.825</td> | ||
| <td>1.0</td> | ||
| <td>50.6</td> | ||
| <td>95.3</td> | ||
| <td>500K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>news_1B_250K_300d_100q</th> | ||
| <td>navec</td> | ||
| <td>0.775</td> | ||
| <td>0.5</td> | ||
| <td>25.4</td> | ||
| <td>47.7</td> | ||
| <td>250K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>ruscorpora_upos_cbow_300_20_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.777</td> | ||
| <td>12.1</td> | ||
| <td>220.6</td> | ||
| <td>236.1</td> | ||
| <td>189K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>ruwikiruscorpora_upos_skipgram_300_2_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.776</td> | ||
| <td>15.7</td> | ||
| <td>290.0</td> | ||
| <td>309.4</td> | ||
| <td>248K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>tayga_upos_skipgram_300_2_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.795</td> | ||
| <td>15.7</td> | ||
| <td>290.7</td> | ||
| <td>310.9</td> | ||
| <td>249K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>tayga_none_fasttextcbow_300_10_2019</th> | ||
| <td>fasttext</td> | ||
| <td>0.706</td> | ||
| <td>11.3</td> | ||
| <td>2741.9</td> | ||
| <td>2746.9</td> | ||
| <td>192K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>araneum_none_fasttextcbow_300_5_2018</th> | ||
| <td>fasttext</td> | ||
| <td>0.720</td> | ||
| <td>7.8</td> | ||
| <td>2752.1</td> | ||
| <td>2754.7</td> | ||
| <td>195K</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- emb1 ---> | ||
| ### Morphology | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#morphology">Slovnet morphology tagger</a>. | ||
| `accuracy` — accuracy on news dataset, see <a href="https://github.com/natasha/slovnet#morphology-1">Slovnet evaluation section</a> for more. | ||
| <!--- morph1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>accuracy</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, sents/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.961</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>115</td> | ||
| <td>532.0</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.951</td> | ||
| <td>20.0</td> | ||
| <td>1393</td> | ||
| <td>8704</td> | ||
| <td>85.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov</th> | ||
| <td>0.940</td> | ||
| <td>4.0</td> | ||
| <td>32</td> | ||
| <td>10240</td> | ||
| <td>90.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>0.919</td> | ||
| <td>10.9</td> | ||
| <td>89</td> | ||
| <td>579</td> | ||
| <td>30.6</td> | ||
| </tr> | ||
| <tr> | ||
| <th>udpipe</th> | ||
| <td>0.918</td> | ||
| <td>6.9</td> | ||
| <td>45</td> | ||
| <td>242</td> | ||
| <td>56.2</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- morph1 ---> | ||
| ### Syntax | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#syntax">Slovnet syntax parser</a>. | ||
| `uas`, `las` — accuracy on news dataset, see <a href="https://github.com/natasha/slovnet#syntax-1">Slovnet evaluation section</a> for more. | ||
| <!--- syntax1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>uas</th> | ||
| <th>las</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, sents/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.907</td> | ||
| <td>0.880</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>125</td> | ||
| <td>450.0</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.962</td> | ||
| <td>0.910</td> | ||
| <td>34.0</td> | ||
| <td>1427</td> | ||
| <td>8704</td> | ||
| <td>75.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>0.876</td> | ||
| <td>0.818</td> | ||
| <td>10.9</td> | ||
| <td>89</td> | ||
| <td>579</td> | ||
| <td>31.6</td> | ||
| </tr> | ||
| <tr> | ||
| <th>udpipe</th> | ||
| <td>0.873</td> | ||
| <td>0.823</td> | ||
| <td>6.9</td> | ||
| <td>45</td> | ||
| <td>242</td> | ||
| <td>56.2</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- syntax1 ---> | ||
| ### NER | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#ner">Slovnet NER tagger</a>. | ||
| `f1` — score aggregated over 4 datasets, see <a href="https://github.com/natasha/slovnet#ner-1">Slovnet evaluation section</a> for more. | ||
| <!--- ner1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>PER/LOC/ORG f1</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, articles/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.97/0.91/0.85</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>205</td> | ||
| <td>25.3</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.98/0.92/0.86</td> | ||
| <td>34.5</td> | ||
| <td>2048</td> | ||
| <td>6144</td> | ||
| <td>13.1 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov</th> | ||
| <td>0.92/0.86/0.76</td> | ||
| <td>5.9</td> | ||
| <td>1024</td> | ||
| <td>3072</td> | ||
| <td>24.3 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>pullenti</th> | ||
| <td>0.92/0.82/0.64</td> | ||
| <td>2.9</td> | ||
| <td>16</td> | ||
| <td>253</td> | ||
| <td>6.0</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- ner1 ---> | ||
| ## Support | ||
@@ -671,3 +343,3 @@ | ||
| - Issues — https://github.com/natasha/natasha/issues | ||
| - Commercial support — http://lab.alexkuk.ru/natasha | ||
| - Commercial support — https://lab.alexkuk.ru | ||
@@ -674,0 +346,0 @@ ## Development |
+6
-334
@@ -324,336 +324,8 @@ | ||
| ### Segmentation | ||
| * Segmentation — <a href="https://github.com/natasha/razdel#quality-performance">Razdel evalualtion section</a> | ||
| * Embedding — <a href="https://github.com/natasha/navec#evaluation">Navec evalualtion section</a> | ||
| * Morphology — <a href="https://github.com/natasha/slovnet#morphology-1">Slovnet Morph evaluation section</a> | ||
| * Syntax — <a href="https://github.com/natasha/slovnet#syntax-1">Slovnet Syntax evaluation section</a> | ||
| * NER — <a href="https://github.com/natasha/slovnet#ner-1">Slovnet NER evaluation section</a> | ||
| Natasha uses <a href="https://github.com/natasha/razdel">Razdel</a> for text segmentation. | ||
| `errors` — number of errors aggregated over 4 datasets, see <a href="https://github.com/natasha/razdel#quality-performance">Razdel evalualtion section</a> for more info. | ||
| <!--- token ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>errors</th> | ||
| <th>time</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>razdel.tokenize</th> | ||
| <td>5439</td> | ||
| <td>9.898350</td> | ||
| </tr> | ||
| <tr> | ||
| <th>mystem</th> | ||
| <td>12192</td> | ||
| <td>17.210470</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>12288</td> | ||
| <td>19.920618</td> | ||
| </tr> | ||
| <tr> | ||
| <th>nltk.word_tokenize</th> | ||
| <td>130119</td> | ||
| <td>12.405366</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- token ---> | ||
| <!--- sent ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>errors</th> | ||
| <th>time</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>razdel.sentenize</th> | ||
| <td>32106</td> | ||
| <td>21.989045</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov/rusenttokenize</th> | ||
| <td>41722</td> | ||
| <td>32.535322</td> | ||
| </tr> | ||
| <tr> | ||
| <th>nltk.sent_tokenize</th> | ||
| <td>60378</td> | ||
| <td>29.916063</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- sent ---> | ||
| ### Embedding | ||
| Natasha uses <a href="https://github.com/natasha/navec">Navec pretrained embeddings</a>. | ||
| `precision` — Average precision over 4 datasets, see <a href="https://github.com/natasha/navec#evaluation">Navec evalualtion section</a> for more info. | ||
| <!--- emb1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>type</th> | ||
| <th>precision</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>vocab</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>hudlit_12B_500K_300d_100q</th> | ||
| <td>navec</td> | ||
| <td>0.825</td> | ||
| <td>1.0</td> | ||
| <td>50.6</td> | ||
| <td>95.3</td> | ||
| <td>500K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>news_1B_250K_300d_100q</th> | ||
| <td>navec</td> | ||
| <td>0.775</td> | ||
| <td>0.5</td> | ||
| <td>25.4</td> | ||
| <td>47.7</td> | ||
| <td>250K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>ruscorpora_upos_cbow_300_20_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.777</td> | ||
| <td>12.1</td> | ||
| <td>220.6</td> | ||
| <td>236.1</td> | ||
| <td>189K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>ruwikiruscorpora_upos_skipgram_300_2_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.776</td> | ||
| <td>15.7</td> | ||
| <td>290.0</td> | ||
| <td>309.4</td> | ||
| <td>248K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>tayga_upos_skipgram_300_2_2019</th> | ||
| <td>w2v</td> | ||
| <td>0.795</td> | ||
| <td>15.7</td> | ||
| <td>290.7</td> | ||
| <td>310.9</td> | ||
| <td>249K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>tayga_none_fasttextcbow_300_10_2019</th> | ||
| <td>fasttext</td> | ||
| <td>0.706</td> | ||
| <td>11.3</td> | ||
| <td>2741.9</td> | ||
| <td>2746.9</td> | ||
| <td>192K</td> | ||
| </tr> | ||
| <tr> | ||
| <th>araneum_none_fasttextcbow_300_5_2018</th> | ||
| <td>fasttext</td> | ||
| <td>0.720</td> | ||
| <td>7.8</td> | ||
| <td>2752.1</td> | ||
| <td>2754.7</td> | ||
| <td>195K</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- emb1 ---> | ||
| ### Morphology | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#morphology">Slovnet morphology tagger</a>. | ||
| `accuracy` — accuracy on news dataset, see <a href="https://github.com/natasha/slovnet#morphology-1">Slovnet evaluation section</a> for more. | ||
| <!--- morph1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>accuracy</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, sents/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.961</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>115</td> | ||
| <td>532.0</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.951</td> | ||
| <td>20.0</td> | ||
| <td>1393</td> | ||
| <td>8704</td> | ||
| <td>85.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov</th> | ||
| <td>0.940</td> | ||
| <td>4.0</td> | ||
| <td>32</td> | ||
| <td>10240</td> | ||
| <td>90.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>0.919</td> | ||
| <td>10.9</td> | ||
| <td>89</td> | ||
| <td>579</td> | ||
| <td>30.6</td> | ||
| </tr> | ||
| <tr> | ||
| <th>udpipe</th> | ||
| <td>0.918</td> | ||
| <td>6.9</td> | ||
| <td>45</td> | ||
| <td>242</td> | ||
| <td>56.2</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- morph1 ---> | ||
| ### Syntax | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#syntax">Slovnet syntax parser</a>. | ||
| `uas`, `las` — accuracy on news dataset, see <a href="https://github.com/natasha/slovnet#syntax-1">Slovnet evaluation section</a> for more. | ||
| <!--- syntax1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>uas</th> | ||
| <th>las</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, sents/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.907</td> | ||
| <td>0.880</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>125</td> | ||
| <td>450.0</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.962</td> | ||
| <td>0.910</td> | ||
| <td>34.0</td> | ||
| <td>1427</td> | ||
| <td>8704</td> | ||
| <td>75.0 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>spacy</th> | ||
| <td>0.876</td> | ||
| <td>0.818</td> | ||
| <td>10.9</td> | ||
| <td>89</td> | ||
| <td>579</td> | ||
| <td>31.6</td> | ||
| </tr> | ||
| <tr> | ||
| <th>udpipe</th> | ||
| <td>0.873</td> | ||
| <td>0.823</td> | ||
| <td>6.9</td> | ||
| <td>45</td> | ||
| <td>242</td> | ||
| <td>56.2</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- syntax1 ---> | ||
| ### NER | ||
| Natasha uses <a href="https://github.com/natasha/slovnet#ner">Slovnet NER tagger</a>. | ||
| `f1` — score aggregated over 4 datasets, see <a href="https://github.com/natasha/slovnet#ner-1">Slovnet evaluation section</a> for more. | ||
| <!--- ner1 ---> | ||
| <table border="0" class="dataframe"> | ||
| <thead> | ||
| <tr style="text-align: right;"> | ||
| <th></th> | ||
| <th>PER/LOC/ORG f1</th> | ||
| <th>init, s</th> | ||
| <th>disk, mb</th> | ||
| <th>ram, mb</th> | ||
| <th>speed, articles/s</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| <tr> | ||
| <th>slovnet</th> | ||
| <td>0.97/0.91/0.85</td> | ||
| <td>1.0</td> | ||
| <td>27</td> | ||
| <td>205</td> | ||
| <td>25.3</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov_bert</th> | ||
| <td>0.98/0.92/0.86</td> | ||
| <td>34.5</td> | ||
| <td>2048</td> | ||
| <td>6144</td> | ||
| <td>13.1 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>deeppavlov</th> | ||
| <td>0.92/0.86/0.76</td> | ||
| <td>5.9</td> | ||
| <td>1024</td> | ||
| <td>3072</td> | ||
| <td>24.3 (gpu)</td> | ||
| </tr> | ||
| <tr> | ||
| <th>pullenti</th> | ||
| <td>0.92/0.82/0.64</td> | ||
| <td>2.9</td> | ||
| <td>16</td> | ||
| <td>253</td> | ||
| <td>6.0</td> | ||
| </tr> | ||
| </tbody> | ||
| </table> | ||
| <!--- ner1 ---> | ||
| ## Support | ||
@@ -663,3 +335,3 @@ | ||
| - Issues — https://github.com/natasha/natasha/issues | ||
| - Commercial support — http://lab.alexkuk.ru/natasha | ||
| - Commercial support — https://lab.alexkuk.ru | ||
@@ -666,0 +338,0 @@ ## Development |
+1
-1
| [bumpversion] | ||
| current_version = 1.2.0 | ||
| current_version = 1.3.0 | ||
| files = setup.py natasha/__init__.py | ||
@@ -4,0 +4,0 @@ commit = True |
+1
-1
@@ -15,3 +15,3 @@ | ||
| name='natasha', | ||
| version='1.2.0', | ||
| version='1.3.0', | ||
@@ -18,0 +18,0 @@ description='Named-entity recognition for russian language', |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
47
2.17%3689
0.99%37427377
-0.06%