hunspell-reader
Advanced tools
Comparing version 2.0.4 to 2.1.0
# Release Notes | ||
## 2.1.0 | ||
- Add an Iterable Reader, this works much better for very large dictionaries. | ||
## 2.0.0 | ||
- Move to RxJs 6 and Node 8 | ||
## 1.2.1 | ||
@@ -4,0 +10,0 @@ - Update packages. |
@@ -1,2 +0,2 @@ | ||
#!/usr/bin/env node --max_old_space_size=8192 | ||
#!/usr/bin/env node | ||
export {}; |
@@ -1,15 +0,21 @@ | ||
#!/usr/bin/env node --max_old_space_size=8192 | ||
#!/usr/bin/env node | ||
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : new P(function (resolve) { resolve(result.value); }).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const commander = require("commander"); | ||
const HunspellReader_1 = require("./HunspellReader"); | ||
const IterableHunspellReader_1 = require("./IterableHunspellReader"); | ||
const fs = require("fs"); | ||
const rxjs_stream_1 = require("rxjs-stream"); | ||
const fs_extra_1 = require("fs-extra"); | ||
const rxjs_1 = require("rxjs"); | ||
const operators_1 = require("rxjs/operators"); | ||
const path = require("path"); | ||
// import * as monitor from './monitor'; | ||
const util_1 = require("./util"); | ||
const gensequence_1 = require("gensequence"); | ||
const uniqueHistorySize = 500000; | ||
const packageInfo = require('../package.json'); | ||
const version = packageInfo['version']; | ||
let displayHelp = true; | ||
commander | ||
@@ -22,47 +28,46 @@ .version(version); | ||
.option('-u, --unique', 'make sure the words are unique.') | ||
.option('-i, --ignore_case', 'used with --unique and --sort') | ||
.option('-l, --lower_case', 'output in lower case') | ||
.option('-T, --no-transform', 'Do not apply the prefix and suffix transforms. Root words only.') | ||
.description('Output all the words in the <hunspell.dic> file.') | ||
.action((hunspellDicFilename, options) => { | ||
const { sort = false, unique = false, ignore_case: ignoreCase = false, output: outputFile, lower_case: lowerCase = false, transform = true, } = options; | ||
notify('Write words', !!outputFile); | ||
notify(`Sort: ${yesNo(sort)}`, !!outputFile); | ||
notify(`Unique: ${yesNo(unique)}`, !!outputFile); | ||
notify(`Ignore Case: ${yesNo(ignoreCase)}`, !!outputFile); | ||
const pOutputStream = createWriteStream(outputFile); | ||
const baseFile = hunspellDicFilename.replace(/(\.dic)?$/, ''); | ||
const dicFile = baseFile + '.dic'; | ||
const affFile = baseFile + '.aff'; | ||
notify(`Dic file: ${dicFile}`, !!outputFile); | ||
notify(`Aff file: ${affFile}`, !!outputFile); | ||
notify(`Generating Words`, !!outputFile); | ||
const pReader = HunspellReader_1.HunspellReader.createFromFiles(affFile, dicFile); | ||
const pWordReader = transform ? pReader.then(reader => reader.readWords()) : pReader.then(reader => reader.readRootWords()); | ||
const wordsRx = rxjs_1.from(pWordReader).pipe(operators_1.map(words => words.pipe(operators_1.map(a => a.trim()), operators_1.filter(a => !!a))), operators_1.map(wordsRx => unique ? makeUnique(wordsRx, ignoreCase) : wordsRx), operators_1.map(wordsRx => sort ? sortWordList(wordsRx, ignoreCase) : wordsRx), operators_1.map(wordsRx => lowerCase ? wordsRx.pipe(operators_1.map(a => a.toLowerCase())) : wordsRx), operators_1.flatMap(words => words), operators_1.map(word => word + '\n')); | ||
pOutputStream.then(writeStream => { | ||
rxjs_stream_1.rxToStream(wordsRx.pipe(operators_1.bufferCount(1024), operators_1.map(words => words.join('')))).pipe(writeStream); | ||
.action(function (hunspellDicFilename, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
displayHelp = false; | ||
const { sort = false, unique = false, output: outputFile, lower_case: lowerCase = false, transform = true, } = options; | ||
const log = (msg) => notify(msg, !!outputFile); | ||
log('Write words'); | ||
log(`Sort: ${yesNo(sort)}`); | ||
log(`Unique: ${yesNo(unique)}`); | ||
const baseFile = hunspellDicFilename.replace(/\.(dic|aff)$/, ''); | ||
const dicFile = baseFile + '.dic'; | ||
const affFile = baseFile + '.aff'; | ||
log(`Dic file: ${dicFile}`); | ||
log(`Aff file: ${affFile}`); | ||
log(`Generating Words...`); | ||
const reader = yield IterableHunspellReader_1.IterableHunspellReader.createFromFiles(affFile, dicFile); | ||
const seqWords = transform ? reader.seqWords() : reader.seqRootWords(); | ||
const normalize = lowerCase ? (a) => a.toLowerCase() : (a) => a; | ||
const filterUnique = unique ? util_1.uniqueFilter(uniqueHistorySize) : (_) => true; | ||
const fd = outputFile ? fs.openSync(outputFile, 'w') : 1; | ||
const words = seqWords | ||
.map(a => a.trim()) | ||
.filter(a => !!a) | ||
.map(normalize) | ||
.map(a => a + '\n') | ||
.filter(filterUnique); | ||
if (sort) { | ||
log('Sorting...'); | ||
const data = words.toArray().sort().join(''); | ||
fs.writeSync(fd, data); | ||
} | ||
else { | ||
gensequence_1.genSequence(util_1.batch(words, 1000)).forEach(w => fs.writeSync(fd, w.join(''))); | ||
} | ||
fs.closeSync(fd); | ||
log('Done.'); | ||
}); | ||
}); | ||
commander.parse(process.argv); | ||
if (!commander.args.length) { | ||
if (displayHelp) { | ||
commander.help(); | ||
} | ||
function createWriteStream(filename) { | ||
return !filename | ||
? Promise.resolve(process.stdout) | ||
: fs_extra_1.mkdirp(path.dirname(filename)).then(() => fs.createWriteStream(filename)); | ||
} | ||
function sortWordList(words, ignoreCase) { | ||
const compStr = (a, b) => a < b ? -1 : (a > b ? 1 : 0); | ||
const fnComp = ignoreCase | ||
? ((a, b) => compStr(a.toLowerCase(), b.toLowerCase())) | ||
: compStr; | ||
return words.pipe(operators_1.toArray(), operators_1.flatMap(a => a.sort(fnComp))); | ||
} | ||
function makeUnique(words, ignoreCase) { | ||
const found = new Set(); | ||
const normalize = ignoreCase ? (a => a.toLowerCase()) : (a => a); | ||
return words.pipe(operators_1.filter(w => !found.has(normalize(w))), operators_1.tap(w => found.add(normalize(w)))); | ||
} | ||
function notify(message, useStdOut = true) { | ||
@@ -69,0 +74,0 @@ if (useStdOut) { |
import { Aff, AffWord } from './aff'; | ||
import { Observable } from 'rxjs'; | ||
export interface WordInfo { | ||
word: string; | ||
rules: string; | ||
} | ||
import { WordInfo } from './types'; | ||
export interface HunspellSrcInfo { | ||
@@ -8,0 +5,0 @@ aff: Aff; |
export * from './HunspellReader'; | ||
export * from './IterableHunspellReader'; |
@@ -7,2 +7,3 @@ "use strict"; | ||
__export(require("./HunspellReader")); | ||
__export(require("./IterableHunspellReader")); | ||
//# sourceMappingURL=index.js.map |
export declare function hrTimeToSeconds([seconds, nanoseconds]: number[]): number; | ||
export declare function uniqueFilter<T>(historySize: number): (i: T) => boolean; | ||
export declare function batch<T>(i: Iterable<T>, size: number): Iterable<T[]>; |
@@ -7,2 +7,36 @@ "use strict"; | ||
exports.hrTimeToSeconds = hrTimeToSeconds; | ||
function uniqueFilter(historySize) { | ||
const f0 = new Set(); | ||
const f1 = new Set(); | ||
const found = [f0, f1, f0]; | ||
let g = 0; | ||
return (w) => { | ||
const p = found[g]; | ||
if (p.has(w)) | ||
return false; | ||
const s = found[g + 1]; | ||
const r = !s.has(w); | ||
p.add(w); | ||
if (p.size >= historySize) { | ||
s.clear(); | ||
g = (g + 1) % 2; | ||
} | ||
return r; | ||
}; | ||
} | ||
exports.uniqueFilter = uniqueFilter; | ||
function* batch(i, size) { | ||
let data = []; | ||
for (const t of i) { | ||
data.push(t); | ||
if (data.length === size) { | ||
yield data; | ||
data = []; | ||
} | ||
} | ||
if (data.length) { | ||
yield data; | ||
} | ||
} | ||
exports.batch = batch; | ||
//# sourceMappingURL=util.js.map |
{ | ||
"name": "hunspell-reader", | ||
"version": "2.0.4", | ||
"version": "2.1.0", | ||
"description": "A library for reading Hunspell Dictionary Files", | ||
@@ -39,19 +39,19 @@ "bin": "./dist/app.js", | ||
"devDependencies": { | ||
"@types/chai": "^4.1.6", | ||
"@types/chai": "^4.1.7", | ||
"@types/fs-extra": "^5.0.4", | ||
"@types/mocha": "^5.2.5", | ||
"@types/node": "^8.10.34", | ||
"@types/node": "^8.10.39", | ||
"chai": "^4.2.0", | ||
"coveralls": "^3.0.2", | ||
"mocha": "^5.2.0", | ||
"nyc": "^13.0.1", | ||
"rimraf": "^2.6.2", | ||
"ts-node": "^6.2.0", | ||
"typescript": "^3.1.1" | ||
"nyc": "^13.1.0", | ||
"rimraf": "^2.6.3", | ||
"ts-node": "^7.0.1", | ||
"typescript": "^3.2.2" | ||
}, | ||
"dependencies": { | ||
"commander": "^2.18.0", | ||
"cspell-lib": "^3.0.4", | ||
"fs-extra": "^7.0.0", | ||
"gensequence": "^2.1.1", | ||
"commander": "^2.19.0", | ||
"cspell-lib": "^3.0.5", | ||
"fs-extra": "^7.0.1", | ||
"gensequence": "^2.1.2", | ||
"rxjs": "^6.3.3", | ||
@@ -58,0 +58,0 @@ "rxjs-stream": "^3.0.1" |
@@ -5,2 +5,3 @@ import * as util from 'util'; | ||
import * as GS from 'gensequence'; | ||
import { Dictionary } from './types'; | ||
@@ -60,3 +61,3 @@ // cSpell:enableCompoundWords | ||
export interface AffInfo extends AffTransformFlags { | ||
SET?: string; | ||
SET?: string; // Characterset encoding of the .aff and .dic file | ||
TRY?: string; | ||
@@ -63,0 +64,0 @@ KEY?: string; |
104
src/app.ts
@@ -1,16 +0,17 @@ | ||
#!/usr/bin/env node --max_old_space_size=8192 | ||
#!/usr/bin/env node | ||
// cSpell:ignore findup | ||
import * as commander from 'commander'; | ||
import { HunspellReader } from './HunspellReader'; | ||
import { IterableHunspellReader } from './IterableHunspellReader'; | ||
import * as fs from 'fs'; | ||
import {rxToStream} from 'rxjs-stream'; | ||
import {mkdirp} from 'fs-extra'; | ||
import {from, Observable} from 'rxjs'; | ||
import {map, flatMap, filter, bufferCount, tap, toArray} from 'rxjs/operators'; | ||
import * as path from 'path'; | ||
// import * as monitor from './monitor'; | ||
import { uniqueFilter, batch } from './util'; | ||
import { genSequence } from 'gensequence'; | ||
const uniqueHistorySize = 500000; | ||
const packageInfo = require('../package.json'); | ||
const version = packageInfo['version']; | ||
let displayHelp = true; | ||
commander | ||
@@ -24,11 +25,10 @@ .version(version); | ||
.option('-u, --unique', 'make sure the words are unique.') | ||
.option('-i, --ignore_case', 'used with --unique and --sort') | ||
.option('-l, --lower_case', 'output in lower case') | ||
.option('-T, --no-transform', 'Do not apply the prefix and suffix transforms. Root words only.') | ||
.description('Output all the words in the <hunspell.dic> file.') | ||
.action((hunspellDicFilename, options) => { | ||
.action(async function(hunspellDicFilename, options) { | ||
displayHelp = false; | ||
const { | ||
sort = false, | ||
unique = false, | ||
ignore_case: ignoreCase = false, | ||
output: outputFile, | ||
@@ -38,31 +38,35 @@ lower_case: lowerCase = false, | ||
} = options; | ||
notify('Write words', !!outputFile); | ||
notify(`Sort: ${yesNo(sort)}`, !!outputFile); | ||
notify(`Unique: ${yesNo(unique)}`, !!outputFile); | ||
notify(`Ignore Case: ${yesNo(ignoreCase)}`, !!outputFile); | ||
const pOutputStream = createWriteStream(outputFile); | ||
const baseFile = hunspellDicFilename.replace(/(\.dic)?$/, ''); | ||
const log = (msg: string) => notify(msg, !!outputFile); | ||
log('Write words'); | ||
log(`Sort: ${yesNo(sort)}`); | ||
log(`Unique: ${yesNo(unique)}`); | ||
const baseFile = hunspellDicFilename.replace(/\.(dic|aff)$/, ''); | ||
const dicFile = baseFile + '.dic'; | ||
const affFile = baseFile + '.aff'; | ||
notify(`Dic file: ${dicFile}`, !!outputFile); | ||
notify(`Aff file: ${affFile}`, !!outputFile); | ||
notify(`Generating Words`, !!outputFile); | ||
const pReader = HunspellReader.createFromFiles(affFile, dicFile); | ||
const pWordReader = transform ? pReader.then(reader => reader.readWords()) : pReader.then(reader => reader.readRootWords()); | ||
log(`Dic file: ${dicFile}`); | ||
log(`Aff file: ${affFile}`); | ||
log(`Generating Words...`); | ||
const reader = await IterableHunspellReader.createFromFiles(affFile, dicFile); | ||
const seqWords = transform ? reader.seqWords() : reader.seqRootWords(); | ||
const normalize = lowerCase ? (a: string) => a.toLowerCase() : (a: string) => a; | ||
const filterUnique = unique ? uniqueFilter(uniqueHistorySize) : (_: string) => true; | ||
const fd = outputFile ? fs.openSync(outputFile, 'w') : 1; | ||
const wordsRx = from(pWordReader).pipe( | ||
map(words => words.pipe( | ||
map(a => a.trim()), | ||
filter(a => !!a), | ||
)), | ||
map(wordsRx => unique ? makeUnique(wordsRx, ignoreCase) : wordsRx), | ||
map(wordsRx => sort ? sortWordList(wordsRx, ignoreCase) : wordsRx), | ||
map(wordsRx => lowerCase ? wordsRx.pipe(map(a => a.toLowerCase())) : wordsRx), | ||
flatMap(words => words), | ||
map(word => word + '\n'), | ||
); | ||
const words = seqWords | ||
.map(a => a.trim()) | ||
.filter(a => !!a) | ||
.map(normalize) | ||
.map(a => a + '\n') | ||
.filter(filterUnique); | ||
pOutputStream.then(writeStream => { | ||
rxToStream(wordsRx.pipe(bufferCount(1024),map(words => words.join('')))).pipe(writeStream); | ||
}); | ||
if (sort) { | ||
log('Sorting...'); | ||
const data = words.toArray().sort().join(''); | ||
fs.writeSync(fd, data); | ||
} else { | ||
genSequence(batch(words, 1000)).forEach(w => fs.writeSync(fd, w.join(''))); | ||
} | ||
fs.closeSync(fd); | ||
log('Done.'); | ||
}); | ||
@@ -72,32 +76,6 @@ | ||
if (!commander.args.length) { | ||
if (displayHelp) { | ||
commander.help(); | ||
} | ||
function createWriteStream(filename?: string): Promise<NodeJS.WritableStream> { | ||
return !filename | ||
? Promise.resolve(process.stdout) | ||
: mkdirp(path.dirname(filename)).then(() => fs.createWriteStream(filename)); | ||
} | ||
function sortWordList(words: Observable<string>, ignoreCase: boolean) { | ||
const compStr = (a, b) => a < b ? -1 : (a > b ? 1 : 0); | ||
const fnComp: (a: string, b: string) => number = ignoreCase | ||
? ((a, b) => compStr(a.toLowerCase(), b.toLowerCase())) | ||
: compStr; | ||
return words.pipe( | ||
toArray(), | ||
flatMap(a => a.sort(fnComp)), | ||
); | ||
} | ||
function makeUnique(words: Observable<string>, ignoreCase: boolean) { | ||
const found = new Set<string>(); | ||
const normalize: (a: string) => string = ignoreCase ? (a => a.toLowerCase()) : (a => a); | ||
return words.pipe( | ||
filter(w => !found.has(normalize(w))), | ||
tap(w => found.add(normalize(w))), | ||
); | ||
} | ||
function notify(message: any, useStdOut = true) { | ||
@@ -104,0 +82,0 @@ if (useStdOut) { |
@@ -0,1 +1,2 @@ | ||
import { Dictionary } from './types'; | ||
@@ -2,0 +3,0 @@ export interface ConvItem { |
@@ -7,9 +7,4 @@ import {parseAffFileToAff} from './affReader'; | ||
import * as monitor from './monitor'; | ||
import { WordInfo } from './types'; | ||
export interface WordInfo { | ||
word: string; | ||
rules: string; | ||
} | ||
export interface HunspellSrcInfo { | ||
@@ -28,3 +23,2 @@ aff: Aff; | ||
/** | ||
@@ -31,0 +25,0 @@ * @internal |
export * from './HunspellReader'; | ||
export * from './IterableHunspellReader'; |
@@ -6,1 +6,35 @@ | ||
} | ||
export function uniqueFilter<T>(historySize: number): (i: T) => boolean { | ||
const f0 = new Set<T>(); | ||
const f1 = new Set<T>(); | ||
const found = [f0, f1, f0]; | ||
let g = 0; | ||
return (w: T) => { | ||
const p = found[g]; | ||
if (p.has(w)) return false; | ||
const s = found[g + 1]; | ||
const r = !s.has(w); | ||
p.add(w); | ||
if (p.size >= historySize) { | ||
s.clear(); | ||
g = (g + 1) % 2; | ||
} | ||
return r; | ||
}; | ||
} | ||
export function *batch<T>(i: Iterable<T>, size: number): Iterable<T[]> { | ||
let data: T[] = []; | ||
for (const t of i) { | ||
data.push(t); | ||
if (data.length === size) { | ||
yield data; | ||
data = []; | ||
} | ||
} | ||
if (data.length) { | ||
yield data; | ||
} | ||
} |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
86804
48
1632
Updatedcommander@^2.19.0
Updatedcspell-lib@^3.0.5
Updatedfs-extra@^7.0.1
Updatedgensequence@^2.1.2