transcript-parser
Advanced tools
Comparing version 0.3.0 to 0.4.0
@@ -10,2 +10,3 @@ 'use strict'; | ||
const TranscriptParser = require('../app.js'); | ||
const Readable = require('stream').Readable; | ||
@@ -18,3 +19,3 @@ const tp = new TranscriptParser(); | ||
var s = new Readable(); | ||
/*********************** | ||
@@ -48,5 +49,10 @@ * Benchmarks | ||
console.log('Async Parse #3:', msg); | ||
}).catch(e => console.error(e)); | ||
s.push(firstTranscript, 'utf8'); | ||
s.push(null); | ||
console.log('Starting stream parse'); | ||
return timePromise(() => Promise.fromCallback(cb => tp.parseStream(s, cb)) ); | ||
}).then(msg => { | ||
console.log('Stream Parse #1:', msg); | ||
}).catch(e => console.error(e.stack)); | ||
/*********************** | ||
@@ -90,7 +96,7 @@ * Functions | ||
function readTranscript(name) { | ||
return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'}); | ||
return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'}); | ||
} | ||
function readTranscriptSync(name) { | ||
return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'}); | ||
return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'}); | ||
} |
@@ -8,2 +8,3 @@ 'use strict'; | ||
const Promise = require('bluebird'); | ||
const byline = require('byline'); | ||
@@ -21,12 +22,13 @@ | ||
removeUnknownSpeakers: false, | ||
aliases: {} | ||
aliases: {}, | ||
regex: { | ||
newLine: /\r?\n/, | ||
action: /\([A-Z\ ]+\)\ ?/, | ||
speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-\(\)]+?)(?:\ ?\[[A-z\ ]+\])? ?:\ ?/, | ||
timestamp: /\ ?\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/, | ||
annotation: /\[.+?\]\ ?/ | ||
} | ||
}; | ||
this.settings = _.assign(this.defaultSettings, options); | ||
this.regex = { | ||
newLine: /\r?\n/, | ||
action: /\([A-Z\ ]+\)\ ?/, | ||
speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-\(\)]+)(?: \[.+\])?:\ ?/, | ||
timestamp: /\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/, | ||
annotation: /\[.+?\]\ ?/ | ||
}; | ||
this.regex = this.settings.regex; | ||
}; | ||
@@ -45,10 +47,11 @@ | ||
.filter(line => line.length > 0); //Remove blank lines | ||
lines = (this.settings.removeActions) ? lines.map(line => line.split(this.regex.action).join('')): lines; | ||
lines = this.settings.removeActions ? lines.map(line => removeAll(line, this.regex.action)): lines; | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
lines = lines.map(line => line.split(this.regex.annotation).join('')); | ||
lines = lines.map(line => removeAll(line, this.regex.annotation)); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
lines = lines.map(line => line.split(this.regex.timestamp).join('')); | ||
lines = lines.map(line => removeAll(line, this.regex.timestamp)); | ||
} | ||
lines = lines.filter(line => line.length > 0); //Remove newely blank lines | ||
@@ -67,4 +70,4 @@ //Output object | ||
if(lines[i].match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(lines[i])[1]; | ||
//Regex match - is speaker | ||
speaker = this.regex.speaker.exec(lines[i])[1].trim(); | ||
//Remove the speaker from the line | ||
@@ -110,3 +113,3 @@ lines[i] = lines[i].replace(this.regex.speaker, ''); | ||
if(this.settings.removeActions) { | ||
return Promise.map(lines, line => line.split(this.regex.action).join('')); | ||
return Promise.map(lines, line => removeAll(line, this.regex.action)); | ||
} | ||
@@ -117,13 +120,18 @@ return Promise.resolve(lines); | ||
//Remove annotations | ||
return Promise.map(lines, line => line.split(this.regex.annotation).join('')); | ||
return Promise.map(lines, line => removeAll(line, this.regex.annotation)); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
return Promise.map(lines, line => line.split(this.regex.timestamp).join('')); | ||
return Promise.map(lines, line => removeAll(line, this.regex.timestamp)); | ||
} | ||
return Promise.resolve(lines); | ||
}).then(lines => { | ||
}) | ||
.then(lines => { | ||
//Remove newly blank lines | ||
return Promise.filter(lines, line => line.length > 0); | ||
}) | ||
.then(lines => { | ||
return Promise.each(lines, (line) => { | ||
if(line.match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(line)[1]; | ||
speaker = this.regex.speaker.exec(line)[1].trim(); | ||
//Remove the speaker from the line | ||
@@ -258,1 +266,62 @@ line = line.replace(this.regex.speaker, ''); | ||
}; | ||
proto.parseStream = function(inputStream, cb) { | ||
const stream = byline.createStream(inputStream); | ||
//Output object | ||
const output = {}; | ||
//Object containing the speakers and their lines | ||
output.speaker = {}; | ||
//List of the speakers, in order | ||
output.order = []; | ||
var line; | ||
var speaker = 'none'; | ||
stream.on('readable', () => { | ||
const line = stream.read() | ||
if(line === null) return cb(null, output); | ||
var filteredLine = this.filterLine(line); | ||
if(filteredLine) { | ||
if(filteredLine.match(this.regex.speaker)) { | ||
//Regex match - is speaker | ||
speaker = this.regex.speaker.exec(filteredLine)[1].trim(); | ||
//Remove the speaker from the line | ||
filteredLine = filteredLine.replace(this.regex.speaker, ''); | ||
} | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker) && | ||
//And the speaker is defined or the setting to remove undefined speakers is false | ||
(speaker !== 'none' || !this.settings.removeUnknownSpeakers)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//If the speaker is defined or the setting to remove undefined speakers is false | ||
if(speaker !== 'none' || !this.settings.removeUnknownSpeakers) { | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(filteredLine); | ||
output.order.push(speaker); | ||
} | ||
} | ||
}); | ||
}; | ||
//Filters a line based on the defined settings | ||
//Returns null on the line being completely removed | ||
proto.filterLine = function(line) { | ||
if(typeof line !== 'string') {line = line.toString();} | ||
line = this.settings.removeActions ? removeAll(line, this.regex.action) : line; | ||
if(this.settings.removeAnnotations) { | ||
line = removeAll(line, this.regex.annotation); | ||
} else if(this.settings.removeTimestamps) { | ||
line = removeAll(line, this.regex.timestamp); | ||
} | ||
if(line.length <= 0) return null; | ||
return line; | ||
}; | ||
function removeAll(text, regex) { | ||
return text.split(regex).join(''); | ||
} |
{ | ||
"name": "transcript-parser", | ||
"version": "0.3.0", | ||
"version": "0.4.0", | ||
"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.", | ||
@@ -38,4 +38,5 @@ "main": "app.js", | ||
"bluebird": "^3.3.4", | ||
"byline": "^4.2.1", | ||
"lodash": "^4.9.0" | ||
} | ||
} |
@@ -18,5 +18,21 @@ transcript-parser | ||
//Do not use fs.readFileSync in production | ||
const output = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'})); | ||
console.log(output); | ||
//Synchronous example | ||
const parsed = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'})); | ||
console.log(parsed); | ||
//Asyncronous example | ||
fs.readFile('transcript.txt', (err, data) => { | ||
if(err) return console.error('Error:', err); | ||
tp.parseOne(data, (err, parsed => { | ||
if(err) return console.error('Error:', err); | ||
console.log(parsed); | ||
})); | ||
}); | ||
//Stream example | ||
const stream = fs.createReadStream('transcript.txt', 'utf8'); | ||
tp.parseStream(stream, (err, parsed) => { | ||
if(err) return console.error('Error:', err); | ||
console.log(parsed); | ||
}); | ||
@@ -52,2 +68,20 @@ | ||
### .parseStream() | ||
The `parseStream()` method parses a [`Stream`](https://nodejs.org/api/stream.html) and returns an object representing it. | ||
This is the preferred method for parsing streams asynchronously as it doesn't have to load the entire transcript into memory (unlike `parseOne()`). | ||
#### Syntax | ||
`tp.parseOneSync(stream, callback)` | ||
##### Parameters | ||
- `stream` | ||
+ The `Readable` stream to read. | ||
- `callback(err, data)` | ||
+ A callback to be executed on function completion or error. | ||
### .parseOneSync() | ||
@@ -80,3 +114,3 @@ | ||
- `callback(err, data)` | ||
+ A callback to be exectuted on function completion. | ||
+ A callback to be exectuted on function completion or error. | ||
@@ -115,3 +149,3 @@ | ||
- `callback(err, resolved)` | ||
+ A callback to be executed on function completion. | ||
+ A callback to be executed on function completion or error. | ||
@@ -21,2 +21,22 @@ 'use strict'; | ||
describe('#parseStream()', function() { | ||
const tp = new TranscriptParser(); | ||
it('should parse a transcript correctly', function(done) { | ||
readSample(1) | ||
.bind({}) | ||
.then(info => { | ||
var stream = fs.createReadStream(path.join(TEST_DIR, '1.txt'), 'utf8'); | ||
return Promise.fromCallback(cb => tp.parseStream(stream, cb)); | ||
}).then(result => { | ||
this.result = result; | ||
return readExpected(1); | ||
}).then(expected => { | ||
this.result.should.be.eql(JSON.parse(expected)); | ||
done(); | ||
}) | ||
.catch(e => done(e)); | ||
}); | ||
}); | ||
/* | ||
@@ -61,3 +81,3 @@ * For the synchronous parseOne method | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true}); | ||
var result = parser.parseOneSync('[20:20:34] BERMAN: [2:1:41] The [first] name...'); | ||
var result = parser.parseOneSync('[20:20:34] BERMAN [2:1:41] : The [first] name...'); | ||
result.speaker.should.eql({ | ||
@@ -146,3 +166,3 @@ 'BERMAN': [ | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true}); | ||
parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...', | ||
parser.parseOne('[20:20:34] BERMAN: The [first] name...', | ||
(err, result) => { | ||
@@ -345,7 +365,7 @@ if(err) return done(err); | ||
function readSample(sampleName) { | ||
return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'UTF-8'}); | ||
return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'utf8'}); | ||
} | ||
function readExpected(expectedName) { | ||
return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'UTF-8'}); | ||
return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'utf8'}); | ||
} |
@@ -60,3 +60,3 @@ 'use strict'; | ||
'[20:20:34] BERMAN: [2:1:41] The...'.split(regex.timestamp).join('') | ||
.should.equal('BERMAN: The...'); | ||
.should.equal('BERMAN:The...'); | ||
}); | ||
@@ -63,0 +63,0 @@ }); |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
487547
761
148
0
3
+ Addedbyline@^4.2.1
+ Addedbyline@4.2.2(transitive)