transcript-parser
Advanced tools
Comparing version 0.1.1 to 0.2.0
117
app.js
"use strict"; | ||
/*********************** | ||
Module dependencies | ||
* Module dependencies | ||
***********************/ | ||
// const S = require('string'); | ||
const _ = require('lodash'); | ||
const Promise = require('bluebird'); | ||
/*********************** | ||
Object creation | ||
* Object creation | ||
***********************/ | ||
@@ -33,4 +34,8 @@ const TranscriptParser = function (options) { | ||
const proto = TranscriptParser.prototype; | ||
const tp = this; | ||
proto.parseOne = function(transcript) { | ||
/*********************** | ||
* Synchronous parseOne method | ||
***********************/ | ||
proto.parseOneSync = function(transcript) { | ||
var lines = transcript.split(this.regex.newLine) | ||
@@ -81,6 +86,67 @@ .filter(line => line.length > 0); //Remove blank lines | ||
proto.resolveAliases = function(data) { | ||
var aliases = this.settings.aliases; | ||
/*********************** | ||
* Asynchronous parseOne method | ||
***********************/ | ||
proto.parseOne = function(transcript, cb) { | ||
//Output object | ||
const output = {}; | ||
//Object containing the speakers and their lines | ||
output.speaker = {}; | ||
//List of the speakers, in order | ||
output.order = []; | ||
//Current speaker | ||
var speaker = 'none'; | ||
//Remove blank lines | ||
return Promise.filter(transcript.split(this.regex.newLine), line => line.length > 0) | ||
.then(lines => { | ||
if(this.settings.removeActions) { | ||
return Promise.map(lines, line => line.split(this.regex.action).join('')) | ||
} | ||
return Promise.resolve(lines); | ||
}).then(lines => { | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
return Promise.map(lines, line => line.split(this.regex.annotation).join('')); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
return Promise.map(lines, line => line.split(this.regex.timestamp).join('')); | ||
} | ||
return Promise.resolve(lines); | ||
}).then(lines => { | ||
return Promise.each(lines, (line, index) => { | ||
if(line.match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(line)[1]; | ||
//Remove the speaker from the line | ||
line = line.replace(this.regex.speaker, ''); | ||
} | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker) && | ||
//And the speaker is defined or the setting to remove undefined speakers is false | ||
(speaker !== 'none' || !this.settings.removeUnknownSpeakers)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//If the speaker is defined or the setting to remove undefined speakers is false | ||
if(speaker !== 'none' || !this.settings.removeUnknownSpeakers) { | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(line); | ||
output.order.push(speaker); | ||
} | ||
}); | ||
}).then(() => { | ||
cb(null, output); | ||
}) | ||
.catch(err => cb(err)); | ||
}; | ||
/*********************** | ||
* Synchronous resolveAliases method | ||
***********************/ | ||
proto.resolveAliasesSync = function(data) { | ||
const aliases = this.settings.aliases; | ||
if(_.isEmpty(aliases)) return data; | ||
var speakers = data.speaker; | ||
const speakers = data.speaker; | ||
@@ -122,3 +188,42 @@ for(var speaker in speakers) { | ||
/*********************** | ||
* Asynchronous resolveAliases method | ||
***********************/ | ||
proto.resolveAliases = function(data, cb) { | ||
const aliases = this.settings.aliases; | ||
if(_.isEmpty(aliases)) return cb(null, data); | ||
const speakers = data.speaker; | ||
return Promise.all(_.keys(speakers).map(speakerName => { | ||
return Promise.all(_.keys(aliases).map(trueName => { | ||
return Promise.each(aliases[trueName], regex => { | ||
//If the regex matches | ||
if(regex.test(speakerName)) { | ||
//Add the lines from the regex-matched speaker | ||
//to the new speaker if the new speaker exists | ||
speakers[trueName] = speakers[trueName] ? | ||
_.concat(speakers[trueName], speakers[speakerName]) : | ||
//Otherwise, make a new list | ||
speakers[trueName] = speakers[speakerName]; | ||
//Delete the old key | ||
delete speakers[speakerName]; | ||
return; | ||
} | ||
}) | ||
})) | ||
})).then(() => { | ||
return Promise.each(data.order, (speaker, speakerIndex) => { | ||
return Promise.all(_.map(aliases, (alias, trueName) => { | ||
return Promise.all(_.map(alias, (regex, regexIndex) => { | ||
if(speaker.search(regex) !== -1) { | ||
return data.order[speakerIndex] = trueName; | ||
} | ||
})); | ||
})); | ||
}); | ||
}).then(() => { | ||
cb(null, data); | ||
}).catch(err => cb(err)); | ||
}; | ||
module.exports = TranscriptParser; |
{ | ||
"name": "transcript-parser", | ||
"version": "0.1.1", | ||
"version": "0.2.0", | ||
"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.", | ||
@@ -5,0 +5,0 @@ "main": "app.js", |
@@ -18,4 +18,4 @@ transcript-parser | ||
//Do not use readFileSync in production | ||
const output = tp.parseOne(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'})); | ||
//Do not use fs.readFileSync in production | ||
const output = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'})); | ||
console.log(output); | ||
@@ -52,2 +52,16 @@ | ||
### .parseOneSync() | ||
The `parseOneSync()` method parses a string and returns an object representing it. | ||
#### Syntax | ||
`tp.parseOneSync(transcript)` | ||
##### Parameters | ||
- `transcript` | ||
+ The transcript, as a `string`. | ||
### .parseOne() | ||
@@ -59,3 +73,3 @@ | ||
`tp.parseOne(transcript)` | ||
`tp.parseOne(transcript, callback)` | ||
@@ -65,5 +79,23 @@ ##### Parameters | ||
- `transcript` | ||
- The transcript, as a `string`. | ||
+ The transcript, as a `string`. | ||
- `callback(err, data)` | ||
+ A callback to be exectuted on function completion. | ||
### .resolveAliasesSync() | ||
The `resolveAliasesSync()` method resolves all aliases specified in the configuration passed to the `TranscriptParser`'s constructor (see above). | ||
Renames the names in the `order` list to match the new names in the transcript. Note that there is a signifigant performance penalty, so don't use this method unless you need it. | ||
#### Syntax | ||
`tp.resolveAliasesSync(data)` | ||
##### Parameters | ||
- `data` | ||
+ The transcript object after being parsed. | ||
### .resolveAliases() | ||
@@ -77,3 +109,3 @@ | ||
`tp.resolveAliases(data)` | ||
`tp.resolveAliases(data, callback)` | ||
@@ -83,2 +115,5 @@ ##### Parameters | ||
- `data` | ||
- The transcript object after being parsed. | ||
+ The transcript object after being parsed. | ||
- `callback(err, resolved)` | ||
+ A callback to be executed on function completion. | ||
@@ -15,3 +15,7 @@ "use strict"; | ||
describe('#parseOne()', function(){ | ||
/* | ||
* For the synchronous parseOne method | ||
* | ||
*/ | ||
describe('#parseOneSync()', function(){ | ||
const tp = new TranscriptParser(); | ||
@@ -21,3 +25,3 @@ | ||
const parser = new TranscriptParser(); | ||
var result = parser.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)'); | ||
var result = parser.parseOneSync('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)'); | ||
result.speaker.should.eql({ | ||
@@ -32,3 +36,3 @@ 'PERSON A': [ | ||
const parser = new TranscriptParser({removeActions: false}); | ||
var result = parser.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)'); | ||
var result = parser.parseOneSync('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)'); | ||
result.speaker.should.eql({ | ||
@@ -43,3 +47,3 @@ 'PERSON A': [ | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: false}); | ||
var result = parser.parseOne('[20:20:34] BERMAN: [2:1:41] The...'); | ||
var result = parser.parseOneSync('[20:20:34] BERMAN: [2:1:41] The...'); | ||
result.speaker.should.eql({ | ||
@@ -54,3 +58,3 @@ '[20:20:34] BERMAN': [ | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true}); | ||
var result = parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...'); | ||
var result = parser.parseOneSync('[20:20:34] BERMAN: [2:1:41] The [first] name...'); | ||
result.speaker.should.eql({ | ||
@@ -65,3 +69,3 @@ 'BERMAN': [ | ||
const parser = new TranscriptParser({removeUnknownSpeakers: true}); | ||
var result = parser.parseOne('The quick [brown] fox jumps over the (lazy) dog.'); | ||
var result = parser.parseOneSync('The quick [brown] fox jumps over the (lazy) dog.'); | ||
result.should.eql({ | ||
@@ -77,3 +81,3 @@ speaker: {}, | ||
.then(info => { | ||
this.result = tp.parseOne(info); | ||
this.result = tp.parseOneSync(info); | ||
return readExpected(1); | ||
@@ -86,6 +90,108 @@ }).then(expected => { | ||
}); | ||
}); | ||
describe('#resolveAliases()', function () { | ||
/* | ||
* For the asynchronous parseOne method | ||
* | ||
*/ | ||
describe('#parseOne()', function(){ | ||
const tp = new TranscriptParser(); | ||
it('should remove actions by default', function(done) { | ||
const parser = new TranscriptParser(); | ||
parser.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'PERSON A': [ | ||
'Hello, my name is Bob.' | ||
] | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('should respect the removeActions setting', function(done) { | ||
const parser = new TranscriptParser({removeActions: false}); | ||
var result = parser.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'PERSON A': [ | ||
'Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)' | ||
] | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('should respect the removeTimestamps setting', function(done) { | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: false}); | ||
var result = parser.parseOne('[20:20:34] BERMAN: [2:1:41] The...', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'[20:20:34] BERMAN': [ | ||
'[2:1:41] The...' | ||
] | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('should be able to remove timestamps without removing annotations', function(done) { | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true}); | ||
var result = parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'BERMAN': [ | ||
'The [first] name...' | ||
] | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('should respect the remove unknown speakers setting', function(done) { | ||
const parser = new TranscriptParser({removeUnknownSpeakers: true}); | ||
var result = parser.parseOne('The quick [brown] fox jumps over the (lazy) dog.', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.should.eql({ | ||
speaker: {}, | ||
order: [] | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('should parse a transcript correctly', function(done) { | ||
readSample(1) | ||
.bind({}) | ||
.then(info => { | ||
return Promise.fromCallback(cb => { | ||
tp.parseOne(info, cb); | ||
}); | ||
}) | ||
.then(result => { | ||
this.result = result; | ||
return readExpected(1); | ||
}).then(expected => { | ||
this.result.should.be.eql(JSON.parse(expected)); | ||
done(); | ||
}) | ||
.catch(e => done(e)); | ||
}); | ||
}); | ||
/* | ||
* For the synchronous resolveAliases method | ||
* | ||
*/ | ||
describe('#resolveAliasesSync()', function () { | ||
it('should resolve aliases correctly', function(done) { | ||
@@ -98,4 +204,4 @@ const tp = new TranscriptParser({ | ||
.then(info => { | ||
this.result = tp.parseOne(info); | ||
this.result = tp.resolveAliases(this.result); | ||
this.result = tp.parseOneSync(info); | ||
this.result = tp.resolveAliasesSync(this.result); | ||
return readExpected(2); | ||
@@ -113,4 +219,4 @@ }).then(expected => { | ||
.then(info => { | ||
var parsed = tp.parseOne(info); | ||
var resolved = tp.resolveAliases(parsed); | ||
var parsed = tp.parseOneSync(info); | ||
var resolved = tp.resolveAliasesSync(parsed); | ||
parsed.should.equal(resolved); | ||
@@ -121,4 +227,47 @@ done(); | ||
}); | ||
}); | ||
/* | ||
* For the asynchronous resolveAliases method | ||
* | ||
*/ | ||
describe('#resolveAliases()', function () { | ||
it('should resolve aliases correctly', function(done) { | ||
const tp = new TranscriptParser({ | ||
aliases: { "DONALD TRUMP": [ /.*TRUMP.*/ ] } | ||
}); | ||
readSample(2) | ||
.bind({}) | ||
.then(info => { | ||
return Promise.fromCallback(cb => tp.parseOne(info, cb)); | ||
}).then(result => { | ||
return Promise.fromCallback(cb => tp.resolveAliases(result, cb)); | ||
}).then(result => { | ||
this.result = result; | ||
return readExpected(2); | ||
}).then(expected => { | ||
this.result.should.eql(JSON.parse(expected)); | ||
done(); | ||
}) | ||
.catch(e => done(e)); | ||
}); | ||
it('should return unchanged data if aliases are not set', function(done) { | ||
const tp = new TranscriptParser({aliases: {}}); | ||
readSample(2) | ||
.bind({}) | ||
.then(info => { | ||
return Promise.fromCallback(cb => tp.parseOne(info, cb)); | ||
}).then(parsed => { | ||
this.parsed = parsed; | ||
return Promise.fromCallback(cb => tp.resolveAliases(parsed, cb)); | ||
}).then(resolved => { | ||
this.parsed.should.equal(resolved); | ||
done(); | ||
}) | ||
.catch(e => done(e)); | ||
}); | ||
}); | ||
}); | ||
@@ -125,0 +274,0 @@ |
157352
496
114