transcript-parser
Advanced tools
Comparing version 0.2.0 to 0.3.0
227
app.js
@@ -1,226 +0,3 @@ | ||
"use strict"; | ||
/*********************** | ||
* Module dependencies | ||
***********************/ | ||
// const S = require('string'); | ||
const _ = require('lodash'); | ||
const Promise = require('bluebird'); | ||
'use strict'; | ||
/*********************** | ||
* Object creation | ||
***********************/ | ||
const TranscriptParser = function (options) { | ||
options = options || {}; | ||
this.defaultSettings = { | ||
removeActions: true, | ||
removeAnnotations: true, | ||
removeTimestamps: true, //Overriden by removeAnnotations | ||
removeUnknownSpeakers: false, | ||
aliases: {} | ||
}; | ||
this.settings = _.assign(this.defaultSettings, options); | ||
this.regex = { | ||
newLine: /\r?\n/, | ||
action: /\([A-Z\ ]+\)\ ?/, | ||
speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-\(\)]+)(?: \[.+\])?:\ ?/, | ||
timestamp: /\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/, | ||
annotation: /\[.+?\]\ ?/ | ||
}; | ||
}; | ||
const proto = TranscriptParser.prototype; | ||
const tp = this; | ||
/*********************** | ||
* Synchronous parseOne method | ||
***********************/ | ||
proto.parseOneSync = function(transcript) { | ||
var lines = transcript.split(this.regex.newLine) | ||
.filter(line => line.length > 0); //Remove blank lines | ||
lines = (this.settings.removeActions) ? lines.map(line => line.split(this.regex.action).join('')): lines; | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
lines = lines.map(line => line.split(this.regex.annotation).join('')); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
lines = lines.map(line => line.split(this.regex.timestamp).join('')); | ||
} | ||
//Output object | ||
const output = {}; | ||
//Object containing the speakers and their lines | ||
output.speaker = {}; | ||
//List of the speakers, in order | ||
output.order = []; | ||
//Current speaker | ||
var speaker = 'none'; | ||
for(var i = 0; i < lines.length; i++) { | ||
if(lines[i].match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(lines[i])[1]; | ||
//Remove the speaker from the line | ||
lines[i] = lines[i].replace(this.regex.speaker, ''); | ||
} | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker) && | ||
//And the speaker is defined or the setting to remove undefined speakers is false | ||
(speaker !== 'none' || !this.settings.removeUnknownSpeakers)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//If the speaker is defined or the setting to remove undefined speakers is false | ||
if(speaker !== 'none' || !this.settings.removeUnknownSpeakers) { | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(lines[i]); | ||
output.order.push(speaker); | ||
} | ||
} | ||
return output; | ||
}; | ||
/*********************** | ||
* Asynchronous parseOne method | ||
***********************/ | ||
proto.parseOne = function(transcript, cb) { | ||
//Output object | ||
const output = {}; | ||
//Object containing the speakers and their lines | ||
output.speaker = {}; | ||
//List of the speakers, in order | ||
output.order = []; | ||
//Current speaker | ||
var speaker = 'none'; | ||
//Remove blank lines | ||
return Promise.filter(transcript.split(this.regex.newLine), line => line.length > 0) | ||
.then(lines => { | ||
if(this.settings.removeActions) { | ||
return Promise.map(lines, line => line.split(this.regex.action).join('')) | ||
} | ||
return Promise.resolve(lines); | ||
}).then(lines => { | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
return Promise.map(lines, line => line.split(this.regex.annotation).join('')); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
return Promise.map(lines, line => line.split(this.regex.timestamp).join('')); | ||
} | ||
return Promise.resolve(lines); | ||
}).then(lines => { | ||
return Promise.each(lines, (line, index) => { | ||
if(line.match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(line)[1]; | ||
//Remove the speaker from the line | ||
line = line.replace(this.regex.speaker, ''); | ||
} | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker) && | ||
//And the speaker is defined or the setting to remove undefined speakers is false | ||
(speaker !== 'none' || !this.settings.removeUnknownSpeakers)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//If the speaker is defined or the setting to remove undefined speakers is false | ||
if(speaker !== 'none' || !this.settings.removeUnknownSpeakers) { | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(line); | ||
output.order.push(speaker); | ||
} | ||
}); | ||
}).then(() => { | ||
cb(null, output); | ||
}) | ||
.catch(err => cb(err)); | ||
}; | ||
/*********************** | ||
* Synchronous resolveAliases method | ||
***********************/ | ||
proto.resolveAliasesSync = function(data) { | ||
const aliases = this.settings.aliases; | ||
if(_.isEmpty(aliases)) return data; | ||
const speakers = data.speaker; | ||
for(var speaker in speakers) { | ||
for(var trueName in aliases) { | ||
for(var aliasKey in aliases[trueName]) { | ||
var aliasRegex = aliases[trueName][aliasKey]; | ||
//If the regex matches | ||
if(aliasRegex.test(speaker)) { | ||
//Add the lines from the regex-matched speaker | ||
//to the new speaker if the new speaker exists | ||
speakers[trueName] = speakers[trueName] ? | ||
_.concat(speakers[trueName], speakers[speaker]) : | ||
//Otherwise, make a new list | ||
speakers[trueName] = speakers[speaker]; | ||
//Delete the old key | ||
delete speakers[speaker]; | ||
break; | ||
} | ||
} | ||
} | ||
} | ||
//Fix the names in the order array | ||
data.order = data.order.map(speaker => { | ||
for(trueName in aliases) { | ||
for(var aliasKey in aliases[trueName]) { | ||
if(speaker.search(aliases[trueName][aliasKey]) !== -1) { | ||
return trueName; | ||
} | ||
} | ||
} | ||
return speaker; | ||
}); | ||
return data; | ||
}; | ||
/*********************** | ||
* Asynchronous resolveAliases method | ||
***********************/ | ||
proto.resolveAliases = function(data, cb) { | ||
const aliases = this.settings.aliases; | ||
if(_.isEmpty(aliases)) return cb(null, data); | ||
const speakers = data.speaker; | ||
return Promise.all(_.keys(speakers).map(speakerName => { | ||
return Promise.all(_.keys(aliases).map(trueName => { | ||
return Promise.each(aliases[trueName], regex => { | ||
//If the regex matches | ||
if(regex.test(speakerName)) { | ||
//Add the lines from the regex-matched speaker | ||
//to the new speaker if the new speaker exists | ||
speakers[trueName] = speakers[trueName] ? | ||
_.concat(speakers[trueName], speakers[speakerName]) : | ||
//Otherwise, make a new list | ||
speakers[trueName] = speakers[speakerName]; | ||
//Delete the old key | ||
delete speakers[speakerName]; | ||
return; | ||
} | ||
}) | ||
})) | ||
})).then(() => { | ||
return Promise.each(data.order, (speaker, speakerIndex) => { | ||
return Promise.all(_.map(aliases, (alias, trueName) => { | ||
return Promise.all(_.map(alias, (regex, regexIndex) => { | ||
if(speaker.search(regex) !== -1) { | ||
return data.order[speakerIndex] = trueName; | ||
} | ||
})); | ||
})); | ||
}); | ||
}).then(() => { | ||
cb(null, data); | ||
}).catch(err => cb(err)); | ||
}; | ||
module.exports = TranscriptParser; | ||
module.exports = require('./lib/parser'); |
{ | ||
"name": "transcript-parser", | ||
"version": "0.2.0", | ||
"version": "0.3.0", | ||
"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.", | ||
@@ -8,3 +8,4 @@ "main": "app.js", | ||
"test": "node ./node_modules/mocha/bin/mocha", | ||
"travis-test": "node ./node_modules/istanbul/lib/cli.js cover node_modules/mocha/bin/_mocha" | ||
"travis-test": "node ./node_modules/istanbul/lib/cli.js cover node_modules/mocha/bin/_mocha", | ||
"benchmark": "node ./benchmark/benchmark.js" | ||
}, | ||
@@ -11,0 +12,0 @@ "repository": { |
@@ -1,2 +0,6 @@ | ||
"use strict"; | ||
'use strict'; | ||
/*********************** | ||
* Test dependencies | ||
***********************/ | ||
const Promise = require('bluebird'); | ||
@@ -7,3 +11,3 @@ const path = require('path'); | ||
const chai = require('chai'); | ||
chai.should(); | ||
const should = chai.should(); | ||
@@ -13,3 +17,5 @@ const TEST_DIR = path.join(__dirname, 'transcripts'); | ||
/*********************** | ||
* Tests | ||
***********************/ | ||
describe('TranscriptParser', function() { | ||
@@ -112,12 +118,12 @@ | ||
const parser = new TranscriptParser({removeActions: false}); | ||
var result = parser.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'PERSON A': [ | ||
'Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)' | ||
] | ||
parser.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)', | ||
(err, result) => { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'PERSON A': [ | ||
'Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)' | ||
] | ||
}); | ||
done(); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
@@ -127,11 +133,11 @@ | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: false}); | ||
var result = parser.parseOne('[20:20:34] BERMAN: [2:1:41] The...', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'[20:20:34] BERMAN': [ | ||
'[2:1:41] The...' | ||
] | ||
}); | ||
done(); | ||
parser.parseOne('[20:20:34] BERMAN: [2:1:41] The...', | ||
(err, result) => { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'[20:20:34] BERMAN': [ | ||
'[2:1:41] The...' | ||
] | ||
}); | ||
done(); | ||
}); | ||
@@ -142,11 +148,9 @@ }); | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true}); | ||
var result = parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'BERMAN': [ | ||
'The [first] name...' | ||
] | ||
}); | ||
done(); | ||
parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...', | ||
(err, result) => { | ||
if(err) return done(err); | ||
result.speaker.should.eql({ | ||
'BERMAN': ['The [first] name...'] | ||
}); | ||
done(); | ||
}); | ||
@@ -157,10 +161,10 @@ }); | ||
const parser = new TranscriptParser({removeUnknownSpeakers: true}); | ||
var result = parser.parseOne('The quick [brown] fox jumps over the (lazy) dog.', | ||
function(err, result) { | ||
if(err) return done(err); | ||
result.should.eql({ | ||
speaker: {}, | ||
order: [] | ||
}); | ||
done(); | ||
parser.parseOne('The quick [brown] fox jumps over the (lazy) dog.', | ||
(err, result) => { | ||
if(err) return done(err); | ||
result.should.eql({ | ||
speaker: {}, | ||
order: [] | ||
}); | ||
done(); | ||
}); | ||
@@ -187,2 +191,32 @@ }); | ||
it('should return a promise when callback is not set', function(done) { | ||
readSample(1) | ||
.bind({}) | ||
.then(info => { | ||
return tp.parseOne(info); | ||
}) | ||
.then(result => { | ||
this.result = result; | ||
return readExpected(1); | ||
}).then(expected => { | ||
this.result.should.be.eql(JSON.parse(expected)); | ||
done(); | ||
}) | ||
.catch(e => done(e)); | ||
}); | ||
it('should handle errors properly', function(done) { | ||
tp.parseOne(null).then( output => { | ||
should.not.exist(output); | ||
}).catch(err => { | ||
should.exist(err); | ||
}).finally(() => { | ||
tp.parseOne(null, function(err, output) { | ||
should.exist(err); | ||
should.not.exist(output); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
}); | ||
@@ -252,2 +286,22 @@ | ||
it('should return a promise when callback is not set', function(done) { | ||
const tp = new TranscriptParser({ | ||
aliases: { "DONALD TRUMP": [ /.*TRUMP.*/ ] } | ||
}); | ||
readSample(2) | ||
.bind({}) | ||
.then(info => { | ||
return tp.parseOne(info); | ||
}).then(result => { | ||
return tp.resolveAliases(result); | ||
}).then(result => { | ||
this.result = result; | ||
return readExpected(2); | ||
}).then(expected => { | ||
this.result.should.eql(JSON.parse(expected)); | ||
done(); | ||
}) | ||
.catch(e => done(e)); | ||
}); | ||
it('should return unchanged data if aliases are not set', function(done) { | ||
@@ -261,5 +315,10 @@ const tp = new TranscriptParser({aliases: {}}); | ||
this.parsed = parsed; | ||
//With callback | ||
return Promise.fromCallback(cb => tp.resolveAliases(parsed, cb)); | ||
}).then(resolved => { | ||
this.parsed.should.equal(resolved); | ||
//With Promise | ||
return tp.resolveAliases(this.parsed); | ||
}).then(resolved => { | ||
this.parsed.should.equal(resolved); | ||
done(); | ||
@@ -269,2 +328,20 @@ }) | ||
}); | ||
it('should handle errors properly', function(done) { | ||
const tp = new TranscriptParser({ | ||
aliases: { "DONALD TRUMP": [ /.*TRUMP.*/ ] } | ||
}); | ||
tp.resolveAliases(null).then( output => { | ||
should.not.exist(output); | ||
}).catch(err => { | ||
should.exist(err); | ||
}).finally(() => { | ||
tp.resolveAliases(null, (err, output) => { | ||
should.exist(err); | ||
should.not.exist(output); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
}); | ||
@@ -271,0 +348,0 @@ |
@@ -1,2 +0,6 @@ | ||
"use strict"; | ||
'use strict'; | ||
/*********************** | ||
* Test dependencies | ||
***********************/ | ||
const TranscriptParser = require('../app.js'); | ||
@@ -7,2 +11,5 @@ const chai = require('chai'); | ||
/*********************** | ||
* Tests | ||
***********************/ | ||
describe('TranscriptParser', function() { | ||
@@ -9,0 +16,0 @@ const transcriptParser = new TranscriptParser(); |
@@ -9,2 +9,3 @@ FREDERICK RYAN JR., WASHINGTON POST PUBLISHER: Mr. Trump, welcome to the Washington Post... | ||
FRED HIATT, WASHINGTON POST EDITORIAL PAGE EDITOR: Do you want to start out? | ||
FRED HIATT, WASHINGTON POST EDITORIAL PAGE EDITOR: Do you want to start out? | ||
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
483357
19
675
2
1