transcript-parser
Advanced tools
Comparing version 0.5.0 to 0.6.0
'use strict'; | ||
module.exports = require('./lib/parser'); | ||
module.exports = require('./lib/parser'); |
@@ -8,3 +8,3 @@ 'use strict'; | ||
const Promise = require('bluebird'); | ||
const byline = require('byline'); | ||
const readline = require('readline'); | ||
@@ -15,3 +15,3 @@ | ||
***********************/ | ||
const TranscriptParser = function (options) { | ||
const TranscriptParser = function(options) { | ||
options = options || {}; | ||
@@ -46,14 +46,17 @@ this.defaultSettings = { | ||
proto.parseOneSync = function(transcript) { | ||
var lines = transcript.split(this.regex.newLine) | ||
.filter(line => line.length > 0); //Remove blank lines | ||
lines = this.settings.removeActions ? lines.map(line => removeAll(line, this.regex.action)): lines; | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
lines = lines.map(line => removeAll(line, this.regex.annotation)); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
lines = lines.map(line => removeAll(line, this.regex.timestamp)); | ||
} | ||
lines = lines.filter(line => line.length > 0); //Remove newly blank lines | ||
let lines = transcript.split(this.regex.newLine); | ||
lines = _.map(lines, line => { | ||
if(line.length <= 0) return ''; | ||
if(this.settings.removeActions) | ||
line = removeAll(line, this.regex.action); | ||
if(this.settings.removeAnnotations) | ||
line = removeAll(line, this.regex.annotation); | ||
else if(this.settings.removeTimestamps) | ||
line = removeAll(line, this.regex.timestamp); | ||
return line; | ||
}); | ||
lines = _.filter(lines, line => (line.length > 0)); //Remove newly blank lines | ||
//Output object | ||
@@ -66,17 +69,17 @@ const output = {}; | ||
//Current speaker | ||
var speaker = 'none'; | ||
//Are we ignoring the line because of a blacklisted speaker? | ||
var ignore = false; | ||
let speaker = 'none'; //Current speaker | ||
let ignore = false; //Are we ignoring the line because of a blacklisted speaker? | ||
let match; | ||
for(var i = 0; i < lines.length; i++) { | ||
if(lines[i].match(this.regex.speaker)) { | ||
//Regex match - is speaker | ||
speaker = this.regex.speaker.exec(lines[i])[1].trim(); | ||
_.each(lines, (line) => { | ||
if((match = this.regex.speaker.exec(line)) !== null) { | ||
//Regex match | ||
speaker = match[1].trim(); | ||
//Remove the speaker from the line | ||
lines[i] = lines[i].replace(this.regex.speaker, ''); | ||
//Ignore the speaker if he is in our blacklist | ||
line = line.replace(this.regex.speaker, ''); | ||
ignore = (this.settings.blacklist.indexOf(speaker) > -1); | ||
} | ||
if(ignore || (speaker === 'none' && this.settings.removeUnknownSpeakers)) continue; | ||
//If speaker was blacklisted, return | ||
if(ignore || (speaker === 'none' && this.settings.removeUnknownSpeakers)) return; | ||
//If the speaker's key doesn't already exist | ||
@@ -88,5 +91,5 @@ if(!(speaker in output.speaker)) { | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(lines[i]); | ||
output.speaker[speaker].push(line); | ||
output.order.push(speaker); | ||
} | ||
}); | ||
return output; | ||
@@ -99,3 +102,3 @@ }; | ||
proto.parseOne = function(transcript, cb) { | ||
const hasCallback = (typeof cb !== 'undefined' && cb !== null); | ||
const hasCallback = (typeof cb !== 'undefined' && cb !== null); | ||
//Output object | ||
@@ -107,60 +110,46 @@ const output = {}; | ||
output.order = []; | ||
//Current speaker | ||
var speaker = 'none'; | ||
let speaker = 'none'; //Current speaker | ||
//Convert synchronous errors to asynchronous ones | ||
try { | ||
return Promise.try(() => { | ||
//Remove blank lines | ||
return Promise.filter(transcript.split(this.regex.newLine), line => line.length > 0) | ||
.then(lines => { | ||
if(this.settings.removeActions) { | ||
return Promise.map(lines, line => removeAll(line, this.regex.action)); | ||
} | ||
return Promise.resolve(lines); | ||
}).then(lines => { | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
return Promise.map(lines, line => removeAll(line, this.regex.annotation)); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
return Promise.map(lines, line => removeAll(line, this.regex.timestamp)); | ||
} | ||
return Promise.resolve(lines); | ||
}) | ||
.then(lines => { | ||
//Remove newly blank lines | ||
return Promise.filter(lines, line => line.length > 0); | ||
}) | ||
.then(lines => { | ||
var ignore = false; | ||
return Promise.each(lines, (line) => { | ||
if(line.match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(line)[1].trim(); | ||
//Remove the speaker from the line | ||
line = line.replace(this.regex.speaker, ''); | ||
ignore = (this.settings.blacklist.indexOf(speaker) > -1); | ||
} | ||
//If speaker was blacklisted, return | ||
if(ignore || (speaker === 'none' && this.settings.removeUnknownSpeakers)) return; | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(line); | ||
output.order.push(speaker); | ||
}); | ||
}).then(() => { | ||
if(hasCallback) cb(null, output); | ||
return Promise.resolve(output); | ||
}).catch(err => { | ||
if(hasCallback) cb(err); | ||
else return this.reject(err); | ||
}); | ||
} catch(err) { | ||
if(hasCallback) cb(err); | ||
else return Promise.reject(err); | ||
} | ||
let lines = _.filter(transcript.split(this.regex.newLine), line => (line.length > 0)); | ||
let ignore = false, match = null; | ||
lines = _.map(lines, line => { | ||
if(this.settings.removeActions) | ||
line = removeAll(line, this.regex.action); | ||
if(this.settings.removeAnnotations) | ||
line = removeAll(line, this.regex.annotation); | ||
else if(this.settings.removeTimestamps) | ||
line = removeAll(line, this.regex.timestamp); | ||
return line; | ||
}); | ||
lines = _.filter(lines, line => (line.length > 0)); //Remove newly blank lines | ||
_.each(lines, (line) => { | ||
if((match = this.regex.speaker.exec(line)) !== null) { | ||
//Regex match | ||
speaker = match[1].trim(); | ||
//Remove the speaker from the line | ||
line = line.replace(this.regex.speaker, ''); | ||
ignore = (this.settings.blacklist.indexOf(speaker) > -1); | ||
} | ||
//If speaker was blacklisted, return | ||
if(ignore || (speaker === 'none' && this.settings.removeUnknownSpeakers)) return; | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(line); | ||
output.order.push(speaker); | ||
}); | ||
if(hasCallback) cb(null, output); | ||
return Promise.resolve(output); | ||
}).catch(err => { | ||
if(hasCallback) cb(err); | ||
else return Promise.reject(err); | ||
}); | ||
}; | ||
@@ -183,7 +172,7 @@ | ||
//Add the lines from the regex-matched speaker | ||
//to the new speaker if the new speaker exists | ||
//to the new speaker if the new speaker exists | ||
speakers[newName] = _.concat(lines, speakers[newName]); | ||
} else { | ||
//Otherwise, make a new list | ||
speakers[newName] = lines.slice(); | ||
//Otherwise, make a new list | ||
speakers[newName] = lines; | ||
} | ||
@@ -198,3 +187,3 @@ //Delete the old key | ||
}); | ||
//Fix the names in the order array | ||
@@ -223,10 +212,8 @@ data.order = data.order.map(speaker => { | ||
if(_.isEmpty(aliases)) { | ||
if(hasCallback) cb(null, data); | ||
return Promise.resolve(data); | ||
if(hasCallback) return cb(null, data); | ||
else return Promise.resolve(data); | ||
} | ||
//Convert synchronous errors to asynchronous ones | ||
try { | ||
return Promise.try(() => { | ||
const speakers = data.speaker; | ||
return Promise.all(_.keys(speakers).map(speakerName => { | ||
@@ -241,4 +228,4 @@ return Promise.all(_.keys(aliases).map(trueName => { | ||
_.concat(speakers[speakerName], speakers[trueName]) : | ||
//Otherwise, make a new list | ||
speakers[trueName] = speakers[speakerName]; | ||
//Otherwise, make a new list | ||
speakers[speakerName]; | ||
//Delete the old key | ||
@@ -250,28 +237,27 @@ delete speakers[speakerName]; | ||
})); | ||
})).then(() => { | ||
return Promise.each(data.order, (speaker, speakerIndex) => { | ||
return Promise.all(_.map(aliases, (alias, trueName) => { | ||
return Promise.all(_.map(alias, (regex) => { | ||
if(speaker.search(regex) !== -1) { | ||
data.order[speakerIndex] = trueName; | ||
return; | ||
} | ||
})); | ||
})); | ||
}).then(() => { | ||
return Promise.each(data.order, (speaker, speakerIndex) => { | ||
return Promise.all(_.map(aliases, (alias, trueName) => { | ||
return Promise.all(_.map(alias, (regex) => { | ||
if(speaker.search(regex) !== -1) { | ||
data.order[speakerIndex] = trueName; | ||
return; | ||
} | ||
})); | ||
}); | ||
}).then(() => { | ||
if(hasCallback) cb(null, data); | ||
return Promise.resolve(data); | ||
}).catch(err => { | ||
if(hasCallback) cb(err); | ||
else return this.reject(err); | ||
})); | ||
}); | ||
} catch(err) { | ||
}).then(() => { | ||
if(hasCallback) cb(null, data); | ||
return Promise.resolve(data); | ||
}).catch(err => { | ||
if(hasCallback) cb(err); | ||
else return Promise.reject(err); | ||
} | ||
}); | ||
}; | ||
proto.parseStream = function(inputStream, cb) { | ||
const stream = byline.createStream(inputStream); | ||
const lineStream = readline.createInterface({ | ||
input: inputStream | ||
}); | ||
//Output object | ||
@@ -283,32 +269,30 @@ const output = {}; | ||
output.order = []; | ||
var speaker = 'none'; | ||
var ignore = false; | ||
stream.on('readable', () => { | ||
const line = stream.read(); | ||
if(line === null) return cb(null, output); | ||
let speaker = 'none'; | ||
let ignore = false, match; | ||
var filteredLine = this.filterLine(line); | ||
if(filteredLine) { | ||
if(filteredLine.match(this.regex.speaker)) { | ||
//Regex match - is speaker | ||
speaker = this.regex.speaker.exec(filteredLine)[1].trim(); | ||
//Remove the speaker from the line | ||
filteredLine = filteredLine.replace(this.regex.speaker, ''); | ||
ignore = (this.settings.blacklist.indexOf(speaker) > -1); | ||
} | ||
if(ignore || (speaker === 'none' && this.settings.removeSpeakers)) return; | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//If the speaker is defined or the setting to remove undefined speakers is false | ||
if(!this.settings.removeUnknownSpeakers) { | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(filteredLine); | ||
output.order.push(speaker); | ||
} | ||
} | ||
lineStream.on('line', line => { | ||
if(line === null) return; | ||
line = this.filterLine(line); | ||
if(!line) return; | ||
if((match = this.regex.speaker.exec(line)) !== null) { | ||
speaker = match[1].trim(); //Regex match - is speaker | ||
//Remove the speaker from the line | ||
line = line.replace(this.regex.speaker, ''); | ||
ignore = (this.settings.blacklist.indexOf(speaker) > -1); | ||
} | ||
if(ignore || (speaker === 'none' && this.settings.removeUnknownSpeakers)) return; | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//If the speaker is defined or the setting to remove undefined speakers is false | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(line); | ||
output.order.push(speaker); | ||
}).on('close', () => { | ||
return cb(null, output); | ||
}); | ||
@@ -321,9 +305,10 @@ | ||
proto.filterLine = function(line) { | ||
if(typeof line !== 'string') {line = line.toString();} | ||
line = this.settings.removeActions ? removeAll(line, this.regex.action) : line; | ||
if(this.settings.removeAnnotations) { | ||
if(typeof line !== 'string') line = line.toString(); | ||
if(this.settings.removeActions) | ||
line = removeAll(line, this.regex.action); | ||
if(this.settings.removeAnnotations) | ||
line = removeAll(line, this.regex.annotation); | ||
} else if(this.settings.removeTimestamps) { | ||
else if(this.settings.removeTimestamps) | ||
line = removeAll(line, this.regex.timestamp); | ||
} | ||
if(line.length <= 0) return null; | ||
@@ -335,2 +320,2 @@ return line; | ||
return text.split(regex).join(''); | ||
} | ||
} |
{ | ||
"name": "transcript-parser", | ||
"version": "0.5.0", | ||
"version": "0.6.0", | ||
"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.", | ||
"main": "app.js", | ||
"scripts": { | ||
"test": "node ./node_modules/mocha/bin/mocha", | ||
"travis-test": "node ./node_modules/istanbul/lib/cli.js cover node_modules/mocha/bin/_mocha", | ||
"test": "mocha", | ||
"travis-test": "node ./node_modules/istanbul/lib/cli.js cover ./node_modules/mocha/bin/_mocha", | ||
"benchmark": "node ./benchmark/benchmark.js" | ||
@@ -30,2 +30,3 @@ }, | ||
"devDependencies": { | ||
"benchmark": "^2.1.1", | ||
"chai": "^3.5.0", | ||
@@ -39,5 +40,4 @@ "chai-as-promised": "^5.3.0", | ||
"bluebird": "^3.3.4", | ||
"byline": "^4.2.1", | ||
"lodash": "^4.9.0" | ||
} | ||
} |
@@ -8,4 +8,6 @@ transcript-parser | ||
Parses plaintext speech/debate/radio transcripts into JavaScript objects. It is still in early development and is not stable. Pull requests are welcome. | ||
Parses plaintext speech/debate/radio transcripts into JavaScript objects. It is still in early development. Pull requests are welcome. | ||
Tested for Node.js versions >= 4.4.6 | ||
## Usage | ||
@@ -23,3 +25,3 @@ | ||
//Asyncronous example | ||
//Asynchronous example | ||
fs.readFile('transcript.txt', (err, data) => { | ||
@@ -60,2 +62,6 @@ if(err) return console.error('Error:', err); | ||
+ Specifies if the parser should remove lines that have no associated speaker. | ||
+ If true, lines that have no associated speaker will be stored under the key `none`. | ||
- `blacklist` | ||
+ default: `[]` | ||
+ A list of speakers (as strings) that the parser should ignore. | ||
- `aliases` | ||
@@ -62,0 +68,0 @@ + default: `{}` |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
2
154
0
17844
6
6
275
1
- Removedbyline@^4.2.1
- Removedbyline@4.2.2(transitive)