transcript-parser
Advanced tools
Comparing version 0.3.0 to 0.4.0
@@ -10,2 +10,3 @@ 'use strict'; | ||
const TranscriptParser = require('../app.js'); | ||
const Readable = require('stream').Readable; | ||
@@ -18,3 +19,3 @@ const tp = new TranscriptParser(); | ||
var s = new Readable(); | ||
/*********************** | ||
@@ -48,5 +49,10 @@ * Benchmarks | ||
console.log('Async Parse #3:', msg); | ||
}).catch(e => console.error(e)); | ||
s.push(firstTranscript, 'utf8'); | ||
s.push(null); | ||
console.log('Starting stream parse'); | ||
return timePromise(() => Promise.fromCallback(cb => tp.parseStream(s, cb)) ); | ||
}).then(msg => { | ||
console.log('Stream Parse #1:', msg); | ||
}).catch(e => console.error(e.stack)); | ||
/*********************** | ||
@@ -90,7 +96,7 @@ * Functions | ||
function readTranscript(name) { | ||
return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'}); | ||
return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'}); | ||
} | ||
function readTranscriptSync(name) { | ||
return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'}); | ||
return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'}); | ||
} |
@@ -8,2 +8,3 @@ 'use strict'; | ||
const Promise = require('bluebird'); | ||
const byline = require('byline'); | ||
@@ -21,12 +22,13 @@ | ||
removeUnknownSpeakers: false, | ||
aliases: {} | ||
aliases: {}, | ||
regex: { | ||
newLine: /\r?\n/, | ||
action: /\([A-Z\ ]+\)\ ?/, | ||
speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-\(\)]+?)(?:\ ?\[[A-z\ ]+\])? ?:\ ?/, | ||
timestamp: /\ ?\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/, | ||
annotation: /\[.+?\]\ ?/ | ||
} | ||
}; | ||
this.settings = _.assign(this.defaultSettings, options); | ||
this.regex = { | ||
newLine: /\r?\n/, | ||
action: /\([A-Z\ ]+\)\ ?/, | ||
speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-\(\)]+)(?: \[.+\])?:\ ?/, | ||
timestamp: /\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/, | ||
annotation: /\[.+?\]\ ?/ | ||
}; | ||
this.regex = this.settings.regex; | ||
}; | ||
@@ -45,10 +47,11 @@ | ||
.filter(line => line.length > 0); //Remove blank lines | ||
lines = (this.settings.removeActions) ? lines.map(line => line.split(this.regex.action).join('')): lines; | ||
lines = this.settings.removeActions ? lines.map(line => removeAll(line, this.regex.action)): lines; | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
lines = lines.map(line => line.split(this.regex.annotation).join('')); | ||
lines = lines.map(line => removeAll(line, this.regex.annotation)); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
lines = lines.map(line => line.split(this.regex.timestamp).join('')); | ||
lines = lines.map(line => removeAll(line, this.regex.timestamp)); | ||
} | ||
lines = lines.filter(line => line.length > 0); //Remove newely blank lines | ||
@@ -67,4 +70,4 @@ //Output object | ||
if(lines[i].match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(lines[i])[1]; | ||
//Regex match - is speaker | ||
speaker = this.regex.speaker.exec(lines[i])[1].trim(); | ||
//Remove the speaker from the line | ||
@@ -110,3 +113,3 @@ lines[i] = lines[i].replace(this.regex.speaker, ''); | ||
if(this.settings.removeActions) { | ||
return Promise.map(lines, line => line.split(this.regex.action).join('')); | ||
return Promise.map(lines, line => removeAll(line, this.regex.action)); | ||
} | ||
@@ -117,13 +120,18 @@ return Promise.resolve(lines); | ||
//Remove annotations | ||
return Promise.map(lines, line => line.split(this.regex.annotation).join('')); | ||
return Promise.map(lines, line => removeAll(line, this.regex.annotation)); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
return Promise.map(lines, line => line.split(this.regex.timestamp).join('')); | ||
return Promise.map(lines, line => removeAll(line, this.regex.timestamp)); | ||
} | ||
return Promise.resolve(lines); | ||
}).then(lines => { | ||
}) | ||
.then(lines => { | ||
//Remove newly blank lines | ||
return Promise.filter(lines, line => line.length > 0); | ||
}) | ||
.then(lines => { | ||
return Promise.each(lines, (line) => { | ||
if(line.match(this.regex.speaker)) { | ||
//Regex match | ||
speaker = this.regex.speaker.exec(line)[1]; | ||
speaker = this.regex.speaker.exec(line)[1].trim(); | ||
//Remove the speaker from the line | ||
@@ -258,1 +266,62 @@ line = line.replace(this.regex.speaker, ''); | ||
}; | ||
proto.parseStream = function(inputStream, cb) { | ||
const stream = byline.createStream(inputStream); | ||
//Output object | ||
const output = {}; | ||
//Object containing the speakers and their lines | ||
output.speaker = {}; | ||
//List of the speakers, in order | ||
output.order = []; | ||
var line; | ||
var speaker = 'none'; | ||
stream.on('readable', () => { | ||
const line = stream.read() | ||
if(line === null) return cb(null, output); | ||
var filteredLine = this.filterLine(line); | ||
if(filteredLine) { | ||
if(filteredLine.match(this.regex.speaker)) { | ||
//Regex match - is speaker | ||
speaker = this.regex.speaker.exec(filteredLine)[1].trim(); | ||
//Remove the speaker from the line | ||
filteredLine = filteredLine.replace(this.regex.speaker, ''); | ||
} | ||
//If the speaker's key doesn't already exist | ||
if(!(speaker in output.speaker) && | ||
//And the speaker is defined or the setting to remove undefined speakers is false | ||
(speaker !== 'none' || !this.settings.removeUnknownSpeakers)) { | ||
//Set the output's speaker key to a new empty array | ||
output.speaker[speaker] = []; | ||
} | ||
//If the speaker is defined or the setting to remove undefined speakers is false | ||
if(speaker !== 'none' || !this.settings.removeUnknownSpeakers) { | ||
//Add the text to the output speaker's key and speaker name to the order array | ||
output.speaker[speaker].push(filteredLine); | ||
output.order.push(speaker); | ||
} | ||
} | ||
}); | ||
}; | ||
//Filters a line based on the defined settings | ||
//Returns null on the line being completely removed | ||
proto.filterLine = function(line) { | ||
if(typeof line !== 'string') {line = line.toString();} | ||
line = this.settings.removeActions ? removeAll(line, this.regex.action) : line; | ||
if(this.settings.removeAnnotations) { | ||
line = removeAll(line, this.regex.annotation); | ||
} else if(this.settings.removeTimestamps) { | ||
line = removeAll(line, this.regex.timestamp); | ||
} | ||
if(line.length <= 0) return null; | ||
return line; | ||
}; | ||
function removeAll(text, regex) { | ||
return text.split(regex).join(''); | ||
} |
{ | ||
"name": "transcript-parser", | ||
"version": "0.3.0", | ||
"version": "0.4.0", | ||
"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.", | ||
@@ -38,4 +38,5 @@ "main": "app.js", | ||
"bluebird": "^3.3.4", | ||
"byline": "^4.2.1", | ||
"lodash": "^4.9.0" | ||
} | ||
} |
@@ -18,5 +18,21 @@ transcript-parser | ||
//Do not use fs.readFileSync in production | ||
const output = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'})); | ||
console.log(output); | ||
//Synchronous example | ||
const parsed = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'})); | ||
console.log(parsed); | ||
//Asyncronous example | ||
fs.readFile('transcript.txt', (err, data) => { | ||
if(err) return console.error('Error:', err); | ||
tp.parseOne(data, (err, parsed => { | ||
if(err) return console.error('Error:', err); | ||
console.log(parsed); | ||
})); | ||
}); | ||
//Stream example | ||
const stream = fs.createReadStream('transcript.txt', 'utf8'); | ||
tp.parseStream(stream, (err, parsed) => { | ||
if(err) return console.error('Error:', err); | ||
console.log(parsed); | ||
}); | ||
@@ -52,2 +68,20 @@ | ||
### .parseStream() | ||
The `parseStream()` method parses a [`Stream`](https://nodejs.org/api/stream.html) and returns an object representing it. | ||
This is the preferred method for parsing streams asynchronously as it doesn't have to load the entire transcript into memory (unlike `parseOne()`). | ||
#### Syntax | ||
`tp.parseOneSync(stream, callback)` | ||
##### Parameters | ||
- `stream` | ||
+ The `Readable` stream to read. | ||
- `callback(err, data)` | ||
+ A callback to be executed on function completion or error. | ||
### .parseOneSync() | ||
@@ -80,3 +114,3 @@ | ||
- `callback(err, data)` | ||
+ A callback to be exectuted on function completion. | ||
+ A callback to be exectuted on function completion or error. | ||
@@ -115,3 +149,3 @@ | ||
- `callback(err, resolved)` | ||
+ A callback to be executed on function completion. | ||
+ A callback to be executed on function completion or error. | ||
@@ -21,2 +21,22 @@ 'use strict'; | ||
describe('#parseStream()', function() { | ||
const tp = new TranscriptParser(); | ||
it('should parse a transcript correctly', function(done) { | ||
readSample(1) | ||
.bind({}) | ||
.then(info => { | ||
var stream = fs.createReadStream(path.join(TEST_DIR, '1.txt'), 'utf8'); | ||
return Promise.fromCallback(cb => tp.parseStream(stream, cb)); | ||
}).then(result => { | ||
this.result = result; | ||
return readExpected(1); | ||
}).then(expected => { | ||
this.result.should.be.eql(JSON.parse(expected)); | ||
done(); | ||
}) | ||
.catch(e => done(e)); | ||
}); | ||
}); | ||
/* | ||
@@ -61,3 +81,3 @@ * For the synchronous parseOne method | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true}); | ||
var result = parser.parseOneSync('[20:20:34] BERMAN: [2:1:41] The [first] name...'); | ||
var result = parser.parseOneSync('[20:20:34] BERMAN [2:1:41] : The [first] name...'); | ||
result.speaker.should.eql({ | ||
@@ -146,3 +166,3 @@ 'BERMAN': [ | ||
const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true}); | ||
parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...', | ||
parser.parseOne('[20:20:34] BERMAN: The [first] name...', | ||
(err, result) => { | ||
@@ -345,7 +365,7 @@ if(err) return done(err); | ||
function readSample(sampleName) { | ||
return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'UTF-8'}); | ||
return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'utf8'}); | ||
} | ||
function readExpected(expectedName) { | ||
return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'UTF-8'}); | ||
return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'utf8'}); | ||
} |
@@ -60,3 +60,3 @@ 'use strict'; | ||
'[20:20:34] BERMAN: [2:1:41] The...'.split(regex.timestamp).join('') | ||
.should.equal('BERMAN: The...'); | ||
.should.equal('BERMAN:The...'); | ||
}); | ||
@@ -63,0 +63,0 @@ }); |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
487547
761
148
0
3
+ Addedbyline@^4.2.1
+ Addedbyline@4.2.2(transitive)