transcript-parser
Advanced tools
Comparing version 0.0.1 to 0.0.2
38
app.js
@@ -15,3 +15,6 @@ "use strict"; | ||
this.defaultSettings = { | ||
removeActions: true | ||
removeActions: true, | ||
removeAnnotations: true, | ||
removeTimestamps: true, //Overriden by removeAnnotations | ||
removeUnknownSpeaker: false | ||
}; | ||
@@ -21,4 +24,4 @@ this.settings = _.assign(this.defaultSettings, options); | ||
newLine: /\r?\n/, | ||
newLineOrAction: /(?:\r?\n|\([A-Z\ ]+\))/, | ||
speaker: /^([A-Z\d\ \/,.\-\(\)]+)(?: \[.+\])?:/, | ||
action: /\([A-Z\ ]+\)\ ?/, | ||
speaker: /^([A-Z\d\ \/,.\-\(\)]+)(?: \[.+\])?:\ ?/, | ||
timestamp: /\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/, | ||
@@ -33,12 +36,21 @@ annotation: /\[.+?\]\ ?/ | ||
proto.parseOne = function(transcript) { | ||
const lines = transcript.split(this.settings.removeActions? | ||
this.regex.newLineOrAction : this.regex.newLine) | ||
//Remove blank lines | ||
.filter(line => line.length > 0) | ||
var lines = transcript.split(this.regex.newLine) | ||
.filter(line => line.length > 0); //Remove blank lines | ||
lines = (this.settings.removeActions) ? lines.map(line => line.split(this.regex.action).join('')): lines; | ||
if(this.settings.removeAnnotations) { | ||
//Remove annotations | ||
.map(line => line.split(this.regex.annotation).join('')); | ||
lines = lines.map(line => line.split(this.regex.annotation).join('')); | ||
} else if(this.settings.removeTimestamps) { | ||
//Remove timestamps | ||
lines = lines.map(line => line.split(this.regex.timestamp).join('')); | ||
} | ||
//Output object | ||
const output = {}; | ||
//Object containing the speakers and their lines | ||
output.speaker = {}; | ||
//List of the speakers, in order | ||
output.order = []; | ||
//Current speaker | ||
var speaker = 'none'; | ||
@@ -51,8 +63,10 @@ | ||
} | ||
if(!(speaker in output.speaker)) { | ||
if(!(speaker in output.speaker) && | ||
(!this.settings.removeUnknownSpeaker || speaker !== 'none')) { | ||
output.speaker[speaker] = []; | ||
} | ||
output.speaker[speaker].push(lines[i]); | ||
output.order.push(speaker); | ||
if(!this.settings.removeUnknownSpeaker || speaker !== 'none') { | ||
output.speaker[speaker].push(lines[i]); | ||
output.order.push(speaker); | ||
} | ||
} | ||
@@ -59,0 +73,0 @@ return output; |
{ | ||
"name": "transcript-parser", | ||
"version": "0.0.1", | ||
"version": "0.0.2", | ||
"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.", | ||
@@ -5,0 +5,0 @@ "main": "app.js", |
transcript-parser | ||
================= | ||
[![Build Status](https://travis-ci.org/willshiao/transcript-parser.svg?branch=master)](https://travis-ci.org/willshiao/transcript-parser) | ||
Parses plaintext speech/debate/radio transcripts into JavaScript objects. | ||
## Description | ||
Parses plaintext speech/debate/radio transcripts into JavaScript objects. It is still in early development and is not stable. Pull requests are welcome. | ||
## Usage | ||
`npm install transcript-parser` | ||
const fs = require('fs'); | ||
const TranscriptParser = require('transcript-parser'); | ||
const tp = new TranscriptParser(); | ||
//Do not use readFileSync in production | ||
const output = tp.parseOne(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'})); | ||
console.log(output); | ||
## Config | ||
The constructor for `TranscriptParser` accepts an options argument. | ||
Options: | ||
- removeActions | ||
+ default: `true` | ||
+ Specifies if the parser should remove actions (e.g. "(APPLAUSE)"). | ||
- removeAnnotations | ||
+ default: `true` | ||
+ Specifies if the parser should remove annotations (surrounded by `[]`). | ||
- removeTimestamps | ||
+ default: `true` | ||
+ **True if `removeAnnotations` is true** | ||
+ Specifies if the parser should remove timestamps (in the `[##:##:##]` format). | ||
- removeUnknownSpeaker | ||
+ default: `false` | ||
+ Specifies if the parser should remove lines that have no associated speaker. | ||
## Documentation | ||
### .parseOne() | ||
The `parseOne()` method parses a string and returns an object representing it. | ||
#### Syntax | ||
`tp.parseOne(_transcript_)` | ||
##### Parameters | ||
- `transcript` | ||
- The transcript, as a `string`. |
@@ -9,4 +9,26 @@ "use strict"; | ||
describe('TranscriptParser', function() { | ||
const tp = new TranscriptParser(); | ||
describe('#parse()', function(){ | ||
describe('contructor', function() { | ||
it('should remove actions by default', function() { | ||
const tp = new TranscriptParser(); | ||
var result = tp.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)'); | ||
result.speaker.should.eql({ | ||
'PERSON A': [ | ||
'Hello, my name is Bob.' | ||
] | ||
}); | ||
}); | ||
it('should respect the removeActions setting', function() { | ||
const tp = new TranscriptParser({removeActions: false}); | ||
var result = tp.parseOne('PERSON A: Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)'); | ||
result.speaker.should.eql({ | ||
'PERSON A': [ | ||
'Hello, (PAUSES) (DRINKS WATER) my name is Bob.(APPLAUSE)' | ||
] | ||
}); | ||
}); | ||
}); | ||
describe('#parseOne()', function(){ | ||
const tp = new TranscriptParser(); | ||
it('should parse a transcript with no errors', function(done) { | ||
@@ -13,0 +35,0 @@ fs.readFileAsync('test/transcripts/sample_1.txt', {encoding: 'UTF-8'}) |
"use strict"; | ||
const TranscriptParser = require('../app.js'); | ||
const chai = require('chai'); | ||
chai.should(); | ||
describe('TranscriptParser', function() { | ||
@@ -19,10 +19,6 @@ const transcriptParser = new TranscriptParser(); | ||
describe('.newLineOrAction', function() { | ||
it('should split newlines', function() { | ||
const testStr = 'a\nb\r\nc'; | ||
testStr.split(regex.newLineOrAction).should.eql(['a','b','c']); | ||
}); | ||
describe('.action', function() { | ||
it('should split actions', function() { | ||
const testStr = 'The(LOUD APPLAUSE)chicken(SILENCE)crossed(LAUGHTER)'; | ||
testStr.split(regex.newLineOrAction).should.eql(['The','chicken','crossed','']); | ||
const testStr = 'The (LOUD APPLAUSE) chicken (SILENCE) crossed (LAUGHTER)'; | ||
testStr.split(regex.action).should.eql(['The ','chicken ','crossed ','']); | ||
}); | ||
@@ -29,0 +25,0 @@ }); |
75676
150
56