Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

transcript-parser

Package Overview
Dependencies
Maintainers
1
Versions
13
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

transcript-parser - npm Package Compare versions

Comparing version 0.3.0 to 0.4.0

16

benchmark/benchmark.js

@@ -10,2 +10,3 @@ 'use strict';

const TranscriptParser = require('../app.js');
const Readable = require('stream').Readable;

@@ -18,3 +19,3 @@ const tp = new TranscriptParser();

var s = new Readable();
/***********************

@@ -48,5 +49,10 @@ * Benchmarks

console.log('Async Parse #3:', msg);
}).catch(e => console.error(e));
s.push(firstTranscript, 'utf8');
s.push(null);
console.log('Starting stream parse');
return timePromise(() => Promise.fromCallback(cb => tp.parseStream(s, cb)) );
}).then(msg => {
console.log('Stream Parse #1:', msg);
}).catch(e => console.error(e.stack));
/***********************

@@ -90,7 +96,7 @@ * Functions

function readTranscript(name) {
return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'});
return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'});
}
function readTranscriptSync(name) {
return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'});
return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'});
}

@@ -8,2 +8,3 @@ 'use strict';

const Promise = require('bluebird');
const byline = require('byline');

@@ -21,12 +22,13 @@

removeUnknownSpeakers: false,
aliases: {}
aliases: {},
regex: {
newLine: /\r?\n/,
action: /\([A-Z\ ]+\)\ ?/,
speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-\(\)]+?)(?:\ ?\[[A-z\ ]+\])? ?:\ ?/,
timestamp: /\ ?\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/,
annotation: /\[.+?\]\ ?/
}
};
this.settings = _.assign(this.defaultSettings, options);
this.regex = {
newLine: /\r?\n/,
action: /\([A-Z\ ]+\)\ ?/,
speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-\(\)]+)(?: \[.+\])?:\ ?/,
timestamp: /\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/,
annotation: /\[.+?\]\ ?/
};
this.regex = this.settings.regex;
};

@@ -45,10 +47,11 @@

.filter(line => line.length > 0); //Remove blank lines
lines = (this.settings.removeActions) ? lines.map(line => line.split(this.regex.action).join('')): lines;
lines = this.settings.removeActions ? lines.map(line => removeAll(line, this.regex.action)): lines;
if(this.settings.removeAnnotations) {
//Remove annotations
lines = lines.map(line => line.split(this.regex.annotation).join(''));
lines = lines.map(line => removeAll(line, this.regex.annotation));
} else if(this.settings.removeTimestamps) {
//Remove timestamps
lines = lines.map(line => line.split(this.regex.timestamp).join(''));
lines = lines.map(line => removeAll(line, this.regex.timestamp));
}
lines = lines.filter(line => line.length > 0); //Remove newely blank lines

@@ -67,4 +70,4 @@ //Output object

if(lines[i].match(this.regex.speaker)) {
//Regex match
speaker = this.regex.speaker.exec(lines[i])[1];
//Regex match - is speaker
speaker = this.regex.speaker.exec(lines[i])[1].trim();
//Remove the speaker from the line

@@ -110,3 +113,3 @@ lines[i] = lines[i].replace(this.regex.speaker, '');

if(this.settings.removeActions) {
return Promise.map(lines, line => line.split(this.regex.action).join(''));
return Promise.map(lines, line => removeAll(line, this.regex.action));
}

@@ -117,13 +120,18 @@ return Promise.resolve(lines);

//Remove annotations
return Promise.map(lines, line => line.split(this.regex.annotation).join(''));
return Promise.map(lines, line => removeAll(line, this.regex.annotation));
} else if(this.settings.removeTimestamps) {
//Remove timestamps
return Promise.map(lines, line => line.split(this.regex.timestamp).join(''));
return Promise.map(lines, line => removeAll(line, this.regex.timestamp));
}
return Promise.resolve(lines);
}).then(lines => {
})
.then(lines => {
//Remove newly blank lines
return Promise.filter(lines, line => line.length > 0);
})
.then(lines => {
return Promise.each(lines, (line) => {
if(line.match(this.regex.speaker)) {
//Regex match
speaker = this.regex.speaker.exec(line)[1];
speaker = this.regex.speaker.exec(line)[1].trim();
//Remove the speaker from the line

@@ -258,1 +266,62 @@ line = line.replace(this.regex.speaker, '');

};
proto.parseStream = function(inputStream, cb) {
const stream = byline.createStream(inputStream);
//Output object
const output = {};
//Object containing the speakers and their lines
output.speaker = {};
//List of the speakers, in order
output.order = [];
var line;
var speaker = 'none';
stream.on('readable', () => {
const line = stream.read()
if(line === null) return cb(null, output);
var filteredLine = this.filterLine(line);
if(filteredLine) {
if(filteredLine.match(this.regex.speaker)) {
//Regex match - is speaker
speaker = this.regex.speaker.exec(filteredLine)[1].trim();
//Remove the speaker from the line
filteredLine = filteredLine.replace(this.regex.speaker, '');
}
//If the speaker's key doesn't already exist
if(!(speaker in output.speaker) &&
//And the speaker is defined or the setting to remove undefined speakers is false
(speaker !== 'none' || !this.settings.removeUnknownSpeakers)) {
//Set the output's speaker key to a new empty array
output.speaker[speaker] = [];
}
//If the speaker is defined or the setting to remove undefined speakers is false
if(speaker !== 'none' || !this.settings.removeUnknownSpeakers) {
//Add the text to the output speaker's key and speaker name to the order array
output.speaker[speaker].push(filteredLine);
output.order.push(speaker);
}
}
});
};
//Filters a line based on the defined settings
//Returns null on the line being completely removed
proto.filterLine = function(line) {
if(typeof line !== 'string') {line = line.toString();}
line = this.settings.removeActions ? removeAll(line, this.regex.action) : line;
if(this.settings.removeAnnotations) {
line = removeAll(line, this.regex.annotation);
} else if(this.settings.removeTimestamps) {
line = removeAll(line, this.regex.timestamp);
}
if(line.length <= 0) return null;
return line;
};
function removeAll(text, regex) {
return text.split(regex).join('');
}
{
"name": "transcript-parser",
"version": "0.3.0",
"version": "0.4.0",
"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.",

@@ -38,4 +38,5 @@ "main": "app.js",

"bluebird": "^3.3.4",
"byline": "^4.2.1",
"lodash": "^4.9.0"
}
}

@@ -18,5 +18,21 @@ transcript-parser

//Do not use fs.readFileSync in production
const output = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'}));
console.log(output);
//Synchronous example
const parsed = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'}));
console.log(parsed);
//Asyncronous example
fs.readFile('transcript.txt', (err, data) => {
if(err) return console.error('Error:', err);
tp.parseOne(data, (err, parsed => {
if(err) return console.error('Error:', err);
console.log(parsed);
}));
});
//Stream example
const stream = fs.createReadStream('transcript.txt', 'utf8');
tp.parseStream(stream, (err, parsed) => {
if(err) return console.error('Error:', err);
console.log(parsed);
});

@@ -52,2 +68,20 @@

### .parseStream()
The `parseStream()` method parses a [`Stream`](https://nodejs.org/api/stream.html) and returns an object representing it.
This is the preferred method for parsing streams asynchronously as it doesn't have to load the entire transcript into memory (unlike `parseOne()`).
#### Syntax
`tp.parseOneSync(stream, callback)`
##### Parameters
- `stream`
+ The `Readable` stream to read.
- `callback(err, data)`
+ A callback to be executed on function completion or error.
### .parseOneSync()

@@ -80,3 +114,3 @@

- `callback(err, data)`
+ A callback to be exectuted on function completion.
+ A callback to be exectuted on function completion or error.

@@ -115,3 +149,3 @@

- `callback(err, resolved)`
+ A callback to be executed on function completion.
+ A callback to be executed on function completion or error.

@@ -21,2 +21,22 @@ 'use strict';

describe('#parseStream()', function() {
const tp = new TranscriptParser();
it('should parse a transcript correctly', function(done) {
readSample(1)
.bind({})
.then(info => {
var stream = fs.createReadStream(path.join(TEST_DIR, '1.txt'), 'utf8');
return Promise.fromCallback(cb => tp.parseStream(stream, cb));
}).then(result => {
this.result = result;
return readExpected(1);
}).then(expected => {
this.result.should.be.eql(JSON.parse(expected));
done();
})
.catch(e => done(e));
});
});
/*

@@ -61,3 +81,3 @@ * For the synchronous parseOne method

const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true});
var result = parser.parseOneSync('[20:20:34] BERMAN: [2:1:41] The [first] name...');
var result = parser.parseOneSync('[20:20:34] BERMAN [2:1:41] : The [first] name...');
result.speaker.should.eql({

@@ -146,3 +166,3 @@ 'BERMAN': [

const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true});
parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...',
parser.parseOne('[20:20:34] BERMAN: The [first] name...',
(err, result) => {

@@ -345,7 +365,7 @@ if(err) return done(err);

function readSample(sampleName) {
return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'UTF-8'});
return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'utf8'});
}
function readExpected(expectedName) {
return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'UTF-8'});
return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'utf8'});
}

@@ -60,3 +60,3 @@ 'use strict';

'[20:20:34] BERMAN: [2:1:41] The...'.split(regex.timestamp).join('')
.should.equal('BERMAN: The...');
.should.equal('BERMAN:The...');
});

@@ -63,0 +63,0 @@ });

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc