Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@conscia/tika

Package Overview
Dependencies
Maintainers
3
Versions
21
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@conscia/tika - npm Package Compare versions

Comparing version 1.7.5 to 1.8.0

.editorconfig

2

package.json
{
"name": "@conscia/tika",
"version": "1.7.5",
"version": "1.8.0",
"description": "Apache Tika bridge. Text extraction, metadata extraction, mimetype detection and language detection.",

@@ -5,0 +5,0 @@ "scripts": {

@@ -6,482 +6,482 @@ /*jshint node:true*/

var assert = require('assert');
var tika = require('../');
const assert = require('assert');
const tika = require('../');
suite('document tests', function() {
test('detect txt content-type', function(done) {
tika.type('test/data/file.txt', function(err, contentType) {
assert.ifError(err);
assert.equal(typeof contentType, 'string');
assert.equal(contentType, 'text/plain');
done();
});
});
suite('document tests', () => {
test('detect txt content-type', done => {
tika.type('test/data/file.txt', (err, contentType) => {
assert.ifError(err);
assert.equal(typeof contentType, 'string');
assert.equal(contentType, 'text/plain');
done();
});
});
test('detect txt content-type and charset', function(done) {
tika.typeAndCharset('test/data/file.txt', function(err, contentType) {
assert.ifError(err);
assert.equal(typeof contentType, 'string');
assert.equal(contentType, 'text/plain; charset=ISO-8859-1');
done();
});
});
test('detect txt content-type and charset', done => {
tika.typeAndCharset('test/data/file.txt', (err, contentType) => {
assert.ifError(err);
assert.equal(typeof contentType, 'string');
assert.equal(contentType, 'text/plain; charset=ISO-8859-1');
done();
});
});
test('extract from txt', function(done) {
tika.text('test/data/file.txt', function(err, text) {
assert.ifError(err);
assert.equal(typeof text, 'string');
assert.equal(text, 'Just some text.\n\n');
done();
});
});
test('extract from txt', done => {
tika.text('test/data/file.txt', (err, text) => {
assert.ifError(err);
assert.equal(typeof text, 'string');
assert.equal(text, 'Just some text.\n\n');
done();
});
});
test('extract meta from txt', function(done) {
tika.meta('test/data/file.txt', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.equal(typeof meta.resourceName[0], 'string');
assert.deepEqual(meta.resourceName, ['file.txt']);
assert.deepEqual(meta['Content-Type'], ['text/plain; charset=ISO-8859-1']);
assert.deepEqual(meta['Content-Encoding'], ['ISO-8859-1']);
done();
});
});
test('extract meta from txt', done => {
tika.meta('test/data/file.txt', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.equal(typeof meta.resourceName[0], 'string');
assert.deepEqual(meta.resourceName, ['file.txt']);
assert.deepEqual(meta['Content-Type'], ['text/plain; charset=ISO-8859-1']);
assert.deepEqual(meta['Content-Encoding'], ['ISO-8859-1']);
done();
});
});
test('extract meta and text from txt', function(done) {
tika.extract('test/data/file.txt', function(err, text, meta) {
assert.ifError(err);
assert.equal(typeof text, 'string');
assert.equal(text, 'Just some text.\n\n');
assert.ok(meta);
assert.equal(typeof meta.resourceName[0], 'string');
assert.deepEqual(meta.resourceName, ['file.txt']);
assert.deepEqual(meta['Content-Type'], ['text/plain; charset=ISO-8859-1']);
assert.deepEqual(meta['Content-Encoding'], ['ISO-8859-1']);
done();
});
});
test('extract meta and text from txt', done => {
tika.extract('test/data/file.txt', (err, text, meta) => {
assert.ifError(err);
assert.equal(typeof text, 'string');
assert.equal(text, 'Just some text.\n\n');
assert.ok(meta);
assert.equal(typeof meta.resourceName[0], 'string');
assert.deepEqual(meta.resourceName, ['file.txt']);
assert.deepEqual(meta['Content-Type'], ['text/plain; charset=ISO-8859-1']);
assert.deepEqual(meta['Content-Encoding'], ['ISO-8859-1']);
done();
});
});
test('extract from extensionless txt', function(done) {
tika.text('test/data/extensionless/txt', function(err, text) {
assert.ifError(err);
assert.equal(text, 'Just some text.\n\n');
done();
});
});
test('extract from extensionless txt', done => {
tika.text('test/data/extensionless/txt', (err, text) => {
assert.ifError(err);
assert.equal(text, 'Just some text.\n\n');
done();
});
});
test('extract from doc', function(done) {
tika.text('test/data/file.doc', function(err, text) {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract from doc', done => {
tika.text('test/data/file.doc', (err, text) => {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract meta from doc', function(done) {
tika.meta('test/data/file.doc', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.doc']);
assert.deepEqual(meta['Content-Type'], ['application/msword']);
assert.deepEqual(meta['dcterms:created'], ['2013-12-06T21:15:26Z']);
done();
});
});
test('extract meta from doc', done => {
tika.meta('test/data/file.doc', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.doc']);
assert.deepEqual(meta['Content-Type'], ['application/msword']);
assert.deepEqual(meta['dcterms:created'], ['2013-12-06T21:15:26Z']);
done();
});
});
test('extract from extensionless doc', function(done) {
tika.text('test/data/extensionless/doc', function(err, text) {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract from extensionless doc', done => {
tika.text('test/data/extensionless/doc', (err, text) => {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract from docx', function(done) {
tika.text('test/data/file.docx', function(err, text) {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract from docx', done => {
tika.text('test/data/file.docx', (err, text) => {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract meta from docx', function(done) {
tika.meta('test/data/file.docx', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.docx']);
assert.deepEqual(meta['Content-Type'], ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']);
assert.deepEqual(meta['Application-Name'], ['LibreOffice/4.1.3.2$MacOSX_x86 LibreOffice_project/70feb7d99726f064edab4605a8ab840c50ec57a']);
done();
});
});
test('extract meta from docx', done => {
tika.meta('test/data/file.docx', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.docx']);
assert.deepEqual(meta['Content-Type'], ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']);
assert.deepEqual(meta['Application-Name'], ['LibreOffice/4.1.3.2$MacOSX_x86 LibreOffice_project/70feb7d99726f064edab4605a8ab840c50ec57a']);
done();
});
});
test('extract from extensionless docx', function(done) {
tika.text('test/data/extensionless/docx', function(err, text) {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract from extensionless docx', done => {
tika.text('test/data/extensionless/docx', (err, text) => {
assert.ifError(err);
assert.equal(text, 'Just some text.\n');
done();
});
});
test('extract meta from extensionless docx', function(done) {
tika.meta('test/data/extensionless/docx', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['docx']);
assert.deepEqual(meta['Content-Type'], ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']);
assert.deepEqual(meta['Application-Name'], ['LibreOffice/4.1.3.2$MacOSX_x86 LibreOffice_project/70feb7d99726f064edab4605a8ab840c50ec57a']);
done();
});
});
test('extract meta from extensionless docx', done => {
tika.meta('test/data/extensionless/docx', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['docx']);
assert.deepEqual(meta['Content-Type'], ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']);
assert.deepEqual(meta['Application-Name'], ['LibreOffice/4.1.3.2$MacOSX_x86 LibreOffice_project/70feb7d99726f064edab4605a8ab840c50ec57a']);
done();
});
});
test('extract from pdf', function(done) {
tika.text('test/data/file.pdf', function(err, text) {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
test('extract from pdf', done => {
tika.text('test/data/file.pdf', (err, text) => {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
test('detect content-type of pdf', function(done) {
tika.type('test/data/file.pdf', function(err, contentType) {
assert.ifError(err);
assert.equal(contentType, 'application/pdf');
done();
});
});
test('detect content-type of pdf', done => {
tika.type('test/data/file.pdf', (err, contentType) => {
assert.ifError(err);
assert.equal(contentType, 'application/pdf');
done();
});
});
test('extract meta from pdf', function(done) {
tika.meta('test/data/file.pdf', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.pdf']);
assert.deepEqual(meta['Content-Type'], ['application/pdf']);
assert.deepEqual(meta.producer, ['LibreOffice 4.1']);
done();
});
});
test('extract meta from pdf', done => {
tika.meta('test/data/file.pdf', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.pdf']);
assert.deepEqual(meta['Content-Type'], ['application/pdf']);
assert.deepEqual(meta.producer, ['LibreOffice 4.1']);
done();
});
});
test('extract from extensionless pdf', function(done) {
tika.text('test/data/extensionless/pdf', function(err, text) {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
test('extract from extensionless pdf', done => {
tika.text('test/data/extensionless/pdf', (err, text) => {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
test('extract meta from extensionless pdf', function(done) {
tika.meta('test/data/extensionless/pdf', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['pdf']);
assert.deepEqual(meta['Content-Type'], ['application/pdf']);
assert.deepEqual(meta.producer, ['LibreOffice 4.1']);
done();
});
});
test('extract meta from extensionless pdf', done => {
tika.meta('test/data/extensionless/pdf', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['pdf']);
assert.deepEqual(meta['Content-Type'], ['application/pdf']);
assert.deepEqual(meta.producer, ['LibreOffice 4.1']);
done();
});
});
test('extract from protected pdf', function(done) {
tika.text('test/data/protected/file.pdf', function(err, text) {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
test('extract from protected pdf', done => {
tika.text('test/data/protected/file.pdf', (err, text) => {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
test('extract meta from protected pdf', function(done) {
tika.meta('test/data/protected/file.pdf', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.pdf']);
assert.deepEqual(meta['Content-Type'], ['application/pdf']);
assert.deepEqual(meta.producer, ['LibreOffice 4.1']);
done();
});
});
test('extract meta from protected pdf', done => {
tika.meta('test/data/protected/file.pdf', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.pdf']);
assert.deepEqual(meta['Content-Type'], ['application/pdf']);
assert.deepEqual(meta.producer, ['LibreOffice 4.1']);
done();
});
});
});
suite('partial document extraction tests', function() {
test('extract from long txt', function(done) {
tika.text('test/data/big/file.txt', { maxLength: 10 }, function(err, text) {
assert.ifError(err);
assert.equal(text.length, 10);
assert.equal(text, 'Lorem ipsu');
done();
});
});
suite('partial document extraction tests', () => {
test('extract from long txt', done => {
tika.text('test/data/big/file.txt', { maxLength: 10 }, (err, text) => {
assert.ifError(err);
assert.equal(text.length, 10);
assert.equal(text, 'Lorem ipsu');
done();
});
});
test('extract from pdf', function(done) {
tika.text('test/data/file.pdf', { maxLength: 10 }, function(err, text) {
assert.ifError(err);
assert.equal(text.length, 10);
assert.equal(text.trim(), 'Just some');
done();
});
});
test('extract from pdf', done => {
tika.text('test/data/file.pdf', { maxLength: 10 }, (err, text) => {
assert.ifError(err);
assert.equal(text.length, 10);
assert.equal(text.trim(), 'Just some');
done();
});
});
});
suite('obscure document tests', function() {
test('extract from Word 2003 XML', function(done) {
tika.text('test/data/obscure/word2003.xml', function(err, text) {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('Just some text.'));
assert.ok(-1 === text.indexOf('<?xml'));
done();
});
});
suite('obscure document tests', () => {
test('extract from Word 2003 XML', done => {
tika.text('test/data/obscure/word2003.xml', (err, text) => {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('Just some text.'));
assert.ok(-1 === text.indexOf('<?xml'));
done();
});
});
});
suite('structured data tests', function() {
test('extract from plain XML', function(done) {
tika.text('test/data/structured/plain.xml', function(err, text) {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('Just some text.'));
assert.ok(-1 === text.indexOf('<?xml'));
done();
});
});
suite('structured data tests', () => {
test('extract from plain XML', done => {
tika.text('test/data/structured/plain.xml', (err, text) => {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('Just some text.'));
assert.ok(-1 === text.indexOf('<?xml'));
done();
});
});
});
suite('image tests', function() {
test('extract from png', function(done) {
tika.text('test/data/file.png', function(err, text) {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
suite('image tests', () => {
test('extract from png', done => {
tika.text('test/data/file.png', (err, text) => {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
test('extract from extensionless png', function(done) {
tika.text('test/data/extensionless/png', function(err, text) {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
test('extract from extensionless png', done => {
tika.text('test/data/extensionless/png', (err, text) => {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
test('extract from gif', function(done) {
tika.text('test/data/file.gif', function(err, text) {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
test('extract from gif', done => {
tika.text('test/data/file.gif', (err, text) => {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
test('extract meta from gif', function(done) {
tika.meta('test/data/file.gif', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.gif']);
assert.deepEqual(meta['Content-Type'], ['image/gif']);
assert.deepEqual(meta['Dimension ImageOrientation'], ['Normal']);
done();
});
});
test('extract meta from gif', done => {
tika.meta('test/data/file.gif', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['file.gif']);
assert.deepEqual(meta['Content-Type'], ['image/gif']);
assert.deepEqual(meta['Dimension ImageOrientation'], ['Normal']);
done();
});
});
test('extract from extensionless gif', function(done) {
tika.text('test/data/extensionless/gif', function(err, text) {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
test('extract from extensionless gif', done => {
tika.text('test/data/extensionless/gif', (err, text) => {
assert.ifError(err);
assert.equal(text, '');
done();
});
});
test('extract meta from extensionless gif', function(done) {
tika.meta('test/data/extensionless/gif', function(err, meta) {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['gif']);
assert.deepEqual(meta['Content-Type'], ['image/gif']);
assert.deepEqual(meta['Dimension ImageOrientation'], ['Normal']);
done();
});
});
test('extract meta from extensionless gif', done => {
tika.meta('test/data/extensionless/gif', (err, meta) => {
assert.ifError(err);
assert.ok(meta);
assert.deepEqual(meta.resourceName, ['gif']);
assert.deepEqual(meta['Content-Type'], ['image/gif']);
assert.deepEqual(meta['Dimension ImageOrientation'], ['Normal']);
done();
});
});
});
suite('non-utf8 encoded document tests', function() {
test('extract Windows Latin 1 text', function(done) {
tika.text('test/data/nonutf8/windows-latin1.txt', function(err, text) {
assert.ifError(err);
assert.equal(text, 'Algún pequeño trozo de texto.\n\n');
done();
});
});
suite('non-utf8 encoded document tests', () => {
test('extract Windows Latin 1 text', done => {
tika.text('test/data/nonutf8/windows-latin1.txt', (err, text) => {
assert.ifError(err);
assert.equal(text, 'Algún pequeño trozo de texto.\n\n');
done();
});
});
test('detect Windows Latin 1 text charset', function(done) {
tika.charset('test/data/nonutf8/windows-latin1.txt', function(err, charset) {
assert.ifError(err);
assert.equal(typeof charset, 'string');
assert.equal(charset, 'ISO-8859-1');
done();
});
});
test('detect Windows Latin 1 text charset', done => {
tika.charset('test/data/nonutf8/windows-latin1.txt', (err, charset) => {
assert.ifError(err);
assert.equal(typeof charset, 'string');
assert.equal(charset, 'ISO-8859-1');
done();
});
});
test('detect Windows Latin 1 text content-type and charset', function(done) {
tika.typeAndCharset('test/data/nonutf8/windows-latin1.txt', function(err, contentType) {
assert.ifError(err);
assert.equal(contentType, 'text/plain; charset=ISO-8859-1');
done();
});
});
test('detect Windows Latin 1 text content-type and charset', done => {
tika.typeAndCharset('test/data/nonutf8/windows-latin1.txt', (err, contentType) => {
assert.ifError(err);
assert.equal(contentType, 'text/plain; charset=ISO-8859-1');
done();
});
});
test('extract UTF-16 English-language text', function(done) {
tika.text('test/data/nonutf8/utf16-english.txt', function(err, text) {
assert.ifError(err);
assert.equal(text, 'Just some text.\n\n');
done();
});
});
test('extract UTF-16 English-language text', done => {
tika.text('test/data/nonutf8/utf16-english.txt', (err, text) => {
assert.ifError(err);
assert.equal(text, 'Just some text.\n\n');
done();
});
});
test('detect UTF-16 English-language text charset', function(done) {
tika.charset('test/data/nonutf8/utf16-english.txt', function(err, charset) {
assert.ifError(err);
assert.equal(charset, 'UTF-16LE');
done();
});
});
test('detect UTF-16 English-language text charset', done => {
tika.charset('test/data/nonutf8/utf16-english.txt', (err, charset) => {
assert.ifError(err);
assert.equal(charset, 'UTF-16LE');
done();
});
});
test('detect UTF-16 English-language text content-type and charset', function(done) {
tika.typeAndCharset('test/data/nonutf8/utf16-english.txt', function(err, contentType) {
assert.ifError(err);
assert.equal(contentType, 'text/plain; charset=UTF-16LE');
done();
});
});
test('detect UTF-16 English-language text content-type and charset', done => {
tika.typeAndCharset('test/data/nonutf8/utf16-english.txt', (err, contentType) => {
assert.ifError(err);
assert.equal(contentType, 'text/plain; charset=UTF-16LE');
done();
});
});
test('extract UTF-16 Chinese (Simplified) text', function(done) {
tika.text('test/data/nonutf8/utf16-chinese.txt', function(err, text) {
assert.ifError(err);
assert.equal(text, '\u53ea\u662f\u4e00\u4e9b\u6587\u5b57\u3002\n\n');
done();
});
});
test('extract UTF-16 Chinese (Simplified) text', done => {
tika.text('test/data/nonutf8/utf16-chinese.txt', (err, text) => {
assert.ifError(err);
assert.equal(text, '\u53ea\u662f\u4e00\u4e9b\u6587\u5b57\u3002\n\n');
done();
});
});
test('detect UTF-16 Chinese (Simplified) text charset', function(done) {
tika.charset('test/data/nonutf8/utf16-chinese.txt', function(err, charset) {
assert.ifError(err);
assert.equal(charset, 'UTF-16LE');
done();
});
});
test('detect UTF-16 Chinese (Simplified) text charset', done => {
tika.charset('test/data/nonutf8/utf16-chinese.txt', (err, charset) => {
assert.ifError(err);
assert.equal(charset, 'UTF-16LE');
done();
});
});
test('detect UTF-16 Chinese (Simplified) text content-type and charset', function(done) {
tika.typeAndCharset('test/data/nonutf8/utf16-chinese.txt', function(err, contentType) {
assert.ifError(err);
assert.equal(contentType, 'text/plain; charset=UTF-16LE');
done();
});
});
test('detect UTF-16 Chinese (Simplified) text content-type and charset', done => {
tika.typeAndCharset('test/data/nonutf8/utf16-chinese.txt', (err, contentType) => {
assert.ifError(err);
assert.equal(contentType, 'text/plain; charset=UTF-16LE');
done();
});
});
});
suite('archive tests', function() {
test('extract from compressed archive', function(done) {
tika.text('test/data/archive/files.zip', function(err, text) {
assert.ifError(err);
assert.equal(text.trim(), 'file1.txt\nSome text 1.\n\n\n\n\nfile2.txt\nSome text 2.\n\n\n\n\nfile3.txt\nSome text 3.');
done();
});
});
suite('archive tests', () => {
test('extract from compressed archive', done => {
tika.text('test/data/archive/files.zip', (err, text) => {
assert.ifError(err);
assert.equal(text.trim(), 'file1.txt\nSome text 1.\n\n\n\n\nfile2.txt\nSome text 2.\n\n\n\n\nfile3.txt\nSome text 3.');
done();
});
});
test('extract from compressed zlib archive', function(done) {
tika.text('test/data/archive/files.zlib', function(err, text) {
assert.ifError(err);
assert.equal(text.trim(), 'files\nSome text 1.\nSome text 2.\nSome text 3.');
done();
});
});
test('extract from compressed zlib archive', done => {
tika.text('test/data/archive/files.zlib', (err, text) => {
assert.ifError(err);
assert.equal(text.trim(), 'files\nSome text 1.\nSome text 2.\nSome text 3.');
done();
});
});
test('detect compressed archive content-type', function(done) {
tika.type('test/data/archive/files.zip', function(err, contentType) {
assert.ifError(err);
assert.equal(contentType, 'application/zip');
done();
});
});
test('detect compressed archive content-type', done => {
tika.type('test/data/archive/files.zip', (err, contentType) => {
assert.ifError(err);
assert.equal(contentType, 'application/zip');
done();
});
});
test('extract from twice compressed archive', function(done) {
tika.text('test/data/archive/files-files.zip', function(err, text) {
assert.ifError(err);
assert.equal(text.trim(), 'file4.txt\nSome text 4.\n\n\n\n\nfile5.txt\nSome text 5.\n\n\n\n\nfile6.txt\nSome text 6.\n\n\n\n\nfiles.zip\n\n\nfile1.txt\n\nSome text 1.\n\n\n\n\n\n\n\nfile2.txt\n\nSome text 2.\n\n\n\n\n\n\n\nfile3.txt\n\nSome text 3.');
done();
});
});
test('extract from twice compressed archive', done => {
tika.text('test/data/archive/files-files.zip', (err, text) => {
assert.ifError(err);
assert.equal(text.trim(), 'file4.txt\nSome text 4.\n\n\n\n\nfile5.txt\nSome text 5.\n\n\n\n\nfile6.txt\nSome text 6.\n\n\n\n\nfiles.zip\n\n\nfile1.txt\n\nSome text 1.\n\n\n\n\n\n\n\nfile2.txt\n\nSome text 2.\n\n\n\n\n\n\n\nfile3.txt\n\nSome text 3.');
done();
});
});
});
suite('encrypted doc tests', function() {
test('detect encrypted pdf content-type', function(done) {
tika.type('test/data/encrypted/file.pdf', function(err, contentType) {
assert.ifError(err);
assert.equal(contentType, 'application/pdf');
done();
});
});
suite('encrypted doc tests', () => {
test('detect encrypted pdf content-type', done => {
tika.type('test/data/encrypted/file.pdf', (err, contentType) => {
assert.ifError(err);
assert.equal(contentType, 'application/pdf');
done();
});
});
test('detect encrypted doc content-type', function(done) {
tika.type('test/data/encrypted/file.doc', function(err, contentType) {
assert.ifError(err);
assert.equal(contentType, 'application/msword');
done();
});
});
test('detect encrypted doc content-type', done => {
tika.type('test/data/encrypted/file.doc', (err, contentType) => {
assert.ifError(err);
assert.equal(contentType, 'application/msword');
done();
});
});
test('specify password to decrypt document', function(done) {
tika.text('test/data/encrypted/file.pdf', {
password: 'password'
}, function(err, text) {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
test('specify password to decrypt document', done => {
tika.text('test/data/encrypted/file.pdf', {
password: 'password'
}, (err, text) => {
assert.ifError(err);
assert.equal(text.trim(), 'Just some text.');
done();
});
});
});
suite('error handling tests', function() {
test('extract from encrypted doc', function(done) {
tika.text('test/data/encrypted/file.doc', function(err, text) {
assert.ok(err);
assert.ok(-1 !== err.toString().indexOf('EncryptedDocumentException: Unable to process: document is encrypted'));
done();
});
});
suite('error handling tests', () => {
test('extract from encrypted doc', done => {
tika.text('test/data/encrypted/file.doc', err => {
assert.ok(err);
assert.ok(-1 !== err.toString().indexOf('EncryptedDocumentException: Unable to process: document is encrypted'));
done();
});
});
test('extract from encrypted pdf', function(done) {
tika.text('test/data/encrypted/file.pdf', function(err, text) {
assert.ok(err);
assert.ok(-1 !== err.toString().indexOf('Unable to process: document is encrypted'));
done();
});
});
test('extract from encrypted pdf', done => {
tika.text('test/data/encrypted/file.pdf', err => {
assert.ok(err);
assert.ok(-1 !== err.toString().indexOf('Unable to process: document is encrypted'));
done();
});
});
});
suite('http extraction tests', function() {
test('extract from pdf over http', function(done) {
tika.text('https://tools.ietf.org/pdf/rfc2324.pdf', function(err, text) {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('Hyper Text Coffee Pot Control Protocol'));
done();
});
});
suite('http extraction tests', () => {
test('extract from pdf over http', done => {
tika.text('https://tools.ietf.org/pdf/rfc2324.pdf', (err, text) => {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('Hyper Text Coffee Pot Control Protocol'));
done();
});
});
});
suite('ftp extraction tests', function() {
test('extract from text file over ftp', function(done) {
this.timeout(10000);
tika.text('ftp://ftp.ietf.org/rfc/rfc959.txt', function(err, text) {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('FILE TRANSFER PROTOCOL'));
done();
});
});
suite('ftp extraction tests', function () {
test('extract from text file over ftp', function (done) {
this.timeout(10000);
tika.text('ftp://ftp.ietf.org/rfc/rfc959.txt', (err, text) => {
assert.ifError(err);
assert.ok(-1 !== text.indexOf('FILE TRANSFER PROTOCOL'));
done();
});
});
});
suite('language detection tests', function() {
test('detect English text', function(done) {
tika.language('This just some text in English.', function(err, language, reasonablyCertain) {
assert.ifError(err);
assert.equal(typeof language, 'string');
assert.equal(typeof reasonablyCertain, 'boolean');
assert.equal(language, 'en');
done();
});
});
suite('language detection tests', () => {
test('detect English text', done => {
tika.language('This just some text in English.', (err, language, reasonablyCertain) => {
assert.ifError(err);
assert.equal(typeof language, 'string');
assert.equal(typeof reasonablyCertain, 'boolean');
assert.equal(language, 'en');
done();
});
});
});

@@ -12,3 +12,3 @@ /**

var java = require('java');
const java = require('java');

@@ -19,90 +19,102 @@ java.classpath.push(__dirname + '/jar/node-tika-1.19.jar');

var NodeTika = java.import('org.icij.nodetika.NodeTika');
const NodeTika = java.import('org.icij.nodetika.NodeTika');
exports.extract = function(uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
const extract = function (uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
exports.text(uri, options, function(err, text) {
if (err) {
return cb(err);
}
text(uri, options, function (err, text) {
if (err) {
return cb(err);
}
exports.meta(uri, options, function(err, meta) {
cb(err, text, meta);
});
});
meta(uri, options, function (err, meta) {
cb(err, text, meta);
});
});
};
exports.text = function(uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
const text = function (uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
NodeTika.extractText(uri, JSON.stringify(options), cb);
NodeTika.extractText(uri, JSON.stringify(options), cb);
};
exports.xhtml = function(uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
const xhtml = function (uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
NodeTika.extractXml(uri, 'html', JSON.stringify(options), cb);
NodeTika.extractXml(uri, 'html', JSON.stringify(options), cb);
};
exports.meta = function(uri, options, cb) {
var handler = function(err, meta) {
if (err) {
return cb(err);
}
const meta = function (uri, options, cb) {
const handler = function (err, meta) {
if (err) {
return cb(err);
}
cb(null, JSON.parse(meta));
};
cb(null, JSON.parse(meta));
};
if (arguments.length < 3) {
cb = options;
options = null;
}
if (arguments.length < 3) {
cb = options;
options = null;
}
if (options) {
NodeTika.extractMeta(uri, options.contentType, handler);
} else {
NodeTika.extractMeta(uri, handler);
}
if (options) {
NodeTika.extractMeta(uri, options.contentType, handler);
} else {
NodeTika.extractMeta(uri, handler);
}
};
exports.type = exports.contentType = function(uri, cb) {
NodeTika.detectContentType(uri, cb);
const type = function (uri, cb) {
NodeTika.detectContentType(uri, cb);
};
exports.charset = function(uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
const charset = function (uri, options, cb) {
if (arguments.length < 3) {
cb = options;
options = null;
}
if (options) {
NodeTika.detectCharset(uri, options.contentType, cb);
} else {
NodeTika.detectCharset(uri, cb);
}
if (options) {
NodeTika.detectCharset(uri, options.contentType, cb);
} else {
NodeTika.detectCharset(uri, cb);
}
};
exports.typeAndCharset = function(uri, cb) {
NodeTika.detectContentTypeAndCharset(uri, cb);
const typeAndCharset = function (uri, cb) {
NodeTika.detectContentTypeAndCharset(uri, cb);
};
exports.language = function(text, cb) {
NodeTika.detectLanguage(text, function(err, language) {
if (err) {
cb(err);
} else {
language = JSON.parse(language);
cb(null, language.language, language.reasonablyCertain);
}
});
const language = function (text, cb) {
NodeTika.detectLanguage(text, function (err, language) {
if (err) {
cb(err);
} else {
language = JSON.parse(language);
cb(null, language.language, language.reasonablyCertain);
}
});
};
module.exports = {
extract,
charset,
typeAndCharset,
language,
meta,
contentType: type,
type,
text,
xhtml
};

@@ -1,2 +0,2 @@

var java = require('java');
const java = require('java');

@@ -6,5 +6,5 @@ java.classpath.push('commons-lang3-3.1.jar');

var System = java.import('java.lang.System');
const System = java.import('java.lang.System');
console.log('Java version: ' + System.getPropertySync('java.version'));
console.log('Java home: ' + System.getPropertySync('java.home'));
console.log(`Java version: ${System.getPropertySync('java.version')}`);
console.log(`Java home: ${System.getPropertySync('java.home')}`);
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc