Socket
Socket
Sign inDemoInstall

html-to-text

Package Overview
Dependencies
Maintainers
1
Versions
55
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

html-to-text - npm Package Compare versions

Comparing version 3.1.0 to 3.2.0

.codeclimate.yml

38

bin/cli.js

@@ -7,8 +7,8 @@ #!/usr/bin/env node

var argv = optimist
.string('tables')
.default('wordwrap', 80)
.default('ignore-href', false)
.default('ignore-image', false)
.default('noLinkBrackets', false)
.argv;
.string('tables')
.default('wordwrap', 80)
.default('ignore-href', false)
.default('ignore-image', false)
.default('noLinkBrackets', false)
.argv;

@@ -22,21 +22,21 @@ var text = '';

process.stdin.on('data', function data(data) {
text += data;
text += data;
});
process.stdin.on('end', function end() {
text = htmlToText.fromString(text, {
tables: interpretTables(argv.tables),
wordwrap: argv.wordwrap,
ignoreHref: argv['ignore-href'],
ignoreImage: argv['ignore-image'],
noLinkBrackets: argv['noLinkBrackets']
});
process.stdout.write(text + '\n', 'utf-8');
text = htmlToText.fromString(text, {
tables: interpretTables(argv.tables),
wordwrap: argv.wordwrap,
ignoreHref: argv['ignore-href'],
ignoreImage: argv['ignore-image'],
noLinkBrackets: argv['noLinkBrackets']
});
process.stdout.write(text + '\n', 'utf-8');
});
function interpretTables(tables) {
if (!tables || tables === '' || tables === 'false') {
return [];
}
return tables === 'true' || tables.split(',');
if (!tables || tables === '' || tables === 'false') {
return [];
}
return tables === 'true' || tables.split(',');
}
# Changelog
## Version 3.2.0
* Basic support for alpha ordered list types added #122
* This includes support for the `ol` type values `1`, `a` and `A`
## Version 3.1.0
* Support for the ordered list start attribute added #117
* Option to format paragraph with single new line #112
* `noLinksBrackets` options added #119
* Support for the ordered list start attribute added #117
* Option to format paragraph with single new line #112
* `noLinksBrackets` options added #119
## Version 3.0.0
* Switched from `htmlparser` to `htmlparser2` #113
* Treat non-numeric colspans as zero and handle them gracefully #105
* Switched from `htmlparser` to `htmlparser2` #113
* Treat non-numeric colspans as zero and handle them gracefully #105

@@ -20,40 +25,40 @@ ## Version 2.1.1

* New option to disable `uppercaseHeadings` added. #86
* Starting point of html to text conversion can now be defined in the options via the `baseElement` option. #83
* Support for long words added. The behaviour can be configured via the `longWordSplit` option. #83
* New option to disable `uppercaseHeadings` added. #86
* Starting point of html to text conversion can now be defined in the options via the `baseElement` option. #83
* Support for long words added. The behaviour can be configured via the `longWordSplit` option. #83
## Version 2.0.0
* Unicode support added. #81
* New option `decodeOptions` added.
* Dependencies updated.
* Unicode support added. #81
* New option `decodeOptions` added.
* Dependencies updated.
Breaking Changes:
* Minimum node version increased to >=0.10.0
* Minimum node version increased to >=0.10.0
## Version 1.6.2
* Fixed: correctly handle HTML entities for images #82
* Fixed: correctly handle HTML entities for images #82
## Version 1.6.1
* Fixed: using --tables=true doesn't produce the expected results. #80
* Fixed: using --tables=true doesn't produce the expected results. #80
## Version 1.6.0
* Preserve newlines in text feature added #75
* Preserve newlines in text feature added #75
## Version 1.5.1
* Support for h5 and h6 tags added #74
* Support for h5 and h6 tags added #74
## Version 1.5.0
* Entity regex is now less greedy #69 #70
* Entity regex is now less greedy #69 #70
## Version 1.4.0
* Uppercase tag processing added. Table center support added. #56
* Unuused dependencies removed.
* Uppercase tag processing added. Table center support added. #56
* Unuused dependencies removed.

@@ -60,0 +65,0 @@ ## Version 1.3.2

@@ -7,3 +7,3 @@ var path = require('path');

var text = htmlToText.fromString('<h1>Hello World</h1>', {
wordwrap: 130
wordwrap: 130
});

@@ -15,6 +15,6 @@ console.log(text);

htmlToText.fromFile(path.join(__dirname, 'test.html'), {
tables: ['#invoice', '.address']
tables: ['#invoice', '.address']
}, function(err, text) {
if (err) return console.error(err);
console.log(text);
if (err) return console.error(err);
console.log(text);
});

@@ -8,49 +8,49 @@ var _ = require('underscore');

function formatText(elem, options) {
var text = elem.data || "";
text = he.decode(text, options.decodeOptions);
var text = elem.data || "";
text = he.decode(text, options.decodeOptions);
if (options.isInPre) {
return text;
} else {
return helper.wordwrap(elem.trimLeadingSpace ? _s.lstrip(text) : text, options);
}
if (options.isInPre) {
return text;
} else {
return helper.wordwrap(elem.trimLeadingSpace ? _s.lstrip(text) : text, options);
}
}
function formatImage(elem, options) {
if (options.ignoreImage) {
return '';
}
if (options.ignoreImage) {
return '';
}
var result = '', attribs = elem.attribs || {};
if (attribs.alt) {
result += he.decode(attribs.alt, options.decodeOptions);
if (attribs.src) {
result += ' ';
}
}
if (attribs.src) {
result += '[' + attribs.src + ']';
}
return (result);
var result = '', attribs = elem.attribs || {};
if (attribs.alt) {
result += he.decode(attribs.alt, options.decodeOptions);
if (attribs.src) {
result += ' ';
}
}
if (attribs.src) {
result += '[' + attribs.src + ']';
}
return (result);
}
function formatLineBreak(elem, fn, options) {
return '\n' + fn(elem.children, options);
return '\n' + fn(elem.children, options);
}
function formatParagraph(elem, fn, options) {
var paragraph = fn(elem.children, options)
if (options.singleNewLineParagraphs) {
return paragraph + '\n'
} else {
return paragraph + '\n\n'
}
var paragraph = fn(elem.children, options)
if (options.singleNewLineParagraphs) {
return paragraph + '\n'
} else {
return paragraph + '\n\n'
}
}
function formatHeading(elem, fn, options) {
var heading = fn(elem.children, options);
if (options.uppercaseHeadings) {
heading = heading.toUpperCase();
}
return heading + '\n';
var heading = fn(elem.children, options);
if (options.uppercaseHeadings) {
heading = heading.toUpperCase();
}
return heading + '\n';
}

@@ -64,52 +64,52 @@

function formatAnchor(elem, fn, options) {
var href = '';
// Always get the anchor text
var storedCharCount = options.lineCharCount;
var text = fn(elem.children || [], options);
if (!text) {
text = '';
}
var href = '';
// Always get the anchor text
var storedCharCount = options.lineCharCount;
var text = fn(elem.children || [], options);
if (!text) {
text = '';
}
var result = elem.trimLeadingSpace ? _s.lstrip(text) : text;
var result = elem.trimLeadingSpace ? _s.lstrip(text) : text;
if (!options.ignoreHref) {
// Get the href, if present
if (elem.attribs && elem.attribs.href) {
href = elem.attribs.href.replace(/^mailto\:/, '');
}
if (href) {
if (options.linkHrefBaseUrl && href.indexOf('/') == 0) {
href = options.linkHrefBaseUrl + href;
}
if (!options.hideLinkHrefIfSameAsText || href != _s.replaceAll(result, '\n', '')) {
if (!options.noLinkBrackets) {
result += ' [' + href + ']';
} else {
result += ' ' + href;
}
}
}
}
if (!options.ignoreHref) {
// Get the href, if present
if (elem.attribs && elem.attribs.href) {
href = elem.attribs.href.replace(/^mailto\:/, '');
}
if (href) {
if (options.linkHrefBaseUrl && href.indexOf('/') === 0) {
href = options.linkHrefBaseUrl + href;
}
if (!options.hideLinkHrefIfSameAsText || href !== _s.replaceAll(result, '\n', '')) {
if (!options.noLinkBrackets) {
result += ' [' + href + ']';
} else {
result += ' ' + href;
}
}
}
}
options.lineCharCount = storedCharCount;
options.lineCharCount = storedCharCount;
return formatText({ data: result || href, trimLeadingSpace: elem.trimLeadingSpace }, options);
return formatText({ data: result || href, trimLeadingSpace: elem.trimLeadingSpace }, options);
}
function formatHorizontalLine(elem, fn, options) {
return '\n' + _s.repeat('-', options.wordwrap) + '\n\n';
return '\n' + _s.repeat('-', options.wordwrap) + '\n\n';
}
function formatListItem(prefix, elem, fn, options) {
options = _.clone(options);
// Reduce the wordwrap for sub elements.
options = _.clone(options);
// Reduce the wordwrap for sub elements.
if (options.wordwrap) {
options.wordwrap -= prefix.length;
options.wordwrap -= prefix.length;
}
// Process sub elements.
var text = fn(elem.children, options);
// Replace all line breaks with line break + prefix spacing.
text = text.replace(/\n/g, '\n' + _s.repeat(' ', prefix.length));
// Add first prefix and line break at the end.
return prefix + text + '\n';
// Process sub elements.
var text = fn(elem.children, options);
// Replace all line breaks with line break + prefix spacing.
text = text.replace(/\n/g, '\n' + _s.repeat(' ', prefix.length));
// Add first prefix and line break at the end.
return prefix + text + '\n';
}

@@ -120,113 +120,122 @@

function formatUnorderedList(elem, fn, options) {
var result = '';
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) {
return child.type !== 'text' || !whiteSpaceRegex.test(child.data);
});
_.each(nonWhiteSpaceChildren, function(elem) {
result += formatListItem(' * ', elem, fn, options);
});
return result + '\n';
var result = '';
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) {
return child.type !== 'text' || !whiteSpaceRegex.test(child.data);
});
_.each(nonWhiteSpaceChildren, function(elem) {
result += formatListItem(' * ', elem, fn, options);
});
return result + '\n';
}
function formatOrderedList(elem, fn, options) {
var result = '';
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) {
return child.type !== 'text' || !whiteSpaceRegex.test(child.data);
});
// Make sure there are list items present
if (nonWhiteSpaceChildren.length) {
// Calculate initial start from ol attribute
var start = parseInt(elem.attribs.start || '1') - 1
// Calculate the maximum length to i.
var maxLength = (nonWhiteSpaceChildren.length + start).toString().length;
_.each(nonWhiteSpaceChildren, function(elem, i) {
var index = i + 1 + start;
// Calculate the needed spacing for nice indentation.
var spacing = maxLength - index.toString().length;
var prefix = ' ' + index + '. ' + _s.repeat(' ', spacing);
result += formatListItem(prefix, elem, fn, options);
});
}
return result + '\n';
var result = '';
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) {
return child.type !== 'text' || !whiteSpaceRegex.test(child.data);
});
// Return different functions for different OL types
var typeFunctions = {
1: function(start, i) { return i + 1 + start},
a: function(start, i) { return String.fromCharCode(i + start + 97)},
A: function(start, i) { return String.fromCharCode(i + start + 65)}
};
// Determine type
var olType = elem.attribs.type || '1'
// Make sure there are list items present
if (nonWhiteSpaceChildren.length) {
// Calculate initial start from ol attribute
var start = Number(elem.attribs.start || '1') - 1
// Calculate the maximum length to i.
var maxLength = (nonWhiteSpaceChildren.length + start).toString().length;
_.each(nonWhiteSpaceChildren, function(elem, i) {
// Use different function depending on type
var index = typeFunctions[olType](start, i);
// Calculate the needed spacing for nice indentation.
var spacing = maxLength - index.toString().length;
var prefix = (olType === '1') ? ' ' + index + '. ' + _s.repeat(' ', spacing) : index + '. ';
result += formatListItem(prefix, elem, fn, options);
});
}
return result + '\n';
}
function tableToString(table) {
// Determine space width per column
// Convert all rows to lengths
var widths = _.map(table, function(row) {
return _.map(row, function(col) {
return col.length;
});
});
// Invert rows with colums
widths = helper.arrayZip(widths);
// Determine the max values for each column
widths = _.map(widths, function(col) {
return _.max(col);
});
// Determine space width per column
// Convert all rows to lengths
var widths = _.map(table, function(row) {
return _.map(row, function(col) {
return col.length;
});
});
// Invert rows with colums
widths = helper.arrayZip(widths);
// Determine the max values for each column
widths = _.map(widths, function(col) {
return _.max(col);
});
// Build the table
var text = '';
_.each(table, function(row) {
var i = 0;
_.each(row, function(col) {
text += _s.rpad(_s.strip(col), widths[i++], ' ') + ' ';
});
text += '\n';
});
return text + '\n';
// Build the table
var text = '';
_.each(table, function(row) {
var i = 0;
_.each(row, function(col) {
text += _s.rpad(_s.strip(col), widths[i++], ' ') + ' ';
});
text += '\n';
});
return text + '\n';
}
function formatTable(elem, fn, options) {
var table = [];
_.each(elem.children, tryParseRows);
return tableToString(table);
var table = [];
_.each(elem.children, tryParseRows);
return tableToString(table);
function tryParseRows(elem) {
if (elem.type !== 'tag') {
return;
}
switch (elem.name.toLowerCase()) {
case "thead":
case "tbody":
case "tfoot":
case "center":
_.each(elem.children, tryParseRows);
return;
function tryParseRows(elem) {
if (elem.type !== 'tag') {
return;
}
switch (elem.name.toLowerCase()) {
case "thead":
case "tbody":
case "tfoot":
case "center":
_.each(elem.children, tryParseRows);
return;
case 'tr':
var rows = [];
_.each(elem.children, function(elem) {
var tokens, times;
if (elem.type === 'tag') {
switch (elem.name.toLowerCase()) {
case 'th':
tokens = formatHeading(elem, fn, options).split('\n');
rows.push(_.compact(tokens));
break;
case 'tr':
var rows = [];
_.each(elem.children, function(elem) {
var tokens, times;
if (elem.type === 'tag') {
switch (elem.name.toLowerCase()) {
case 'th':
tokens = formatHeading(elem, fn, options).split('\n');
rows.push(_.compact(tokens));
break;
case 'td':
tokens = fn(elem.children, options).split('\n');
rows.push(_.compact(tokens));
// Fill colspans with empty values
if (elem.attribs && elem.attribs.colspan) {
times = elem.attribs.colspan - 1 || 0;
_.times(times, function() {
rows.push(['']);
});
}
break;
}
}
});
rows = helper.arrayZip(rows);
_.each(rows, function(row) {
row = _.map(row, function(col) {
return col || '';
});
table.push(row);
});
break;
}
}
case 'td':
tokens = fn(elem.children, options).split('\n');
rows.push(_.compact(tokens));
// Fill colspans with empty values
if (elem.attribs && elem.attribs.colspan) {
times = elem.attribs.colspan - 1 || 0;
_.times(times, function() {
rows.push(['']);
});
}
break;
}
}
});
rows = helper.arrayZip(rows);
_.each(rows, function(row) {
row = _.map(row, function(col) {
return col || '';
});
table.push(row);
});
break;
}
}
}

@@ -233,0 +242,0 @@

@@ -8,129 +8,127 @@ var _ = require('underscore');

function splitLongWord(word, options) {
var wrapCharacters = options.longWordSplit.wrapCharacters || [];
var forceWrapOnLimit = options.longWordSplit.forceWrapOnLimit || false;
var max = options.wordwrap;
var wrapCharacters = options.longWordSplit.wrapCharacters || [];
var forceWrapOnLimit = options.longWordSplit.forceWrapOnLimit || false;
var max = options.wordwrap;
var fuseWord = [];
var idx = 0;
while (word.length > max) {
var firstLine = word.substr(0, max);
var remainingChars = word.substr(max);
var fuseWord = [];
var idx = 0;
while (word.length > max) {
var firstLine = word.substr(0, max);
var remainingChars = word.substr(max);
var splitIndex = firstLine.lastIndexOf(wrapCharacters[idx]);
var splitIndex = firstLine.lastIndexOf(wrapCharacters[idx]);
if (splitIndex > -1) {
// We've found a character to split on, store before the split then check if we
// need to split again
word = firstLine.substr(splitIndex + 1) + remainingChars;
fuseWord.push(firstLine.substr(0, splitIndex + 1));
if (splitIndex > -1) {
// We've found a character to split on, store before the split then check if we
// need to split again
word = firstLine.substr(splitIndex + 1) + remainingChars;
fuseWord.push(firstLine.substr(0, splitIndex + 1));
} else {
idx++;
if (idx >= wrapCharacters.length) {
// Cannot split on character, so either split at 'max' or preserve length
if (forceWrapOnLimit) {
fuseWord.push(firstLine);
word = remainingChars;
if (word.length > max) {
continue;
}
} else {
idx++;
if (idx >= wrapCharacters.length) {
// Cannot split on character, so either split at 'max' or preserve length
if (forceWrapOnLimit) {
fuseWord.push(firstLine);
word = remainingChars;
if (word.length > max) {
continue;
}
} else {
word = firstLine + remainingChars;
if (!options.preserveNewlines) {
word += '\n';
}
}
break;
} else {
word = firstLine + remainingChars;
}
word = firstLine + remainingChars;
if (!options.preserveNewlines) {
word += '\n';
}
}
break;
} else {
word = firstLine + remainingChars;
}
}
fuseWord.push(word);
}
fuseWord.push(word);
return fuseWord.join('\n');
return fuseWord.join('\n');
}
exports.wordwrap = function wordwrap(text, options) {
var max = options.wordwrap;
var preserveNewlines = options.preserveNewlines;
var length = options.lineCharCount;
var max = options.wordwrap;
var preserveNewlines = options.preserveNewlines;
var length = options.lineCharCount;
// Preserve leading space
var result = _s.startsWith(text, ' ') ? ' ' : '';
length += result.length;
var buffer = [];
// Split the text into words, decide to preserve new lines or not.
var words = preserveNewlines
? text.replace(/\n/g, '\n ').split(/\ +/)
: _s.words(text);
// Preserve leading space
var result = _s.startsWith(text, ' ') ? ' ' : '';
length += result.length;
var buffer = [];
// Split the text into words, decide to preserve new lines or not.
var words = preserveNewlines
? text.replace(/\n/g, '\n ').split(/\ +/)
: _s.words(text);
// Determine where to end line word by word.
_.each(words, function(word) {
// Add buffer to result if we can't fit any more words in the buffer.
if ((max || max === 0) && length > 0 &&
((length + word.length > max) || (length + word.indexOf('\n') > max)))
{
// Concat buffer and add it to the result
result += buffer.join(' ') + '\n';
// Reset buffer and length
buffer.length = length = 0;
}
// Determine where to end line word by word.
_.each(words, function(word) {
// Add buffer to result if we can't fit any more words in the buffer.
if ((max || max === 0) && length > 0 && ((length + word.length > max) || (length + word.indexOf('\n') > max))) {
// Concat buffer and add it to the result
result += buffer.join(' ') + '\n';
// Reset buffer and length
buffer.length = length = 0;
}
// Check if the current word is long enough to be wrapped
if ((max || max === 0) && (options.longWordSplit) && (word.length > max)) {
word = splitLongWord(word, options);
}
// Check if the current word is long enough to be wrapped
if ((max || max === 0) && (options.longWordSplit) && (word.length > max)) {
word = splitLongWord(word, options);
}
buffer.push(word);
buffer.push(word);
// If the word contains a newline then restart the count and add the buffer to the result
if (word.indexOf('\n') != -1) {
result += buffer.join(' ');
// If the word contains a newline then restart the count and add the buffer to the result
if (word.indexOf('\n') !== -1) {
result += buffer.join(' ');
// Reset the buffer, let the length include any characters after the last newline
buffer.length = 0;
length = word.length - (word.lastIndexOf('\n') + 1);
// If there are characters after the newline, add a space and increase the length by 1
if (length) {
result += ' ';
length++;
}
} else {
// Add word length + one whitespace
length += word.length + 1;
}
});
// Add the rest to the result.
result += buffer.join(' ');
// Preserve trailing space
if (!_s.endsWith(text, ' ')) {
result = _s.rtrim(result);
} else if (!_s.endsWith(result, ' ')) {
result = result + ' ';
// Reset the buffer, let the length include any characters after the last newline
buffer.length = 0;
length = word.length - (word.lastIndexOf('\n') + 1);
// If there are characters after the newline, add a space and increase the length by 1
if (length) {
result += ' ';
length++;
}
} else {
// Add word length + one whitespace
length += word.length + 1;
}
});
// Add the rest to the result.
result += buffer.join(' ');
return result;
// Preserve trailing space
if (!_s.endsWith(text, ' ')) {
result = _s.rtrim(result);
} else if (!_s.endsWith(result, ' ')) {
result = result + ' ';
}
return result;
};
exports.arrayZip = function arrayZip(array) {
return _.zip.apply(_, array);
return _.zip.apply(_, array);
};
exports.splitCssSearchTag = function splitCssSearchTag(tagString) {
function getParams(re, string) {
var captures = [], found;
while (found = re.exec(string)) {
captures.push(found[1]);
}
return captures;
function getParams(re, string) {
var captures = [], found;
while ((found = re.exec(string)) !== null) {
captures.push(found[1]);
}
return captures;
}
var splitTag = {};
var elementRe = /(^\w*)/g;
splitTag.element = elementRe.exec(tagString)[1];
splitTag.classes = getParams( /\.([\d\w-]*)/g, tagString);
splitTag.ids = getParams( /#([\d\w-]*)/g, tagString);
var splitTag = {};
var elementRe = /(^\w*)/g;
splitTag.element = elementRe.exec(tagString)[1];
splitTag.classes = getParams( /\.([\d\w-]*)/g, tagString);
splitTag.ids = getParams( /#([\d\w-]*)/g, tagString);
return splitTag;
return splitTag;
};

@@ -13,181 +13,181 @@ var fs = require('fs');

var SKIP_TYPES = [
'style',
'script'
'style',
'script'
];
function htmlToText(html, options) {
options = options || {};
_.defaults(options, {
wordwrap: 80,
tables: [],
preserveNewlines: false,
uppercaseHeadings: true,
singleNewLineParagraphs: false,
hideLinkHrefIfSameAsText: false,
linkHrefBaseUrl: null,
noLinkBrackets: false,
baseElement: 'body',
returnDomByDefault: true,
decodeOptions: {
isAttributeValue: false,
strict: false
},
longWordSplit: {
wrapCharacters: [],
forceWrapOnLimit: false
}
});
options = options || {};
_.defaults(options, {
wordwrap: 80,
tables: [],
preserveNewlines: false,
uppercaseHeadings: true,
singleNewLineParagraphs: false,
hideLinkHrefIfSameAsText: false,
linkHrefBaseUrl: null,
noLinkBrackets: false,
baseElement: 'body',
returnDomByDefault: true,
decodeOptions: {
isAttributeValue: false,
strict: false
},
longWordSplit: {
wrapCharacters: [],
forceWrapOnLimit: false
}
});
var handler = new htmlparser.DefaultHandler(function (error, dom) {
var handler = new htmlparser.DefaultHandler(function (error, dom) {
}, {
verbose: true
});
new htmlparser.Parser(handler).parseComplete(html);
}, {
verbose: true
});
new htmlparser.Parser(handler).parseComplete(html);
options.lineCharCount = 0;
options.lineCharCount = 0;
var result = '';
var baseElements = Array.isArray(options.baseElement) ? options.baseElement : [options.baseElement];
for (var idx = 0; idx < baseElements.length; ++idx) {
result += walk(filterBody(handler.dom, options, baseElements[idx]), options);
}
return _s.strip(result);
var result = '';
var baseElements = Array.isArray(options.baseElement) ? options.baseElement : [options.baseElement];
for (var idx = 0; idx < baseElements.length; ++idx) {
result += walk(filterBody(handler.dom, options, baseElements[idx]), options);
}
return _s.strip(result);
}
function filterBody(dom, options, baseElement) {
var result = null;
var result = null;
var splitTag = helper.splitCssSearchTag(baseElement);
var splitTag = helper.splitCssSearchTag(baseElement);
function walk(dom) {
if (result) return;
_.each(dom, function(elem) {
if (result) return;
if (elem.name === splitTag.element) {
var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : [];
var documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(" ") : [];
function walk(dom) {
if (result) return;
_.each(dom, function(elem) {
if (result) return;
if (elem.name === splitTag.element) {
var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : [];
var documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(" ") : [];
if ((splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0 })) &&
(splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0 }))) {
result = [elem];
return;
}
}
if (elem.children) walk(elem.children);
});
}
walk(dom);
return options.returnDomByDefault ? result || dom : result;
if ((splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0 })) &&
(splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0 }))) {
result = [elem];
return;
}
}
if (elem.children) walk(elem.children);
});
}
walk(dom);
return options.returnDomByDefault ? result || dom : result;
}
function containsTable(attr, tables) {
if (tables === true) return true;
if (tables === true) return true;
function removePrefix(key) {
return key.substr(1);
}
function checkPrefix(prefix) {
return function(key) {
return _s.startsWith(key, prefix);
};
}
function filterByPrefix(tables, prefix) {
return _(tables).chain()
.filter(checkPrefix(prefix))
.map(removePrefix)
.value();
}
var classes = filterByPrefix(tables, '.');
var ids = filterByPrefix(tables, '#');
return attr && (_.include(classes, attr['class']) || _.include(ids, attr['id']));
function removePrefix(key) {
return key.substr(1);
}
function checkPrefix(prefix) {
return function(key) {
return _s.startsWith(key, prefix);
};
}
function filterByPrefix(tables, prefix) {
return _(tables).chain()
.filter(checkPrefix(prefix))
.map(removePrefix)
.value();
}
var classes = filterByPrefix(tables, '.');
var ids = filterByPrefix(tables, '#');
return attr && (_.include(classes, attr['class']) || _.include(ids, attr['id']));
}
function walk(dom, options, result) {
if (arguments.length < 3) {
result = '';
}
var whiteSpaceRegex = /\s$/;
_.each(dom, function(elem) {
switch(elem.type) {
case 'tag':
switch(elem.name.toLowerCase()) {
case 'img':
result += format.image(elem, options);
break;
case 'a':
// Inline element needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.anchor(elem, walk, options);
break;
case 'p':
result += format.paragraph(elem, walk, options);
break;
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
result += format.heading(elem, walk, options);
break;
case 'br':
result += format.lineBreak(elem, walk, options);
break;
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
case 'ul':
result += format.unorderedList(elem, walk, options);
break;
case 'ol':
result += format.orderedList(elem, walk, options);
break;
case 'pre':
var newOptions = _(options).clone();
newOptions.isInPre = true;
result += format.paragraph(elem, walk, newOptions);
break;
case 'table':
if (containsTable(elem.attribs, options.tables)) {
result += format.table(elem, walk, options);
break;
}
default:
result = walk(elem.children || [], options, result);
}
break;
case 'text':
if (elem.data !== '\r\n') {
// Text needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.text(elem, options);
}
break;
default:
if (!_.include(SKIP_TYPES, elem.type)) {
result = walk(elem.children || [], options, result);
}
}
if (arguments.length < 3) {
result = '';
}
var whiteSpaceRegex = /\s$/;
_.each(dom, function(elem) {
switch(elem.type) {
case 'tag':
switch(elem.name.toLowerCase()) {
case 'img':
result += format.image(elem, options);
break;
case 'a':
// Inline element needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.anchor(elem, walk, options);
break;
case 'p':
result += format.paragraph(elem, walk, options);
break;
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
result += format.heading(elem, walk, options);
break;
case 'br':
result += format.lineBreak(elem, walk, options);
break;
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
case 'ul':
result += format.unorderedList(elem, walk, options);
break;
case 'ol':
result += format.orderedList(elem, walk, options);
break;
case 'pre':
var newOptions = _(options).clone();
newOptions.isInPre = true;
result += format.paragraph(elem, walk, newOptions);
break;
case 'table':
result = containsTable(elem.attribs, options.tables)
? result + format.table(elem, walk, options)
: walk(elem.children || [], options, result);
break;
default:
result = walk(elem.children || [], options, result);
}
break;
case 'text':
if (elem.data !== '\r\n') {
// Text needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.text(elem, options);
}
break;
default:
if (!_.include(SKIP_TYPES, elem.type)) {
result = walk(elem.children || [], options, result);
}
}
options.lineCharCount = result.length - (result.lastIndexOf('\n') + 1);
});
return result;
options.lineCharCount = result.length - (result.lastIndexOf('\n') + 1);
});
return result;
}
exports.fromFile = function(file, options, callback) {
if (!callback) {
callback = options;
options = {};
}
fs.readFile(file, 'utf8', function(err, str) {
var result = htmlToText(str, options);
return callback(null, result);
});
if (!callback) {
callback = options;
options = {};
}
fs.readFile(file, 'utf8', function (err, str) {
if (err) return callback(err);
return callback(null, htmlToText(str, options));
});
};
exports.fromString = function(str, options) {
return htmlToText(str, options || {});
return htmlToText(str, options || {});
};
{
"name": "html-to-text",
"version": "3.1.0",
"version": "3.2.0",
"description": "Advanced html to plain text converter",
"main": "index.js",
"scripts": {
"test": "node_modules/.bin/mocha",
"example": "node ./example/html-to-text.js"
"test": "istanbul cover _mocha && eslint .",
"example": "node ./example/html-to-text.js",
"lint": "eslint ."
},

@@ -46,4 +47,6 @@ "author": {

"chai": "^3.5.0",
"eslint": "^3.14.1",
"istanbul": "^0.4.5",
"mocha": "^3.0.0"
}
}
# html-to-text
[![Build Status](https://travis-ci.org/werk85/node-html-to-text.svg?branch=master)](https://travis-ci.org/werk85/node-html-to-text)
[![Test Coverage](https://codeclimate.com/github/werk85/node-html-to-text/badges/coverage.svg)](https://codeclimate.com/github/werk85/node-html-to-text/coverage)

@@ -5,0 +6,0 @@ An advanced converter that parses HTML and returns beautiful text. It was mainly designed to transform HTML E-Mail templates to a text representation. So it is currently optimized for table layouts.

@@ -0,1 +1,3 @@

/* eslint max-len: "off" */
var expect = require('chai').expect;

@@ -233,2 +235,17 @@ var htmlToText = require('..');

it('should support the ordered list type="1" attribute', function() {
var testString = '<ol type="1"><li>foo</li><li>bar</li></ol>';
expect(htmlToText.fromString(testString)).to.equal('1. foo\n 2. bar');
});
it('should support the ordered list type="a" attribute', function() {
var testString = '<ol type="a"><li>foo</li><li>bar</li></ol>';
expect(htmlToText.fromString(testString)).to.equal('a. foo\nb. bar');
});
it('should support the ordered list type="A" attribute', function() {
var testString = '<ol type="A"><li>foo</li><li>bar</li></ol>';
expect(htmlToText.fromString(testString)).to.equal('A. foo\nB. bar');
});
it('should support the ordered list start attribute', function() {

@@ -238,2 +255,17 @@ var testString = '<ol start="2"><li>foo</li><li>bar</li></ol>';

});
/*
* Currently failing tests for continuing to fill out the specification
* Spec: https://html.spec.whatwg.org/multipage/semantics.html#the-ol-element
*
it('should support the ordered list type="a" attribute past 26 characters', function() {
var testString = '<ol start="26" type="a"><li>foo</li><li>bar</li></ol>';
expect(htmlToText.fromString(testString)).to.equal('z. foo\naa. bar');
});
it('should support the ordered list type="A" attribute past 26 characters', function() {
var testString = '<ol start="26" type="A"><li>foo</li><li>bar</li></ol>';
expect(htmlToText.fromString(testString)).to.equal('Z. foo\nAA. bar');
});
*/
});

@@ -291,4 +323,4 @@

it('should retrieve and convert the entire document under `body` by default', function(done) {
var htmlFile = path.join(__dirname, 'test.html'),
txtFile = path.join(__dirname, 'test.txt');
var htmlFile = path.join(__dirname, 'test.html');
var txtFile = path.join(__dirname, 'test.txt');

@@ -307,4 +339,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8');

it('should only retrieve and convert content under the specified base element if found', function(done) {
var htmlFile = path.join(__dirname, 'test.html'),
txtFile = path.join(__dirname, 'test-address.txt');
var htmlFile = path.join(__dirname, 'test.html');
var txtFile = path.join(__dirname, 'test-address.txt');

@@ -324,4 +356,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8');

it('should retrieve and convert content under multiple base elements', function(done) {
var htmlFile = path.join(__dirname, 'test.html'),
txtFile = path.join(__dirname, 'test-address-dup.txt');
var htmlFile = path.join(__dirname, 'test.html');
var txtFile = path.join(__dirname, 'test-address-dup.txt');

@@ -341,4 +373,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8');

it('should retrieve and convert content under multiple base elements in any order', function(done) {
var htmlFile = path.join(__dirname, 'test.html'),
txtFile = path.join(__dirname, 'test-any-order.txt');
var htmlFile = path.join(__dirname, 'test.html');
var txtFile = path.join(__dirname, 'test-any-order.txt');

@@ -358,4 +390,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8');

it('should process the first base element found when multiple exist', function(done) {
var htmlFile = path.join(__dirname, 'test.html'),
txtFile = path.join(__dirname, 'test-first-element.txt');
var htmlFile = path.join(__dirname, 'test.html');
var txtFile = path.join(__dirname, 'test-first-element.txt');

@@ -375,4 +407,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8');

it('should retrieve and convert the entire document by default if no base element is found', function(done) {
var htmlFile = path.join(__dirname, 'test.html'),
txtFile = path.join(__dirname, 'test.txt');
var htmlFile = path.join(__dirname, 'test.html');
var txtFile = path.join(__dirname, 'test.txt');

@@ -379,0 +411,0 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8');

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc