html-to-text
Advanced tools
Comparing version 3.1.0 to 3.2.0
@@ -7,8 +7,8 @@ #!/usr/bin/env node | ||
var argv = optimist | ||
.string('tables') | ||
.default('wordwrap', 80) | ||
.default('ignore-href', false) | ||
.default('ignore-image', false) | ||
.default('noLinkBrackets', false) | ||
.argv; | ||
.string('tables') | ||
.default('wordwrap', 80) | ||
.default('ignore-href', false) | ||
.default('ignore-image', false) | ||
.default('noLinkBrackets', false) | ||
.argv; | ||
@@ -22,21 +22,21 @@ var text = ''; | ||
process.stdin.on('data', function data(data) { | ||
text += data; | ||
text += data; | ||
}); | ||
process.stdin.on('end', function end() { | ||
text = htmlToText.fromString(text, { | ||
tables: interpretTables(argv.tables), | ||
wordwrap: argv.wordwrap, | ||
ignoreHref: argv['ignore-href'], | ||
ignoreImage: argv['ignore-image'], | ||
noLinkBrackets: argv['noLinkBrackets'] | ||
}); | ||
process.stdout.write(text + '\n', 'utf-8'); | ||
text = htmlToText.fromString(text, { | ||
tables: interpretTables(argv.tables), | ||
wordwrap: argv.wordwrap, | ||
ignoreHref: argv['ignore-href'], | ||
ignoreImage: argv['ignore-image'], | ||
noLinkBrackets: argv['noLinkBrackets'] | ||
}); | ||
process.stdout.write(text + '\n', 'utf-8'); | ||
}); | ||
function interpretTables(tables) { | ||
if (!tables || tables === '' || tables === 'false') { | ||
return []; | ||
} | ||
return tables === 'true' || tables.split(','); | ||
if (!tables || tables === '' || tables === 'false') { | ||
return []; | ||
} | ||
return tables === 'true' || tables.split(','); | ||
} |
# Changelog | ||
## Version 3.2.0 | ||
* Basic support for alpha ordered list types added #122 | ||
* This includes support for the `ol` type values `1`, `a` and `A` | ||
## Version 3.1.0 | ||
* Support for the ordered list start attribute added #117 | ||
* Option to format paragraph with single new line #112 | ||
* `noLinksBrackets` options added #119 | ||
* Support for the ordered list start attribute added #117 | ||
* Option to format paragraph with single new line #112 | ||
* `noLinksBrackets` options added #119 | ||
## Version 3.0.0 | ||
* Switched from `htmlparser` to `htmlparser2` #113 | ||
* Treat non-numeric colspans as zero and handle them gracefully #105 | ||
* Switched from `htmlparser` to `htmlparser2` #113 | ||
* Treat non-numeric colspans as zero and handle them gracefully #105 | ||
@@ -20,40 +25,40 @@ ## Version 2.1.1 | ||
* New option to disable `uppercaseHeadings` added. #86 | ||
* Starting point of html to text conversion can now be defined in the options via the `baseElement` option. #83 | ||
* Support for long words added. The behaviour can be configured via the `longWordSplit` option. #83 | ||
* New option to disable `uppercaseHeadings` added. #86 | ||
* Starting point of html to text conversion can now be defined in the options via the `baseElement` option. #83 | ||
* Support for long words added. The behaviour can be configured via the `longWordSplit` option. #83 | ||
## Version 2.0.0 | ||
* Unicode support added. #81 | ||
* New option `decodeOptions` added. | ||
* Dependencies updated. | ||
* Unicode support added. #81 | ||
* New option `decodeOptions` added. | ||
* Dependencies updated. | ||
Breaking Changes: | ||
* Minimum node version increased to >=0.10.0 | ||
* Minimum node version increased to >=0.10.0 | ||
## Version 1.6.2 | ||
* Fixed: correctly handle HTML entities for images #82 | ||
* Fixed: correctly handle HTML entities for images #82 | ||
## Version 1.6.1 | ||
* Fixed: using --tables=true doesn't produce the expected results. #80 | ||
* Fixed: using --tables=true doesn't produce the expected results. #80 | ||
## Version 1.6.0 | ||
* Preserve newlines in text feature added #75 | ||
* Preserve newlines in text feature added #75 | ||
## Version 1.5.1 | ||
* Support for h5 and h6 tags added #74 | ||
* Support for h5 and h6 tags added #74 | ||
## Version 1.5.0 | ||
* Entity regex is now less greedy #69 #70 | ||
* Entity regex is now less greedy #69 #70 | ||
## Version 1.4.0 | ||
* Uppercase tag processing added. Table center support added. #56 | ||
* Unuused dependencies removed. | ||
* Uppercase tag processing added. Table center support added. #56 | ||
* Unuused dependencies removed. | ||
@@ -60,0 +65,0 @@ ## Version 1.3.2 |
@@ -7,3 +7,3 @@ var path = require('path'); | ||
var text = htmlToText.fromString('<h1>Hello World</h1>', { | ||
wordwrap: 130 | ||
wordwrap: 130 | ||
}); | ||
@@ -15,6 +15,6 @@ console.log(text); | ||
htmlToText.fromFile(path.join(__dirname, 'test.html'), { | ||
tables: ['#invoice', '.address'] | ||
tables: ['#invoice', '.address'] | ||
}, function(err, text) { | ||
if (err) return console.error(err); | ||
console.log(text); | ||
if (err) return console.error(err); | ||
console.log(text); | ||
}); |
@@ -8,49 +8,49 @@ var _ = require('underscore'); | ||
function formatText(elem, options) { | ||
var text = elem.data || ""; | ||
text = he.decode(text, options.decodeOptions); | ||
var text = elem.data || ""; | ||
text = he.decode(text, options.decodeOptions); | ||
if (options.isInPre) { | ||
return text; | ||
} else { | ||
return helper.wordwrap(elem.trimLeadingSpace ? _s.lstrip(text) : text, options); | ||
} | ||
if (options.isInPre) { | ||
return text; | ||
} else { | ||
return helper.wordwrap(elem.trimLeadingSpace ? _s.lstrip(text) : text, options); | ||
} | ||
} | ||
function formatImage(elem, options) { | ||
if (options.ignoreImage) { | ||
return ''; | ||
} | ||
if (options.ignoreImage) { | ||
return ''; | ||
} | ||
var result = '', attribs = elem.attribs || {}; | ||
if (attribs.alt) { | ||
result += he.decode(attribs.alt, options.decodeOptions); | ||
if (attribs.src) { | ||
result += ' '; | ||
} | ||
} | ||
if (attribs.src) { | ||
result += '[' + attribs.src + ']'; | ||
} | ||
return (result); | ||
var result = '', attribs = elem.attribs || {}; | ||
if (attribs.alt) { | ||
result += he.decode(attribs.alt, options.decodeOptions); | ||
if (attribs.src) { | ||
result += ' '; | ||
} | ||
} | ||
if (attribs.src) { | ||
result += '[' + attribs.src + ']'; | ||
} | ||
return (result); | ||
} | ||
function formatLineBreak(elem, fn, options) { | ||
return '\n' + fn(elem.children, options); | ||
return '\n' + fn(elem.children, options); | ||
} | ||
function formatParagraph(elem, fn, options) { | ||
var paragraph = fn(elem.children, options) | ||
if (options.singleNewLineParagraphs) { | ||
return paragraph + '\n' | ||
} else { | ||
return paragraph + '\n\n' | ||
} | ||
var paragraph = fn(elem.children, options) | ||
if (options.singleNewLineParagraphs) { | ||
return paragraph + '\n' | ||
} else { | ||
return paragraph + '\n\n' | ||
} | ||
} | ||
function formatHeading(elem, fn, options) { | ||
var heading = fn(elem.children, options); | ||
if (options.uppercaseHeadings) { | ||
heading = heading.toUpperCase(); | ||
} | ||
return heading + '\n'; | ||
var heading = fn(elem.children, options); | ||
if (options.uppercaseHeadings) { | ||
heading = heading.toUpperCase(); | ||
} | ||
return heading + '\n'; | ||
} | ||
@@ -64,52 +64,52 @@ | ||
function formatAnchor(elem, fn, options) { | ||
var href = ''; | ||
// Always get the anchor text | ||
var storedCharCount = options.lineCharCount; | ||
var text = fn(elem.children || [], options); | ||
if (!text) { | ||
text = ''; | ||
} | ||
var href = ''; | ||
// Always get the anchor text | ||
var storedCharCount = options.lineCharCount; | ||
var text = fn(elem.children || [], options); | ||
if (!text) { | ||
text = ''; | ||
} | ||
var result = elem.trimLeadingSpace ? _s.lstrip(text) : text; | ||
var result = elem.trimLeadingSpace ? _s.lstrip(text) : text; | ||
if (!options.ignoreHref) { | ||
// Get the href, if present | ||
if (elem.attribs && elem.attribs.href) { | ||
href = elem.attribs.href.replace(/^mailto\:/, ''); | ||
} | ||
if (href) { | ||
if (options.linkHrefBaseUrl && href.indexOf('/') == 0) { | ||
href = options.linkHrefBaseUrl + href; | ||
} | ||
if (!options.hideLinkHrefIfSameAsText || href != _s.replaceAll(result, '\n', '')) { | ||
if (!options.noLinkBrackets) { | ||
result += ' [' + href + ']'; | ||
} else { | ||
result += ' ' + href; | ||
} | ||
} | ||
} | ||
} | ||
if (!options.ignoreHref) { | ||
// Get the href, if present | ||
if (elem.attribs && elem.attribs.href) { | ||
href = elem.attribs.href.replace(/^mailto\:/, ''); | ||
} | ||
if (href) { | ||
if (options.linkHrefBaseUrl && href.indexOf('/') === 0) { | ||
href = options.linkHrefBaseUrl + href; | ||
} | ||
if (!options.hideLinkHrefIfSameAsText || href !== _s.replaceAll(result, '\n', '')) { | ||
if (!options.noLinkBrackets) { | ||
result += ' [' + href + ']'; | ||
} else { | ||
result += ' ' + href; | ||
} | ||
} | ||
} | ||
} | ||
options.lineCharCount = storedCharCount; | ||
options.lineCharCount = storedCharCount; | ||
return formatText({ data: result || href, trimLeadingSpace: elem.trimLeadingSpace }, options); | ||
return formatText({ data: result || href, trimLeadingSpace: elem.trimLeadingSpace }, options); | ||
} | ||
function formatHorizontalLine(elem, fn, options) { | ||
return '\n' + _s.repeat('-', options.wordwrap) + '\n\n'; | ||
return '\n' + _s.repeat('-', options.wordwrap) + '\n\n'; | ||
} | ||
function formatListItem(prefix, elem, fn, options) { | ||
options = _.clone(options); | ||
// Reduce the wordwrap for sub elements. | ||
options = _.clone(options); | ||
// Reduce the wordwrap for sub elements. | ||
if (options.wordwrap) { | ||
options.wordwrap -= prefix.length; | ||
options.wordwrap -= prefix.length; | ||
} | ||
// Process sub elements. | ||
var text = fn(elem.children, options); | ||
// Replace all line breaks with line break + prefix spacing. | ||
text = text.replace(/\n/g, '\n' + _s.repeat(' ', prefix.length)); | ||
// Add first prefix and line break at the end. | ||
return prefix + text + '\n'; | ||
// Process sub elements. | ||
var text = fn(elem.children, options); | ||
// Replace all line breaks with line break + prefix spacing. | ||
text = text.replace(/\n/g, '\n' + _s.repeat(' ', prefix.length)); | ||
// Add first prefix and line break at the end. | ||
return prefix + text + '\n'; | ||
} | ||
@@ -120,113 +120,122 @@ | ||
function formatUnorderedList(elem, fn, options) { | ||
var result = ''; | ||
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { | ||
return child.type !== 'text' || !whiteSpaceRegex.test(child.data); | ||
}); | ||
_.each(nonWhiteSpaceChildren, function(elem) { | ||
result += formatListItem(' * ', elem, fn, options); | ||
}); | ||
return result + '\n'; | ||
var result = ''; | ||
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { | ||
return child.type !== 'text' || !whiteSpaceRegex.test(child.data); | ||
}); | ||
_.each(nonWhiteSpaceChildren, function(elem) { | ||
result += formatListItem(' * ', elem, fn, options); | ||
}); | ||
return result + '\n'; | ||
} | ||
function formatOrderedList(elem, fn, options) { | ||
var result = ''; | ||
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { | ||
return child.type !== 'text' || !whiteSpaceRegex.test(child.data); | ||
}); | ||
// Make sure there are list items present | ||
if (nonWhiteSpaceChildren.length) { | ||
// Calculate initial start from ol attribute | ||
var start = parseInt(elem.attribs.start || '1') - 1 | ||
// Calculate the maximum length to i. | ||
var maxLength = (nonWhiteSpaceChildren.length + start).toString().length; | ||
_.each(nonWhiteSpaceChildren, function(elem, i) { | ||
var index = i + 1 + start; | ||
// Calculate the needed spacing for nice indentation. | ||
var spacing = maxLength - index.toString().length; | ||
var prefix = ' ' + index + '. ' + _s.repeat(' ', spacing); | ||
result += formatListItem(prefix, elem, fn, options); | ||
}); | ||
} | ||
return result + '\n'; | ||
var result = ''; | ||
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { | ||
return child.type !== 'text' || !whiteSpaceRegex.test(child.data); | ||
}); | ||
// Return different functions for different OL types | ||
var typeFunctions = { | ||
1: function(start, i) { return i + 1 + start}, | ||
a: function(start, i) { return String.fromCharCode(i + start + 97)}, | ||
A: function(start, i) { return String.fromCharCode(i + start + 65)} | ||
}; | ||
// Determine type | ||
var olType = elem.attribs.type || '1' | ||
// Make sure there are list items present | ||
if (nonWhiteSpaceChildren.length) { | ||
// Calculate initial start from ol attribute | ||
var start = Number(elem.attribs.start || '1') - 1 | ||
// Calculate the maximum length to i. | ||
var maxLength = (nonWhiteSpaceChildren.length + start).toString().length; | ||
_.each(nonWhiteSpaceChildren, function(elem, i) { | ||
// Use different function depending on type | ||
var index = typeFunctions[olType](start, i); | ||
// Calculate the needed spacing for nice indentation. | ||
var spacing = maxLength - index.toString().length; | ||
var prefix = (olType === '1') ? ' ' + index + '. ' + _s.repeat(' ', spacing) : index + '. '; | ||
result += formatListItem(prefix, elem, fn, options); | ||
}); | ||
} | ||
return result + '\n'; | ||
} | ||
function tableToString(table) { | ||
// Determine space width per column | ||
// Convert all rows to lengths | ||
var widths = _.map(table, function(row) { | ||
return _.map(row, function(col) { | ||
return col.length; | ||
}); | ||
}); | ||
// Invert rows with colums | ||
widths = helper.arrayZip(widths); | ||
// Determine the max values for each column | ||
widths = _.map(widths, function(col) { | ||
return _.max(col); | ||
}); | ||
// Determine space width per column | ||
// Convert all rows to lengths | ||
var widths = _.map(table, function(row) { | ||
return _.map(row, function(col) { | ||
return col.length; | ||
}); | ||
}); | ||
// Invert rows with colums | ||
widths = helper.arrayZip(widths); | ||
// Determine the max values for each column | ||
widths = _.map(widths, function(col) { | ||
return _.max(col); | ||
}); | ||
// Build the table | ||
var text = ''; | ||
_.each(table, function(row) { | ||
var i = 0; | ||
_.each(row, function(col) { | ||
text += _s.rpad(_s.strip(col), widths[i++], ' ') + ' '; | ||
}); | ||
text += '\n'; | ||
}); | ||
return text + '\n'; | ||
// Build the table | ||
var text = ''; | ||
_.each(table, function(row) { | ||
var i = 0; | ||
_.each(row, function(col) { | ||
text += _s.rpad(_s.strip(col), widths[i++], ' ') + ' '; | ||
}); | ||
text += '\n'; | ||
}); | ||
return text + '\n'; | ||
} | ||
function formatTable(elem, fn, options) { | ||
var table = []; | ||
_.each(elem.children, tryParseRows); | ||
return tableToString(table); | ||
var table = []; | ||
_.each(elem.children, tryParseRows); | ||
return tableToString(table); | ||
function tryParseRows(elem) { | ||
if (elem.type !== 'tag') { | ||
return; | ||
} | ||
switch (elem.name.toLowerCase()) { | ||
case "thead": | ||
case "tbody": | ||
case "tfoot": | ||
case "center": | ||
_.each(elem.children, tryParseRows); | ||
return; | ||
function tryParseRows(elem) { | ||
if (elem.type !== 'tag') { | ||
return; | ||
} | ||
switch (elem.name.toLowerCase()) { | ||
case "thead": | ||
case "tbody": | ||
case "tfoot": | ||
case "center": | ||
_.each(elem.children, tryParseRows); | ||
return; | ||
case 'tr': | ||
var rows = []; | ||
_.each(elem.children, function(elem) { | ||
var tokens, times; | ||
if (elem.type === 'tag') { | ||
switch (elem.name.toLowerCase()) { | ||
case 'th': | ||
tokens = formatHeading(elem, fn, options).split('\n'); | ||
rows.push(_.compact(tokens)); | ||
break; | ||
case 'tr': | ||
var rows = []; | ||
_.each(elem.children, function(elem) { | ||
var tokens, times; | ||
if (elem.type === 'tag') { | ||
switch (elem.name.toLowerCase()) { | ||
case 'th': | ||
tokens = formatHeading(elem, fn, options).split('\n'); | ||
rows.push(_.compact(tokens)); | ||
break; | ||
case 'td': | ||
tokens = fn(elem.children, options).split('\n'); | ||
rows.push(_.compact(tokens)); | ||
// Fill colspans with empty values | ||
if (elem.attribs && elem.attribs.colspan) { | ||
times = elem.attribs.colspan - 1 || 0; | ||
_.times(times, function() { | ||
rows.push(['']); | ||
}); | ||
} | ||
break; | ||
} | ||
} | ||
}); | ||
rows = helper.arrayZip(rows); | ||
_.each(rows, function(row) { | ||
row = _.map(row, function(col) { | ||
return col || ''; | ||
}); | ||
table.push(row); | ||
}); | ||
break; | ||
} | ||
} | ||
case 'td': | ||
tokens = fn(elem.children, options).split('\n'); | ||
rows.push(_.compact(tokens)); | ||
// Fill colspans with empty values | ||
if (elem.attribs && elem.attribs.colspan) { | ||
times = elem.attribs.colspan - 1 || 0; | ||
_.times(times, function() { | ||
rows.push(['']); | ||
}); | ||
} | ||
break; | ||
} | ||
} | ||
}); | ||
rows = helper.arrayZip(rows); | ||
_.each(rows, function(row) { | ||
row = _.map(row, function(col) { | ||
return col || ''; | ||
}); | ||
table.push(row); | ||
}); | ||
break; | ||
} | ||
} | ||
} | ||
@@ -233,0 +242,0 @@ |
@@ -8,129 +8,127 @@ var _ = require('underscore'); | ||
function splitLongWord(word, options) { | ||
var wrapCharacters = options.longWordSplit.wrapCharacters || []; | ||
var forceWrapOnLimit = options.longWordSplit.forceWrapOnLimit || false; | ||
var max = options.wordwrap; | ||
var wrapCharacters = options.longWordSplit.wrapCharacters || []; | ||
var forceWrapOnLimit = options.longWordSplit.forceWrapOnLimit || false; | ||
var max = options.wordwrap; | ||
var fuseWord = []; | ||
var idx = 0; | ||
while (word.length > max) { | ||
var firstLine = word.substr(0, max); | ||
var remainingChars = word.substr(max); | ||
var fuseWord = []; | ||
var idx = 0; | ||
while (word.length > max) { | ||
var firstLine = word.substr(0, max); | ||
var remainingChars = word.substr(max); | ||
var splitIndex = firstLine.lastIndexOf(wrapCharacters[idx]); | ||
var splitIndex = firstLine.lastIndexOf(wrapCharacters[idx]); | ||
if (splitIndex > -1) { | ||
// We've found a character to split on, store before the split then check if we | ||
// need to split again | ||
word = firstLine.substr(splitIndex + 1) + remainingChars; | ||
fuseWord.push(firstLine.substr(0, splitIndex + 1)); | ||
if (splitIndex > -1) { | ||
// We've found a character to split on, store before the split then check if we | ||
// need to split again | ||
word = firstLine.substr(splitIndex + 1) + remainingChars; | ||
fuseWord.push(firstLine.substr(0, splitIndex + 1)); | ||
} else { | ||
idx++; | ||
if (idx >= wrapCharacters.length) { | ||
// Cannot split on character, so either split at 'max' or preserve length | ||
if (forceWrapOnLimit) { | ||
fuseWord.push(firstLine); | ||
word = remainingChars; | ||
if (word.length > max) { | ||
continue; | ||
} | ||
} else { | ||
idx++; | ||
if (idx >= wrapCharacters.length) { | ||
// Cannot split on character, so either split at 'max' or preserve length | ||
if (forceWrapOnLimit) { | ||
fuseWord.push(firstLine); | ||
word = remainingChars; | ||
if (word.length > max) { | ||
continue; | ||
} | ||
} else { | ||
word = firstLine + remainingChars; | ||
if (!options.preserveNewlines) { | ||
word += '\n'; | ||
} | ||
} | ||
break; | ||
} else { | ||
word = firstLine + remainingChars; | ||
} | ||
word = firstLine + remainingChars; | ||
if (!options.preserveNewlines) { | ||
word += '\n'; | ||
} | ||
} | ||
break; | ||
} else { | ||
word = firstLine + remainingChars; | ||
} | ||
} | ||
fuseWord.push(word); | ||
} | ||
fuseWord.push(word); | ||
return fuseWord.join('\n'); | ||
return fuseWord.join('\n'); | ||
} | ||
exports.wordwrap = function wordwrap(text, options) { | ||
var max = options.wordwrap; | ||
var preserveNewlines = options.preserveNewlines; | ||
var length = options.lineCharCount; | ||
var max = options.wordwrap; | ||
var preserveNewlines = options.preserveNewlines; | ||
var length = options.lineCharCount; | ||
// Preserve leading space | ||
var result = _s.startsWith(text, ' ') ? ' ' : ''; | ||
length += result.length; | ||
var buffer = []; | ||
// Split the text into words, decide to preserve new lines or not. | ||
var words = preserveNewlines | ||
? text.replace(/\n/g, '\n ').split(/\ +/) | ||
: _s.words(text); | ||
// Preserve leading space | ||
var result = _s.startsWith(text, ' ') ? ' ' : ''; | ||
length += result.length; | ||
var buffer = []; | ||
// Split the text into words, decide to preserve new lines or not. | ||
var words = preserveNewlines | ||
? text.replace(/\n/g, '\n ').split(/\ +/) | ||
: _s.words(text); | ||
// Determine where to end line word by word. | ||
_.each(words, function(word) { | ||
// Add buffer to result if we can't fit any more words in the buffer. | ||
if ((max || max === 0) && length > 0 && | ||
((length + word.length > max) || (length + word.indexOf('\n') > max))) | ||
{ | ||
// Concat buffer and add it to the result | ||
result += buffer.join(' ') + '\n'; | ||
// Reset buffer and length | ||
buffer.length = length = 0; | ||
} | ||
// Determine where to end line word by word. | ||
_.each(words, function(word) { | ||
// Add buffer to result if we can't fit any more words in the buffer. | ||
if ((max || max === 0) && length > 0 && ((length + word.length > max) || (length + word.indexOf('\n') > max))) { | ||
// Concat buffer and add it to the result | ||
result += buffer.join(' ') + '\n'; | ||
// Reset buffer and length | ||
buffer.length = length = 0; | ||
} | ||
// Check if the current word is long enough to be wrapped | ||
if ((max || max === 0) && (options.longWordSplit) && (word.length > max)) { | ||
word = splitLongWord(word, options); | ||
} | ||
// Check if the current word is long enough to be wrapped | ||
if ((max || max === 0) && (options.longWordSplit) && (word.length > max)) { | ||
word = splitLongWord(word, options); | ||
} | ||
buffer.push(word); | ||
buffer.push(word); | ||
// If the word contains a newline then restart the count and add the buffer to the result | ||
if (word.indexOf('\n') != -1) { | ||
result += buffer.join(' '); | ||
// If the word contains a newline then restart the count and add the buffer to the result | ||
if (word.indexOf('\n') !== -1) { | ||
result += buffer.join(' '); | ||
// Reset the buffer, let the length include any characters after the last newline | ||
buffer.length = 0; | ||
length = word.length - (word.lastIndexOf('\n') + 1); | ||
// If there are characters after the newline, add a space and increase the length by 1 | ||
if (length) { | ||
result += ' '; | ||
length++; | ||
} | ||
} else { | ||
// Add word length + one whitespace | ||
length += word.length + 1; | ||
} | ||
}); | ||
// Add the rest to the result. | ||
result += buffer.join(' '); | ||
// Preserve trailing space | ||
if (!_s.endsWith(text, ' ')) { | ||
result = _s.rtrim(result); | ||
} else if (!_s.endsWith(result, ' ')) { | ||
result = result + ' '; | ||
// Reset the buffer, let the length include any characters after the last newline | ||
buffer.length = 0; | ||
length = word.length - (word.lastIndexOf('\n') + 1); | ||
// If there are characters after the newline, add a space and increase the length by 1 | ||
if (length) { | ||
result += ' '; | ||
length++; | ||
} | ||
} else { | ||
// Add word length + one whitespace | ||
length += word.length + 1; | ||
} | ||
}); | ||
// Add the rest to the result. | ||
result += buffer.join(' '); | ||
return result; | ||
// Preserve trailing space | ||
if (!_s.endsWith(text, ' ')) { | ||
result = _s.rtrim(result); | ||
} else if (!_s.endsWith(result, ' ')) { | ||
result = result + ' '; | ||
} | ||
return result; | ||
}; | ||
exports.arrayZip = function arrayZip(array) { | ||
return _.zip.apply(_, array); | ||
return _.zip.apply(_, array); | ||
}; | ||
exports.splitCssSearchTag = function splitCssSearchTag(tagString) { | ||
function getParams(re, string) { | ||
var captures = [], found; | ||
while (found = re.exec(string)) { | ||
captures.push(found[1]); | ||
} | ||
return captures; | ||
function getParams(re, string) { | ||
var captures = [], found; | ||
while ((found = re.exec(string)) !== null) { | ||
captures.push(found[1]); | ||
} | ||
return captures; | ||
} | ||
var splitTag = {}; | ||
var elementRe = /(^\w*)/g; | ||
splitTag.element = elementRe.exec(tagString)[1]; | ||
splitTag.classes = getParams( /\.([\d\w-]*)/g, tagString); | ||
splitTag.ids = getParams( /#([\d\w-]*)/g, tagString); | ||
var splitTag = {}; | ||
var elementRe = /(^\w*)/g; | ||
splitTag.element = elementRe.exec(tagString)[1]; | ||
splitTag.classes = getParams( /\.([\d\w-]*)/g, tagString); | ||
splitTag.ids = getParams( /#([\d\w-]*)/g, tagString); | ||
return splitTag; | ||
return splitTag; | ||
}; |
@@ -13,181 +13,181 @@ var fs = require('fs'); | ||
var SKIP_TYPES = [ | ||
'style', | ||
'script' | ||
'style', | ||
'script' | ||
]; | ||
function htmlToText(html, options) { | ||
options = options || {}; | ||
_.defaults(options, { | ||
wordwrap: 80, | ||
tables: [], | ||
preserveNewlines: false, | ||
uppercaseHeadings: true, | ||
singleNewLineParagraphs: false, | ||
hideLinkHrefIfSameAsText: false, | ||
linkHrefBaseUrl: null, | ||
noLinkBrackets: false, | ||
baseElement: 'body', | ||
returnDomByDefault: true, | ||
decodeOptions: { | ||
isAttributeValue: false, | ||
strict: false | ||
}, | ||
longWordSplit: { | ||
wrapCharacters: [], | ||
forceWrapOnLimit: false | ||
} | ||
}); | ||
options = options || {}; | ||
_.defaults(options, { | ||
wordwrap: 80, | ||
tables: [], | ||
preserveNewlines: false, | ||
uppercaseHeadings: true, | ||
singleNewLineParagraphs: false, | ||
hideLinkHrefIfSameAsText: false, | ||
linkHrefBaseUrl: null, | ||
noLinkBrackets: false, | ||
baseElement: 'body', | ||
returnDomByDefault: true, | ||
decodeOptions: { | ||
isAttributeValue: false, | ||
strict: false | ||
}, | ||
longWordSplit: { | ||
wrapCharacters: [], | ||
forceWrapOnLimit: false | ||
} | ||
}); | ||
var handler = new htmlparser.DefaultHandler(function (error, dom) { | ||
var handler = new htmlparser.DefaultHandler(function (error, dom) { | ||
}, { | ||
verbose: true | ||
}); | ||
new htmlparser.Parser(handler).parseComplete(html); | ||
}, { | ||
verbose: true | ||
}); | ||
new htmlparser.Parser(handler).parseComplete(html); | ||
options.lineCharCount = 0; | ||
options.lineCharCount = 0; | ||
var result = ''; | ||
var baseElements = Array.isArray(options.baseElement) ? options.baseElement : [options.baseElement]; | ||
for (var idx = 0; idx < baseElements.length; ++idx) { | ||
result += walk(filterBody(handler.dom, options, baseElements[idx]), options); | ||
} | ||
return _s.strip(result); | ||
var result = ''; | ||
var baseElements = Array.isArray(options.baseElement) ? options.baseElement : [options.baseElement]; | ||
for (var idx = 0; idx < baseElements.length; ++idx) { | ||
result += walk(filterBody(handler.dom, options, baseElements[idx]), options); | ||
} | ||
return _s.strip(result); | ||
} | ||
function filterBody(dom, options, baseElement) { | ||
var result = null; | ||
var result = null; | ||
var splitTag = helper.splitCssSearchTag(baseElement); | ||
var splitTag = helper.splitCssSearchTag(baseElement); | ||
function walk(dom) { | ||
if (result) return; | ||
_.each(dom, function(elem) { | ||
if (result) return; | ||
if (elem.name === splitTag.element) { | ||
var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : []; | ||
var documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(" ") : []; | ||
function walk(dom) { | ||
if (result) return; | ||
_.each(dom, function(elem) { | ||
if (result) return; | ||
if (elem.name === splitTag.element) { | ||
var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : []; | ||
var documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(" ") : []; | ||
if ((splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0 })) && | ||
(splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0 }))) { | ||
result = [elem]; | ||
return; | ||
} | ||
} | ||
if (elem.children) walk(elem.children); | ||
}); | ||
} | ||
walk(dom); | ||
return options.returnDomByDefault ? result || dom : result; | ||
if ((splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0 })) && | ||
(splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0 }))) { | ||
result = [elem]; | ||
return; | ||
} | ||
} | ||
if (elem.children) walk(elem.children); | ||
}); | ||
} | ||
walk(dom); | ||
return options.returnDomByDefault ? result || dom : result; | ||
} | ||
function containsTable(attr, tables) { | ||
if (tables === true) return true; | ||
if (tables === true) return true; | ||
function removePrefix(key) { | ||
return key.substr(1); | ||
} | ||
function checkPrefix(prefix) { | ||
return function(key) { | ||
return _s.startsWith(key, prefix); | ||
}; | ||
} | ||
function filterByPrefix(tables, prefix) { | ||
return _(tables).chain() | ||
.filter(checkPrefix(prefix)) | ||
.map(removePrefix) | ||
.value(); | ||
} | ||
var classes = filterByPrefix(tables, '.'); | ||
var ids = filterByPrefix(tables, '#'); | ||
return attr && (_.include(classes, attr['class']) || _.include(ids, attr['id'])); | ||
function removePrefix(key) { | ||
return key.substr(1); | ||
} | ||
function checkPrefix(prefix) { | ||
return function(key) { | ||
return _s.startsWith(key, prefix); | ||
}; | ||
} | ||
function filterByPrefix(tables, prefix) { | ||
return _(tables).chain() | ||
.filter(checkPrefix(prefix)) | ||
.map(removePrefix) | ||
.value(); | ||
} | ||
var classes = filterByPrefix(tables, '.'); | ||
var ids = filterByPrefix(tables, '#'); | ||
return attr && (_.include(classes, attr['class']) || _.include(ids, attr['id'])); | ||
} | ||
function walk(dom, options, result) { | ||
if (arguments.length < 3) { | ||
result = ''; | ||
} | ||
var whiteSpaceRegex = /\s$/; | ||
_.each(dom, function(elem) { | ||
switch(elem.type) { | ||
case 'tag': | ||
switch(elem.name.toLowerCase()) { | ||
case 'img': | ||
result += format.image(elem, options); | ||
break; | ||
case 'a': | ||
// Inline element needs its leading space to be trimmed if `result` | ||
// currently ends with whitespace | ||
elem.trimLeadingSpace = whiteSpaceRegex.test(result); | ||
result += format.anchor(elem, walk, options); | ||
break; | ||
case 'p': | ||
result += format.paragraph(elem, walk, options); | ||
break; | ||
case 'h1': | ||
case 'h2': | ||
case 'h3': | ||
case 'h4': | ||
case 'h5': | ||
case 'h6': | ||
result += format.heading(elem, walk, options); | ||
break; | ||
case 'br': | ||
result += format.lineBreak(elem, walk, options); | ||
break; | ||
case 'hr': | ||
result += format.horizontalLine(elem, walk, options); | ||
break; | ||
case 'ul': | ||
result += format.unorderedList(elem, walk, options); | ||
break; | ||
case 'ol': | ||
result += format.orderedList(elem, walk, options); | ||
break; | ||
case 'pre': | ||
var newOptions = _(options).clone(); | ||
newOptions.isInPre = true; | ||
result += format.paragraph(elem, walk, newOptions); | ||
break; | ||
case 'table': | ||
if (containsTable(elem.attribs, options.tables)) { | ||
result += format.table(elem, walk, options); | ||
break; | ||
} | ||
default: | ||
result = walk(elem.children || [], options, result); | ||
} | ||
break; | ||
case 'text': | ||
if (elem.data !== '\r\n') { | ||
// Text needs its leading space to be trimmed if `result` | ||
// currently ends with whitespace | ||
elem.trimLeadingSpace = whiteSpaceRegex.test(result); | ||
result += format.text(elem, options); | ||
} | ||
break; | ||
default: | ||
if (!_.include(SKIP_TYPES, elem.type)) { | ||
result = walk(elem.children || [], options, result); | ||
} | ||
} | ||
if (arguments.length < 3) { | ||
result = ''; | ||
} | ||
var whiteSpaceRegex = /\s$/; | ||
_.each(dom, function(elem) { | ||
switch(elem.type) { | ||
case 'tag': | ||
switch(elem.name.toLowerCase()) { | ||
case 'img': | ||
result += format.image(elem, options); | ||
break; | ||
case 'a': | ||
// Inline element needs its leading space to be trimmed if `result` | ||
// currently ends with whitespace | ||
elem.trimLeadingSpace = whiteSpaceRegex.test(result); | ||
result += format.anchor(elem, walk, options); | ||
break; | ||
case 'p': | ||
result += format.paragraph(elem, walk, options); | ||
break; | ||
case 'h1': | ||
case 'h2': | ||
case 'h3': | ||
case 'h4': | ||
case 'h5': | ||
case 'h6': | ||
result += format.heading(elem, walk, options); | ||
break; | ||
case 'br': | ||
result += format.lineBreak(elem, walk, options); | ||
break; | ||
case 'hr': | ||
result += format.horizontalLine(elem, walk, options); | ||
break; | ||
case 'ul': | ||
result += format.unorderedList(elem, walk, options); | ||
break; | ||
case 'ol': | ||
result += format.orderedList(elem, walk, options); | ||
break; | ||
case 'pre': | ||
var newOptions = _(options).clone(); | ||
newOptions.isInPre = true; | ||
result += format.paragraph(elem, walk, newOptions); | ||
break; | ||
case 'table': | ||
result = containsTable(elem.attribs, options.tables) | ||
? result + format.table(elem, walk, options) | ||
: walk(elem.children || [], options, result); | ||
break; | ||
default: | ||
result = walk(elem.children || [], options, result); | ||
} | ||
break; | ||
case 'text': | ||
if (elem.data !== '\r\n') { | ||
// Text needs its leading space to be trimmed if `result` | ||
// currently ends with whitespace | ||
elem.trimLeadingSpace = whiteSpaceRegex.test(result); | ||
result += format.text(elem, options); | ||
} | ||
break; | ||
default: | ||
if (!_.include(SKIP_TYPES, elem.type)) { | ||
result = walk(elem.children || [], options, result); | ||
} | ||
} | ||
options.lineCharCount = result.length - (result.lastIndexOf('\n') + 1); | ||
}); | ||
return result; | ||
options.lineCharCount = result.length - (result.lastIndexOf('\n') + 1); | ||
}); | ||
return result; | ||
} | ||
exports.fromFile = function(file, options, callback) { | ||
if (!callback) { | ||
callback = options; | ||
options = {}; | ||
} | ||
fs.readFile(file, 'utf8', function(err, str) { | ||
var result = htmlToText(str, options); | ||
return callback(null, result); | ||
}); | ||
if (!callback) { | ||
callback = options; | ||
options = {}; | ||
} | ||
fs.readFile(file, 'utf8', function (err, str) { | ||
if (err) return callback(err); | ||
return callback(null, htmlToText(str, options)); | ||
}); | ||
}; | ||
exports.fromString = function(str, options) { | ||
return htmlToText(str, options || {}); | ||
return htmlToText(str, options || {}); | ||
}; |
{ | ||
"name": "html-to-text", | ||
"version": "3.1.0", | ||
"version": "3.2.0", | ||
"description": "Advanced html to plain text converter", | ||
"main": "index.js", | ||
"scripts": { | ||
"test": "node_modules/.bin/mocha", | ||
"example": "node ./example/html-to-text.js" | ||
"test": "istanbul cover _mocha && eslint .", | ||
"example": "node ./example/html-to-text.js", | ||
"lint": "eslint ." | ||
}, | ||
@@ -46,4 +47,6 @@ "author": { | ||
"chai": "^3.5.0", | ||
"eslint": "^3.14.1", | ||
"istanbul": "^0.4.5", | ||
"mocha": "^3.0.0" | ||
} | ||
} |
# html-to-text | ||
[![Build Status](https://travis-ci.org/werk85/node-html-to-text.svg?branch=master)](https://travis-ci.org/werk85/node-html-to-text) | ||
[![Test Coverage](https://codeclimate.com/github/werk85/node-html-to-text/badges/coverage.svg)](https://codeclimate.com/github/werk85/node-html-to-text/coverage) | ||
@@ -5,0 +6,0 @@ An advanced converter that parses HTML and returns beautiful text. It was mainly designed to transform HTML E-Mail templates to a text representation. So it is currently optimized for table layouts. |
@@ -0,1 +1,3 @@ | ||
/* eslint max-len: "off" */ | ||
var expect = require('chai').expect; | ||
@@ -233,2 +235,17 @@ var htmlToText = require('..'); | ||
it('should support the ordered list type="1" attribute', function() { | ||
var testString = '<ol type="1"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal('1. foo\n 2. bar'); | ||
}); | ||
it('should support the ordered list type="a" attribute', function() { | ||
var testString = '<ol type="a"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal('a. foo\nb. bar'); | ||
}); | ||
it('should support the ordered list type="A" attribute', function() { | ||
var testString = '<ol type="A"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal('A. foo\nB. bar'); | ||
}); | ||
it('should support the ordered list start attribute', function() { | ||
@@ -238,2 +255,17 @@ var testString = '<ol start="2"><li>foo</li><li>bar</li></ol>'; | ||
}); | ||
/* | ||
* Currently failing tests for continuing to fill out the specification | ||
* Spec: https://html.spec.whatwg.org/multipage/semantics.html#the-ol-element | ||
* | ||
it('should support the ordered list type="a" attribute past 26 characters', function() { | ||
var testString = '<ol start="26" type="a"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal('z. foo\naa. bar'); | ||
}); | ||
it('should support the ordered list type="A" attribute past 26 characters', function() { | ||
var testString = '<ol start="26" type="A"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal('Z. foo\nAA. bar'); | ||
}); | ||
*/ | ||
}); | ||
@@ -291,4 +323,4 @@ | ||
it('should retrieve and convert the entire document under `body` by default', function(done) { | ||
var htmlFile = path.join(__dirname, 'test.html'), | ||
txtFile = path.join(__dirname, 'test.txt'); | ||
var htmlFile = path.join(__dirname, 'test.html'); | ||
var txtFile = path.join(__dirname, 'test.txt'); | ||
@@ -307,4 +339,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8'); | ||
it('should only retrieve and convert content under the specified base element if found', function(done) { | ||
var htmlFile = path.join(__dirname, 'test.html'), | ||
txtFile = path.join(__dirname, 'test-address.txt'); | ||
var htmlFile = path.join(__dirname, 'test.html'); | ||
var txtFile = path.join(__dirname, 'test-address.txt'); | ||
@@ -324,4 +356,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8'); | ||
it('should retrieve and convert content under multiple base elements', function(done) { | ||
var htmlFile = path.join(__dirname, 'test.html'), | ||
txtFile = path.join(__dirname, 'test-address-dup.txt'); | ||
var htmlFile = path.join(__dirname, 'test.html'); | ||
var txtFile = path.join(__dirname, 'test-address-dup.txt'); | ||
@@ -341,4 +373,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8'); | ||
it('should retrieve and convert content under multiple base elements in any order', function(done) { | ||
var htmlFile = path.join(__dirname, 'test.html'), | ||
txtFile = path.join(__dirname, 'test-any-order.txt'); | ||
var htmlFile = path.join(__dirname, 'test.html'); | ||
var txtFile = path.join(__dirname, 'test-any-order.txt'); | ||
@@ -358,4 +390,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8'); | ||
it('should process the first base element found when multiple exist', function(done) { | ||
var htmlFile = path.join(__dirname, 'test.html'), | ||
txtFile = path.join(__dirname, 'test-first-element.txt'); | ||
var htmlFile = path.join(__dirname, 'test.html'); | ||
var txtFile = path.join(__dirname, 'test-first-element.txt'); | ||
@@ -375,4 +407,4 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8'); | ||
it('should retrieve and convert the entire document by default if no base element is found', function(done) { | ||
var htmlFile = path.join(__dirname, 'test.html'), | ||
txtFile = path.join(__dirname, 'test.txt'); | ||
var htmlFile = path.join(__dirname, 'test.html'); | ||
var txtFile = path.join(__dirname, 'test.txt'); | ||
@@ -379,0 +411,0 @@ var expectedTxt = fs.readFileSync(txtFile, 'utf8'); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Deprecated
MaintenanceThe maintainer of the package marked it as deprecated. This could indicate that a single version should not be used, or that the package is no longer maintained and any new vulnerabilities will not be fixed.
Found 1 instance in 1 package
80994
23
1027
0
310
4