html-to-text
Advanced tools
Comparing version 5.1.1 to 6.0.0
#!/usr/bin/env node | ||
var parseArgs = require('minimist'); | ||
const parseArgs = require('minimist'); | ||
var htmlToText = require('../lib/html-to-text'); | ||
const { htmlToText } = require('../lib/html-to-text'); | ||
var argv = parseArgs(process.argv.slice(2), { | ||
string: [ | ||
'tables' | ||
], | ||
const argv = parseArgs(process.argv.slice(2), { | ||
alias: { | ||
'ignore-href': 'ignoreHref', | ||
'ignore-image': 'ignoreImage' | ||
}, | ||
boolean: [ | ||
@@ -15,9 +17,4 @@ 'noLinkBrackets', | ||
], | ||
alias: { | ||
'ignore-href': 'ignoreHref', | ||
'ignore-image': 'ignoreImage' | ||
}, | ||
default: { | ||
'wordwrap': 80 | ||
} | ||
default: { 'wordwrap': 80 }, | ||
string: [ 'tables' ] | ||
}); | ||
@@ -27,3 +24,3 @@ | ||
var text = ''; | ||
let text = ''; | ||
@@ -34,12 +31,12 @@ process.title = 'html-to-text'; | ||
process.stdin.setEncoding('utf8'); | ||
process.stdin.on('data', function data(data) { | ||
process.stdin.on('data', function (data) { | ||
text += data; | ||
}); | ||
process.stdin.on('end', function end() { | ||
text = htmlToText.fromString(text, argv); | ||
process.stdin.on('end', function () { | ||
text = htmlToText(text, argv); | ||
process.stdout.write(text + '\n', 'utf-8'); | ||
}); | ||
function interpretTables(tables) { | ||
function interpretTables (tables) { | ||
if (!tables || tables === '' || tables === 'false') { | ||
@@ -46,0 +43,0 @@ return []; |
# Changelog | ||
## Version 6.0.0 | ||
This is a major update. No code left untouched. While the goal was to keep as much compatibility as possible, some client-facing changes were unavoidable. | ||
### fromString() is deprecated in favor of htmlToText() | ||
Since the library has the only exported function, it is now self-titled. | ||
### Inline and block-level tags, HTML whitespace | ||
Formatting code was rewritten almost entirely to make it aware of block-level tags and to handle HTML whitespace properly. One of popular requests was to support divs, and it is here now, after a lot of effort. | ||
### Options reorganized | ||
Options are reorganized to make room for some extra format options while making everything more structured. Now tag-specific options live within that tag configuration. | ||
For the majority of changed options there is a compatibility layer that will remain until next major release. But you are encouraged to explore new options since they provide a bit more flexibility. | ||
### Custom formatters are different now | ||
Because formatters are integral part of the formatting code (as the name suggests), it wasn't possible to provide a compatibility layer. | ||
Please refer to the Readme to see how things are wired now, in case you were using them for anything othen than dealing with the lack of block-level tags support. | ||
### Tables support was improved | ||
Cells can make use of extra space with colspan and rowspan attributes. Max column width is defined separately from global wordwrap limit. | ||
### Limits | ||
Multiple options to cut content in large HTML documents. | ||
By default, any input longer than 16 million characters will be truncated. | ||
### Node and dependencies | ||
Required Node version is now >=8.10.0. | ||
Dependency versions are bumped. | ||
### Repository is moved to it's own organization | ||
[https://github.com/html-to-text/node-html-to-text](https://github.com/html-to-text/node-html-to-text) is the new home. | ||
GitHub should handle all redirects from the old url, so it shouldn't break anything, even if you have a local fork pointing at the old origin. But it is still a good idea to [update](https://docs.github.com/en/free-pro-team@latest/github/using-git/changing-a-remotes-url) the url. | ||
### And more | ||
Version 6 roadmap issue: [#200](https://github.com/html-to-text/node-html-to-text/issues/200) | ||
## Version 5.1.1 | ||
* `preserveNewLines` whitespace issue fixed [#162](https://github.com/werk85/node-html-to-text/pull/162) | ||
* `preserveNewLines` whitespace issue fixed [#162](https://github.com/html-to-text/node-html-to-text/pull/162) | ||
## Version 5.1.0 | ||
* Hard-coded CLI options removed [#173](https://github.com/werk85/node-html-to-text/pull/173) | ||
* Hard-coded CLI options removed [#173](https://github.com/html-to-text/node-html-to-text/pull/173) | ||
@@ -17,3 +67,3 @@ ## Version 5.0.0 | ||
The function `fromFile` is removed. It was the main reason `html-to-text` could not be used in the browser [#164](https://github.com/werk85/node-html-to-text/pull/164). | ||
The function `fromFile` is removed. It was the main reason `html-to-text` could not be used in the browser [#164](https://github.com/html-to-text/node-html-to-text/pull/164). | ||
@@ -46,5 +96,5 @@ You can get the `fromFile` functionality back by using the following code | ||
#### Supported NodeJS Versions | ||
Node versions < 6 are no longer supported. | ||
## Version 4.0.0 | ||
@@ -79,3 +129,3 @@ | ||
* Extra space ploblem fixed. #88 | ||
* Extra space ploblem fixed. #88 | ||
@@ -82,0 +132,0 @@ ## Version 2.1.0 |
@@ -1,18 +0,20 @@ | ||
var path = require('path'); | ||
const fs = require('fs'); | ||
const path = require('path'); | ||
var htmlToText = require('../lib/html-to-text'); | ||
const { htmlToText } = require('../lib/html-to-text'); | ||
console.log('fromString:'); | ||
var text = htmlToText.fromString('<h1>Hello World</h1>', { | ||
wordwrap: 130 | ||
}); | ||
console.log('From string:'); | ||
const text = htmlToText( | ||
'<h1>Hello World</h1>', | ||
{ wordwrap: 130 } | ||
); | ||
console.log(text); | ||
console.log(); | ||
console.log('fromFile:'); | ||
htmlToText.fromFile(path.join(__dirname, 'test.html'), { | ||
tables: ['#invoice', '.address'] | ||
}, function(err, text) { | ||
if (err) return console.error(err); | ||
console.log(text); | ||
}); | ||
console.log('From file:'); | ||
const filePath = path.join(__dirname, 'test.html'); | ||
/** @type { Options } */ | ||
const options = { tables: ['#invoice', '.address'] }; | ||
const text2 = htmlToText(fs.readFileSync(filePath, 'utf8'), options); | ||
console.log(text2); |
@@ -1,1 +0,1 @@ | ||
module.exports = require('./lib/html-to-text'); | ||
module.exports = require('./lib/html-to-text'); |
@@ -1,249 +0,363 @@ | ||
var max = require('lodash/max'); | ||
var compact = require('lodash/compact'); | ||
var times = require('lodash/times'); | ||
const he = require('he'); | ||
const get = require('lodash/get'); | ||
// eslint-disable-next-line you-dont-need-lodash-underscore/trim | ||
const trim = require('lodash/trim'); | ||
const trimStart = require('lodash/trimStart'); | ||
var trimStart = require('lodash/trimStart'); | ||
var padEnd = require('lodash/padEnd'); | ||
const { numberToLetterSequence, numberToRoman, splitClassesAndIds } = require('./helper'); | ||
var he = require('he'); | ||
// eslint-disable-next-line import/no-unassigned-import | ||
require('./typedefs'); | ||
var helper = require('./helper'); | ||
function formatText(elem, options) { | ||
var text = elem.data || ""; | ||
text = he.decode(text, options.decodeOptions); | ||
/** | ||
* Dummy formatter that discards the input and does nothing. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatSkip (elem, walk, builder, formatOptions) { | ||
/* do nothing */ | ||
} | ||
if (options.isInPre) { | ||
return text; | ||
} else { | ||
return helper.wordwrap(elem.trimLeadingSpace ? trimStart(text) : text, options); | ||
} | ||
/** | ||
* Process an inline-level element. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatInline (elem, walk, builder, formatOptions) { | ||
walk(elem.children, builder); | ||
} | ||
function formatImage(elem, options) { | ||
if (options.ignoreImage) { | ||
return ''; | ||
} | ||
/** | ||
* Process a block-level container. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatBlock (elem, walk, builder, formatOptions) { | ||
builder.openBlock(formatOptions.leadingLineBreaks); | ||
walk(elem.children, builder); | ||
builder.closeBlock(formatOptions.trailingLineBreaks); | ||
} | ||
var result = '', attribs = elem.attribs || {}; | ||
if (attribs.alt) { | ||
result += he.decode(attribs.alt, options.decodeOptions); | ||
if (attribs.src) { | ||
result += ' '; | ||
} | ||
} | ||
if (attribs.src) { | ||
result += '[' + attribs.src + ']'; | ||
} | ||
return (result); | ||
/** | ||
* Process a line-break. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatLineBreak (elem, walk, builder, formatOptions) { | ||
builder.addLineBreak(); | ||
} | ||
function formatLineBreak(elem, fn, options) { | ||
return '\n' + fn(elem.children, options); | ||
/** | ||
* Process a `wbk` tag (word break opportunity). | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatWbr (elem, walk, builder, formatOptions) { | ||
builder.addWordBreakOpportunity(); | ||
} | ||
function formatParagraph(elem, fn, options) { | ||
var paragraph = fn(elem.children, options); | ||
if (options.singleNewLineParagraphs) { | ||
return paragraph + '\n'; | ||
/** | ||
* Process a horizontal line. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatHorizontalLine (elem, walk, builder, formatOptions) { | ||
builder.openBlock(formatOptions.leadingLineBreaks || 2); | ||
builder.addInline('-'.repeat(formatOptions.length || builder.options.wordwrap || 40)); | ||
builder.closeBlock(formatOptions.trailingLineBreaks || 2); | ||
} | ||
/** | ||
* Process a paragraph. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatParagraph (elem, walk, builder, formatOptions) { | ||
builder.openBlock(formatOptions.leadingLineBreaks || 2); | ||
walk(elem.children, builder); | ||
builder.closeBlock(formatOptions.trailingLineBreaks || 2); | ||
} | ||
/** | ||
* Process a preformatted content. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatPre (elem, walk, builder, formatOptions) { | ||
builder.openBlock(formatOptions.leadingLineBreaks || 2, 0, true); | ||
walk(elem.children, builder); | ||
builder.closeBlock(formatOptions.trailingLineBreaks || 2); | ||
} | ||
/** | ||
* Process a heading. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatHeading (elem, walk, builder, formatOptions) { | ||
builder.openBlock(formatOptions.leadingLineBreaks || 2); | ||
if (formatOptions.uppercase !== false) { | ||
builder.pushWordTransform(str => str.toUpperCase()); | ||
walk(elem.children, builder); | ||
builder.popWordTransform(); | ||
} else { | ||
return paragraph + '\n\n'; | ||
walk(elem.children, builder); | ||
} | ||
builder.closeBlock(formatOptions.trailingLineBreaks || 2); | ||
} | ||
function formatHeading(elem, fn, options) { | ||
var heading = fn(elem.children, options); | ||
if (options.uppercaseHeadings) { | ||
heading = heading.toUpperCase(); | ||
} | ||
return heading + '\n'; | ||
/** | ||
* Process a blockquote. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatBlockquote (elem, walk, builder, formatOptions) { | ||
builder.openBlock(formatOptions.leadingLineBreaks || 2, 2); | ||
walk(elem.children, builder); | ||
builder.closeBlock( | ||
formatOptions.trailingLineBreaks || 2, | ||
str => ((formatOptions.trimEmptyLines !== false) ? trim(str, '\n') : str) | ||
.split('\n') | ||
.map(line => '> ' + line) | ||
.join('\n') | ||
); | ||
} | ||
// If we have both href and anchor text, format it in a useful manner: | ||
// - "anchor text [href]" | ||
// Otherwise if we have only anchor text or an href, we return the part we have: | ||
// - "anchor text" or | ||
// - "href" | ||
function formatAnchor(elem, fn, options) { | ||
var href = ''; | ||
// Always get the anchor text | ||
var storedCharCount = options.lineCharCount; | ||
var text = fn(elem.children || [], options); | ||
if (!text) { | ||
text = ''; | ||
} | ||
/** | ||
* Process an image. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatImage (elem, walk, builder, formatOptions) { | ||
const attribs = elem.attribs || {}; | ||
const alt = (attribs.alt) | ||
? he.decode(attribs.alt, builder.options.decodeOptions) | ||
: ''; | ||
const src = (!attribs.src) | ||
? '' | ||
: (formatOptions.baseUrl && attribs.src.indexOf('/') === 0) | ||
? formatOptions.baseUrl + attribs.src | ||
: attribs.src; | ||
const text = (!src) | ||
? alt | ||
: (!alt) | ||
? '[' + src + ']' | ||
: alt + ' [' + src + ']'; | ||
var result = elem.trimLeadingSpace ? trimStart(text) : text; | ||
builder.addInline(text); | ||
} | ||
if (!options.ignoreHref) { | ||
// Get the href, if present | ||
if (elem.attribs && elem.attribs.href) { | ||
href = elem.attribs.href.replace(/^mailto:/, ''); | ||
} | ||
if (href) { | ||
if ((!options.noAnchorUrl) || (options.noAnchorUrl && href[0] !== '#')) { | ||
if (options.linkHrefBaseUrl && href.indexOf('/') === 0) { | ||
href = options.linkHrefBaseUrl + href; | ||
} | ||
if (!options.hideLinkHrefIfSameAsText || href !== helper.replaceAll(result, '\n', '')) { | ||
if (!options.noLinkBrackets) { | ||
result += ' [' + href + ']'; | ||
} else { | ||
result += ' ' + href; | ||
} | ||
} | ||
/** | ||
* Process an anchor. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatAnchor (elem, walk, builder, formatOptions) { | ||
function getHref () { | ||
if (formatOptions.ignoreHref) { return ''; } | ||
if (!elem.attribs || !elem.attribs.href) { return ''; } | ||
let href = elem.attribs.href.replace(/^mailto:/, ''); | ||
if (formatOptions.noAnchorUrl && href[0] === '#') { return ''; } | ||
href = (formatOptions.baseUrl && href[0] === '/') | ||
? formatOptions.baseUrl + href | ||
: href; | ||
return he.decode(href, builder.options.decodeOptions); | ||
} | ||
const href = getHref(); | ||
if (!href) { | ||
walk(elem.children, builder); | ||
} else { | ||
let text = ''; | ||
builder.pushWordTransform( | ||
str => { | ||
if (str) { text += str; } | ||
return str; | ||
} | ||
); | ||
walk(elem.children, builder); | ||
builder.popWordTransform(); | ||
const hideSameLink = formatOptions.hideLinkHrefIfSameAsText && href === text; | ||
if (!hideSameLink) { | ||
builder.addInline( | ||
(!text) | ||
? href | ||
: (formatOptions.noLinkBrackets) | ||
? ' ' + href | ||
: ' [' + href + ']', | ||
true | ||
); | ||
} | ||
} | ||
} | ||
options.lineCharCount = storedCharCount; | ||
/** | ||
* @param { DomNode } elem List items with their prefixes. | ||
* @param { RecursiveCallback } walk Recursive callback to process child nodes. | ||
* @param { BlockTextBuilder } builder Passed around to accumulate output text. | ||
* @param { FormatOptions } formatOptions Options specific to a formatter. | ||
* @param { () => string } nextPrefixCallback Function that returns inreasing index each time it is called. | ||
*/ | ||
function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) { | ||
const isNestedList = get(elem, 'parent.name') === 'li'; | ||
return formatText({ data: result || href, trimLeadingSpace: elem.trimLeadingSpace }, options); | ||
// With Roman numbers, index length is not as straightforward as with Arabic numbers or letters, | ||
// so the dumb length comparison is the most robust way to get the correct value. | ||
let maxPrefixLength = 0; | ||
const listItems = (elem.children || []) | ||
// it might be more accuurate to check only for html spaces here, but no significant benefit | ||
.filter(child => child.type !== 'text' || !/^\s*$/.test(child.data)) | ||
.map(function (child) { | ||
if (child.name !== 'li') { | ||
return { node: child, prefix: '' }; | ||
} | ||
const prefix = (isNestedList) | ||
? trimStart(nextPrefixCallback()) | ||
: nextPrefixCallback(); | ||
if (prefix.length > maxPrefixLength) { maxPrefixLength = prefix.length; } | ||
return { node: child, prefix: prefix }; | ||
}); | ||
if (!listItems.length) { return; } | ||
const reservedWidth = maxPrefixLength; | ||
const spacing = '\n' + ' '.repeat(reservedWidth); | ||
builder.openBlock(isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2)); | ||
for (const { node, prefix } of listItems) { | ||
builder.openBlock(1, reservedWidth); | ||
walk([node], builder); | ||
builder.closeBlock( | ||
1, | ||
str => prefix + ' '.repeat(reservedWidth - prefix.length) + str.replace(/\n/g, spacing) | ||
); | ||
} | ||
builder.closeBlock(isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2)); | ||
} | ||
function formatHorizontalLine(elem, fn, options) { | ||
return '\n' + '-'.repeat(options.wordwrap) + '\n\n'; | ||
/** | ||
* Process an unordered list. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatUnorderedList (elem, walk, builder, formatOptions) { | ||
const prefix = formatOptions.itemPrefix || ' * '; | ||
return formatList(elem, walk, builder, formatOptions, () => prefix); | ||
} | ||
function formatListItem(prefix, elem, fn, options) { | ||
options = Object.assign({}, options); | ||
// Reduce the wordwrap for sub elements. | ||
if (options.wordwrap) { | ||
options.wordwrap -= prefix.length; | ||
/** | ||
* Process an ordered list. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatOrderedList (elem, walk, builder, formatOptions) { | ||
let nextIndex = Number(elem.attribs.start || '1'); | ||
const indexFunction = getOrderedListIndexFunction(elem.attribs.type); | ||
const nextPrefixCallback = () => ' ' + indexFunction(nextIndex++) + '. '; | ||
return formatList(elem, walk, builder, formatOptions, nextPrefixCallback); | ||
} | ||
/** | ||
* Return a function that can be used to generate index markers of a specified format. | ||
* | ||
* @param { string } [olType='1'] Marker type. | ||
* @returns { (i: number) => string } | ||
*/ | ||
function getOrderedListIndexFunction (olType = '1') { | ||
switch (olType) { | ||
case 'a': return (i) => numberToLetterSequence(i, 'a'); | ||
case 'A': return (i) => numberToLetterSequence(i, 'A'); | ||
case 'i': return (i) => numberToRoman(i).toLowerCase(); | ||
case 'I': return (i) => numberToRoman(i); | ||
case '1': | ||
default: return (i) => (i).toString(); | ||
} | ||
// Process sub elements. | ||
var text = fn(elem.children, options); | ||
// Replace all line breaks with line break + prefix spacing. | ||
text = text.replace(/\n/g, '\n' + ' '.repeat(prefix.length)); | ||
// Add first prefix and line break at the end. | ||
return prefix + text + '\n'; | ||
} | ||
var whiteSpaceRegex = /^\s*$/; | ||
function isDataTable (attr, tables) { | ||
if (tables === true) { return true; } | ||
if (!attr) { return false; } | ||
function formatUnorderedList(elem, fn, options) { | ||
var result = ''; | ||
var prefix = options.unorderedListItemPrefix; | ||
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { | ||
return child.type !== 'text' || !whiteSpaceRegex.test(child.data); | ||
}); | ||
nonWhiteSpaceChildren.forEach(function(elem) { | ||
result += formatListItem(prefix, elem, fn, options); | ||
}); | ||
return result + '\n'; | ||
const { classes, ids } = splitClassesAndIds(tables); | ||
const attrClasses = (attr['class'] || '').split(' '); | ||
const attrIds = (attr['id'] || '').split(' '); | ||
return attrClasses.some(x => classes.includes(x)) || attrIds.some(x => ids.includes(x)); | ||
} | ||
function formatOrderedList(elem, fn, options) { | ||
var result = ''; | ||
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { | ||
return child.type !== 'text' || !whiteSpaceRegex.test(child.data); | ||
}); | ||
// Return different functions for different OL types | ||
var typeFunction = (function() { | ||
// Determine type | ||
var olType = elem.attribs.type || '1'; | ||
// TODO Imeplement the other valid types | ||
// Fallback to type '1' function for other valid types | ||
switch(olType) { | ||
case 'a': return function(start, i) { return String.fromCharCode(i + start + 97);}; | ||
case 'A': return function(start, i) { return String.fromCharCode(i + start + 65);}; | ||
case '1': | ||
default: return function(start, i) { return i + 1 + start;}; | ||
} | ||
}()); | ||
// Make sure there are list items present | ||
if (nonWhiteSpaceChildren.length) { | ||
// Calculate initial start from ol attribute | ||
var start = Number(elem.attribs.start || '1') - 1; | ||
// Calculate the maximum length to i. | ||
var maxLength = (nonWhiteSpaceChildren.length + start).toString().length; | ||
nonWhiteSpaceChildren.forEach(function(elem, i) { | ||
// Use different function depending on type | ||
var index = typeFunction(start, i); | ||
// Calculate the needed spacing for nice indentation. | ||
var spacing = maxLength - index.toString().length; | ||
var prefix = ' ' + index + '. ' + ' '.repeat(spacing); | ||
result += formatListItem(prefix, elem, fn, options); | ||
}); | ||
} | ||
return result + '\n'; | ||
/** | ||
* Process a table (either as a container or as a data table, depending on options). | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatTable (elem, walk, builder, formatOptions) { | ||
return isDataTable(elem.attribs, builder.options.tables) | ||
? formatDataTable(elem, walk, builder, formatOptions) | ||
: formatBlock(elem, walk, builder, formatOptions); | ||
} | ||
function tableToString(table) { | ||
// Determine space width per column | ||
// Convert all rows to lengths | ||
var widths = table.map(function(row) { | ||
return row.map(function(col) { | ||
return col.length; | ||
}); | ||
}); | ||
// Invert rows with colums | ||
widths = helper.arrayZip(widths); | ||
// Determine the max values for each column | ||
widths = widths.map(function(col) { | ||
return max(col); | ||
}); | ||
/** | ||
* Process a data table. | ||
* | ||
* @type { FormatCallback } | ||
*/ | ||
function formatDataTable (elem, walk, builder, formatOptions) { | ||
builder.openTable(); | ||
elem.children.forEach(walkTable); | ||
builder.closeTable( | ||
formatOptions.colSpacing, | ||
formatOptions.rowSpacing, | ||
formatOptions.leadingLineBreaks, | ||
formatOptions.trailingLineBreaks | ||
); | ||
// Build the table | ||
var text = ''; | ||
table.forEach(function(row) { | ||
var i = 0; | ||
row.forEach(function(col) { | ||
text += padEnd(col.trim(), widths[i++], ' ') + ' '; | ||
}); | ||
text += '\n'; | ||
}); | ||
return text + '\n'; | ||
} | ||
function formatCell (cellNode) { | ||
const colspan = +get(cellNode, 'attribs.colspan') || 1; | ||
const rowspan = +get(cellNode, 'attribs.rowspan') || 1; | ||
builder.openTableCell(formatOptions.maxColumnWidth); | ||
walk(cellNode.children, builder); | ||
builder.closeTableCell(colspan, rowspan); | ||
} | ||
function formatTable(elem, fn, options) { | ||
var table = []; | ||
elem.children.forEach(tryParseRows); | ||
return tableToString(table); | ||
function walkTable (elem) { | ||
if (elem.type !== 'tag') { return; } | ||
function tryParseRows(elem) { | ||
if (elem.type !== 'tag') { | ||
return; | ||
} | ||
switch (elem.name.toLowerCase()) { | ||
case "thead": | ||
case "tbody": | ||
case "tfoot": | ||
case "center": | ||
elem.children.forEach(tryParseRows); | ||
const formatHeaderCell = (formatOptions.uppercaseHeaderCells) | ||
? (cellNode) => { | ||
builder.pushWordTransform(str => str.toUpperCase()); | ||
formatCell(cellNode); | ||
builder.popWordTransform(); | ||
} | ||
: formatCell; | ||
switch (elem.name) { | ||
case 'thead': | ||
case 'tbody': | ||
case 'tfoot': | ||
case 'center': | ||
elem.children.forEach(walkTable); | ||
return; | ||
case 'tr': | ||
var rows = []; | ||
elem.children.forEach(function(elem) { | ||
var tokens, count; | ||
if (elem.type === 'tag') { | ||
switch (elem.name.toLowerCase()) { | ||
case 'th': | ||
tokens = formatHeading(elem, fn, options).split('\n'); | ||
rows.push(compact(tokens)); | ||
break; | ||
case 'td': | ||
tokens = fn(elem.children, options).split('\n'); | ||
rows.push(compact(tokens)); | ||
// Fill colspans with empty values | ||
if (elem.attribs && elem.attribs.colspan) { | ||
count = elem.attribs.colspan - 1 || 0; | ||
times(count, function() { | ||
rows.push(['']); | ||
}); | ||
} | ||
break; | ||
case 'tr': { | ||
builder.openTableRow(); | ||
for (const childOfTr of elem.children) { | ||
if (childOfTr.type !== 'tag') { continue; } | ||
switch (childOfTr.name) { | ||
case 'th': { | ||
formatHeaderCell(childOfTr); | ||
break; | ||
} | ||
case 'td': { | ||
formatCell(childOfTr); | ||
break; | ||
} | ||
default: | ||
// do nothing | ||
} | ||
}); | ||
rows = helper.arrayZip(rows); | ||
rows.forEach(function(row) { | ||
row = row.map(function(col) { | ||
return col || ''; | ||
}); | ||
table.push(row); | ||
}); | ||
} | ||
builder.closeTableRow(); | ||
break; | ||
} | ||
default: | ||
// do nothing | ||
} | ||
@@ -253,17 +367,19 @@ } | ||
function formatBlockquote(elem, fn, options) { | ||
return '> ' + fn(elem.children, options) + '\n'; | ||
} | ||
exports.text = formatText; | ||
exports.image = formatImage; | ||
exports.lineBreak = formatLineBreak; | ||
exports.paragraph = formatParagraph; | ||
exports.anchor = formatAnchor; | ||
exports.heading = formatHeading; | ||
exports.table = formatTable; | ||
exports.orderedList = formatOrderedList; | ||
exports.unorderedList = formatUnorderedList; | ||
exports.listItem = formatListItem; | ||
exports.horizontalLine = formatHorizontalLine; | ||
exports.blockquote = formatBlockquote; | ||
module.exports = { | ||
anchor: formatAnchor, | ||
block: formatBlock, | ||
blockquote: formatBlockquote, | ||
dataTable: formatDataTable, | ||
heading: formatHeading, | ||
horizontalLine: formatHorizontalLine, | ||
image: formatImage, | ||
inline: formatInline, | ||
lineBreak: formatLineBreak, | ||
orderedList: formatOrderedList, | ||
paragraph: formatParagraph, | ||
pre: formatPre, | ||
skip: formatSkip, | ||
table: formatTable, | ||
unorderedList: formatUnorderedList, | ||
wbr: formatWbr | ||
}; |
@@ -1,139 +0,218 @@ | ||
var zip = require('lodash/zip'); | ||
var trimEnd = require('lodash/trimEnd'); | ||
// Split a long word up to fit within the word wrap limit. Use either a | ||
// character to split looking back from the word wrap limit, or | ||
// truncate to the word wrap limit. | ||
function splitLongWord(word, options) { | ||
var wrapCharacters = options.longWordSplit.wrapCharacters || []; | ||
var forceWrapOnLimit = options.longWordSplit.forceWrapOnLimit || false; | ||
var max = options.wordwrap; | ||
/** | ||
* Split given tag selector into it's components. | ||
* Only element name, class names and ID names are supported. | ||
* | ||
* @param { string } selector Tag selector ("tag.class#id" etc). | ||
* @returns { { classes: string[], element: string, ids: string[] } } | ||
*/ | ||
function splitSelector (selector) { | ||
function getParams (re, string) { | ||
const captures = []; | ||
let found; | ||
while ((found = re.exec(string)) !== null) { | ||
captures.push(found[1]); | ||
} | ||
return captures; | ||
} | ||
var fuseWord = []; | ||
var idx = 0; | ||
while (word.length > max) { | ||
var firstLine = word.substr(0, max); | ||
var remainingChars = word.substr(max); | ||
return { | ||
classes: getParams(/\.([\d\w-]*)/g, selector), | ||
element: /(^\w*)/g.exec(selector)[1], | ||
ids: getParams(/#([\d\w-]*)/g, selector) | ||
}; | ||
} | ||
var splitIndex = firstLine.lastIndexOf(wrapCharacters[idx]); | ||
if (splitIndex > -1) { | ||
// We've found a character to split on, store before the split then check if we | ||
// need to split again | ||
word = firstLine.substr(splitIndex + 1) + remainingChars; | ||
fuseWord.push(firstLine.substr(0, splitIndex + 1)); | ||
} else { | ||
idx++; | ||
if (idx >= wrapCharacters.length) { | ||
// Cannot split on character, so either split at 'max' or preserve length | ||
if (forceWrapOnLimit) { | ||
fuseWord.push(firstLine); | ||
word = remainingChars; | ||
if (word.length > max) { | ||
continue; | ||
} | ||
} else { | ||
word = firstLine + remainingChars; | ||
if (!options.preserveNewlines) { | ||
word += '\n'; | ||
} | ||
} | ||
break; | ||
} else { | ||
word = firstLine + remainingChars; | ||
} | ||
/** | ||
* Given a list of class and ID selectors (prefixed with '.' and '#'), | ||
* return them as separate lists of names without prefixes. | ||
* | ||
* @param { string[] } selectors Class and ID selectors (`[".class", "#id"]` etc). | ||
* @returns { { classes: string[], ids: string[] } } | ||
*/ | ||
function splitClassesAndIds (selectors) { | ||
const classes = []; | ||
const ids = []; | ||
for (const selector of selectors) { | ||
if (selector.startsWith('.')) { | ||
classes.push(selector.substring(1)); | ||
} else if (selector.startsWith('#')) { | ||
ids.push(selector.substring(1)); | ||
} | ||
} | ||
fuseWord.push(word); | ||
return { classes: classes, ids: ids }; | ||
} | ||
return fuseWord.join('\n'); | ||
/** | ||
* Make a recursive function that will only run to a given depth | ||
* and switches to an alternative function at that depth. \ | ||
* No limitation if `n` is `undefined` (Just wraps `f` in that case). | ||
* | ||
* @param { number | undefined } n Allowed depth of recursion. `undefined` for no limitation. | ||
* @param { Function } f Function that accepts recursive callback as the first argument. | ||
* @param { Function } [g] Function to run instead, when maximum depth was reached. Do nothing by default. | ||
* @returns { Function } | ||
*/ | ||
function limitedDepthRecursive (n, f, g = () => undefined) { | ||
if (n === undefined) { | ||
const f1 = function (...args) { return f(f1, ...args); }; | ||
return f1; | ||
} | ||
if (n >= 0) { | ||
return function (...args) { return f(limitedDepthRecursive(n - 1, f, g), ...args); }; | ||
} | ||
return g; | ||
} | ||
exports.wordwrap = function wordwrap(text, options) { | ||
var max = options.wordwrap; | ||
var preserveNewlines = options.preserveNewlines; | ||
var length = options.lineCharCount; | ||
/** | ||
* Convert a number into alphabetic sequence representation (Sequence without zeroes). | ||
* | ||
* For example: `a, ..., z, aa, ..., zz, aaa, ...`. | ||
* | ||
* @param { number } num Number to convert. Must be >= 1. | ||
* @param { string } [baseChar = 'a'] Character for 1 in the sequence. | ||
* @param { number } [base = 26] Number of characters in the sequence. | ||
* @returns { string } | ||
*/ | ||
function numberToLetterSequence (num, baseChar = 'a', base = 26) { | ||
const digits = []; | ||
do { | ||
num -= 1; | ||
digits.push(num % base); | ||
num = (num / base) >> 0; // quick `floor` | ||
} while (num > 0); | ||
const baseCode = baseChar.charCodeAt(0); | ||
return digits | ||
.reverse() | ||
.map(n => String.fromCharCode(baseCode + n)) | ||
.join(''); | ||
} | ||
// Preserve leading space | ||
var result = text.startsWith(' ') ? ' ' : ''; | ||
length += result.length; | ||
var buffer = []; | ||
// Split the text into words, decide to preserve new lines or not. | ||
var words = preserveNewlines | ||
? text.trim().replace(/\n/g, '\n ').split(/\ +/) | ||
: text.trim().split(/\s+/); | ||
const I = ['I', 'X', 'C', 'M']; | ||
const V = ['V', 'L', 'D']; | ||
// Determine where to end line word by word. | ||
words.forEach(function(word) { | ||
// Add buffer to result if we can't fit any more words in the buffer. | ||
if ((max || max === 0) && length > 0 && ((length + word.length > max) || (length + word.indexOf('\n') > max))) { | ||
// Concat buffer and add it to the result | ||
result += buffer.join(' ') + '\n'; | ||
// Reset buffer and length | ||
buffer.length = length = 0; | ||
} | ||
/** | ||
* Convert a number to it's Roman representation. No large numbers extension. | ||
* | ||
* @param { number } num Number to convert. `0 < num <= 3999`. | ||
* @returns { string } | ||
*/ | ||
function numberToRoman (num) { | ||
return [...(num) + ''] | ||
.map(n => +n) | ||
.reverse() | ||
.map((v, i) => ((v % 5 < 4) | ||
? (v < 5 ? '' : V[i]) + I[i].repeat(v % 5) | ||
: I[i] + (v < 5 ? V[i] : I[i + 1]))) | ||
.reverse() | ||
.join(''); | ||
} | ||
// Check if the current word is long enough to be wrapped | ||
if ((max || max === 0) && (options.longWordSplit) && (word.length > max)) { | ||
word = splitLongWord(word, options); | ||
} | ||
buffer.push(word); | ||
function getRow (matrix, j) { | ||
if (!matrix[j]) { matrix[j] = []; } | ||
return matrix[j]; | ||
} | ||
// If the word contains a newline then restart the count and add the buffer to the result | ||
if (word.indexOf('\n') !== -1) { | ||
result += buffer.join(' '); | ||
function findFirstVacantIndex (row, x = 0) { | ||
while (row[x]) { x++; } | ||
return x; | ||
} | ||
// Reset the buffer, let the length include any characters after the last newline | ||
buffer.length = 0; | ||
length = word.length - (word.lastIndexOf('\n') + 1); | ||
// If there are characters after the newline, add a space and increase the length by 1 | ||
if (length) { | ||
result += ' '; | ||
length++; | ||
} | ||
} else { | ||
// Add word length + one whitespace | ||
length += word.length + 1; | ||
function transposeInPlace (matrix, maxSize) { | ||
for (let i = 0; i < maxSize; i++) { | ||
const rowI = getRow(matrix, i); | ||
for (let j = 0; j < i; j++) { | ||
const rowJ = getRow(matrix, j); | ||
const temp = rowI[j]; | ||
rowI[j] = rowJ[i]; | ||
rowJ[i] = temp; | ||
} | ||
}); | ||
// Add the rest to the result. | ||
result += buffer.join(' '); | ||
} | ||
} | ||
// Preserve trailing space | ||
if (!text.endsWith(' ')) { | ||
result = trimEnd(result); | ||
} else if (!result.endsWith(' ')) { | ||
result = result + ' '; | ||
function putCellIntoLayout (cell, layout, baseRow, baseCol) { | ||
for (let r = 0; r < cell.rowspan; r++) { | ||
const layoutRow = getRow(layout, baseRow + r); | ||
for (let c = 0; c < cell.colspan; c++) { | ||
layoutRow[baseCol + c] = cell; | ||
} | ||
} | ||
} | ||
return result; | ||
}; | ||
function updateOffset (offsets, base, span, value) { | ||
offsets[base + span] = Math.max( | ||
offsets[base + span] || 0, | ||
offsets[base] + value | ||
); | ||
} | ||
exports.arrayZip = function arrayZip(array) { | ||
return zip.apply(null, array); | ||
}; | ||
/** | ||
* Render a table into string. | ||
* Cells can contain multiline text and span across multiple rows and columns. | ||
* | ||
* Modifies cells to add lines array. | ||
* | ||
* @param { { colspan: number, rowspan: number, text: string }[][] } tableRows Table to render. | ||
* @param { number } rowSpacing Number of spaces between columns. | ||
* @param { number } colSpacing Number of empty lines between rows. | ||
* @returns { string } | ||
*/ | ||
function tableToString (tableRows, rowSpacing, colSpacing) { | ||
const layout = []; | ||
let colNumber = 0; | ||
const rowNumber = tableRows.length; | ||
const rowOffsets = [0]; | ||
// Fill the layout table and row offsets row-by-row. | ||
for (let j = 0; j < rowNumber; j++) { | ||
const layoutRow = getRow(layout, j); | ||
const cells = tableRows[j]; | ||
let x = 0; | ||
for (let i = 0; i < cells.length; i++) { | ||
const cell = cells[i]; | ||
x = findFirstVacantIndex(layoutRow, x); | ||
putCellIntoLayout(cell, layout, j, x); | ||
x += cell.colspan; | ||
cell.lines = cell.text.split('\n'); | ||
const cellHeight = cell.lines.length; | ||
updateOffset(rowOffsets, j, cell.rowspan, cellHeight + rowSpacing); | ||
} | ||
colNumber = (layoutRow.length > colNumber) ? layoutRow.length : colNumber; | ||
} | ||
exports.splitCssSearchTag = function splitCssSearchTag(tagString) { | ||
function getParams(re, string) { | ||
var captures = [], found; | ||
while ((found = re.exec(string)) !== null) { | ||
captures.push(found[1]); | ||
transposeInPlace(layout, (rowNumber > colNumber) ? rowNumber : colNumber); | ||
const outputLines = []; | ||
const colOffsets = [0]; | ||
// Fill column offsets and output lines column-by-column. | ||
for (let x = 0; x < colNumber; x++) { | ||
let y = 0; | ||
let cell; | ||
while (y < rowNumber && (cell = layout[x][y])) { | ||
if (!cell.rendered) { | ||
let cellWidth = 0; | ||
for (let j = 0; j < cell.lines.length; j++) { | ||
const line = cell.lines[j]; | ||
const lineOffset = rowOffsets[y] + j; | ||
outputLines[lineOffset] = (outputLines[lineOffset] || '').padEnd(colOffsets[x]) + line; | ||
cellWidth = (line.length > cellWidth) ? line.length : cellWidth; | ||
} | ||
updateOffset(colOffsets, x, cell.colspan, cellWidth + colSpacing); | ||
cell.rendered = true; | ||
} | ||
y += cell.rowspan; | ||
} | ||
return captures; | ||
} | ||
var splitTag = {}; | ||
var elementRe = /(^\w*)/g; | ||
splitTag.element = elementRe.exec(tagString)[1]; | ||
splitTag.classes = getParams( /\.([\d\w-]*)/g, tagString); | ||
splitTag.ids = getParams( /#([\d\w-]*)/g, tagString); | ||
return outputLines.join('\n'); | ||
} | ||
return splitTag; | ||
}; | ||
exports.replaceAll = function replaceAll(str, find, replace) { | ||
var reg = new RegExp(find, 'g'); | ||
return str.replace(reg, replace); | ||
module.exports = { | ||
limitedDepthRecursive: limitedDepthRecursive, | ||
numberToLetterSequence: numberToLetterSequence, | ||
numberToRoman: numberToRoman, | ||
splitClassesAndIds: splitClassesAndIds, | ||
splitSelector: splitSelector, | ||
tableToString: tableToString | ||
}; |
@@ -1,71 +0,219 @@ | ||
var includes = require('lodash/includes'); | ||
var trimEnd = require('lodash/trimEnd'); | ||
var htmlparser = require('htmlparser2'); | ||
const merge = require('deepmerge'); | ||
const he = require('he'); | ||
const htmlparser = require('htmlparser2'); | ||
const set = require('lodash/set'); | ||
var helper = require('./helper'); | ||
var defaultFormat = require('./formatter'); | ||
const { BlockTextBuilder } = require('./block-text-builder'); | ||
const defaultFormatters = require('./formatter'); | ||
const { limitedDepthRecursive, splitSelector } = require('./helper'); | ||
// Which type of tags should not be parsed | ||
var SKIP_TYPES = [ | ||
'style', | ||
'script' | ||
]; | ||
// eslint-disable-next-line import/no-unassigned-import | ||
require('./typedefs'); | ||
function htmlToText(html, options) { | ||
options = Object.assign({ | ||
wordwrap: 80, | ||
tables: [], | ||
preserveNewlines: false, | ||
uppercaseHeadings: true, | ||
singleNewLineParagraphs: false, | ||
hideLinkHrefIfSameAsText: false, | ||
linkHrefBaseUrl: null, | ||
noLinkBrackets: false, | ||
noAnchorUrl: true, | ||
baseElement: 'body', | ||
returnDomByDefault: true, | ||
format: {}, | ||
decodeOptions: { | ||
isAttributeValue: false, | ||
strict: false | ||
/** | ||
* Default options. | ||
* | ||
* @constant | ||
* @type { Options } | ||
* @default | ||
* @private | ||
*/ | ||
const DEFAULT_OPTIONS = { | ||
baseElement: 'body', | ||
decodeOptions: { | ||
isAttributeValue: false, | ||
strict: false | ||
}, | ||
formatters: {}, | ||
limits: { | ||
ellipsis: '...', | ||
maxChildNodes: undefined, | ||
maxDepth: undefined, | ||
maxInputLength: (1 << 24) // 16_777_216 | ||
}, | ||
longWordSplit: { | ||
forceWrapOnLimit: false, | ||
wrapCharacters: [] | ||
}, | ||
preserveNewlines: false, | ||
returnDomByDefault: true, | ||
tables: [], | ||
tags: { | ||
'': { format: 'inline' }, // defaults for any other tag name | ||
'a': { | ||
format: 'anchor', | ||
options: { baseUrl: null, hideLinkHrefIfSameAsText: false, ignoreHref: false, noAnchorUrl: true, noLinkBrackets: false } | ||
}, | ||
longWordSplit: { | ||
wrapCharacters: [], | ||
forceWrapOnLimit: false | ||
'article': { format: 'block' }, | ||
'aside': { format: 'block' }, | ||
'blockquote': { | ||
format: 'blockquote', | ||
options: { leadingLineBreaks: 2, trailingLineBreaks: 2, trimEmptyLines: true } | ||
}, | ||
unorderedListItemPrefix: ' * ' | ||
}, options || {}); | ||
'br': { format: 'lineBreak' }, | ||
'div': { format: 'block' }, | ||
'footer': { format: 'block' }, | ||
'form': { format: 'block' }, | ||
'h1': { format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, | ||
'h2': { format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, | ||
'h3': { format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, | ||
'h4': { format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, | ||
'h5': { format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, | ||
'h6': { format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, | ||
'header': { format: 'block' }, | ||
'hr': { format: 'horizontalLine', options: { leadingLineBreaks: 2, length: undefined, trailingLineBreaks: 2 } }, | ||
'img': { format: 'image', options: { baseUrl: null } }, | ||
'main': { format: 'block' }, | ||
'nav': { format: 'block' }, | ||
'ol': { format: 'orderedList', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, | ||
'p': { format: 'paragraph', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, | ||
'pre': { format: 'pre', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, | ||
'section': { format: 'block' }, | ||
'table': { | ||
format: 'table', | ||
options: { | ||
colSpacing: 3, | ||
leadingLineBreaks: 2, | ||
maxColumnWidth: 60, | ||
rowSpacing: 0, | ||
trailingLineBreaks: 2, | ||
uppercaseHeaderCells: true | ||
} | ||
}, | ||
'ul': { | ||
format: 'unorderedList', | ||
options: { itemPrefix: ' * ', leadingLineBreaks: 2, trailingLineBreaks: 2 } | ||
}, | ||
'wbr': { format: 'wbr' } | ||
}, | ||
whitespaceCharacters: ' \t\r\n\f\u200b', | ||
wordwrap: 80 | ||
}; | ||
var handler = new htmlparser.DefaultHandler(function (error, dom) { | ||
/** | ||
* Convert given HTML content to plain text string. | ||
* | ||
* @param { string } html HTML content to convert. | ||
* @param { Options } [options = {}] HtmlToText options. | ||
* @returns { string } Plain text string. | ||
* @static | ||
* | ||
* @example | ||
* const { htmlToText } = require('html-to-text'); | ||
* const text = htmlToText('<h1>Hello World</h1>', { | ||
* wordwrap: 130 | ||
* }); | ||
* console.log(text); // HELLO WORLD | ||
*/ | ||
function htmlToText (html, options = {}) { | ||
options = merge( | ||
DEFAULT_OPTIONS, | ||
options, | ||
{ arrayMerge: (destinationArray, sourceArray, mergeOptions) => sourceArray } | ||
); | ||
options.formatters = Object.assign({}, defaultFormatters, options.formatters); | ||
}, { | ||
verbose: true | ||
}); | ||
new htmlparser.Parser(handler).parseComplete(html); | ||
handleDeprecatedOptions(options); | ||
options.lineCharCount = 0; | ||
const maxInputLength = options.limits.maxInputLength; | ||
if (maxInputLength && html && html.length > maxInputLength) { | ||
console.warn( | ||
`Input lenght ${html.length} is above allowed limit of ${maxInputLength}. Truncating without ellipsis.` | ||
); | ||
html = html.substring(0, maxInputLength); | ||
} | ||
var result = ''; | ||
var baseElements = Array.isArray(options.baseElement) ? options.baseElement : [options.baseElement]; | ||
for (var idx = 0; idx < baseElements.length; ++idx) { | ||
result += walk(filterBody(handler.dom, options, baseElements[idx]), options); | ||
const handler = new htmlparser.DefaultHandler(); | ||
new htmlparser.Parser(handler, { lowerCaseTags: true }).parseComplete(html); | ||
const limitedWalk = limitedDepthRecursive( | ||
options.limits.maxDepth, | ||
recursiveWalk, | ||
function (dom, builder) { | ||
builder.addInline(options.limits.ellipsis || ''); | ||
} | ||
); | ||
const baseElements = Array.isArray(options.baseElement) | ||
? options.baseElement | ||
: [options.baseElement]; | ||
const bases = baseElements | ||
.map(be => findBase(handler.dom, options, be)) | ||
.filter(b => b) | ||
.reduce((acc, b) => acc.concat(b), []); | ||
const builder = new BlockTextBuilder(options); | ||
limitedWalk(bases, builder); | ||
return builder.toString(); | ||
} | ||
/** | ||
* Map previously existing and now deprecated options to the new options layout. | ||
* This is a subject for cleanup in major releases. | ||
* | ||
* @param { Options } options HtmlToText options. | ||
*/ | ||
function handleDeprecatedOptions (options) { | ||
const tagDefinitions = Object.values(options.tags); | ||
function copyFormatterOption (source, format, target) { | ||
if (options[source] === undefined) { return; } | ||
for (const tagDefinition of tagDefinitions) { | ||
if (tagDefinition.format === format) { | ||
set(tagDefinition, ['options', target], options[source]); | ||
} | ||
} | ||
} | ||
return trimEnd(result); | ||
copyFormatterOption('hideLinkHrefIfSameAsText', 'anchor', 'hideLinkHrefIfSameAsText'); | ||
copyFormatterOption('ignoreHref', 'anchor', 'ignoreHref'); | ||
copyFormatterOption('linkHrefBaseUrl', 'anchor', 'baseUrl'); | ||
copyFormatterOption('noAnchorUrl', 'anchor', 'noAnchorUrl'); | ||
copyFormatterOption('noLinkBrackets', 'anchor', 'noLinkBrackets'); | ||
copyFormatterOption('linkHrefBaseUrl', 'image', 'baseUrl'); | ||
copyFormatterOption('unorderedListItemPrefix', 'unorderedList', 'itemPrefix'); | ||
copyFormatterOption('uppercaseHeadings', 'heading', 'uppercase'); | ||
copyFormatterOption('uppercaseHeadings', 'table', 'uppercaseHeadings'); | ||
copyFormatterOption('uppercaseHeadings', 'dataTable', 'uppercaseHeadings'); | ||
if (options['ignoreImage']) { | ||
for (const tagDefinition of tagDefinitions) { | ||
if (tagDefinition.format === 'image') { | ||
tagDefinition.format = 'skip'; | ||
} | ||
} | ||
} | ||
if (options['singleNewLineParagraphs']) { | ||
for (const tagDefinition of tagDefinitions) { | ||
if (tagDefinition.format === 'paragraph' || tagDefinition.format === 'pre') { | ||
set(tagDefinition, ['options', 'leadingLineBreaks'], 1); | ||
set(tagDefinition, ['options', 'trailingLineBreaks'], 1); | ||
} | ||
} | ||
} | ||
} | ||
function filterBody(dom, options, baseElement) { | ||
var result = null; | ||
function findBase (dom, options, baseElement) { | ||
let result = null; | ||
var splitTag = helper.splitCssSearchTag(baseElement); | ||
const splitTag = splitSelector(baseElement); | ||
function walk(dom) { | ||
if (result) return; | ||
dom.forEach(function(elem) { | ||
if (result) return; | ||
function recursiveWalk (walk, /** @type { DomNode[] } */ dom) { | ||
if (result) { return; } | ||
dom = dom.slice(0, options.limits.maxChildNodes); | ||
for (const elem of dom) { | ||
if (result) { return; } | ||
if (elem.name === splitTag.element) { | ||
var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : []; | ||
var documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(" ") : []; | ||
const documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(' ') : []; | ||
const documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(' ') : []; | ||
if ((splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0; })) && | ||
(splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0; }))) { | ||
if ( | ||
splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0; }) && | ||
splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0; }) | ||
) { | ||
result = [elem]; | ||
@@ -75,115 +223,73 @@ return; | ||
} | ||
if (elem.children) walk(elem.children); | ||
}); | ||
if (elem.children) { walk(elem.children); } | ||
} | ||
} | ||
walk(dom); | ||
const limitedWalk = limitedDepthRecursive( | ||
options.limits.maxDepth, | ||
recursiveWalk | ||
); | ||
limitedWalk(dom); | ||
return options.returnDomByDefault ? result || dom : result; | ||
} | ||
function containsTable(attr, tables) { | ||
if (tables === true) return true; | ||
/** | ||
* Function to walk through DOM nodes and accumulate their string representations. | ||
* | ||
* @param { RecursiveCallback } walk Recursive callback. | ||
* @param { DomNode[] } [dom] Nodes array to process. | ||
* @param { BlockTextBuilder } builder Passed around to accumulate output text. | ||
* @private | ||
*/ | ||
function recursiveWalk (walk, dom, builder) { | ||
if (!dom) { return; } | ||
function removePrefix(key) { | ||
return key.substr(1); | ||
} | ||
function checkPrefix(prefix) { | ||
return function(key) { | ||
return key.startsWith(prefix); | ||
}; | ||
} | ||
function filterByPrefix(tables, prefix) { | ||
return tables | ||
.filter(checkPrefix(prefix)) | ||
.map(removePrefix); | ||
} | ||
var classes = filterByPrefix(tables, '.'); | ||
var ids = filterByPrefix(tables, '#'); | ||
return attr && (includes(classes, attr['class']) || includes(ids, attr['id'])); | ||
} | ||
const options = builder.options; | ||
function walk(dom, options, result) { | ||
if (arguments.length < 3) { | ||
result = ''; | ||
const tooManyChildNodes = dom.length > options.limits.maxChildNodes; | ||
if (tooManyChildNodes) { | ||
dom = dom.slice(0, options.limits.maxChildNodes); | ||
dom.push({ | ||
data: options.limits.ellipsis, | ||
type: 'text' | ||
}); | ||
} | ||
var whiteSpaceRegex = /\s$/; | ||
var format = Object.assign({}, defaultFormat, options.format); | ||
if (!dom) { | ||
return result; | ||
} | ||
dom.forEach(function(elem) { | ||
switch(elem.type) { | ||
case 'tag': | ||
switch(elem.name.toLowerCase()) { | ||
case 'img': | ||
result += format.image(elem, options); | ||
break; | ||
case 'a': | ||
// Inline element needs its leading space to be trimmed if `result` | ||
// currently ends with whitespace | ||
elem.trimLeadingSpace = whiteSpaceRegex.test(result); | ||
result += format.anchor(elem, walk, options); | ||
break; | ||
case 'p': | ||
result += format.paragraph(elem, walk, options); | ||
break; | ||
case 'h1': | ||
case 'h2': | ||
case 'h3': | ||
case 'h4': | ||
case 'h5': | ||
case 'h6': | ||
result += format.heading(elem, walk, options); | ||
break; | ||
case 'br': | ||
result += format.lineBreak(elem, walk, options); | ||
break; | ||
case 'hr': | ||
result += format.horizontalLine(elem, walk, options); | ||
break; | ||
case 'ul': | ||
result += format.unorderedList(elem, walk, options); | ||
break; | ||
case 'ol': | ||
result += format.orderedList(elem, walk, options); | ||
break; | ||
case 'pre': | ||
var newOptions = Object.assign({}, options); | ||
newOptions.isInPre = true; | ||
result += format.paragraph(elem, walk, newOptions); | ||
break; | ||
case 'table': | ||
result = containsTable(elem.attribs, options.tables) | ||
? result + format.table(elem, walk, options) | ||
: walk(elem.children || [], options, result); | ||
break; | ||
case 'blockquote': | ||
result += format.blockquote(elem, walk, options); | ||
break; | ||
default: | ||
result = walk(elem.children || [], options, result); | ||
} | ||
for (const elem of dom) { | ||
switch (elem.type) { | ||
case 'text': { | ||
builder.addInline(he.decode(elem.data, options.decodeOptions)); | ||
break; | ||
case 'text': | ||
if (elem.data !== '\r\n') { | ||
// Text needs its leading space to be trimmed if `result` | ||
// currently ends with whitespace | ||
elem.trimLeadingSpace = whiteSpaceRegex.test(result); | ||
result += format.text(elem, options); | ||
} | ||
} | ||
case 'tag': { | ||
const tags = options.tags; | ||
const tagDefinition = tags[elem.name] || tags['']; | ||
const format = options.formatters[tagDefinition.format]; | ||
format(elem, walk, builder, tagDefinition.options || {}); | ||
break; | ||
} | ||
default: | ||
if (!includes(SKIP_TYPES, elem.type)) { | ||
result = walk(elem.children || [], options, result); | ||
} | ||
/* do nothing */ | ||
break; | ||
} | ||
} | ||
options.lineCharCount = result.length - (result.lastIndexOf('\n') + 1); | ||
}); | ||
return result; | ||
return; | ||
} | ||
exports.fromString = function(str, options) { | ||
return htmlToText(str, options || {}); | ||
/** | ||
* @deprecated Import/require `{ htmlToText }` function instead! | ||
* @see htmlToText | ||
* | ||
* @param { string } html HTML content to convert. | ||
* @param { Options } [options = {}] HtmlToText options. | ||
* @returns { string } Plain text string. | ||
* @static | ||
*/ | ||
const fromString = (html, options = {}) => htmlToText(html, options); | ||
module.exports = { | ||
htmlToText: htmlToText, | ||
fromString: fromString | ||
}; |
{ | ||
"name": "html-to-text", | ||
"version": "5.1.1", | ||
"version": "6.0.0", | ||
"description": "Advanced html to plain text converter", | ||
"main": "index.js", | ||
"scripts": { | ||
"test": "istanbul cover _mocha && eslint .", | ||
"test": "mocha && eslint .", | ||
"example": "node ./example/html-to-text.js", | ||
"lint": "eslint .", | ||
"prepublish": "npm test" | ||
"cover": "nyc --reporter=lcov --reporter=text-summary npm run test", | ||
"prepublishOnly": "npm run cover" | ||
}, | ||
@@ -16,16 +17,17 @@ "author": { | ||
}, | ||
"homepage": "https://github.com/werk85/node-html-to-text", | ||
"homepage": "https://github.com/html-to-text/node-html-to-text", | ||
"license": "MIT", | ||
"repository": { | ||
"type": "git", | ||
"url": "git://github.com/werk85/node-html-to-text.git" | ||
"url": "git://github.com/html-to-text/node-html-to-text.git" | ||
}, | ||
"bugs": { | ||
"url": "https://github.com/werk85/node-html-to-text/issues" | ||
"url": "https://github.com/html-to-text/node-html-to-text/issues" | ||
}, | ||
"dependencies": { | ||
"deepmerge": "^4.2.2", | ||
"he": "^1.2.0", | ||
"htmlparser2": "^3.10.1", | ||
"minimist": "^1.2.0", | ||
"lodash": "^4.17.11" | ||
"htmlparser2": "^4.1.0", | ||
"lodash": "^4.17.20", | ||
"minimist": "^1.2.5" | ||
}, | ||
@@ -41,3 +43,3 @@ "keywords": [ | ||
"engines": { | ||
"node": ">= 4.0.0" | ||
"node": ">=8.10.0" | ||
}, | ||
@@ -49,6 +51,11 @@ "bin": { | ||
"chai": "^4.2.0", | ||
"eslint": "^5.15.3", | ||
"istanbul": "^0.4.5", | ||
"mocha": "^6.0.2" | ||
"eslint": "^6.8.0", | ||
"eslint-plugin-filenames": "^1.3.2", | ||
"eslint-plugin-import": "^2.20.2", | ||
"eslint-plugin-jsdoc": "^27.0.4", | ||
"eslint-plugin-mocha": "^6.3.0", | ||
"eslint-plugin-you-dont-need-lodash-underscore": "^6.10.0", | ||
"mocha": "^7.2.0", | ||
"nyc": "^15.1.0" | ||
} | ||
} |
426
README.md
# html-to-text | ||
[![Build Status](https://travis-ci.org/werk85/node-html-to-text.svg?branch=master)](https://travis-ci.org/werk85/node-html-to-text) | ||
[![Test Coverage](https://codeclimate.com/github/werk85/node-html-to-text/badges/coverage.svg)](https://codeclimate.com/github/werk85/node-html-to-text/coverage) | ||
[![Build Status](https://travis-ci.org/html-to-text/node-html-to-text.svg?branch=master)](https://travis-ci.org/html-to-text/node-html-to-text) | ||
[![Test Coverage](https://codeclimate.com/github/html-to-text/node-html-to-text/badges/coverage.svg)](https://codeclimate.com/github/html-to-text/node-html-to-text/coverage) | ||
[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/html-to-text/node-html-to-text/blob/master/LICENSE-MIT) | ||
[![npm](https://img.shields.io/npm/v/html-to-text?logo=npm)](https://www.npmjs.com/package/html-to-text) | ||
[![npm](https://img.shields.io/npm/dw/html-to-text?color=informational&logo=npm)](https://www.npmjs.com/package/html-to-text) | ||
An advanced converter that parses HTML and returns beautiful text. It was mainly designed to transform HTML E-Mail templates to a text representation. So it is currently optimized for table layouts. | ||
Advanced converter that parses HTML and returns beautiful text. | ||
### Features: | ||
## Features | ||
* Transform headlines to uppercase text. | ||
* Convert tables to an appropiate text representation with rows and columns. | ||
* Word wrapping for paragraphs (default 80 chars). | ||
* Automatic extraction of href information from links. | ||
* `<br>` conversion to `\n`. | ||
* Unicode support. | ||
* Runs in browser and server environments. | ||
* Inline and block-level tags. | ||
* Tables with colspans and rowspans. | ||
* Links with both text and href. | ||
* Word wrapping. | ||
* Unicode support. | ||
* Plenty of customization options. | ||
## Changelog | ||
Available here: [CHANGELOG.md](https://github.com/html-to-text/node-html-to-text/blob/master/CHANGELOG.md) | ||
Version 6 contains a ton of changes, so it worth to take a look. | ||
## Installation | ||
@@ -33,5 +41,5 @@ | ||
```js | ||
const htmlToText = require('html-to-text'); | ||
const { htmlToText } = require('html-to-text'); | ||
const text = htmlToText.fromString('<h1>Hello World</h1>', { | ||
const text = htmlToText('<h1>Hello World</h1>', { | ||
wordwrap: 130 | ||
@@ -42,280 +50,192 @@ }); | ||
### Options: | ||
### Options | ||
You can configure the behaviour of html-to-text with the following options: | ||
#### General options | ||
* `tables` allows to select certain tables by the `class` or `id` attribute from the HTML document. This is necessary because the majority of HTML E-Mails uses a table based layout. Prefix your table selectors with an `.` for the `class` and with a `#` for the `id` attribute. All other tables are ignored. You can assign `true` to this attribute to select all tables. Default: `[]` | ||
* `wordwrap` defines after how many chars a line break should follow in `p` elements. Set to `null` or `false` to disable word-wrapping. Default: `80` | ||
* `linkHrefBaseUrl` allows you to specify the server host for href attributes, where the links start at the root (`/`). For example, `linkHrefBaseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>` the link in the text will be `http://asdf.com/dir/subdir`. Keep in mind that `linkHrefBaseUrl` shouldn't end with a `/`. | ||
* `hideLinkHrefIfSameAsText` by default links are translated the following `<a href='link'>text</a>` => becomes => `text [link]`. If this option is set to true and `link` and `text` are the same, `[link]` will be hidden and only `text` visible. | ||
* `noLinkBrackets` dont print brackets around the link if `true`. | ||
* `ignoreHref` ignore all document links if `true`. | ||
* `ignoreImage` ignore all document images if `true`. | ||
* `preserveNewlines` by default, any newlines `\n` in a block of text will be removed. If `true`, these newlines will not be removed. | ||
* `decodeOptions` defines the text decoding options given to `he.decode`. For more informations see the [he](https://github.com/mathiasbynens/he) module. | ||
* `uppercaseHeadings` by default, headings (`<h1>`, `<h2>`, etc) are uppercased. Set to `false` to leave headings as they are. | ||
* `singleNewLineParagraphs` by default, paragraphs are converted with two newlines (`\n\n`). Set to `true` to convert to a single newline. | ||
* `baseElement` defines the tags whose text content will be captured from the html. All content will be captured below the baseElement tags and added to the resulting text output. This option allows the user to specify an array of elements as base elements using a single tag with css class and id parameters e.g. [`p.class1.class2#id1#id2`, `p.class1.class2#id1#id2`] . Default: `body` | ||
* `returnDomByDefault` convert the entire document if we don't find the tag we're looking for if `true`. | ||
* `longWordSplit` describes how to wrap long words, has the following parameters: | ||
* `wrapCharacters` is an array containing the characters that may be wrapped on, these are used in order | ||
* `forceWrapOnLimit` defines whether to break long words on the limit if `true`. | ||
* `format` pass an object to enable custom formatting for specific elements (see below) | ||
* `unorderedListItemPrefix` defines the string that is used as item prefix for unordered lists `<ol>`. Default: `' * '` | ||
Option | Default | Description | ||
----------------------- | ------------ | ----------- | ||
`baseElement` | `'body'` | The tag(s) whose text content will be captured from the html and added to the resulting text output.<br/>Single element or an array of elements can be specified, each as a single tag name with optional css class and id parameters e.g. `['p.class1.class2#id1#id2', 'p.class1.class2#id1#id2']`. | ||
`decodeOptions` | `{ isAttributeValue: false, strict: false }` | Text decoding options given to `he.decode`. For more informations see the [he](https://github.com/mathiasbynens/he) module. | ||
`formatters` | `{}` | An object with custom formatting functions for specific elements (see "Override formatting" section below). | ||
`limits` | | Describes how to limit the output text in case of large HTML documents. | ||
`limits.ellipsis` | `'...'` | A string to insert in place of skipped content. | ||
`limits.maxChildNodes` | `undefined` | Maximum number of child nodes of a single node to be added to the output. Unlimited if undefined. | ||
`limits.maxDepth` | `undefined` | Stop looking for nodes to add to the output below this depth in the DOM tree. Unlimited if undefined. | ||
`limits.maxInputLength` | `16_777_216` | If the input string is longer than this value - it will be truncated and a message will be sent to `stderr`. Ellipsis is not used in this case. Unlimited if undefined. | ||
`longWordSplit` | | Describes how to wrap long words. | ||
`longWordSplit.wrapCharacters` | `[]` | An array containing the characters that may be wrapped on. Checked in order, search stops once line length requirement can be met. | ||
`longWordSplit.forceWrapOnLimit` | `false` | Break long words at the line length limit in case no better wrap opportunities found. | ||
`preserveNewlines` | `false` | By default, any newlines `\n` in a block of text will be removed. If `true`, these newlines will not be removed. | ||
`returnDomByDefault` | `true` | Convert the entire document if we don't find the tag defined in `baseElement`. | ||
`tables` | `[]` | Allows to select certain tables by the `class` or `id` attribute from the HTML document. This is necessary because the majority of HTML E-Mails uses a table based layout. Prefix your table selectors with an `.` for the `class` and with a `#` for the `id` attribute. All other tables are ignored.<br/>You can assign `true` to this attribute to select all tables. | ||
`tags` | | Describes how different tags should be formatted. See "Tags" section below. | ||
`whitespaceCharacters` | `' \t\r\n\f\u200b'` | A string of characters that are recognized as HTML whitespace. Default value uses the set of characters defined in [HTML4 standard](https://www.w3.org/TR/html4/struct/text.html#h-9.1). (It includes Zero-width space compared to [living standard](https://infra.spec.whatwg.org#ascii-whitespace).) | ||
`wordwrap` | `80` | After how many chars a line break should follow.<br/>Set to `null` or `false` to disable word-wrapping. | ||
### Override formatting for specific elements | ||
#### Options deprecated in version 6 | ||
By using the `format` option, you can specify formatting for these elements: `text`, `image`, `lineBreak`, `paragraph`, `anchor`, `heading`, `table`, `orderedList`, `unorderedList`, `listItem`, `horizontalLine`. | ||
Old option | Instead use | ||
-------------------------- | ----------- | ||
`hideLinkHrefIfSameAsText` | `hideLinkHrefIfSameAsText` option for tags with `anchor` formatter. | ||
`ignoreHref` | `ignoreHref` option for tags with `anchor` formatter. | ||
`ignoreImage` | Set format to `skip` for `img` tags. | ||
`linkHrefBaseUrl` | `baseUrl` option for tags with `anchor` and `image` formatters. | ||
`noAnchorUrl` | `noAnchorUrl` option for tags with `anchor` formatter. | ||
`noLinkBrackets` | `noLinkBrackets` option for tags with `anchor` formatter. | ||
`singleNewLineParagraphs` | Set `leadingLineBreaks` and `trailingLineBreaks` options to `1` for `p` and `pre` tags. | ||
`unorderedListItemPrefix` | `itemPrefix` option for tags with `unorderedList` formatter. | ||
`uppercaseHeadings` | `uppercase` option for tags with `heading` formatter, `uppercaseHeaderCells` option for `table` or `dataTable` formatters. | ||
Each key must be a function which eventually receive `elem` (the current elem), `fn` (the next formatting function) and `options` (the options passed to html-to-text). | ||
Deprecated options will be removed with future major version update. | ||
```js | ||
var htmlToText = require('html-to-text'); | ||
#### Options removed in version 6 | ||
var text = htmlToText.fromString('<h1>Hello World</h1>', { | ||
format: { | ||
heading: function (elem, fn, options) { | ||
var h = fn(elem.children, options); | ||
return '====\n' + h.toUpperCase() + '\n===='; | ||
} | ||
} | ||
}); | ||
Old option | Description | ||
--------------- | ----------- | ||
`format` | The way formatters are written has changed completely. New formatters have to be added to the `formatters` option, old ones can not be reused without rewrite. See new instructions below. | ||
console.log(text); | ||
``` | ||
#### Tags | ||
## Command Line Interface | ||
By default there are following tag to formatter assignments: | ||
It is possible to use html-to-text as command line interface. This allows an easy validation of your generated text and the integration in other systems that does not run on node.js. | ||
Tag name | Default format | Notes | ||
------------- | ------------------- | ----- | ||
`''` | `inline` | Catch-all default for unknown tags. | ||
`a` | `anchor` | | ||
`article` | `block` | | ||
`aside` | `block` | | ||
`blockquote` | `blockquote` | | ||
`br` | `lineBreak` | | ||
`div` | `block` | | ||
`footer` | `block` | | ||
`form` | `block` | | ||
`h1` | `heading` | | ||
`h2` | `heading` | | ||
`h3` | `heading` | | ||
`h4` | `heading` | | ||
`h5` | `heading` | | ||
`h6` | `heading` | | ||
`header` | `block` | | ||
`hr` | `horizontalLine` | | ||
`img` | `image` | | ||
`main` | `block` | | ||
`nav` | `block` | | ||
`ol` | `orderedList` | | ||
`p` | `paragraph` | | ||
`pre` | `pre` | | ||
`table` | `table` | there is also `dataTable` formatter. Using it will be equivalent to setting `tables` to `true`. `tables` option might be deprecated in the future. | ||
`ul` | `unorderedList` | | ||
`wbr` | `wbr` | | ||
`html-to-text` uses `stdin` and `stdout` for data in and output. So you can use `html-to-text` the following way: | ||
More formatters also available for use: | ||
``` | ||
cat example/test.html | html-to-text > test.txt | ||
``` | ||
* `skip` - as the name implies it skips the given tag with it's contents without printing anything. | ||
There also all options available as described above. You can use them like this: | ||
Format options are specified for each tag indepentently: | ||
Option | Default | Applies to | Description | ||
------------------- | ----------- | ------------------ | ----------- | ||
`leadingLineBreaks` | `1`, `2` or `3` | all block-level formatters | Number of line breaks to separate previous block from this one.<br/>Note that N+1 line breaks are needed to make N empty lines. | ||
`trailingLineBreaks` | `1` or `2` | all block-level formatters | Number of line breaks to separate this block from the next one.<br/>Note that N+1 line breaks are needed to make N empty lines. | ||
`baseUrl` | null | `anchor`, `image` | Server host for link `href` attributes and image `src` attributes relative to the root (the ones that start with `/`).<br/>For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>` the link in the text will be `http://asdf.com/dir/subdir`.<br/>Keep in mind that `baseUrl` should not end with a `/`. | ||
`hideLinkHrefIfSameAsText` | `false` | `anchor` | By default links are translated in the following way:<br/>`<a href='link'>text</a>` => becomes => `text [link]`.<br/>If this option is set to `true` and `link` and `text` are the same, `[link]` will be omitted and only `text` will be present. | ||
`ignoreHref` | `false` | `anchor` | Ignore all links. Only process internal text of anchor tags. | ||
`noAnchorUrl` | `true` | `anchor` | Ignore anchor links (where `href='#...'`). | ||
`noLinkBrackets` | `false` | `anchor` | Don't print brackets around links. | ||
`itemPrefix` | `' * '` | `unorderedList` | String prefix for each list item. | ||
`uppercase` | `true` | `heading` | By default, headings (`<h1>`, `<h2>`, etc) are uppercased.<br/>Set this to `false` to leave headings as they are. | ||
`length` | `undefined` | `horizontalLine` | Length of the line. If undefined then `wordwrap` value is used. Falls back to 40 if that's also disabled. | ||
`trimEmptyLines` | `true` | `blockquote` | Trim empty lines from blockquote.<br/>While empty lines should be preserved in HTML, space-saving behavior is chosen as default for convenience. | ||
`uppercaseHeaderCells` | `true` | `table`, `dataTable` | By default, heading cells (`<th>`) are uppercased.<br/>Set this to `false` to leave heading cells as they are. | ||
`maxColumnWidth` | `60` | `table`, `dataTable` | Data table cell content will be wrapped to fit this width instead of global `wordwrap` limit.<br/>Set to `undefined` in order to fall back to `wordwrap` limit. | ||
`colSpacing` | `3` | `table`, `dataTable` | Number of spaces between data table columns. | ||
`rowSpacing` | `0` | `table`, `dataTable` | Number of empty lines between data table rows. | ||
How to set a specific format option, example: | ||
```javascript | ||
var { htmlToText } = require('html-to-text'); | ||
var text = htmlToText('<a href="/page.html">Page</a>', { | ||
tags: { 'a': { options: { baseUrl: 'https://example.com' } } } | ||
}); | ||
console.log(text); // Page [https://example.com/page.html] | ||
``` | ||
cat example/test.html | html-to-text --tables=#invoice,.address --wordwrap=100 > test.txt | ||
``` | ||
The `tables` option has to be declared as comma separated list without whitespaces. | ||
### Override formatting | ||
## Example | ||
This is significantly changed in version 6. | ||
```html | ||
<html> | ||
<head> | ||
<meta charset="utf-8"> | ||
</head> | ||
`formatters` option is an object that holds formatting functions. They can be assigned to format different tags by key in the `tags` option. | ||
<body> | ||
<table cellpadding="0" cellspacing="0" border="0"> | ||
<tr> | ||
<td> | ||
<h2>Paragraphs</h2> | ||
<p class="normal-space">At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. <a href="www.github.com">Github</a> | ||
</p> | ||
<p class="normal-space">At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. | ||
</p> | ||
</td> | ||
<td></td> | ||
</tr> | ||
<tr> | ||
<td> | ||
<hr/> | ||
<h2>Pretty printed table</h2> | ||
<table id="invoice"> | ||
<thead> | ||
<tr> | ||
<th>Article</th> | ||
<th>Price</th> | ||
<th>Taxes</th> | ||
<th>Amount</th> | ||
<th>Total</th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td> | ||
<p> | ||
Product 1<br /> | ||
<span style="font-size:0.8em">Contains: 1x Product 1</span> | ||
</p> | ||
</td> | ||
<td align="right" valign="top">6,99€</td> | ||
<td align="right" valign="top">7%</td> | ||
<td align="right" valign="top">1</td> | ||
<td align="right" valign="top">6,99€</td> | ||
</tr> | ||
<tr> | ||
<td>Shipment costs</td> | ||
<td align="right">3,25€</td> | ||
<td align="right">7%</td> | ||
<td align="right">1</td> | ||
<td align="right">3,25€</td> | ||
</tr> | ||
</tbody> | ||
<tfoot> | ||
<tr> | ||
<td> </td> | ||
<td> </td> | ||
<td colspan="3">to pay: 10,24€</td> | ||
</tr> | ||
<tr> | ||
<td></td> | ||
<td></td> | ||
<td colspan="3">Taxes 7%: 0,72€</td> | ||
</tr> | ||
</tfoot> | ||
</table> | ||
Each formatter is a function of four arguments that returns nothing. Arguments are: | ||
</td> | ||
<td></td> | ||
</tr> | ||
<tr> | ||
<td> | ||
<hr/> | ||
<h2>Lists</h2> | ||
<ul> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
</ul> | ||
<ol> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
</ol> | ||
</td> | ||
</tr> | ||
<tr> | ||
<td> | ||
<hr /> | ||
<h2>Column Layout with tables</h2> | ||
<table class="address"> | ||
<tr> | ||
<th align="left">Invoice Address</th> | ||
<th align="left">Shipment Address</th> | ||
</tr> | ||
<tr> | ||
<td align="left"> | ||
<p> | ||
Mr.<br/> | ||
John Doe<br/> | ||
Featherstone Street 49<br/> | ||
28199 Bremen<br/> | ||
</p> | ||
</td> | ||
<td align="left"> | ||
<p> | ||
Mr.<br/> | ||
John Doe<br/> | ||
Featherstone Street 49<br/> | ||
28199 Bremen<br/> | ||
</p> | ||
</td> | ||
</tr> | ||
</table> | ||
</td> | ||
<td></td> | ||
</tr> | ||
<tr> | ||
<td> | ||
<hr/> | ||
<h2>Mailto formating</h2> | ||
<p class="normal-space small"> | ||
Some Company<br /> | ||
Some Street 42<br /> | ||
Somewhere<br /> | ||
E-Mail: <a href="mailto:test@example.com">Click here</a> | ||
</p> | ||
</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
``` | ||
* `elem` - the HTML element to be processed by this formatter; | ||
* `walk` - recursive function to process the children of this element. Called as `walk(elem.children, builder)`; | ||
* `builder` - [BlockTextBuilder](https://github.com/html-to-text/node-html-to-text/blob/master/lib/block-text-builder.js) object. Manipulate this object state to build the output text; | ||
* `formatOptions` - options that are specified for a tag, along with this formatter (Note: if you need global html-to-text options - they are accessible via `builder.options`). | ||
Gets converted to: | ||
Custom formatter example: | ||
``` | ||
PARAGRAPHS | ||
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum | ||
dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor | ||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos | ||
et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea | ||
takimata sanctus est Lorem ipsum dolor sit amet. Github [www.github.com] | ||
```javascript | ||
var { htmlToText } = require('html-to-text'); | ||
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum | ||
dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor | ||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos | ||
et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea | ||
takimata sanctus est Lorem ipsum dolor sit amet. | ||
var text = htmlToText('<foo>Hello World</foo>', { | ||
formatters: { | ||
// Create a formatter. | ||
'fooBlockFormatter': function (elem, walk, builder, formatOptions) { | ||
builder.openBlock(formatOptions.leadingLineBreaks || 1); | ||
walk(elem.children, builder); | ||
builder.addInline('!'); | ||
builder.closeBlock(formatOptions.trailingLineBreaks || 1); | ||
} | ||
}, | ||
tags: { | ||
// Assign it to `foo` tags. | ||
'foo': { | ||
format: 'fooBlockFormatter', | ||
options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } | ||
} | ||
} | ||
}); | ||
-------------------------------------------------------------------------------- | ||
console.log(text); // Hello World! | ||
``` | ||
PRETTY PRINTED TABLE | ||
ARTICLE PRICE TAXES AMOUNT TOTAL | ||
Product 1 6,99€ 7% 1 6,99€ | ||
Contains: 1x Product 1 | ||
Shipment costs 3,25€ 7% 1 3,25€ | ||
to pay: 10,24€ | ||
Taxes 7%: 0,72€ | ||
Refer to [built-in formatters](https://github.com/html-to-text/node-html-to-text/blob/master/lib/formatter.js) for more examples. | ||
-------------------------------------------------------------------------------- | ||
Refer to [BlockTextBuilder](https://github.com/html-to-text/node-html-to-text/blob/master/lib/block-text-builder.js) for available functions and arguments. | ||
LISTS | ||
* At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. | ||
* At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. | ||
## Command Line Interface | ||
1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. | ||
2. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. | ||
It is possible to use html-to-text as command line interface. This allows an easy validation of your generated text and the integration in other systems that does not run on node.js. | ||
-------------------------------------------------------------------------------- | ||
`html-to-text` uses `stdin` and `stdout` for data in and output. So you can use `html-to-text` the following way: | ||
COLUMN LAYOUT WITH TABLES | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen | ||
``` | ||
cat example/test.html | html-to-text > test.txt | ||
``` | ||
-------------------------------------------------------------------------------- | ||
There also all options available as described above. You can use them like this: | ||
MAILTO FORMATING | ||
Some Company | ||
Some Street 42 | ||
Somewhere | ||
E-Mail: Click here [test@example.com] | ||
``` | ||
cat example/test.html | html-to-text --tables=#invoice,.address --wordwrap=100 > test.txt | ||
``` | ||
## License | ||
The `tables` option has to be declared as comma separated list without whitespaces. | ||
(The MIT License) | ||
## Example | ||
Copyright (c) 2019 werk85 <malte@werk85.de> | ||
* Input text: [test.html](https://github.com/html-to-text/node-html-to-text/blob/master/test/test.html) | ||
* Output text: [test.txt](https://github.com/html-to-text/node-html-to-text/blob/master/test/test.txt) | ||
Permission is hereby granted, free of charge, to any person obtaining | ||
a copy of this software and associated documentation files (the | ||
'Software'), to deal in the Software without restriction, including | ||
without limitation the rights to use, copy, modify, merge, publish, | ||
distribute, sublicense, and/or sell copies of the Software, and to | ||
permit persons to whom the Software is furnished to do so, subject to | ||
the following conditions: | ||
## Contributors | ||
The above copyright notice and this permission notice shall be | ||
included in all copies or substantial portions of the Software. | ||
* [@mlegenhausen](https://github.com/mlegenhausen) - creator; | ||
* [@KillyMXI](https://github.com/KillyMXI) - maintainer since 2020; | ||
* Everyone else who [added something](https://github.com/html-to-text/node-html-to-text/graphs/contributors) to the tool or helped us shaping it via [issues](https://github.com/html-to-text/node-html-to-text/issues) and [PRs](https://github.com/html-to-text/node-html-to-text/pulls). | ||
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, | ||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | ||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
## License | ||
[MIT License](https://github.com/html-to-text/node-html-to-text/blob/master/LICENSE-MIT) |
@@ -1,142 +0,165 @@ | ||
/* eslint max-len: "off" */ | ||
const fs = require('fs'); | ||
const path = require('path'); | ||
var expect = require('chai').expect; | ||
var htmlToText = require('..'); | ||
var path = require('path'); | ||
var fs = require('fs'); | ||
const { expect } = require('chai'); | ||
describe('html-to-text', function() { | ||
describe('.fromString()', function() { | ||
describe('wordwrap option', function() { | ||
const { htmlToText } = require('..'); | ||
var longStr; | ||
beforeEach(function() { | ||
describe('html-to-text', function () { | ||
describe('Smoke test', function () { | ||
it('should return empty input unchanged', function () { | ||
expect(htmlToText('')).to.equal(''); | ||
}); | ||
it('should return empty result if input undefined', function () { | ||
expect(htmlToText()).to.equal(''); | ||
}); | ||
it('should return plain text (no line breaks) unchanged', function () { | ||
expect(htmlToText('Hello world!')).to.equal('Hello world!'); | ||
}); | ||
}); | ||
describe('.htmlToText()', function () { | ||
describe('wordwrap option', function () { | ||
let longStr; | ||
beforeEach(function () { | ||
longStr = '111111111 222222222 333333333 444444444 555555555 666666666 777777777 888888888 999999999'; | ||
}); | ||
it('should wordwrap at 80 characters by default', function() { | ||
expect(htmlToText.fromString(longStr)).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777 888888888\n999999999'); | ||
it('should wordwrap at 80 characters by default', function () { | ||
expect(htmlToText(longStr)).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777 888888888\n999999999'); | ||
}); | ||
it('should wordwrap at given amount of characters when give a number', function() { | ||
it('should wordwrap at given amount of characters when give a number', function () { | ||
expect(htmlToText.fromString(longStr, { wordwrap: 20 })).to.equal('111111111 222222222\n333333333 444444444\n555555555 666666666\n777777777 888888888\n999999999'); | ||
expect(htmlToText(longStr, { wordwrap: 20 })).to.equal('111111111 222222222\n333333333 444444444\n555555555 666666666\n777777777 888888888\n999999999'); | ||
expect(htmlToText.fromString(longStr, { wordwrap: 50 })).to.equal('111111111 222222222 333333333 444444444 555555555\n666666666 777777777 888888888 999999999'); | ||
expect(htmlToText(longStr, { wordwrap: 50 })).to.equal('111111111 222222222 333333333 444444444 555555555\n666666666 777777777 888888888 999999999'); | ||
expect(htmlToText.fromString(longStr, { wordwrap: 70 })).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777\n888888888 999999999'); | ||
expect(htmlToText(longStr, { wordwrap: 70 })).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777\n888888888 999999999'); | ||
}); | ||
it('should not wordwrap when given null', function() { | ||
expect(htmlToText.fromString(longStr, { wordwrap: null })).to.equal(longStr); | ||
it('should not wordwrap when given null', function () { | ||
expect(htmlToText(longStr, { wordwrap: null })).to.equal(longStr); | ||
}); | ||
it('should not wordwrap when given false', function() { | ||
expect(htmlToText.fromString(longStr, { wordwrap: false })).to.equal(longStr); | ||
it('should not wordwrap when given false', function () { | ||
expect(htmlToText(longStr, { wordwrap: false })).to.equal(longStr); | ||
}); | ||
it('should not exceed the line width when processing embedded format tags', function() { | ||
var testString = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths.</p>'; | ||
expect(htmlToText.fromString(testString, {} )).to.equal('This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths.'); | ||
it('should not exceed the line width when processing embedded format tags', function () { | ||
const testString = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths.</p>'; | ||
expect(htmlToText(testString, {})).to.equal('This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths.'); | ||
}); | ||
it('should work with a long string containing line feeds', function() { | ||
var testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>'; | ||
expect(htmlToText.fromString(testString, {} )).to.equal('If a word with a line feed exists over the line feed boundary then you must\nrespect it.'); | ||
it('should work with a long string containing line feeds', function () { | ||
const testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>'; | ||
expect(htmlToText(testString, {})).to.equal('If a word with a line feed exists over the line feed boundary then you must\nrespect it.'); | ||
}); | ||
it('should not wrongly truncate lines when processing embedded format tags', function() { | ||
var testString = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths. However it can affect where the next line breaks and this could lead to having an early line break</p>'; | ||
expect(htmlToText.fromString(testString, {} )).to.equal('This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths. However it can affect where the next line breaks and\nthis could lead to having an early line break'); | ||
it('should not wrongly truncate lines when processing embedded format tags', function () { | ||
const testString = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths. However it can affect where the next line breaks and this could lead to having an early line break</p>'; | ||
expect(htmlToText(testString, {})).to.equal('This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths. However it can affect where the next line breaks and\nthis could lead to having an early line break'); | ||
}); | ||
it('should not exceed the line width when processing anchor tags', function() { | ||
var testString = "<p>We appreciate your business. And we hope you'll check out our <a href=\"http://example.com/\">new products</a>!</p>"; | ||
expect(htmlToText.fromString(testString, {} )).to.equal('We appreciate your business. And we hope you\'ll check out our new products\n[http://example.com/]!'); | ||
it('should not exceed the line width when processing anchor tags', function () { | ||
const testString = "<p>We appreciate your business. And we hope you'll check out our <a href=\"http://example.com/\">new products</a>!</p>"; | ||
expect(htmlToText(testString, {})).to.equal('We appreciate your business. And we hope you\'ll check out our new products\n[http://example.com/]!'); | ||
}); | ||
it('should honour line feeds from a long word across the wrap, where the line feed is before the wrap', function() { | ||
var testString = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>'; | ||
expect(htmlToText.fromString(testString, {} )) | ||
.to.equal('This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.'); | ||
it('should honour line feeds from a long word across the wrap, where the line feed is before the wrap', function () { | ||
const testString = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>'; | ||
expect(htmlToText(testString, {})) | ||
.to.equal('This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.'); | ||
}); | ||
it('should remove line feeds from a long word across the wrap, where the line feed is after the wrap', function() { | ||
var testString = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>'; | ||
expect(htmlToText.fromString(testString, {} )) | ||
.to.equal('This string is meant to test if a string is split properly across\nanewlineandlong word with following text.'); | ||
it('should remove line feeds from a long word across the wrap, where the line feed is after the wrap', function () { | ||
const testString = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>'; | ||
expect(htmlToText(testString, {})) | ||
.to.equal('This string is meant to test if a string is split properly across\nanewlineandlong word with following text.'); | ||
}); | ||
}); | ||
describe('preserveNewlines option', function() { | ||
describe('preserveNewlines option', function () { | ||
var newlineStr; | ||
let newlineStr; | ||
beforeEach(function() { | ||
newlineStr = '<p\n>One\nTwo\nThree</p>'; | ||
beforeEach(function () { | ||
newlineStr = '<p\n>One\nTwo\nThree</p>'; // newline inside a tag is intentional | ||
}); | ||
it('should not preserve newlines by default', function() { | ||
expect(htmlToText.fromString(newlineStr)).to.not.contain('\n'); | ||
it('should not preserve newlines by default', function () { | ||
expect(htmlToText(newlineStr)).to.equal('One Two Three'); | ||
}); | ||
it('should preserve newlines when provided with a truthy value', function() { | ||
expect(htmlToText.fromString(newlineStr, { preserveNewlines: true })).to.contain('\n'); | ||
it('should preserve newlines when provided with a truthy value', function () { | ||
expect(htmlToText(newlineStr, { preserveNewlines: true })).to.equal('One\nTwo\nThree'); | ||
}); | ||
it('should not preserve newlines in the tags themselves', function() { | ||
var output_text = htmlToText.fromString(newlineStr, { preserveNewlines: true }); | ||
expect(output_text.slice(0,1)).to.equal("O"); | ||
it('should preserve line feeds in a long wrapping string containing line feeds', function () { | ||
const testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.'); | ||
}); | ||
it('should preserve line feeds in a long wrapping string containing line feeds', function() { | ||
var testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>'; | ||
expect(htmlToText.fromString(testString, { preserveNewlines: true } )) | ||
.to.equal('If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.'); | ||
it('should preserve line feeds in a long string containing line feeds across the wrap', function () { | ||
const testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou must respect it.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('If a word with a line feed exists over the line feed boundary then\nyou must respect it.'); | ||
}); | ||
it('should preserve line feeds in a long string containing line feeds across the wrap', function() { | ||
var testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou must respect it.</p>'; | ||
expect(htmlToText.fromString(testString, { preserveNewlines: true } )) | ||
.to.equal('If a word with a line feed exists over the line feed boundary then\nyou must respect it.'); | ||
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed before 80 chars', function () { | ||
const testString = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.'); | ||
}); | ||
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed before 80 chars', function() { | ||
var testString = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>'; | ||
expect(htmlToText.fromString(testString, { preserveNewlines: true } )) | ||
.to.equal('This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.'); | ||
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed after 80 chars', function () { | ||
const testString = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('This string is meant to test if a string is split properly across\nanewlineandlong\nword with following text.'); | ||
}); | ||
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed after 80 chars', function() { | ||
var testString = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>'; | ||
expect(htmlToText.fromString(testString, { preserveNewlines: true } )) | ||
.to.equal('This string is meant to test if a string is split properly across\nanewlineandlong\nword with following text.'); | ||
it('should split long lines', function () { | ||
const testString = '<p>If a word with a line feed exists over the line feed boundary then you must respect it.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('If a word with a line feed exists over the line feed boundary then you must\nrespect it.'); | ||
}); | ||
it('should split long lines', function() { | ||
var testString = '<p>If a word with a line feed exists over the line feed boundary then you must respect it.</p>'; | ||
expect(htmlToText.fromString(testString, { preserveNewlines: true } )) | ||
.to.equal('If a word with a line feed exists over the line feed boundary then you must\nrespect it.'); | ||
it('should remove spaces if they occur around line feed', function () { | ||
const testString = '<p>A string of text\nwith \nmultiple\n spaces \n that \n \n can be safely removed.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('A string of text\nwith\nmultiple\nspaces\nthat\n\ncan be safely removed.'); | ||
}); | ||
it('should remove spaces if they occur around line feed 2', function () { | ||
const testString = 'multiple\n spaces'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('multiple\nspaces'); | ||
}); | ||
}); | ||
describe('single line paragraph option', function() { | ||
describe('single line paragraph option', function () { | ||
var paragraphsString; | ||
let paragraphsString; | ||
beforeEach(function() { | ||
beforeEach(function () { | ||
paragraphsString = '<p>First</p><p>Second</p>'; | ||
}); | ||
it('should not use single new line when given null', function() { | ||
expect(htmlToText.fromString(paragraphsString, { singleNewLineParagraphs: null } )).to.equal('First\n\nSecond'); | ||
it('should not use single new line when given null', function () { | ||
expect(htmlToText(paragraphsString, { singleNewLineParagraphs: null })).to.equal('First\n\nSecond'); | ||
}); | ||
it('should not use single new line when given false', function() { | ||
expect(htmlToText.fromString(paragraphsString, { singleNewLineParagraphs: false } )).to.equal('First\n\nSecond'); | ||
it('should not use single new line when given false', function () { | ||
expect(htmlToText(paragraphsString, { singleNewLineParagraphs: false })).to.equal('First\n\nSecond'); | ||
}); | ||
it('should use single new line when given true', function() { | ||
expect(htmlToText.fromString(paragraphsString, { singleNewLineParagraphs: true } )).to.equal('First\nSecond'); | ||
it('should use single new line when given true', function () { | ||
expect(htmlToText(paragraphsString, { singleNewLineParagraphs: true })).to.equal('First\nSecond'); | ||
}); | ||
@@ -146,36 +169,216 @@ }); | ||
describe('block-level elements', function () { | ||
it('should render common block-level elements on separate lines with default line breaks number', function () { | ||
const testString = | ||
'a<article>article</article>b<aside>aside</aside>c<div>div</div>d<footer>footer</footer>' + | ||
'e<form>form</form>f<header>header</header>g<main>main</main>h<nav>nav</nav>i<section>section</section>j'; | ||
const expectedResult = 'a\narticle\nb\naside\nc\ndiv\nd\nfooter\ne\nform\nf\nheader\ng\nmain\nh\nnav\ni\nsection\nj'; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
}); | ||
describe('tables', function () { | ||
it('does not process tables with uppercase tags / does not process tables with center tag', function () { | ||
var html = 'Good morning Jacob, \ | ||
<TABLE> \ | ||
<CENTER> \ | ||
<TBODY> \ | ||
<TR> \ | ||
<TD>Lorem ipsum dolor sit amet.</TD> \ | ||
</TR> \ | ||
</CENTER> \ | ||
</TBODY> \ | ||
</TABLE> \ | ||
'; | ||
var resultExpected = 'Good morning Jacob, Lorem ipsum dolor sit amet.'; | ||
var result = htmlToText.fromString(html, { tables: true }); | ||
it('should handle center tag in tables', function () { | ||
const html = `Good morning Jacob, \ | ||
<TABLE> | ||
<CENTER> | ||
<TBODY> | ||
<TR> | ||
<TD>Lorem ipsum dolor sit amet.</TD> | ||
</TR> | ||
</CENTER> | ||
</TBODY> | ||
</TABLE> | ||
`; | ||
const resultExpected = 'Good morning Jacob,\n\nLorem ipsum dolor sit amet.'; | ||
const result = htmlToText(html, { tables: true }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
it('does handle non-integer colspan on td element gracefully', function () { | ||
var html = 'Good morning Jacob, \ | ||
<TABLE> \ | ||
<CENTER> \ | ||
<TBODY> \ | ||
<TR> \ | ||
<TD colspan="abc">Lorem ipsum dolor sit amet.</TD> \ | ||
</TR> \ | ||
</CENTER> \ | ||
</TBODY> \ | ||
</TABLE> \ | ||
'; | ||
var resultExpected = 'Good morning Jacob, Lorem ipsum dolor sit amet.'; | ||
var result = htmlToText.fromString(html, { tables: true }); | ||
it('should handle non-integer colspan on td element gracefully', function () { | ||
const html = `Good morning Jacob, | ||
<table> | ||
<tbody> | ||
<tr> | ||
<td colspan="abc">Lorem ipsum dolor sit amet.</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
`; | ||
const resultExpected = 'Good morning Jacob,\n\nLorem ipsum dolor sit amet.'; | ||
const result = htmlToText(html, { tables: true }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
it('should handle tables with colspans and rowspans', function () { | ||
const html = /*html*/` | ||
<table> | ||
<tr> | ||
<td colspan="2" rowspan="3">aa<br/>aa<br/>aa</td> | ||
<td colspan="1" rowspan="3">b<br/>b<br/>b</td> | ||
<td colspan="4" rowspan="2">cccc<br/>cccc</td> | ||
<td colspan="1" rowspan="4">d<br/>d<br/>d<br/>d</td> | ||
</tr> | ||
<tr></tr> | ||
<tr> | ||
<td colspan="2" rowspan="3">ee<br/>ee<br/>ee</td> | ||
<td colspan="2" rowspan="2">ff<br/>ff</td> | ||
</tr> | ||
<tr> | ||
<td colspan="3" rowspan="1">ggg</td> | ||
</tr> | ||
<tr> | ||
<td colspan="1" rowspan="2">h<br/>h</td> | ||
<td colspan="2" rowspan="2">ii<br/>ii</td> | ||
<td colspan="3" rowspan="1">jjj</td> | ||
</tr> | ||
<tr> | ||
<td colspan="1" rowspan="2">k<br/>k</td> | ||
<td colspan="2" rowspan="2">ll<br/>ll</td> | ||
<td colspan="2" rowspan="1">mm</td> | ||
</tr> | ||
<tr> | ||
<td colspan="2" rowspan="2">nn<br/>nn</td> | ||
<td colspan="1" rowspan="1">o</td> | ||
<td colspan="2" rowspan="2">pp<br/>pp</td> | ||
</tr> | ||
<tr> | ||
<td colspan="4" rowspan="1">qqqq</td> | ||
</tr> | ||
</table>`; | ||
const resultExpected = | ||
'aa b cccc d\n' + | ||
'aa b cccc d\n' + | ||
'aa b ee ff d\n' + | ||
'ggg ee ff d\n' + | ||
'h ii ee jjj\n' + | ||
'h ii k ll mm\n' + | ||
'nn o k ll pp\n' + | ||
'nn qqqq pp'; | ||
const result = htmlToText(html, { tables: true }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
it('should support custom spacing for tables', function () { | ||
const html = /*html*/` | ||
<table> | ||
<tr> | ||
<td colspan="2" rowspan="2">aa<br/>aa</td> | ||
<td>b</td> | ||
</tr> | ||
<tr> | ||
<td>c</td> | ||
</tr> | ||
<tr> | ||
<td>d</td> | ||
<td>e</td> | ||
<td>f</td> | ||
</tr> | ||
</table>`; | ||
const resultExpected = | ||
'aa b\n' + | ||
'aa\n' + | ||
' c\n' + | ||
'\n' + | ||
'd e f'; | ||
const result = htmlToText(html, { tables: true, tags: { 'table': { options: { colSpacing: 1, rowSpacing: 1 } } } }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
it('should properly align columns in tables with thead, tfoot', function () { | ||
const html = /*html*/` | ||
<table> | ||
<thead> | ||
<tr> | ||
<td>aaaaaaaaa</td> | ||
<td colspan="2">b</td> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td>ccc</td> | ||
<td>ddd</td> | ||
<td>eee</td> | ||
</tr> | ||
</tbody> | ||
<tfoot> | ||
<tr> | ||
<td colspan="2">f</td> | ||
<td>ggggggggg</td> | ||
</tr> | ||
</tfoot> | ||
</table>`; | ||
const resultExpected = | ||
'aaaaaaaaa b\n' + | ||
'ccc ddd eee\n' + | ||
'f ggggggggg'; | ||
const result = htmlToText(html, { tables: true }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
it('should render block-level elements inside table cells properly', function () { | ||
const html = /*html*/` | ||
<table> | ||
<tr> | ||
<td><h1>hEaDeR</h1></td> | ||
<td><blockquote>A quote<br/>from somewhere.</blockquote></td> | ||
</tr> | ||
<tr> | ||
<td> | ||
<pre> preformatted... ...text </pre> | ||
</td> | ||
<td> | ||
<ol> | ||
<li>list item one</li> | ||
<li>list item two</li> | ||
</ol> | ||
</td> | ||
</tr> | ||
</table>`; | ||
const resultExpected = | ||
'HEADER > A quote\n' + | ||
' > from somewhere.\n' + | ||
' preformatted... ...text 1. list item one\n' + | ||
' 2. list item two'; | ||
const result = htmlToText(html, { tables: true }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
it('should wrap table contents to custom max column width', function () { | ||
const html = /*html*/` | ||
<table> | ||
<tr> | ||
<td>short</td> | ||
<td>short</td> | ||
<td>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</td> | ||
</tr> | ||
<tr> | ||
<td>short</td> | ||
<td>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</td> | ||
<td>short</td> | ||
</tr> | ||
</table>`; | ||
const resultExpected = | ||
'short short Lorem ipsum dolor sit amet,\n' + | ||
' consectetur adipiscing elit,\n' + | ||
' sed do eiusmod tempor\n' + | ||
' incididunt ut labore et dolore\n' + | ||
' magna aliqua. Ut enim ad minim\n' + | ||
' veniam, quis nostrud\n' + | ||
' exercitation ullamco laboris\n' + | ||
' nisi ut aliquip ex ea commodo\n' + | ||
' consequat.\n' + | ||
'short Duis aute irure dolor in short\n' + | ||
' reprehenderit in voluptate\n' + | ||
' velit esse cillum dolore eu\n' + | ||
' fugiat nulla pariatur.\n' + | ||
' Excepteur sint occaecat\n' + | ||
' cupidatat non proident, sunt\n' + | ||
' in culpa qui officia deserunt\n' + | ||
' mollit anim id est laborum.'; | ||
const result = htmlToText(html, { tables: true, tags: { 'table': { options: { maxColumnWidth: 30 } } } }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
}); | ||
@@ -185,3 +388,3 @@ | ||
it('should decode html attribute entities from href', function () { | ||
var result = htmlToText.fromString('<a href="/foo?a=b">test</a>'); | ||
const result = htmlToText('<a href="/foo?a=b">test</a>'); | ||
expect(result).to.equal('test [/foo?a=b]'); | ||
@@ -191,3 +394,3 @@ }); | ||
it('should strip mailto: from email links', function () { | ||
var result = htmlToText.fromString('<a href="mailto:foo@example.com">email me</a>'); | ||
const result = htmlToText('<a href="mailto:foo@example.com">email me</a>'); | ||
expect(result).to.equal('email me [foo@example.com]'); | ||
@@ -197,3 +400,3 @@ }); | ||
it('should return link with brackets', function () { | ||
var result = htmlToText.fromString('<a href="http://my.link">test</a>'); | ||
const result = htmlToText('<a href="http://my.link">test</a>'); | ||
expect(result).to.equal('test [http://my.link]'); | ||
@@ -203,5 +406,6 @@ }); | ||
it('should return link without brackets', function () { | ||
var result = htmlToText.fromString('<a href="http://my.link">test</a>', { | ||
noLinkBrackets: true | ||
}); | ||
const result = htmlToText( | ||
'<a href="http://my.link">test</a>', | ||
{ noLinkBrackets: true } | ||
); | ||
expect(result).to.equal('test http://my.link'); | ||
@@ -211,5 +415,6 @@ }); | ||
it('should not return link for anchor if noAnchorUrl is set to true', function () { | ||
var result = htmlToText.fromString('<a href="#link">test</a>', { | ||
noAnchorUrl: true | ||
}); | ||
const result = htmlToText( | ||
'<a href="#link">test</a>', | ||
{ noAnchorUrl: true } | ||
); | ||
expect(result).to.equal('test'); | ||
@@ -219,5 +424,6 @@ }); | ||
it('should return link for anchor if noAnchorUrl is set to false', function () { | ||
var result = htmlToText.fromString('<a href="#link">test</a>', { | ||
noAnchorUrl: false | ||
}); | ||
const result = htmlToText( | ||
'<a href="#link">test</a>', | ||
{ noAnchorUrl: false } | ||
); | ||
expect(result).to.equal('test [#link]'); | ||
@@ -227,106 +433,195 @@ }); | ||
describe('lists', function() { | ||
describe('ul', function() { | ||
it('should handle empty unordered lists', function() { | ||
var testString = '<ul></ul>'; | ||
expect(htmlToText.fromString(testString)).to.equal(''); | ||
describe('lists', function () { | ||
describe('ul', function () { | ||
it('should handle empty unordered lists', function () { | ||
const testString = '<ul></ul>'; | ||
expect(htmlToText(testString)).to.equal(''); | ||
}); | ||
it('should handle an unordered list with multiple elements', function() { | ||
var testString = '<ul><li>foo</li><li>bar</li></ul>'; | ||
expect(htmlToText.fromString(testString)).to.equal(' * foo\n * bar'); | ||
it('should handle an unordered list with multiple elements', function () { | ||
const testString = '<ul><li>foo</li><li>bar</li></ul>'; | ||
expect(htmlToText(testString)).to.equal(' * foo\n * bar'); | ||
}); | ||
it('should handle an unordered list prefix option', function() { | ||
var testString = '<ul><li>foo</li><li>bar</li></ul>'; | ||
var options = {unorderedListItemPrefix: ' test '}; | ||
expect(htmlToText.fromString(testString, options)).to.equal(' test foo\n test bar'); | ||
it('should handle an unordered list prefix option', function () { | ||
const testString = '<ul><li>foo</li><li>bar</li></ul>'; | ||
const options = { unorderedListItemPrefix: ' test ' }; | ||
expect(htmlToText(testString, options)).to.equal(' test foo\n test bar'); | ||
}); | ||
it('should handle nested ul correctly', function () { | ||
const testString = '<ul><li>foo<ul><li>bar<ul><li>baz.1</li><li>baz.2</li></ul></li></ul></li></ul>'; | ||
expect(htmlToText(testString)).to.equal(' * foo\n * bar\n * baz.1\n * baz.2'); | ||
}); | ||
it('should handle long nested ul correctly', function () { | ||
const testString = /*html*/`<ul> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>Inner: | ||
<ul> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
</ul> | ||
</li> | ||
</ul>`; | ||
const expected = | ||
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g\n' + | ||
' u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' + | ||
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g\n' + | ||
' u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' + | ||
' * Inner:\n' + | ||
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' + | ||
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' + | ||
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' + | ||
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.'; | ||
expect(htmlToText(testString)).to.equal(expected); | ||
}); | ||
}); | ||
describe('ol', function() { | ||
it('should handle empty ordered lists', function() { | ||
var testString = '<ol></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal(''); | ||
describe('ol', function () { | ||
it('should handle empty ordered lists', function () { | ||
const testString = '<ol></ol>'; | ||
expect(htmlToText(testString)).to.equal(''); | ||
}); | ||
it('should handle an ordered list with multiple elements', function() { | ||
var testString = '<ol><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal(' 1. foo\n 2. bar'); | ||
it('should handle an ordered list with multiple elements', function () { | ||
const testString = '<ol><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString)).to.equal(' 1. foo\n 2. bar'); | ||
}); | ||
it('should support the ordered list type="1" attribute', function() { | ||
var testString = '<ol type="1"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal(' 1. foo\n 2. bar'); | ||
it('should support the ordered list type="1" attribute', function () { | ||
const testString = '<ol type="1"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString)).to.equal(' 1. foo\n 2. bar'); | ||
}); | ||
it('should fallback to type="!" behavior if type attribute is invalid', function() { | ||
var testString = '<ol type="1"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal(' 1. foo\n 2. bar'); | ||
it('should fallback to type="1" behavior if type attribute is invalid', function () { | ||
const testString = '<ol type="whatever"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString)).to.equal(' 1. foo\n 2. bar'); | ||
}); | ||
it('should support the ordered list type="a" attribute', function() { | ||
var testString = '<ol type="a"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal(' a. foo\n b. bar'); | ||
it('should support the ordered list type="a" attribute', function () { | ||
const testString = '<ol type="a"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString)).to.equal(' a. foo\n b. bar'); | ||
}); | ||
it('should support the ordered list type="A" attribute', function() { | ||
var testString = '<ol type="A"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal(' A. foo\n B. bar'); | ||
it('should support the ordered list type="A" attribute', function () { | ||
const testString = '<ol type="A"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString)).to.equal(' A. foo\n B. bar'); | ||
}); | ||
it('should support the ordered list type="i" attribute by falling back to type="1"', function() { | ||
var testString = '<ol type="i"><li>foo</li><li>bar</li></ol>'; | ||
// TODO Implement lowercase roman numerals | ||
// expect(htmlToText.fromString(testString)).to.equal('i. foo\nii. bar'); | ||
expect(htmlToText.fromString(testString)).to.equal(' 1. foo\n 2. bar'); | ||
it('should support the ordered list type="i" attribute', function () { | ||
const testString1 = '<ol type="i"><li>foo</li><li>bar</li></ol>'; | ||
const testString2 = '<ol start="8" type="i"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString1)).to.equal(' i. foo\n ii. bar'); | ||
expect(htmlToText(testString2)).to.equal(' viii. foo\n ix. bar'); | ||
}); | ||
it('should support the ordered list type="I" attribute by falling back to type="1"', function() { | ||
var testString = '<ol type="I"><li>foo</li><li>bar</li></ol>'; | ||
// TODO Implement uppercase roman numerals | ||
// expect(htmlToText.fromString(testString)).to.equal('I. foo\nII. bar'); | ||
expect(htmlToText.fromString(testString)).to.equal(' 1. foo\n 2. bar'); | ||
it('should support the ordered list type="I" attribute', function () { | ||
const testString1 = '<ol type="I"><li>foo</li><li>bar</li></ol>'; | ||
const testString2 = '<ol start="8" type="I"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString1)).to.equal(' I. foo\n II. bar'); | ||
expect(htmlToText(testString2)).to.equal(' VIII. foo\n IX. bar'); | ||
}); | ||
it('should support the ordered list start attribute', function() { | ||
var testString = '<ol start="2"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal(' 2. foo\n 3. bar'); | ||
it('should support the ordered list start attribute', function () { | ||
const testString = '<ol start="100"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString)).to.equal(' 100. foo\n 101. bar'); | ||
}); | ||
/* | ||
* Currently failing tests for continuing to fill out the specification | ||
* Spec: https://html.spec.whatwg.org/multipage/semantics.html#the-ol-element | ||
* | ||
it('should support the ordered list type="a" attribute past 26 characters', function() { | ||
var testString = '<ol start="26" type="a"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal('z. foo\naa. bar'); | ||
it('should handle nested ol correctly', function () { | ||
const testString = '<ol><li>foo<ol><li>bar<ol><li>baz</li><li>baz</li></ol></li></ol></li></ol>'; | ||
expect(htmlToText(testString)).to.equal(' 1. foo\n 1. bar\n 1. baz\n 2. baz'); | ||
}); | ||
it('should support the ordered list type="A" attribute past 26 characters', function() { | ||
var testString = '<ol start="26" type="A"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText.fromString(testString)).to.equal('Z. foo\nAA. bar'); | ||
it('should handle long nested ol correctly', function () { | ||
const testString = /*html*/`<ol> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>Inner: | ||
<ol> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li> | ||
</ol> | ||
</li> | ||
</ol>`; | ||
const expected = | ||
' 1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' + | ||
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' + | ||
' 2. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' + | ||
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' + | ||
' 3. Inner:\n' + | ||
' 1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s\n' + | ||
' d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit\n' + | ||
' amet.\n' + | ||
' 2. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s\n' + | ||
' d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit\n' + | ||
' amet.'; | ||
expect(htmlToText(testString)).to.equal(expected); | ||
}); | ||
*/ | ||
it('should support the ordered list type="a" attribute past 26 characters', function () { | ||
const testString1 = '<ol start="26" type="a"><li>foo</li><li>bar</li></ol>'; | ||
const testString2 = '<ol start="702" type="a"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString1)).to.equal(' z. foo\n aa. bar'); | ||
expect(htmlToText(testString2)).to.equal(' zz. foo\n aaa. bar'); | ||
}); | ||
it('should support the ordered list type="A" attribute past 26 characters', function () { | ||
const testString1 = '<ol start="26" type="A"><li>foo</li><li>bar</li></ol>'; | ||
const testString2 = '<ol start="702" type="A"><li>foo</li><li>bar</li></ol>'; | ||
expect(htmlToText(testString1)).to.equal(' Z. foo\n AA. bar'); | ||
expect(htmlToText(testString2)).to.equal(' ZZ. foo\n AAA. bar'); | ||
}); | ||
// HTML standard defines vinculum extension for large numbers. | ||
// But that doesn't seem to have any significance for practical purposes. | ||
// it('should support the ordered list type="i" attribute past 3999', function () { | ||
// const testString = '<ol start="3999" type="i"><li>foo</li><li>bar</li></ol>'; | ||
// expect(htmlToText(testString)).to.equal(' mmmcmxcix. foo\n i̅v̅. bar'); | ||
// }); | ||
// it('should support the ordered list type="I" attribute past 3999', function () { | ||
// const testString = '<ol start="3999" type="I"><li>foo</li><li>bar</li></ol>'; | ||
// expect(htmlToText(testString)).to.equal(' MMMCMXCIX. foo\n I̅V̅. bar'); | ||
// }); | ||
}); | ||
it('doesnt wrap li if wordwrap isnt', function () { | ||
var html = 'Good morning Jacob, \ | ||
<p>Lorem ipsum dolor sit amet</p> \ | ||
<p><strong>Lorem ipsum dolor sit amet.</strong></p> \ | ||
<ul> \ | ||
<li>run in the park <span style="color:#888888;">(in progress)</span></li> \ | ||
</ul> \ | ||
'; | ||
var resultExpected = 'Good morning Jacob, Lorem ipsum dolor sit amet\n\nLorem ipsum dolor sit amet.\n\n * run in the park (in progress)'; | ||
var result = htmlToText.fromString(html, { wordwrap: false }); | ||
it('should not wrap li when wordwrap is disabled', function () { | ||
const html = `Good morning Jacob, | ||
<p>Lorem ipsum dolor sit amet</p> | ||
<p><strong>Lorem ipsum dolor sit amet.</strong></p> | ||
<ul> | ||
<li>run in the park <span style="color:#888888;">(in progress)</span></li> | ||
</ul> | ||
`; | ||
const resultExpected = 'Good morning Jacob,\n\nLorem ipsum dolor sit amet\n\nLorem ipsum dolor sit amet.\n\n * run in the park (in progress)'; | ||
const result = htmlToText(html, { wordwrap: false }); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
it('should handle non-li elements inside a list gracefully', function () { | ||
const html = /*html*/` | ||
<ul> | ||
<li>list item</li> | ||
plain text | ||
<li>list item</li> | ||
<div>div</div> | ||
<li>list item</li> | ||
<p>paragraph</p> | ||
<li>list item</li> | ||
</ul> | ||
`; | ||
const resultExpected = ' * list item\n plain text\n * list item\n div\n * list item\n\n paragraph\n\n * list item'; | ||
const result = htmlToText(html); | ||
expect(result).to.equal(resultExpected); | ||
}); | ||
}); | ||
describe('entities', function () { | ||
it('does not insert null bytes', function () { | ||
var html = '<a href="some-url?a=b&b=c">Testing & Done</a>'; | ||
it('should not insert null bytes', function () { | ||
const html = '<a href="some-url?a=b&b=c">Testing & Done</a>'; | ||
var result = htmlToText.fromString(html); | ||
const result = htmlToText(html); | ||
expect(result).to.equal('Testing & Done [some-url?a=b&b=c]'); | ||
@@ -336,7 +631,19 @@ }); | ||
it('should replace entities inside `alt` attributes of images', function () { | ||
var html = '<img src="test.png" alt=""Awesome"">'; | ||
const html = '<img src="test.png" alt=""Awesome"">'; | ||
var result = htmlToText.fromString(html); | ||
const result = htmlToText(html); | ||
expect(result).to.equal('"Awesome" [test.png]'); | ||
}); | ||
it('should update relatively sourced entities with linkHrefBaseUrl', function () { | ||
const html1 = '<img src="/test.png">'; | ||
const html2 = '<a href="/test.html">test</a>'; | ||
const options = { linkHrefBaseUrl: 'http://www.domain.com' }; | ||
const result1 = htmlToText(html1, options); | ||
expect(result1).to.equal('[http://www.domain.com/test.png]'); | ||
const result2 = htmlToText(html2, options); | ||
expect(result2).to.equal('test [http://www.domain.com/test.html]'); | ||
}); | ||
}); | ||
@@ -346,3 +653,3 @@ | ||
it('should decode 😂 to 😂', function () { | ||
var result = htmlToText.fromString('😂'); | ||
const result = htmlToText('😂'); | ||
expect(result).to.equal('😂'); | ||
@@ -353,108 +660,177 @@ }); | ||
describe('disable uppercaseHeadings', function () { | ||
[1, 2, 3, 4, 5, 6].forEach(function (i) { | ||
for (const i of [1, 2, 3, 4, 5, 6]) { | ||
it('should return h' + i + ' in lowercase', function () { | ||
var result = htmlToText.fromString('<h' + i + '>test</h' + i + '>', { | ||
uppercaseHeadings: false | ||
}); | ||
const result = htmlToText( | ||
'<h' + i + '>test</h' + i + '>', | ||
{ uppercaseHeadings: false } | ||
); | ||
expect(result).to.equal('test'); | ||
}); | ||
}); | ||
} | ||
}); | ||
describe('custom formatting', function () { | ||
it('should allow to pass custom formatting functions', function () { | ||
var result = htmlToText.fromString('<h1>TeSt</h1>', { | ||
format: { | ||
heading: function (elem, fn, options) { | ||
var h = fn(elem.children, options); | ||
return '====\n' + h.toLowerCase() + '\n===='; | ||
it('should allow to override formatting of existing tags', function () { | ||
const result = htmlToText('<h1>TeSt</h1><h1>mOrE tEsT</h1>', { | ||
formatters: { | ||
heading: function (elem, walk, builder, formatOptions) { | ||
builder.openBlock(2); | ||
builder.pushWordTransform(str => str.toLowerCase()); | ||
walk(elem.children, builder); | ||
builder.popWordTransform(); | ||
builder.closeBlock(2, str => { | ||
const line = '='.repeat(str.length); | ||
return `${line}\n${str}\n${line}`; | ||
}); | ||
} | ||
} | ||
}); | ||
expect(result).to.equal('====\ntest\n===='); | ||
expect(result).to.equal('====\ntest\n====\n\n=========\nmore test\n========='); | ||
}); | ||
it('should allow to skip tags with dummy formatting function', function () { | ||
const input = '<ruby>漢<rt>かん</rt>字<rt>じ</rt></ruby>'; | ||
const expected = '漢字'; | ||
const result = htmlToText( | ||
input, | ||
{ tags: { 'rt': { format: 'skip' } } } | ||
); | ||
expect(result).to.equal(expected); | ||
}); | ||
it('should allow to define basic support for inline tags', function () { | ||
const input = /*html*/`<p>a <span>b </span>c<span> d </span>e</p>`; | ||
const expected = 'a b c d e'; | ||
const result = htmlToText( | ||
input, | ||
{ tags: { 'span': { format: 'inline' } } } | ||
); | ||
expect(result).to.equal(expected); | ||
}); | ||
it('should allow to define basic support for block-level tags', function () { | ||
const input = /*html*/`<widget><gadget>a</gadget><fidget>b</fidget></widget>c<budget>d</budget>e`; | ||
const expected = 'a\nb\nc\nd\ne'; | ||
const result = htmlToText( | ||
input, | ||
{ | ||
tags: { | ||
'budget': { format: 'block' }, | ||
'fidget': { format: 'block' }, | ||
'gadget': { format: 'block' }, | ||
'widget': { format: 'block' }, | ||
} | ||
} | ||
); | ||
expect(result).to.equal(expected); | ||
}); | ||
it('should allow to add support for different tags', function () { | ||
const input = '<div><foo>foo<br/>content</foo><bar src="bar.src" /></div>'; | ||
const expected = '[FOO]foo\ncontent[/FOO]\n[BAR src="bar.src"]'; | ||
const result = htmlToText( | ||
input, | ||
{ | ||
formatters: { | ||
'formatFoo': function (elem, walk, builder, formatOptions) { | ||
builder.openBlock(1); | ||
walk(elem.children, builder); | ||
builder.closeBlock(1, str => `[FOO]${str}[/FOO]`); | ||
}, | ||
'formatBar': function (elem, walk, builder, formatOptions) { | ||
// attribute availability check is left out for brevity | ||
builder.addInline(`[BAR src="${elem.attribs.src}"]`, true); | ||
} | ||
}, | ||
tags: { | ||
'foo': { format: 'formatFoo' }, | ||
'bar': { format: 'formatBar' } | ||
} | ||
} | ||
); | ||
expect(result).to.equal(expected); | ||
}); | ||
}); | ||
describe('Base element', function () { | ||
it('should retrieve and convert the entire document under `body` by default', function() { | ||
var htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
var txtFile = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8'); | ||
it('should retrieve and convert the entire document under `body` by default', function () { | ||
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
const txtFile = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8'); | ||
var options = { | ||
tables: ['#invoice', '.address'] | ||
}; | ||
var text = htmlToText.fromString(htmlFile, options); | ||
const options = { tables: ['#invoice', '.address'] }; | ||
const text = htmlToText(htmlFile, options); | ||
expect(text).to.equal(txtFile); | ||
}); | ||
it('should only retrieve and convert content under the specified base element if found', function() { | ||
var htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
var txtFile = fs.readFileSync(path.join(__dirname, 'test-address.txt'), 'utf8'); | ||
it('should only retrieve and convert content under the specified base element if found', function () { | ||
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
const txtFile = fs.readFileSync(path.join(__dirname, 'test-address.txt'), 'utf8'); | ||
var options = { | ||
const options = { | ||
tables: ['.address'], | ||
baseElement: 'table.address' | ||
}; | ||
var text = htmlToText.fromString(htmlFile, options); | ||
const text = htmlToText(htmlFile, options); | ||
expect(text).to.equal(txtFile); | ||
}); | ||
it('should retrieve and convert content under multiple base elements', function() { | ||
var htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
var txtFile = fs.readFileSync(path.join(__dirname, 'test-address-dup.txt'), 'utf8'); | ||
it('should retrieve and convert content under multiple base elements', function () { | ||
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
const txtFile = fs.readFileSync(path.join(__dirname, 'test-address-dup.txt'), 'utf8'); | ||
var options = { | ||
const options = { | ||
tables: ['.address'], | ||
baseElement: ['table.address', 'table.address'] | ||
}; | ||
var text = htmlToText.fromString(htmlFile, options); | ||
const text = htmlToText(htmlFile, options); | ||
expect(text).to.equal(txtFile); | ||
}); | ||
it('should retrieve and convert content under multiple base elements in any order', function() { | ||
var htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
var txtFile = fs.readFileSync(path.join(__dirname, 'test-any-order.txt'), 'utf8'); | ||
it('should retrieve and convert content under multiple base elements in any order', function () { | ||
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
const txtFile = fs.readFileSync(path.join(__dirname, 'test-any-order.txt'), 'utf8'); | ||
var options = { | ||
const options = { | ||
tables: ['.address'], | ||
baseElement: ['table.address', 'p.normal-space', 'table.address'] | ||
}; | ||
var text = htmlToText.fromString(htmlFile, options); | ||
const text = htmlToText(htmlFile, options); | ||
expect(text).to.equal(txtFile); | ||
}); | ||
it('should process the first base element found when multiple exist', function() { | ||
var htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
var txtFile = fs.readFileSync(path.join(__dirname, 'test-first-element.txt'), 'utf8'); | ||
it('should process the first base element found when multiple exist', function () { | ||
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
const txtFile = fs.readFileSync(path.join(__dirname, 'test-first-element.txt'), 'utf8'); | ||
var options = { | ||
const options = { | ||
tables: ['.address'], | ||
baseElement: 'p.normal-space' | ||
}; | ||
var text = htmlToText.fromString(htmlFile, options); | ||
const text = htmlToText(htmlFile, options); | ||
expect(text).to.equal(txtFile); | ||
}); | ||
it('should retrieve and convert the entire document by default if no base element is found', function() { | ||
var htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
var txtFile = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8'); | ||
it('should retrieve and convert the entire document by default if no base element is found', function () { | ||
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
const txtFile = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8'); | ||
var options = { | ||
const options = { | ||
tables: ['#invoice', '.address'], | ||
baseElement: 'table.notthere' | ||
}; | ||
var text = htmlToText.fromString(htmlFile, options); | ||
const text = htmlToText(htmlFile, options); | ||
expect(text).to.equal(txtFile); | ||
}); | ||
it('should return null if the base element isn\'t found and we\'re not returning the DOM by default', function() { | ||
var htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
it('should return null if the base element isn\'t found and we\'re not returning the DOM by default', function () { | ||
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8'); | ||
var expectedTxt = ''; | ||
var options = { | ||
tables: ['#invoice', '.address'], | ||
const expectedTxt = ''; | ||
const options = { | ||
baseElement: 'table.notthere', | ||
returnDomByDefault: false | ||
returnDomByDefault: false, | ||
tables: ['#invoice', '.address'] | ||
}; | ||
var text = htmlToText.fromString(htmlFile, options); | ||
const text = htmlToText(htmlFile, options); | ||
expect(text).to.equal(expectedTxt); | ||
@@ -464,137 +840,359 @@ }); | ||
describe('Long words', function() { | ||
it('should split long words if forceWrapOnLimit is set, existing linefeeds converted to space', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: true }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlo\nng word_with_following_text.'); | ||
describe('Long words', function () { | ||
it('should split long words if forceWrapOnLimit is set, existing linefeeds converted to space', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: true } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlo\nng word_with_following_text.'); | ||
}); | ||
it('should not wrap a string if longWordSplit is not set', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, {} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.'); | ||
it('should not wrap a string if longWordSplit is not set', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.</p>'; | ||
expect(htmlToText(testString, {})) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.'); | ||
}); | ||
it('should not wrap a string if not wrapCharacters are found and forceWrapOnLimit is not set', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: false }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.'); | ||
it('should not wrap a string if wrapCharacters are set but not found and forceWrapOnLimit is not set', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: false } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.'); | ||
}); | ||
it('should not wrap a string if no wrapCharacters are set and forceWrapOnLimit is not set', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: false }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.'); | ||
it('should not wrap a string if wrapCharacters are not set and forceWrapOnLimit is not set', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: false } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.'); | ||
}); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit.', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.'); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit.', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.'); | ||
}); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit. Content of wrapCharacters shouldn\'t matter.', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['/','-', '_'], forceWrapOnLimit: false }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.'); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit. Content of wrapCharacters shouldn\'t matter.', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-', '_'], forceWrapOnLimit: false } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.'); | ||
}); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit. Order of wrapCharacters shouldn\'t matter.', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['_', '/'], forceWrapOnLimit: false }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.'); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit. Order of wrapCharacters shouldn\'t matter.', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['_', '/'], forceWrapOnLimit: false } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.'); | ||
}); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit. Should preference wrapCharacters in order', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split-properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['-', '_', '/'], forceWrapOnLimit: false }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split-\nproperly_across_anewlineandlong word_with_following_text.'); | ||
it('should wrap on the last instance of a wrap character before the wordwrap limit. Should preference wrapCharacters in order', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split-properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['-', '_', '/'], forceWrapOnLimit: false } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split-\nproperly_across_anewlineandlong word_with_following_text.'); | ||
}); | ||
it('should not wrap a string that is too short', function() { | ||
var testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false }} )) | ||
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js'); | ||
it('should not wrap a string that is too short', function () { | ||
const testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } })) | ||
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js'); | ||
}); | ||
it('should wrap a url string using \'/\'', function() { | ||
var testString = '<p>https://github.com/AndrewFinlay/node-html-to-text/commit/64836a5bd97294a672b24c26cb8a3ccdace41001</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false }} )) | ||
.to.equal('https://github.com/AndrewFinlay/node-html-to-text/commit/\n64836a5bd97294a672b24c26cb8a3ccdace41001'); | ||
it('should wrap a url string using \'/\'', function () { | ||
const testString = '<p>https://github.com/AndrewFinlay/node-html-to-text/commit/64836a5bd97294a672b24c26cb8a3ccdace41001</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } })) | ||
.to.equal('https://github.com/AndrewFinlay/node-html-to-text/commit/\n64836a5bd97294a672b24c26cb8a3ccdace41001'); | ||
}); | ||
it('should wrap very long url strings using \'/\'', function() { | ||
var testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false }} )) | ||
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/\nnode-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/\nwerk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/\nlib/html-to-text.js'); | ||
it('should wrap very long url strings using \'/\'', function () { | ||
const testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } })) | ||
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/\nnode-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/\nwerk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/\nlib/html-to-text.js'); | ||
}); | ||
it('should wrap very long url strings using limit', function() { | ||
var testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>'; | ||
expect(htmlToText.fromString(testString, { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: true }} )) | ||
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-\ntext/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-t\no-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js'); | ||
it('should wrap very long url strings using limit', function () { | ||
const testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>'; | ||
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: true } })) | ||
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-\ntext/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-t\no-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js'); | ||
}); | ||
it('should honour preserveNewlines and split long words', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { preserveNewlines: true, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false }} )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong\nword_with_following_text.'); | ||
it('should honour preserveNewlines and split long words', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong\nword_with_following_text.'); | ||
}); | ||
it('should not put in extra linefeeds if the end of the untouched long string coincides with a preserved line feed', function() { | ||
var testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText.fromString(testString, { preserveNewlines: true } )) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.'); | ||
it('should not put in extra linefeeds if the end of the untouched long string coincides with a preserved line feed', function () { | ||
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>'; | ||
expect(htmlToText(testString, { preserveNewlines: true })) | ||
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.'); | ||
}); | ||
it('should split long strings buried in links and hide the href', function() { | ||
var testString = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>'; | ||
expect(htmlToText.fromString(testString, { hideLinkHrefIfSameAsText: true, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false }} )) | ||
.to.equal('http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/'); | ||
it('should split long strings buried in links and hide the href', function () { | ||
const testString = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>'; | ||
expect(htmlToText(testString, { hideLinkHrefIfSameAsText: true, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } })) | ||
.to.equal('http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/'); | ||
}); | ||
it('should split long strings buried in links and show the href', function() { | ||
var testString = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>'; | ||
expect(htmlToText.fromString(testString, { hideLinkHrefIfSameAsText: false, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false }} )) | ||
.to.equal('http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/\n[http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/]'); | ||
it('should split long strings buried in links and show the href', function () { | ||
const testString = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>'; | ||
expect(htmlToText(testString, { hideLinkHrefIfSameAsText: false, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } })) | ||
.to.equal('http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/\n[http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/]'); | ||
}); | ||
}); | ||
describe('whitespace', function() { | ||
it('should not be ignored inside a whitespace-only node', function() { | ||
var testString = 'foo<span> </span>bar'; | ||
expect(htmlToText.fromString(testString)).to.equal('foo bar'); | ||
describe('whitespace', function () { | ||
it('should not be ignored inside a whitespace-only node', function () { | ||
const testString = 'foo<span> </span>bar'; | ||
expect(htmlToText(testString)).to.equal('foo bar'); | ||
}); | ||
it('should not add additional whitespace after <sup>', function() { | ||
var testString = '<p>This text contains <sup>superscript</sup> text.</p>'; | ||
var options = { preserveNewlines: true }; | ||
it('should not add additional whitespace after <sup>', function () { | ||
const testString = '<p>This text contains <sup>superscript</sup> text.</p>'; | ||
const options = { preserveNewlines: true }; | ||
expect(htmlToText.fromString(testString, options)).to.equal('This text contains superscript text.'); | ||
expect(htmlToText(testString, options)).to.equal('This text contains superscript text.'); | ||
}); | ||
it('should handle custom whitespace characters', function () { | ||
// No-Break Space - decimal 160, hex \u00a0. | ||
const testString = '<span>first span\u00a0</span>\u00a0<span>\u00a0last span</span>'; | ||
const expectedDefault = 'first span\u00a0\u00a0\u00a0last span'; | ||
const expectedCustom = 'first span last span'; | ||
const options = { whitespaceCharacters: ' \t\r\n\f\u200b\u00a0' }; | ||
expect(htmlToText(testString)).to.equal(expectedDefault); | ||
expect(htmlToText(testString, options)).to.equal(expectedCustom); | ||
}); | ||
it('should handle space and newline combination - keep space when and only when needed', function () { | ||
const testString = '<span>foo</span> \n<span>bar</span>\n <span>baz</span>'; | ||
const defaultResult = htmlToText(testString); | ||
const resultWithNewLine = htmlToText(testString, { preserveNewlines: true }); | ||
expect(defaultResult).to.equal('foo bar baz'); | ||
expect(resultWithNewLine).to.equal('foo\nbar\nbaz'); | ||
}); | ||
it('should not have extra spaces at the beginning for space-indented html', function () { | ||
const html = /*html*/`<html> | ||
<body> | ||
<p>foo</p> | ||
<p>bar</p> | ||
</body> | ||
</html>`; | ||
const text = htmlToText(html); | ||
expect(text).to.equal('foo\n\nbar'); | ||
}); | ||
it('should not have extra spaces at the beginning for space-indented html with explicitly block-level tags', function () { | ||
const html = /*html*/`<html> | ||
<body> | ||
<div>foo</div> | ||
<div>bar</div> | ||
</body> | ||
</html>`; | ||
expect(htmlToText(html, { tags: { 'div': { format: 'block', level: 'block' } } })).to.equal('foo\nbar'); | ||
}); | ||
}); | ||
describe('wbr', function() { | ||
it('should handle a large number of wbr tags w/o stack overflow', function() { | ||
var testString = "<!DOCTYPE html><html><head></head><body>\n"; | ||
var expectedResult = ""; | ||
for (var i = 0; i < 1000; i++){ | ||
describe('lots of tags, limits', function () { | ||
it('should handle a large number of wbr tags w/o stack overflow', function () { | ||
let testString = '<!DOCTYPE html><html><head></head><body>\n'; | ||
let expectedResult = ''; | ||
for (let i = 0; i < 10000; i++) { | ||
if (i !== 0 && i % 80 === 0) { | ||
expectedResult += "\n"; | ||
expectedResult += '\n'; | ||
} | ||
expectedResult += "n"; | ||
testString += "<wbr>n"; | ||
expectedResult += 'n'; | ||
testString += '<wbr>n'; | ||
} | ||
testString += "</body></html>"; | ||
expect(htmlToText.fromString(testString)).to.equal(expectedResult); | ||
testString += '</body></html>'; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
it('should handle a very large number of wbr tags with limits', function () { | ||
let testString = '<!DOCTYPE html><html><head></head><body>'; | ||
for (let i = 0; i < 70000; i++) { | ||
testString += '<wbr>n'; | ||
} | ||
testString += '</body></html>'; | ||
const options = { | ||
limits: { | ||
maxChildNodes: 10, | ||
ellipsis: '(...)' | ||
} | ||
}; | ||
const expectedResult = 'nnnnn(...)'; | ||
expect(htmlToText(testString, options)).to.equal(expectedResult); | ||
}); | ||
it('should respect maxDepth limit', function () { | ||
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><span>a<span>b<span>c<span>d</span>e</span>f</span>g<span>h<span>i<span>j</span>k</span>l</span>m</span></body></html>`; | ||
const options = { | ||
limits: { | ||
maxDepth: 2, | ||
ellipsis: '(...)' | ||
} | ||
}; | ||
const expectedResult = 'a(...)g(...)m'; | ||
expect(htmlToText(testString, options)).to.equal(expectedResult); | ||
}); | ||
it('should respect maxChildNodes limit', function () { | ||
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`; | ||
const options = { | ||
singleNewLineParagraphs: true, | ||
limits: { | ||
maxChildNodes: 6, | ||
ellipsis: '(skipped the rest)' | ||
} | ||
}; | ||
const expectedResult = 'a\nb\nc\nd\ne\nf\n(skipped the rest)'; | ||
expect(htmlToText(testString, options)).to.equal(expectedResult); | ||
}); | ||
it('should not add ellipsis when maxChildNodes limit is exact match', function () { | ||
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`; | ||
const options = { | ||
singleNewLineParagraphs: true, | ||
limits: { | ||
maxChildNodes: 10, | ||
ellipsis: 'can\'t see me' | ||
} | ||
}; | ||
const expectedResult = 'a\nb\nc\nd\ne\nf\ng\nh\ni\nj'; | ||
expect(htmlToText(testString, options)).to.equal(expectedResult); | ||
}); | ||
it('should use default ellipsis value if none provided', function () { | ||
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`; | ||
const options = { | ||
singleNewLineParagraphs: true, | ||
limits: { maxChildNodes: 6 } | ||
}; | ||
const expectedResult = 'a\nb\nc\nd\ne\nf\n...'; | ||
expect(htmlToText(testString, options)).to.equal(expectedResult); | ||
}); | ||
}); | ||
describe('blockquote', function() { | ||
it('should handle format blockquote', function() { | ||
var testString = 'foo<blockquote>test</blockquote>bar'; | ||
var expectedResult = 'foo> test\nbar'; | ||
expect(htmlToText.fromString(testString)).to.equal(expectedResult); | ||
describe('limits.maxInputLength', function () { | ||
const processStderrWrite = process.stderr.write; | ||
let processStderrWriteBuffer; | ||
const overwriteProcessStderrWrite = () => { | ||
processStderrWriteBuffer = ''; | ||
process.stderr.write = (text) => { processStderrWriteBuffer += text; }; | ||
}; | ||
const getProcessStderrBuffer = () => processStderrWriteBuffer; | ||
const resetProcessStderrWrite = () => { process.stderr.write = processStderrWrite; }; | ||
beforeEach(function () { overwriteProcessStderrWrite(); }); | ||
afterEach(function () { resetProcessStderrWrite(); }); | ||
it('should respect default limit of maxInputLength', function () { | ||
const testString = '0123456789'.repeat(2000000); | ||
const options = { wordwrap: false }; | ||
expect(htmlToText(testString, options).length).to.equal(1 << 24); | ||
expect(getProcessStderrBuffer()).to.equal('Input lenght 20000000 is above allowed limit of 16777216. Truncating without ellipsis.\n'); | ||
}); | ||
it('should respect custom maxInputLength', function () { | ||
const testString = '0123456789'.repeat(2000000); | ||
const options = { limits: { maxInputLength: 42 } }; | ||
expect(htmlToText(testString, options).length).to.equal(42); | ||
expect(getProcessStderrBuffer()).to.equal('Input lenght 20000000 is above allowed limit of 42. Truncating without ellipsis.\n'); | ||
}); | ||
}); | ||
describe('blockquote', function () { | ||
it('should handle format single-line blockquote', function () { | ||
const testString = 'foo<blockquote>test</blockquote>bar'; | ||
const expectedResult = 'foo\n\n> test\n\nbar'; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
it('should format multi-line blockquote', function () { | ||
const testString = '<blockquote>a<br/>b</blockquote>'; | ||
const expectedResult = '> a\n> b'; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
it('should trim newlines unless disabled', function () { | ||
const testString = '<blockquote><br/>a<br/><br/><br/></blockquote>'; | ||
const expectedDefaultResult = '> a'; | ||
const expectedCustomResult = '> \n> a\n> \n> \n> '; | ||
expect(htmlToText(testString)).to.equal(expectedDefaultResult); | ||
expect(htmlToText(testString, { tags: { 'blockquote': { options: { trimEmptyLines: false } } } })).to.equal(expectedCustomResult); | ||
}); | ||
}); | ||
describe('pre', function () { | ||
it('should support simple preformatted text', function () { | ||
const testString = '<P>Code fragment:</P><PRE> body {\n color: red;\n }</PRE>'; | ||
const expectedResult = 'Code fragment:\n\n body {\n color: red;\n }'; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
it('should support preformatted text with inner tags', function () { | ||
const testString = /*html*/`<p>Code fragment:</p> | ||
<pre><code> var total = 0; | ||
<em style="color: green;">// Add 1 to total and display in a paragraph</em> | ||
<strong style="color: blue;">document.write('<p>Sum: ' + (total + 1) + '</p>');</strong></code></pre>`; | ||
const expectedResult = `Code fragment:\n\n var total = 0;\n\n // Add 1 to total and display in a paragraph\n document.write('<p>Sum: ' + (total + 1) + '</p>');`; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
it('should support preformatted text with line break tags', function () { | ||
const testString = '<pre> line 1 <br/> line 2 </pre>'; | ||
const expectedResult = ' line 1 \n line 2 '; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
it('should support preformatted text with a table', function () { | ||
const testString = /*html*/` | ||
<pre><table> | ||
<tr> | ||
<td>[a    | ||
</td> | ||
<td> b   | ||
</td> | ||
<td> c] | ||
</td> | ||
</tr> | ||
<tr> | ||
<td>      | ||
d]</td> | ||
<td>      | ||
e </td> | ||
<td>      | ||
[f </td> | ||
</tr> | ||
</table></pre>`; | ||
const expectedResult = | ||
'[a b c]\n' + | ||
' \n' + | ||
' \n' + | ||
' d] e [f '; | ||
expect(htmlToText(testString, { tables: true })).to.equal(expectedResult); | ||
}); | ||
}); | ||
describe('hr', function () { | ||
it('should output horizontal line of default length', function () { | ||
const testString = '<div>foo</div><hr/><div>bar</div>'; | ||
const expectedResult = 'foo\n\n--------------------------------------------------------------------------------\n\nbar'; | ||
expect(htmlToText(testString)).to.equal(expectedResult); | ||
}); | ||
it('should output horizontal line of specific length', function () { | ||
const testString = '<div>foo</div><hr/><div>bar</div>'; | ||
const expectedResult = 'foo\n\n------------------------------\n\nbar'; | ||
expect(htmlToText(testString, { tags: { 'hr': { options: { length: 30 } } } })).to.equal(expectedResult); | ||
}); | ||
it('should output horizontal line of length 40 when wordwrap is disabled', function () { | ||
const testString = '<div>foo</div><hr/><div>bar</div>'; | ||
const expectedResult = 'foo\n\n----------------------------------------\n\nbar'; | ||
expect(htmlToText(testString, { wordwrap: false })).to.equal(expectedResult); | ||
}); | ||
}); | ||
}); |
@@ -1,11 +0,11 @@ | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen |
@@ -1,5 +0,5 @@ | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen |
@@ -1,6 +0,6 @@ | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen | ||
@@ -14,6 +14,6 @@ At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen |
@@ -1,8 +0,14 @@ | ||
var expect = require('chai').expect; | ||
var fs = require('fs'); | ||
const { exec } = require('child_process'); | ||
const fs = require('fs'); | ||
var exec = require('child_process').exec; | ||
const { expect } = require('chai'); | ||
function runWithInputAndExpect(input, args, expectedOutput, done) { | ||
exec('echo "' + input.replace(/"/g, '\\"') + '" | node bin/cli.js ' + args, function callback(error, stdout, stderr) { | ||
const isWin = process.platform === 'win32'; | ||
function runWithInputAndExpect (input, args, expectedOutput, done) { | ||
const command = isWin | ||
? 'echo.' + input.replace(/[<>]/g, '^^^$&') + ' | node bin/cli.js ' + args | ||
: 'echo "' + input.replace(/"/g, '\\"') + '" | node bin/cli.js ' + args; | ||
exec(command, function callback (error, stdout, stderr) { | ||
expect(error).to.be.a('null'); | ||
@@ -15,4 +21,7 @@ expect(stderr).to.equal(''); | ||
describe('cli arguments', function() { | ||
it('should output nothing with empty input', function(done) { | ||
describe('cli arguments', function () { | ||
this.timeout(5000); | ||
it('should output nothing with empty input', function (done) { | ||
runWithInputAndExpect('', '', '', done); | ||
@@ -26,3 +35,4 @@ }); | ||
'Hello alt text [http://my.img/here.jpg]!', | ||
done); | ||
done | ||
); | ||
}); | ||
@@ -35,3 +45,4 @@ | ||
'Hello !', | ||
done); | ||
done | ||
); | ||
}); | ||
@@ -44,3 +55,4 @@ | ||
'test [http://my.link]', | ||
done); | ||
done | ||
); | ||
}); | ||
@@ -53,3 +65,4 @@ | ||
'test', | ||
done); | ||
done | ||
); | ||
}); | ||
@@ -61,4 +74,5 @@ | ||
'', | ||
' 123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789\n123456789', | ||
done); | ||
'123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789\n123456789', | ||
done | ||
); | ||
}); | ||
@@ -70,4 +84,5 @@ | ||
'--wordwrap=40', | ||
' 123456789 123456789 123456789 123456789\n123456789 123456789 123456789 123456789\n123456789', | ||
done); | ||
'123456789 123456789 123456789 123456789\n123456789 123456789 123456789 123456789\n123456789', | ||
done | ||
); | ||
}); | ||
@@ -80,3 +95,4 @@ | ||
'test [http://my.link]', | ||
done); | ||
done | ||
); | ||
}); | ||
@@ -89,13 +105,17 @@ | ||
'test http://my.link', | ||
done); | ||
done | ||
); | ||
}); | ||
it('should support --tables definitions with commas', function(done) { | ||
var expectedTxt = fs.readFileSync('test/test.txt', 'utf8'); | ||
it('should support --tables definitions with commas', function (done) { | ||
const expectedTxt = fs.readFileSync('test/test.txt', 'utf8'); | ||
function runWithArgs(args, callback) { | ||
exec('cat test/test.html | node bin/cli.js ' + args, callback); | ||
function runWithArgs (args, callback) { | ||
const command = isWin | ||
? 'node bin/cli.js ' + args + ' < test/test.html' | ||
: 'cat test/test.html | node bin/cli.js ' + args; | ||
exec(command, callback); | ||
} | ||
runWithArgs('--tables=#invoice,.address', function callback(error, stdout, stderr) { | ||
runWithArgs('--tables=#invoice,.address', function callback (error, stdout, stderr) { | ||
expect(error).to.be.a('null'); | ||
@@ -107,2 +127,2 @@ expect(stderr).to.equal(''); | ||
}); | ||
}); | ||
}); |
PARAGRAPHS | ||
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
@@ -16,17 +17,19 @@ gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum | ||
-------------------------------------------------------------------------------- | ||
PRETTY PRINTED TABLE | ||
ARTICLE PRICE TAXES AMOUNT TOTAL | ||
Product 1 6,99€ 7% 1 6,99€ | ||
Contains: 1x Product 1 | ||
Shipment costs 3,25€ 7% 1 3,25€ | ||
to pay: 10,24€ | ||
Taxes 7%: 0,72€ | ||
ARTICLE PRICE TAXES AMOUNT TOTAL | ||
Product 1 6,99€ 7% 1 6,99€ | ||
Contains: 1x Product 1 | ||
Shipment costs 3,25€ 7% 1 3,25€ | ||
to pay: 10,24€ | ||
Taxes 7%: 0,72€ | ||
-------------------------------------------------------------------------------- | ||
LISTS | ||
* At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd | ||
@@ -42,16 +45,18 @@ gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. | ||
-------------------------------------------------------------------------------- | ||
COLUMN LAYOUT WITH TABLES | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen | ||
INVOICE ADDRESS SHIPMENT ADDRESS | ||
Mr. Mr. | ||
John Doe John Doe | ||
Featherstone Street 49 Featherstone Street 49 | ||
28199 Bremen 28199 Bremen | ||
-------------------------------------------------------------------------------- | ||
MAILTO FORMATING | ||
Some Company | ||
@@ -58,0 +63,0 @@ Some Street 42 |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
159073
28
2990
5
9
240
3
3
+ Addeddeepmerge@^4.2.2
+ Addeddeepmerge@4.3.1(transitive)
+ Addeddom-serializer@1.4.1(transitive)
+ Addeddomhandler@3.3.04.3.1(transitive)
+ Addeddomutils@2.8.0(transitive)
+ Addedhtmlparser2@4.1.0(transitive)
- Removeddom-serializer@0.2.2(transitive)
- Removeddomelementtype@1.3.1(transitive)
- Removeddomhandler@2.4.2(transitive)
- Removeddomutils@1.7.0(transitive)
- Removedentities@1.1.2(transitive)
- Removedhtmlparser2@3.10.1(transitive)
- Removedinherits@2.0.4(transitive)
- Removedreadable-stream@3.6.2(transitive)
- Removedsafe-buffer@5.2.1(transitive)
- Removedstring_decoder@1.3.0(transitive)
- Removedutil-deprecate@1.0.2(transitive)
Updatedhtmlparser2@^4.1.0
Updatedlodash@^4.17.20
Updatedminimist@^1.2.5