Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

html-to-text

Package Overview
Dependencies
Maintainers
2
Versions
55
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

html-to-text - npm Package Compare versions

Comparing version 6.0.0 to 7.0.0

.github/ISSUE_TEMPLATE/bug_report.md

6

.eslintrc.js

@@ -8,4 +8,3 @@ module.exports = {

'plugin:jsdoc/recommended',
'plugin:mocha/recommended',
'plugin:you-dont-need-lodash-underscore/compatible'
'plugin:mocha/recommended'
],

@@ -18,2 +17,3 @@ parserOptions: {},

globals: {},
settings: { 'jsdoc': { mode: 'typescript' } }, // allow compact callback types in particular
rules: {

@@ -77,3 +77,3 @@ // Best practices

'function-call-argument-newline': ['error', 'consistent'],
'function-paren-newline': ['error', "consistent"],
'function-paren-newline': ['error', 'consistent'],
'indent': ['error', 2, { 'SwitchCase': 1, 'flatTernaryExpressions': true }],

@@ -80,0 +80,0 @@ 'key-spacing': ['error'],

# Changelog
## Version 7.0.0
### Node version
Required Node version is now >=10.23.2.
### Dependency updates
* `lodash` dependency is removed;
* `htmlparser2` updated from 4.1.0 to 6.0.0 ([Release notes](https://github.com/fb55/htmlparser2/releases), also [domhandler](https://github.com/fb55/domhandler/releases/tag/v4.0.0)). There is a slim chance you can run into some differences in case you're relying on it heavily in your custom formatters;
* dev dependencies are bumped.
### Custom formatters API change
[BlockTextBuilder](https://github.com/html-to-text/node-html-to-text/blob/master/lib/block-text-builder.js) methods now accept option objects for optional arguments. This improves client code readability and allows to introduce extra options with ease. It will see some use in future updates.
Positional arguments introduced in version 6.0.0 are now deprecated. Formatters written for the version 6.0.0 should keep working for now but the compatibility layer is rather inconvenient and will be removed with the next major version.
See the commit [f50f10f](https://github.com/html-to-text/node-html-to-text/commit/f50f10f54cf814efb2f7633d9d377ba7eadeaf1e). Changes in `lib/formatter.js` file are illustrative for how to migrate to the new API.
### And more
* Bunch of documentation and test updates.
All commits: [6.0.0...7.0.0](https://github.com/html-to-text/node-html-to-text/compare/6.0.0...7.0.0)
Version 7 roadmap issue: [#222](https://github.com/html-to-text/node-html-to-text/issues/222)
## Version 6.0.0

@@ -4,0 +32,0 @@

@@ -1,8 +0,7 @@

// eslint-disable-next-line you-dont-need-lodash-underscore/trim
const trim = require('lodash/trim');
const { tableToString } = require('./helper');
const { trimCharacter } = require('./helper');
// eslint-disable-next-line no-unused-vars
const { StackItem, BlockStackItem, TableCellStackItem, TableRowStackItem, TableStackItem, TransformerStackItem }
= require('./stack-item');
const { tableToString } = require('./table-printer');
const { WhitespaceProcessor } = require('./whitespace-processor');

@@ -103,6 +102,22 @@

*
* @param { string } str Text content of a node to add.
* @param { boolean } [ noWordTransform = false ] Ignore word transformers if there are any.
* @param { string } str
* Text content of a node to add.
*
* @param { object | boolean } [ optionsObjectOrNoWordTransform ]
* Object holding the parameters of the operation.
*
* Boolean value is deprecated.
*
* @param { boolean } [ optionsObjectOrNoWordTransform.noWordTransform = false ]
* Ignore word transformers if there are any.
*/
addInline (str, noWordTransform = false) {
addInline (str, optionsObjectOrNoWordTransform = {}) {
if (typeof optionsObjectOrNoWordTransform === 'object') {
this._addInline(str, optionsObjectOrNoWordTransform);
} else {
this._addInline(str, { noWordTransform: optionsObjectOrNoWordTransform });
}
}
_addInline (str, { noWordTransform = false } = {}) {
if (!(

@@ -137,12 +152,35 @@ this._stackItem instanceof BlockStackItem

*
* @param { number } [leadingLineBreaks = 1]
* @param { object | number } [optionsObjectOrLeadingLineBreaks]
* Object holding the parameters of the block.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrLeadingLineBreaks.leadingLineBreaks = 1]
* This block should have at least this number of line breaks to separate if from any preceding block.
*
* @param { number } [reservedLineLength = 0]
* @param { number } [optionsObjectOrLeadingLineBreaks.reservedLineLength = 0]
* Reserve this number of characters on each line for block markup.
*
* @param { boolean } [isPre = false]
* @param { boolean } [optionsObjectOrLeadingLineBreaks.isPre = false]
* Should HTML whitespace be preserved inside this block.
*
* @param { number } [reservedLineLength]
* Deprecated.
*
* @param { boolean } [isPre]
* Deprecated.
*/
openBlock (leadingLineBreaks = 1, reservedLineLength = 0, isPre = false) {
openBlock (optionsObjectOrLeadingLineBreaks = {}, reservedLineLength = undefined, isPre = undefined) {
if (typeof optionsObjectOrLeadingLineBreaks === 'object') {
this._openBlock(optionsObjectOrLeadingLineBreaks);
} else {
this._openBlock({
isPre: isPre,
leadingLineBreaks: optionsObjectOrLeadingLineBreaks,
reservedLineLength: reservedLineLength,
});
}
}
_openBlock ({ leadingLineBreaks = 1, reservedLineLength = 0, isPre = false } = {}) {
const maxLineLength = Math.max(20, this._stackItem.inlineTextBuilder.maxLineLength - reservedLineLength);

@@ -161,6 +199,11 @@ this._stackItem = new BlockStackItem(

*
* @param { number } [trailingLineBreaks = 1]
* @param { object | number } [optionsObjectOrTrailingLineBreaks]
* Object holding the parameters of the block.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrTrailingLineBreaks.trailingLineBreaks = 1]
* This block should have at least this number of line breaks to separate it from any following block.
*
* @param { (str: string) => string } [blockTransform = undefined]
* @param { (str: string) => string } [optionsObjectOrTrailingLineBreaks.blockTransform = undefined]
* A function to transform the block text before adding to the parent block.

@@ -170,4 +213,18 @@ * This happens after word wrap and should be used in combination with reserved line length

* Used for whole block markup.
*
* @param { (str: string) => string } [blockTransform]
* Deprecated.
*/
closeBlock (trailingLineBreaks = 1, blockTransform = undefined) {
closeBlock (optionsObjectOrTrailingLineBreaks = {}, blockTransform = undefined) {
if (typeof optionsObjectOrTrailingLineBreaks === 'object') {
this._closeBlock(optionsObjectOrTrailingLineBreaks);
} else {
this._closeBlock({
trailingLineBreaks: optionsObjectOrTrailingLineBreaks,
blockTransform: blockTransform,
});
}
}
_closeBlock ({ trailingLineBreaks = 1, blockTransform = undefined } = {}) {
const block = this._popStackItem();

@@ -198,5 +255,19 @@ const blockText = (blockTransform) ? blockTransform(getText(block)) : getText(block);

*
* @param { number } [maxColumnWidth = undefined] Wrap cell content to this width instead of global wordwrap value.
* @param { object | number } [optionsObjectOrMaxColumnWidth = undefined]
* Object holding the parameters of the cell.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrMaxColumnWidth.maxColumnWidth = undefined]
* Wrap cell content to this width. Fall back to global wordwrap value if undefined.
*/
openTableCell (maxColumnWidth = undefined) {
openTableCell (optionsObjectOrMaxColumnWidth = {}) {
if (typeof optionsObjectOrMaxColumnWidth === 'object') {
this._openTableCell(optionsObjectOrMaxColumnWidth);
} else {
this._openTableCell({ maxColumnWidth: optionsObjectOrMaxColumnWidth });
}
}
_openTableCell ({ maxColumnWidth = undefined } = {}) {
if (!(this._stackItem instanceof TableRowStackItem)) {

@@ -211,8 +282,26 @@ throw new Error('Can\'t add table cell to something that is not a table row! Check the formatter.');

*
* @param { number } [colspan = 1] How many columns this cell should occupy.
* @param { number } [rowspan = 1] How many rows this cell should occupy.
* @param { object | number } [optionsObjectOrColspan]
* Object holding the parameters of the cell.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrColspan.colspan = 1] How many columns this cell should occupy.
* @param { number } [optionsObjectOrColspan.rowspan = 1] How many rows this cell should occupy.
*
* @param { number } [rowspan] Deprecated.
*/
closeTableCell (colspan = 1, rowspan = 1) {
closeTableCell (optionsObjectOrColspan = {}, rowspan = undefined) {
if (typeof optionsObjectOrColspan === 'object') {
this._closeTableCell(optionsObjectOrColspan);
} else {
this._closeTableCell({
colspan: optionsObjectOrColspan,
rowspan: rowspan,
});
}
}
_closeTableCell ({ colspan = 1, rowspan = 1 } = {}) {
const cell = this._popStackItem();
const text = trim(getText(cell), '\n');
const text = trimCharacter(getText(cell), '\n');
cell.next.cells.push({ colspan: colspan, rowspan: rowspan, text: text });

@@ -232,15 +321,47 @@ }

*
* @param { number } [colSpacing = 3]
* @param { object | number } [optionsObjectOrColSpacing]
* Object holding the parameters of the table.
*
* Number value is depreceted.
*
* @param { number } [optionsObjectOrColSpacing.colSpacing = 3]
* Number of spaces between table columns.
*
* @param { number } [rowSpacing = 0]
* @param { number } [optionsObjectOrColSpacing.rowSpacing = 0]
* Number of empty lines between table rows.
*
* @param { number } [leadingLineBreaks = 2]
* @param { number } [optionsObjectOrColSpacing.leadingLineBreaks = 2]
* This table should have at least this number of line breaks to separate if from any preceding block.
*
* @param { number } [trailingLineBreaks = 2]
* @param { number } [optionsObjectOrColSpacing.trailingLineBreaks = 2]
* This table should have at least this number of line breaks to separate it from any following block.
*
* @param { number } [rowSpacing]
* Deprecated.
*
* @param { number } [leadingLineBreaks]
* Deprecated.
*
* @param { number } [trailingLineBreaks]
* Deprecated.
*/
closeTable (colSpacing = 3, rowSpacing = 0, leadingLineBreaks = 2, trailingLineBreaks = 2) {
closeTable (
optionsObjectOrColSpacing = {},
rowSpacing = undefined,
leadingLineBreaks = undefined,
trailingLineBreaks = undefined
) {
if (typeof optionsObjectOrColSpacing === 'object') {
this._closeTable(optionsObjectOrColSpacing);
} else {
this._closeTable({
colSpacing: optionsObjectOrColSpacing,
leadingLineBreaks: leadingLineBreaks,
rowSpacing: rowSpacing,
trailingLineBreaks: trailingLineBreaks
});
}
}
_closeTable ({ colSpacing = 3, rowSpacing = 0, leadingLineBreaks = 2, trailingLineBreaks = 2 } = {}) {
const table = this._popStackItem();

@@ -247,0 +368,0 @@ const output = tableToString(table.rows, rowSpacing, colSpacing);

const he = require('he');
const get = require('lodash/get');
// eslint-disable-next-line you-dont-need-lodash-underscore/trim
const trim = require('lodash/trim');
const trimStart = require('lodash/trimStart');
const { numberToLetterSequence, numberToRoman, splitClassesAndIds } = require('./helper');
const { get, numberToLetterSequence, numberToRoman, splitClassesAndIds, trimCharacter } = require('./helper');

@@ -37,5 +33,5 @@ // eslint-disable-next-line import/no-unassigned-import

function formatBlock (elem, walk, builder, formatOptions) {
builder.openBlock(formatOptions.leadingLineBreaks);
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks });
walk(elem.children, builder);
builder.closeBlock(formatOptions.trailingLineBreaks);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks });
}

@@ -67,5 +63,5 @@

function formatHorizontalLine (elem, walk, builder, formatOptions) {
builder.openBlock(formatOptions.leadingLineBreaks || 2);
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
builder.addInline('-'.repeat(formatOptions.length || builder.options.wordwrap || 40));
builder.closeBlock(formatOptions.trailingLineBreaks || 2);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}

@@ -79,5 +75,5 @@

function formatParagraph (elem, walk, builder, formatOptions) {
builder.openBlock(formatOptions.leadingLineBreaks || 2);
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
walk(elem.children, builder);
builder.closeBlock(formatOptions.trailingLineBreaks || 2);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}

@@ -91,5 +87,8 @@

function formatPre (elem, walk, builder, formatOptions) {
builder.openBlock(formatOptions.leadingLineBreaks || 2, 0, true);
builder.openBlock({
isPre: true,
leadingLineBreaks: formatOptions.leadingLineBreaks || 2
});
walk(elem.children, builder);
builder.closeBlock(formatOptions.trailingLineBreaks || 2);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}

@@ -103,3 +102,3 @@

function formatHeading (elem, walk, builder, formatOptions) {
builder.openBlock(formatOptions.leadingLineBreaks || 2);
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
if (formatOptions.uppercase !== false) {

@@ -112,3 +111,3 @@ builder.pushWordTransform(str => str.toUpperCase());

}
builder.closeBlock(formatOptions.trailingLineBreaks || 2);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}

@@ -122,11 +121,14 @@

function formatBlockquote (elem, walk, builder, formatOptions) {
builder.openBlock(formatOptions.leadingLineBreaks || 2, 2);
builder.openBlock({
leadingLineBreaks: formatOptions.leadingLineBreaks || 2,
reservedLineLength: 2
});
walk(elem.children, builder);
builder.closeBlock(
formatOptions.trailingLineBreaks || 2,
str => ((formatOptions.trimEmptyLines !== false) ? trim(str, '\n') : str)
builder.closeBlock({
trailingLineBreaks: formatOptions.trailingLineBreaks || 2,
blockTransform: str => ((formatOptions.trimEmptyLines !== false) ? trimCharacter(str, '\n') : str)
.split('\n')
.map(line => '> ' + line)
.join('\n')
);
});
}

@@ -196,3 +198,3 @@

: ' [' + href + ']',
true
{ noWordTransform: true }
);

@@ -211,3 +213,3 @@ }

function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) {
const isNestedList = get(elem, 'parent.name') === 'li';
const isNestedList = get(elem, ['parent', 'name']) === 'li';

@@ -225,3 +227,3 @@ // With Roman numbers, index length is not as straightforward as with Arabic numbers or letters,

const prefix = (isNestedList)
? trimStart(nextPrefixCallback())
? nextPrefixCallback().trimStart()
: nextPrefixCallback();

@@ -233,14 +235,17 @@ if (prefix.length > maxPrefixLength) { maxPrefixLength = prefix.length; }

const reservedWidth = maxPrefixLength;
const spacing = '\n' + ' '.repeat(reservedWidth);
builder.openBlock(isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2));
const reservedLineLength = maxPrefixLength;
const spacing = '\n' + ' '.repeat(reservedLineLength);
builder.openBlock({ leadingLineBreaks: isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2) });
for (const { node, prefix } of listItems) {
builder.openBlock(1, reservedWidth);
builder.openBlock({
leadingLineBreaks: 1,
reservedLineLength: reservedLineLength
});
walk([node], builder);
builder.closeBlock(
1,
str => prefix + ' '.repeat(reservedWidth - prefix.length) + str.replace(/\n/g, spacing)
);
builder.closeBlock({
trailingLineBreaks: 1,
blockTransform: str => prefix + ' '.repeat(reservedLineLength - prefix.length) + str.replace(/\n/g, spacing)
});
}
builder.closeBlock(isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2));
builder.closeBlock({ trailingLineBreaks: isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2) });
}

@@ -317,15 +322,15 @@

elem.children.forEach(walkTable);
builder.closeTable(
formatOptions.colSpacing,
formatOptions.rowSpacing,
formatOptions.leadingLineBreaks,
formatOptions.trailingLineBreaks
);
builder.closeTable({
colSpacing: formatOptions.colSpacing,
leadingLineBreaks: formatOptions.leadingLineBreaks,
rowSpacing: formatOptions.rowSpacing,
trailingLineBreaks: formatOptions.trailingLineBreaks
});
function formatCell (cellNode) {
const colspan = +get(cellNode, 'attribs.colspan') || 1;
const rowspan = +get(cellNode, 'attribs.rowspan') || 1;
builder.openTableCell(formatOptions.maxColumnWidth);
const colspan = +get(cellNode, ['attribs', 'colspan']) || 1;
const rowspan = +get(cellNode, ['attribs', 'rowspan']) || 1;
builder.openTableCell({ maxColumnWidth: formatOptions.maxColumnWidth });
walk(cellNode.children, builder);
builder.closeTableCell(colspan, rowspan);
builder.closeTableCell({ colspan: colspan, rowspan: rowspan });
}

@@ -332,0 +337,0 @@

@@ -111,109 +111,63 @@

function getRow (matrix, j) {
if (!matrix[j]) { matrix[j] = []; }
return matrix[j];
/**
* Return the same string or a substring with the given character occurences removed from each end if any.
*
* @param { string } str A string to trim.
* @param { string } char A character to be trimmed.
* @returns { string }
*/
function trimCharacter (str, char) {
let start = 0;
let end = str.length;
while (start < end && str[start] === char) { ++start; }
while (end > start && str[end - 1] === char) { --end; }
return (start > 0 || end < str.length)
? str.substring(start, end)
: str;
}
function findFirstVacantIndex (row, x = 0) {
while (row[x]) { x++; }
return x;
}
function transposeInPlace (matrix, maxSize) {
for (let i = 0; i < maxSize; i++) {
const rowI = getRow(matrix, i);
for (let j = 0; j < i; j++) {
const rowJ = getRow(matrix, j);
const temp = rowI[j];
rowI[j] = rowJ[i];
rowJ[i] = temp;
}
/**
* Get a nested property from an object.
*
* @param { object } obj The object to query for the value.
* @param { string[] } path The path to the property.
* @returns { any }
*/
function get (obj, path) {
for (const key of path) {
if (!obj) { return undefined; }
obj = obj[key];
}
return obj;
}
function putCellIntoLayout (cell, layout, baseRow, baseCol) {
for (let r = 0; r < cell.rowspan; r++) {
const layoutRow = getRow(layout, baseRow + r);
for (let c = 0; c < cell.colspan; c++) {
layoutRow[baseCol + c] = cell;
}
}
}
function updateOffset (offsets, base, span, value) {
offsets[base + span] = Math.max(
offsets[base + span] || 0,
offsets[base] + value
);
}
/**
* Render a table into string.
* Cells can contain multiline text and span across multiple rows and columns.
* Set a nested property of an object.
*
* Modifies cells to add lines array.
*
* @param { { colspan: number, rowspan: number, text: string }[][] } tableRows Table to render.
* @param { number } rowSpacing Number of spaces between columns.
* @param { number } colSpacing Number of empty lines between rows.
* @returns { string }
* @param { object } obj The object to modify.
* @param { string[] } path The path to the property.
* @param { any } value The value to set.
*/
function tableToString (tableRows, rowSpacing, colSpacing) {
const layout = [];
let colNumber = 0;
const rowNumber = tableRows.length;
const rowOffsets = [0];
// Fill the layout table and row offsets row-by-row.
for (let j = 0; j < rowNumber; j++) {
const layoutRow = getRow(layout, j);
const cells = tableRows[j];
let x = 0;
for (let i = 0; i < cells.length; i++) {
const cell = cells[i];
x = findFirstVacantIndex(layoutRow, x);
putCellIntoLayout(cell, layout, j, x);
x += cell.colspan;
cell.lines = cell.text.split('\n');
const cellHeight = cell.lines.length;
updateOffset(rowOffsets, j, cell.rowspan, cellHeight + rowSpacing);
function set (obj, path, value) {
const valueKey = path.pop();
for (const key of path) {
let nested = obj[key];
if (!nested) {
nested = {};
obj[key] = nested;
}
colNumber = (layoutRow.length > colNumber) ? layoutRow.length : colNumber;
obj = nested;
}
transposeInPlace(layout, (rowNumber > colNumber) ? rowNumber : colNumber);
const outputLines = [];
const colOffsets = [0];
// Fill column offsets and output lines column-by-column.
for (let x = 0; x < colNumber; x++) {
let y = 0;
let cell;
while (y < rowNumber && (cell = layout[x][y])) {
if (!cell.rendered) {
let cellWidth = 0;
for (let j = 0; j < cell.lines.length; j++) {
const line = cell.lines[j];
const lineOffset = rowOffsets[y] + j;
outputLines[lineOffset] = (outputLines[lineOffset] || '').padEnd(colOffsets[x]) + line;
cellWidth = (line.length > cellWidth) ? line.length : cellWidth;
}
updateOffset(colOffsets, x, cell.colspan, cellWidth + colSpacing);
cell.rendered = true;
}
y += cell.rowspan;
}
}
return outputLines.join('\n');
obj[valueKey] = value;
}
module.exports = {
get: get,
limitedDepthRecursive: limitedDepthRecursive,
numberToLetterSequence: numberToLetterSequence,
numberToRoman: numberToRoman,
set: set,
splitClassesAndIds: splitClassesAndIds,
splitSelector: splitSelector,
tableToString: tableToString
trimCharacter: trimCharacter
};
const merge = require('deepmerge');
const he = require('he');
const htmlparser = require('htmlparser2');
const set = require('lodash/set');
const { BlockTextBuilder } = require('./block-text-builder');
const defaultFormatters = require('./formatter');
const { limitedDepthRecursive, splitSelector } = require('./helper');
const { limitedDepthRecursive, set, splitSelector } = require('./helper');

@@ -122,3 +121,3 @@ // eslint-disable-next-line import/no-unassigned-import

console.warn(
`Input lenght ${html.length} is above allowed limit of ${maxInputLength}. Truncating without ellipsis.`
`Input length ${html.length} is above allowed limit of ${maxInputLength}. Truncating without ellipsis.`
);

@@ -129,3 +128,3 @@ html = html.substring(0, maxInputLength);

const handler = new htmlparser.DefaultHandler();
new htmlparser.Parser(handler, { lowerCaseTags: true }).parseComplete(html);
new htmlparser.Parser(handler, { decodeEntities: false }).parseComplete(html);

@@ -132,0 +131,0 @@ const limitedWalk = limitedDepthRecursive(

@@ -0,0 +0,0 @@ // eslint-disable-next-line import/no-unassigned-import

@@ -0,0 +0,0 @@ /* eslint-disable max-classes-per-file */

@@ -30,5 +30,5 @@

* @property { boolean } [preserveNewlines = false]
* By default, any newlines `\n` from the input HTML are dropped.
*
* By default, any newlines `\n` from the input HTML are collapsed into space as any other HTML whitespace characters.
* If `true`, these newlines will be preserved in the output.
* This is only useful when input HTML carries some plain text formatting instead of proper tags.
*

@@ -198,3 +198,3 @@ * @property { boolean } [returnDomByDefault = true]

*
* Set to `undefined` in order to fall back to `wordwrap` limit.
* Set this to `undefined` in order to fall back to `wordwrap` limit.
*

@@ -201,0 +201,0 @@ * @property { number } [colSpacing = 3]

{
"name": "html-to-text",
"version": "6.0.0",
"version": "7.0.0",
"description": "Advanced html to plain text converter",

@@ -29,4 +29,3 @@ "main": "index.js",

"he": "^1.2.0",
"htmlparser2": "^4.1.0",
"lodash": "^4.17.20",
"htmlparser2": "^6.0.0",
"minimist": "^1.2.5"

@@ -43,3 +42,3 @@ },

"engines": {
"node": ">=8.10.0"
"node": ">=10.23.2"
},

@@ -50,12 +49,11 @@ "bin": {

"devDependencies": {
"chai": "^4.2.0",
"eslint": "^6.8.0",
"chai": "^4.3.0",
"eslint": "^7.19.0",
"eslint-plugin-filenames": "^1.3.2",
"eslint-plugin-import": "^2.20.2",
"eslint-plugin-jsdoc": "^27.0.4",
"eslint-plugin-mocha": "^6.3.0",
"eslint-plugin-you-dont-need-lodash-underscore": "^6.10.0",
"mocha": "^7.2.0",
"eslint-plugin-import": "^2.22.1",
"eslint-plugin-jsdoc": "^31.6.1",
"eslint-plugin-mocha": "^8.0.0",
"mocha": "^8.2.1",
"nyc": "^15.1.0"
}
}

@@ -26,2 +26,4 @@ # html-to-text

Version 7 contains an important change for custom formatters.
## Installation

@@ -44,3 +46,4 @@

const text = htmlToText('<h1>Hello World</h1>', {
const html = '<h1>Hello World</h1>';
const text = htmlToText(html, {
wordwrap: 130

@@ -59,3 +62,3 @@ });

`decodeOptions` | `{ isAttributeValue: false, strict: false }` | Text decoding options given to `he.decode`. For more informations see the [he](https://github.com/mathiasbynens/he) module.
`formatters` | `{}` | An object with custom formatting functions for specific elements (see "Override formatting" section below).
`formatters` | `{}` | An object with custom formatting functions for specific elements (see [Override formatting](#override-formatting) section below).
`limits` | | Describes how to limit the output text in case of large HTML documents.

@@ -69,6 +72,6 @@ `limits.ellipsis` | `'...'` | A string to insert in place of skipped content.

`longWordSplit.forceWrapOnLimit` | `false` | Break long words at the line length limit in case no better wrap opportunities found.
`preserveNewlines` | `false` | By default, any newlines `\n` in a block of text will be removed. If `true`, these newlines will not be removed.
`preserveNewlines` | `false` | By default, any newlines `\n` from the input HTML are collapsed into space as any other HTML whitespace characters. If `true`, these newlines will be preserved in the output. This is only useful when input HTML carries some plain text formatting instead of proper tags.
`returnDomByDefault` | `true` | Convert the entire document if we don't find the tag defined in `baseElement`.
`tables` | `[]` | Allows to select certain tables by the `class` or `id` attribute from the HTML document. This is necessary because the majority of HTML E-Mails uses a table based layout. Prefix your table selectors with an `.` for the `class` and with a `#` for the `id` attribute. All other tables are ignored.<br/>You can assign `true` to this attribute to select all tables.
`tags` | | Describes how different tags should be formatted. See "Tags" section below.
`tags` | | Describes how different tags should be formatted. See [Tags](#tags) section below.
`whitespaceCharacters` | `' \t\r\n\f\u200b'` | A string of characters that are recognized as HTML whitespace. Default value uses the set of characters defined in [HTML4 standard](https://www.w3.org/TR/html4/struct/text.html#h-9.1). (It includes Zero-width space compared to [living standard](https://infra.spec.whatwg.org#ascii-whitespace).)

@@ -81,11 +84,11 @@ `wordwrap` | `80` | After how many chars a line break should follow.<br/>Set to `null` or `false` to disable word-wrapping.

-------------------------- | -----------
`hideLinkHrefIfSameAsText` | `hideLinkHrefIfSameAsText` option for tags with `anchor` formatter.
`ignoreHref` | `ignoreHref` option for tags with `anchor` formatter.
`ignoreImage` | Set format to `skip` for `img` tags.
`linkHrefBaseUrl` | `baseUrl` option for tags with `anchor` and `image` formatters.
`noAnchorUrl` | `noAnchorUrl` option for tags with `anchor` formatter.
`noLinkBrackets` | `noLinkBrackets` option for tags with `anchor` formatter.
`singleNewLineParagraphs` | Set `leadingLineBreaks` and `trailingLineBreaks` options to `1` for `p` and `pre` tags.
`unorderedListItemPrefix` | `itemPrefix` option for tags with `unorderedList` formatter.
`uppercaseHeadings` | `uppercase` option for tags with `heading` formatter, `uppercaseHeaderCells` option for `table` or `dataTable` formatters.
`hideLinkHrefIfSameAsText` | `tags: { 'a': { options: { hideLinkHrefIfSameAsText: true } } }`
`ignoreHref` | `tags: { 'a': { options: { ignoreHref: true } } }`
`ignoreImage` | `tags: { 'img': { format: 'skip' } }`
`linkHrefBaseUrl` | `tags: {`<br/>`'a': { options: { baseUrl: 'https://example.com' } },`<br/>`'img': { options: { baseUrl: 'https://example.com' } }`<br/>`}`
`noAnchorUrl` | `tags: { 'a': { options: { noAnchorUrl: true } } }`
`noLinkBrackets` | `tags: { 'a': { options: { noLinkBrackets: true } } }`
`singleNewLineParagraphs` | `tags: {`<br/>`'p': { options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },`<br/>`'pre': { options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }`<br/>`}`
`unorderedListItemPrefix` | `tags: { 'ul': { options: { itemPrefix: ' * ' } } }`
`uppercaseHeadings` | `tags: {`<br/>`'h1': { options: { uppercase: false } },`<br/>`...`<br/>`'table': { options: { uppercaseHeaderCells: false } }`<br/>`}`

@@ -98,8 +101,25 @@ Deprecated options will be removed with future major version update.

--------------- | -----------
`format` | The way formatters are written has changed completely. New formatters have to be added to the `formatters` option, old ones can not be reused without rewrite. See new instructions below.
`format` | The way formatters are written has changed completely. New formatters have to be added to the `formatters` option, old ones can not be reused without rewrite. See [new instructions](#override-formatting) below.
#### Tags
By default there are following tag to formatter assignments:
Example for tag-specific options:
```javascript
const { htmlToText } = require('html-to-text');
const html = '<a href="/page.html">Page</a>';
const text = htmlToText(html, {
tags: {
'a': { options: { baseUrl: 'https://example.com' } },
'figure': { format: 'block' }
}
});
console.log(text); // Page [https://example.com/page.html]
```
For new tags you have to specify the `format` value. For tags listed below you can skip it and only provide `options`. (Valid options listed in the next table.)
By default there are following tag to format assignments:
Tag&nbsp;name | Default&nbsp;format | Notes

@@ -130,7 +150,7 @@ ------------- | ------------------- | -----

`pre` | `pre` |
`table` | `table` | there is also `dataTable` formatter. Using it will be equivalent to setting `tables` to `true`. `tables` option might be deprecated in the future.
`table` | `table` | there is also `dataTable` format. Using it will be equivalent to setting `tables` to `true`. `tables` option might be deprecated in the future.
`ul` | `unorderedList` |
`wbr` | `wbr` |
More formatters also available for use:
More formats also available for use:

@@ -145,3 +165,3 @@ * `skip` - as the name implies it skips the given tag with it's contents without printing anything.

`trailingLineBreaks` | `1` or `2` | all block-level formatters | Number of line breaks to separate this block from the next one.<br/>Note that N+1 line breaks are needed to make N empty lines.
`baseUrl` | null | `anchor`, `image` | Server host for link `href` attributes and image `src` attributes relative to the root (the ones that start with `/`).<br/>For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>` the link in the text will be `http://asdf.com/dir/subdir`.<br/>Keep in mind that `baseUrl` should not end with a `/`.
`baseUrl` | `null` | `anchor`, `image` | Server host for link `href` attributes and image `src` attributes relative to the root (the ones that start with `/`).<br/>For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>` the link in the text will be `http://asdf.com/dir/subdir`.<br/>Keep in mind that `baseUrl` should not end with a `/`.
`hideLinkHrefIfSameAsText` | `false` | `anchor` | By default links are translated in the following way:<br/>`<a href='link'>text</a>` => becomes => `text [link]`.<br/>If this option is set to `true` and `link` and `text` are the same, `[link]` will be omitted and only `text` will be present.

@@ -156,18 +176,6 @@ `ignoreHref` | `false` | `anchor` | Ignore all links. Only process internal text of anchor tags.

`uppercaseHeaderCells` | `true` | `table`, `dataTable` | By default, heading cells (`<th>`) are uppercased.<br/>Set this to `false` to leave heading cells as they are.
`maxColumnWidth` | `60` | `table`, `dataTable` | Data table cell content will be wrapped to fit this width instead of global `wordwrap` limit.<br/>Set to `undefined` in order to fall back to `wordwrap` limit.
`maxColumnWidth` | `60` | `table`, `dataTable` | Data table cell content will be wrapped to fit this width instead of global `wordwrap` limit.<br/>Set this to `undefined` in order to fall back to `wordwrap` limit.
`colSpacing` | `3` | `table`, `dataTable` | Number of spaces between data table columns.
`rowSpacing` | `0` | `table`, `dataTable` | Number of empty lines between data table rows.
How to set a specific format option, example:
```javascript
var { htmlToText } = require('html-to-text');
var text = htmlToText('<a href="/page.html">Page</a>', {
tags: { 'a': { options: { baseUrl: 'https://example.com' } } }
});
console.log(text); // Page [https://example.com/page.html]
```
### Override formatting

@@ -184,3 +192,3 @@

* `builder` - [BlockTextBuilder](https://github.com/html-to-text/node-html-to-text/blob/master/lib/block-text-builder.js) object. Manipulate this object state to build the output text;
* `formatOptions` - options that are specified for a tag, along with this formatter (Note: if you need global html-to-text options - they are accessible via `builder.options`).
* `formatOptions` - options that are specified for a tag, along with this formatter (Note: if you need general html-to-text [options](#general-options) - they are accessible via `builder.options`).

@@ -190,12 +198,13 @@ Custom formatter example:

```javascript
var { htmlToText } = require('html-to-text');
const { htmlToText } = require('html-to-text');
var text = htmlToText('<foo>Hello World</foo>', {
const html = '<foo>Hello World</foo>';
const text = htmlToText(html, {
formatters: {
// Create a formatter.
'fooBlockFormatter': function (elem, walk, builder, formatOptions) {
builder.openBlock(formatOptions.leadingLineBreaks || 1);
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 1 });
walk(elem.children, builder);
builder.addInline('!');
builder.closeBlock(formatOptions.trailingLineBreaks || 1);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 1 });
}

@@ -211,3 +220,2 @@ },

});
console.log(text); // Hello World!

@@ -220,2 +228,4 @@ ```

Note: `BlockTextBuilder` got some important [changes](https://github.com/html-to-text/node-html-to-text/commit/f50f10f54cf814efb2f7633d9d377ba7eadeaf1e) in the version 7. Positional arguments are deprecated and formatters written for the version 6 have to be updated accordingly in order to keep working after next major update.
## Command Line Interface

@@ -222,0 +232,0 @@

@@ -11,3 +11,4 @@ const fs = require('fs');

describe('Smoke test', function () {
describe('smoke test', function () {
it('should return empty input unchanged', function () {

@@ -24,738 +25,187 @@ expect(htmlToText('')).to.equal('');

});
});
describe('.htmlToText()', function () {
describe('wordwrap option', function () {
describe('skipped html content', function () {
let longStr;
it('should ignore html comments', function () {
const html = /*html*/`
<!--[^-]*-->
<!-- <h1>Hello World</h1> -->
text
`;
expect(htmlToText(html)).to.equal('text');
});
beforeEach(function () {
longStr = '111111111 222222222 333333333 444444444 555555555 666666666 777777777 888888888 999999999';
});
it('should wordwrap at 80 characters by default', function () {
expect(htmlToText(longStr)).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777 888888888\n999999999');
});
it('should wordwrap at given amount of characters when give a number', function () {
expect(htmlToText(longStr, { wordwrap: 20 })).to.equal('111111111 222222222\n333333333 444444444\n555555555 666666666\n777777777 888888888\n999999999');
expect(htmlToText(longStr, { wordwrap: 50 })).to.equal('111111111 222222222 333333333 444444444 555555555\n666666666 777777777 888888888 999999999');
expect(htmlToText(longStr, { wordwrap: 70 })).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777\n888888888 999999999');
});
it('should not wordwrap when given null', function () {
expect(htmlToText(longStr, { wordwrap: null })).to.equal(longStr);
});
it('should not wordwrap when given false', function () {
expect(htmlToText(longStr, { wordwrap: false })).to.equal(longStr);
});
it('should not exceed the line width when processing embedded format tags', function () {
const testString = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths.</p>';
expect(htmlToText(testString, {})).to.equal('This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths.');
});
it('should work with a long string containing line feeds', function () {
const testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>';
expect(htmlToText(testString, {})).to.equal('If a word with a line feed exists over the line feed boundary then you must\nrespect it.');
});
it('should not wrongly truncate lines when processing embedded format tags', function () {
const testString = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths. However it can affect where the next line breaks and this could lead to having an early line break</p>';
expect(htmlToText(testString, {})).to.equal('This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths. However it can affect where the next line breaks and\nthis could lead to having an early line break');
});
it('should not exceed the line width when processing anchor tags', function () {
const testString = "<p>We appreciate your business. And we hope you'll check out our <a href=\"http://example.com/\">new products</a>!</p>";
expect(htmlToText(testString, {})).to.equal('We appreciate your business. And we hope you\'ll check out our new products\n[http://example.com/]!');
});
it('should honour line feeds from a long word across the wrap, where the line feed is before the wrap', function () {
const testString = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>';
expect(htmlToText(testString, {}))
.to.equal('This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.');
});
it('should remove line feeds from a long word across the wrap, where the line feed is after the wrap', function () {
const testString = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>';
expect(htmlToText(testString, {}))
.to.equal('This string is meant to test if a string is split properly across\nanewlineandlong word with following text.');
});
it('should ignore scripts', function () {
const html = /*html*/`
<script src="javascript.js"></script>
<script>
console.log("Hello World!");
</script>
<script id="data" type="application/json">{"userId":1234,"userName":"John Doe","memberSince":"2000-01-01T00:00:00.000Z"}</script>
text
`;
expect(htmlToText(html)).to.equal('text');
});
describe('preserveNewlines option', function () {
let newlineStr;
beforeEach(function () {
newlineStr = '<p\n>One\nTwo\nThree</p>'; // newline inside a tag is intentional
});
it('should not preserve newlines by default', function () {
expect(htmlToText(newlineStr)).to.equal('One Two Three');
});
it('should preserve newlines when provided with a truthy value', function () {
expect(htmlToText(newlineStr, { preserveNewlines: true })).to.equal('One\nTwo\nThree');
});
it('should preserve line feeds in a long wrapping string containing line feeds', function () {
const testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.');
});
it('should preserve line feeds in a long string containing line feeds across the wrap', function () {
const testString = '<p>If a word with a line feed exists over the line feed boundary then\nyou must respect it.</p>';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('If a word with a line feed exists over the line feed boundary then\nyou must respect it.');
});
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed before 80 chars', function () {
const testString = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.');
});
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed after 80 chars', function () {
const testString = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('This string is meant to test if a string is split properly across\nanewlineandlong\nword with following text.');
});
it('should split long lines', function () {
const testString = '<p>If a word with a line feed exists over the line feed boundary then you must respect it.</p>';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('If a word with a line feed exists over the line feed boundary then you must\nrespect it.');
});
it('should remove spaces if they occur around line feed', function () {
const testString = '<p>A string of text\nwith \nmultiple\n spaces \n that \n \n can be safely removed.</p>';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('A string of text\nwith\nmultiple\nspaces\nthat\n\ncan be safely removed.');
});
it('should remove spaces if they occur around line feed 2', function () {
const testString = 'multiple\n spaces';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('multiple\nspaces');
});
it('should ignore styles', function () {
const html = /*html*/`
<link href="main.css" rel="stylesheet">
<style type="text/css" media="all and (max-width: 500px)">
p { color: #26b72b; }
</style>
text
`;
expect(htmlToText(html)).to.equal('text');
});
describe('single line paragraph option', function () {
let paragraphsString;
beforeEach(function () {
paragraphsString = '<p>First</p><p>Second</p>';
});
it('should not use single new line when given null', function () {
expect(htmlToText(paragraphsString, { singleNewLineParagraphs: null })).to.equal('First\n\nSecond');
});
it('should not use single new line when given false', function () {
expect(htmlToText(paragraphsString, { singleNewLineParagraphs: false })).to.equal('First\n\nSecond');
});
it('should use single new line when given true', function () {
expect(htmlToText(paragraphsString, { singleNewLineParagraphs: true })).to.equal('First\nSecond');
});
});
});
describe('block-level elements', function () {
describe('wordwrap option', function () {
it('should render common block-level elements on separate lines with default line breaks number', function () {
const testString =
'a<article>article</article>b<aside>aside</aside>c<div>div</div>d<footer>footer</footer>' +
'e<form>form</form>f<header>header</header>g<main>main</main>h<nav>nav</nav>i<section>section</section>j';
const expectedResult = 'a\narticle\nb\naside\nc\ndiv\nd\nfooter\ne\nform\nf\nheader\ng\nmain\nh\nnav\ni\nsection\nj';
expect(htmlToText(testString)).to.equal(expectedResult);
});
let longStr;
});
describe('tables', function () {
it('should handle center tag in tables', function () {
const html = `Good morning Jacob, \
<TABLE>
<CENTER>
<TBODY>
<TR>
<TD>Lorem ipsum dolor sit amet.</TD>
</TR>
</CENTER>
</TBODY>
</TABLE>
`;
const resultExpected = 'Good morning Jacob,\n\nLorem ipsum dolor sit amet.';
const result = htmlToText(html, { tables: true });
expect(result).to.equal(resultExpected);
beforeEach(function () {
longStr = '111111111 222222222 333333333 444444444 555555555 666666666 777777777 888888888 999999999';
});
it('should handle non-integer colspan on td element gracefully', function () {
const html = `Good morning Jacob,
<table>
<tbody>
<tr>
<td colspan="abc">Lorem ipsum dolor sit amet.</td>
</tr>
</tbody>
</table>
`;
const resultExpected = 'Good morning Jacob,\n\nLorem ipsum dolor sit amet.';
const result = htmlToText(html, { tables: true });
expect(result).to.equal(resultExpected);
it('should wordwrap at 80 characters by default', function () {
expect(htmlToText(longStr)).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777 888888888\n999999999');
});
it('should handle tables with colspans and rowspans', function () {
const html = /*html*/`
<table>
<tr>
<td colspan="2" rowspan="3">aa<br/>aa<br/>aa</td>
<td colspan="1" rowspan="3">b<br/>b<br/>b</td>
<td colspan="4" rowspan="2">cccc<br/>cccc</td>
<td colspan="1" rowspan="4">d<br/>d<br/>d<br/>d</td>
</tr>
<tr></tr>
<tr>
<td colspan="2" rowspan="3">ee<br/>ee<br/>ee</td>
<td colspan="2" rowspan="2">ff<br/>ff</td>
</tr>
<tr>
<td colspan="3" rowspan="1">ggg</td>
</tr>
<tr>
<td colspan="1" rowspan="2">h<br/>h</td>
<td colspan="2" rowspan="2">ii<br/>ii</td>
<td colspan="3" rowspan="1">jjj</td>
</tr>
<tr>
<td colspan="1" rowspan="2">k<br/>k</td>
<td colspan="2" rowspan="2">ll<br/>ll</td>
<td colspan="2" rowspan="1">mm</td>
</tr>
<tr>
<td colspan="2" rowspan="2">nn<br/>nn</td>
<td colspan="1" rowspan="1">o</td>
<td colspan="2" rowspan="2">pp<br/>pp</td>
</tr>
<tr>
<td colspan="4" rowspan="1">qqqq</td>
</tr>
</table>`;
const resultExpected =
'aa b cccc d\n' +
'aa b cccc d\n' +
'aa b ee ff d\n' +
'ggg ee ff d\n' +
'h ii ee jjj\n' +
'h ii k ll mm\n' +
'nn o k ll pp\n' +
'nn qqqq pp';
const result = htmlToText(html, { tables: true });
expect(result).to.equal(resultExpected);
it('should wordwrap at given amount of characters when give a number', function () {
expect(htmlToText(longStr, { wordwrap: 20 })).to.equal('111111111 222222222\n333333333 444444444\n555555555 666666666\n777777777 888888888\n999999999');
expect(htmlToText(longStr, { wordwrap: 50 })).to.equal('111111111 222222222 333333333 444444444 555555555\n666666666 777777777 888888888 999999999');
expect(htmlToText(longStr, { wordwrap: 70 })).to.equal('111111111 222222222 333333333 444444444 555555555 666666666 777777777\n888888888 999999999');
});
it('should support custom spacing for tables', function () {
const html = /*html*/`
<table>
<tr>
<td colspan="2" rowspan="2">aa<br/>aa</td>
<td>b</td>
</tr>
<tr>
<td>c</td>
</tr>
<tr>
<td>d</td>
<td>e</td>
<td>f</td>
</tr>
</table>`;
const resultExpected =
'aa b\n' +
'aa\n' +
' c\n' +
'\n' +
'd e f';
const result = htmlToText(html, { tables: true, tags: { 'table': { options: { colSpacing: 1, rowSpacing: 1 } } } });
expect(result).to.equal(resultExpected);
it('should not wordwrap when given null', function () {
expect(htmlToText(longStr, { wordwrap: null })).to.equal(longStr);
});
it('should properly align columns in tables with thead, tfoot', function () {
const html = /*html*/`
<table>
<thead>
<tr>
<td>aaaaaaaaa</td>
<td colspan="2">b</td>
</tr>
</thead>
<tbody>
<tr>
<td>ccc</td>
<td>ddd</td>
<td>eee</td>
</tr>
</tbody>
<tfoot>
<tr>
<td colspan="2">f</td>
<td>ggggggggg</td>
</tr>
</tfoot>
</table>`;
const resultExpected =
'aaaaaaaaa b\n' +
'ccc ddd eee\n' +
'f ggggggggg';
const result = htmlToText(html, { tables: true });
expect(result).to.equal(resultExpected);
it('should not wordwrap when given false', function () {
expect(htmlToText(longStr, { wordwrap: false })).to.equal(longStr);
});
it('should render block-level elements inside table cells properly', function () {
const html = /*html*/`
<table>
<tr>
<td><h1>hEaDeR</h1></td>
<td><blockquote>A quote<br/>from somewhere.</blockquote></td>
</tr>
<tr>
<td>
<pre> preformatted... ...text </pre>
</td>
<td>
<ol>
<li>list item one</li>
<li>list item two</li>
</ol>
</td>
</tr>
</table>`;
const resultExpected =
'HEADER > A quote\n' +
' > from somewhere.\n' +
' preformatted... ...text 1. list item one\n' +
' 2. list item two';
const result = htmlToText(html, { tables: true });
expect(result).to.equal(resultExpected);
it('should not exceed the line width when processing embedded format tags', function () {
const html = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths.</p>';
const expected = 'This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths.';
expect(htmlToText(html, {})).to.equal(expected);
});
it('should wrap table contents to custom max column width', function () {
const html = /*html*/`
<table>
<tr>
<td>short</td>
<td>short</td>
<td>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</td>
</tr>
<tr>
<td>short</td>
<td>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</td>
<td>short</td>
</tr>
</table>`;
const resultExpected =
'short short Lorem ipsum dolor sit amet,\n' +
' consectetur adipiscing elit,\n' +
' sed do eiusmod tempor\n' +
' incididunt ut labore et dolore\n' +
' magna aliqua. Ut enim ad minim\n' +
' veniam, quis nostrud\n' +
' exercitation ullamco laboris\n' +
' nisi ut aliquip ex ea commodo\n' +
' consequat.\n' +
'short Duis aute irure dolor in short\n' +
' reprehenderit in voluptate\n' +
' velit esse cillum dolore eu\n' +
' fugiat nulla pariatur.\n' +
' Excepteur sint occaecat\n' +
' cupidatat non proident, sunt\n' +
' in culpa qui officia deserunt\n' +
' mollit anim id est laborum.';
const result = htmlToText(html, { tables: true, tags: { 'table': { options: { maxColumnWidth: 30 } } } });
expect(result).to.equal(resultExpected);
it('should work with a long string containing line feeds', function () {
const html = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>';
const expected = 'If a word with a line feed exists over the line feed boundary then you must\nrespect it.';
expect(htmlToText(html, {})).to.equal(expected);
});
});
describe('a', function () {
it('should decode html attribute entities from href', function () {
const result = htmlToText('<a href="/foo?a&#x3D;b">test</a>');
expect(result).to.equal('test [/foo?a=b]');
it('should not wrongly truncate lines when processing embedded format tags', function () {
const html = '<p><strong>This text isn\'t counted</strong> when calculating where to break a string for 80 character line lengths. However it can affect where the next line breaks and this could lead to having an early line break</p>';
const expected = 'This text isn\'t counted when calculating where to break a string for 80\ncharacter line lengths. However it can affect where the next line breaks and\nthis could lead to having an early line break';
expect(htmlToText(html, {})).to.equal(expected);
});
it('should strip mailto: from email links', function () {
const result = htmlToText('<a href="mailto:foo@example.com">email me</a>');
expect(result).to.equal('email me [foo@example.com]');
it('should not exceed the line width when processing anchor tags', function () {
const html = "<p>We appreciate your business. And we hope you'll check out our <a href=\"http://example.com/\">new products</a>!</p>";
const expected = 'We appreciate your business. And we hope you\'ll check out our new products\n[http://example.com/]!';
expect(htmlToText(html, {})).to.equal(expected);
});
it('should return link with brackets', function () {
const result = htmlToText('<a href="http://my.link">test</a>');
expect(result).to.equal('test [http://my.link]');
it('should honour line feeds from a long word across the wrap, where the line feed is before the wrap', function () {
const html = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>';
const expected = 'This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.';
expect(htmlToText(html, {})).to.equal(expected);
});
it('should return link without brackets', function () {
const result = htmlToText(
'<a href="http://my.link">test</a>',
{ noLinkBrackets: true }
);
expect(result).to.equal('test http://my.link');
it('should remove line feeds from a long word across the wrap, where the line feed is after the wrap', function () {
const html = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>';
const expected = 'This string is meant to test if a string is split properly across\nanewlineandlong word with following text.';
expect(htmlToText(html, {})).to.equal(expected);
});
it('should not return link for anchor if noAnchorUrl is set to true', function () {
const result = htmlToText(
'<a href="#link">test</a>',
{ noAnchorUrl: true }
);
expect(result).to.equal('test');
});
it('should return link for anchor if noAnchorUrl is set to false', function () {
const result = htmlToText(
'<a href="#link">test</a>',
{ noAnchorUrl: false }
);
expect(result).to.equal('test [#link]');
});
});
describe('lists', function () {
describe('ul', function () {
it('should handle empty unordered lists', function () {
const testString = '<ul></ul>';
expect(htmlToText(testString)).to.equal('');
});
describe('preserveNewlines option', function () {
it('should handle an unordered list with multiple elements', function () {
const testString = '<ul><li>foo</li><li>bar</li></ul>';
expect(htmlToText(testString)).to.equal(' * foo\n * bar');
});
it('should not preserve newlines by default', function () {
const html = '<p\n>One\nTwo\nThree</p>';
const expected = 'One Two Three';
expect(htmlToText(html)).to.equal(expected);
});
it('should handle an unordered list prefix option', function () {
const testString = '<ul><li>foo</li><li>bar</li></ul>';
const options = { unorderedListItemPrefix: ' test ' };
expect(htmlToText(testString, options)).to.equal(' test foo\n test bar');
});
it('should handle nested ul correctly', function () {
const testString = '<ul><li>foo<ul><li>bar<ul><li>baz.1</li><li>baz.2</li></ul></li></ul></li></ul>';
expect(htmlToText(testString)).to.equal(' * foo\n * bar\n * baz.1\n * baz.2');
});
it('should handle long nested ul correctly', function () {
const testString = /*html*/`<ul>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
<li>Inner:
<ul>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
</ul>
</li>
</ul>`;
const expected =
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g\n' +
' u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' +
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g\n' +
' u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' +
' * Inner:\n' +
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' +
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' +
' * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' +
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.';
expect(htmlToText(testString)).to.equal(expected);
});
it('should preserve newlines when provided with a truthy value', function () {
const html = '<p\n>One\nTwo\nThree</p>';
const expected = 'One\nTwo\nThree';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
describe('ol', function () {
it('should handle empty ordered lists', function () {
const testString = '<ol></ol>';
expect(htmlToText(testString)).to.equal('');
});
it('should handle an ordered list with multiple elements', function () {
const testString = '<ol><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString)).to.equal(' 1. foo\n 2. bar');
});
it('should support the ordered list type="1" attribute', function () {
const testString = '<ol type="1"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString)).to.equal(' 1. foo\n 2. bar');
});
it('should fallback to type="1" behavior if type attribute is invalid', function () {
const testString = '<ol type="whatever"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString)).to.equal(' 1. foo\n 2. bar');
});
it('should support the ordered list type="a" attribute', function () {
const testString = '<ol type="a"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString)).to.equal(' a. foo\n b. bar');
});
it('should support the ordered list type="A" attribute', function () {
const testString = '<ol type="A"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString)).to.equal(' A. foo\n B. bar');
});
it('should support the ordered list type="i" attribute', function () {
const testString1 = '<ol type="i"><li>foo</li><li>bar</li></ol>';
const testString2 = '<ol start="8" type="i"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString1)).to.equal(' i. foo\n ii. bar');
expect(htmlToText(testString2)).to.equal(' viii. foo\n ix. bar');
});
it('should support the ordered list type="I" attribute', function () {
const testString1 = '<ol type="I"><li>foo</li><li>bar</li></ol>';
const testString2 = '<ol start="8" type="I"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString1)).to.equal(' I. foo\n II. bar');
expect(htmlToText(testString2)).to.equal(' VIII. foo\n IX. bar');
});
it('should support the ordered list start attribute', function () {
const testString = '<ol start="100"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString)).to.equal(' 100. foo\n 101. bar');
});
it('should handle nested ol correctly', function () {
const testString = '<ol><li>foo<ol><li>bar<ol><li>baz</li><li>baz</li></ol></li></ol></li></ol>';
expect(htmlToText(testString)).to.equal(' 1. foo\n 1. bar\n 1. baz\n 2. baz');
});
it('should handle long nested ol correctly', function () {
const testString = /*html*/`<ol>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
<li>Inner:
<ol>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
<li>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.</li>
</ol>
</li>
</ol>`;
const expected =
' 1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' +
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' +
' 2. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s d\n' +
' g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit amet.\n' +
' 3. Inner:\n' +
' 1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s\n' +
' d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit\n' +
' amet.\n' +
' 2. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita k a s\n' +
' d g u b e r g r e n, no sea takimata sanctus est Lorem ipsum dolor sit\n' +
' amet.';
expect(htmlToText(testString)).to.equal(expected);
});
it('should support the ordered list type="a" attribute past 26 characters', function () {
const testString1 = '<ol start="26" type="a"><li>foo</li><li>bar</li></ol>';
const testString2 = '<ol start="702" type="a"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString1)).to.equal(' z. foo\n aa. bar');
expect(htmlToText(testString2)).to.equal(' zz. foo\n aaa. bar');
});
it('should support the ordered list type="A" attribute past 26 characters', function () {
const testString1 = '<ol start="26" type="A"><li>foo</li><li>bar</li></ol>';
const testString2 = '<ol start="702" type="A"><li>foo</li><li>bar</li></ol>';
expect(htmlToText(testString1)).to.equal(' Z. foo\n AA. bar');
expect(htmlToText(testString2)).to.equal(' ZZ. foo\n AAA. bar');
});
// HTML standard defines vinculum extension for large numbers.
// But that doesn't seem to have any significance for practical purposes.
// it('should support the ordered list type="i" attribute past 3999', function () {
// const testString = '<ol start="3999" type="i"><li>foo</li><li>bar</li></ol>';
// expect(htmlToText(testString)).to.equal(' mmmcmxcix. foo\n i̅v̅. bar');
// });
// it('should support the ordered list type="I" attribute past 3999', function () {
// const testString = '<ol start="3999" type="I"><li>foo</li><li>bar</li></ol>';
// expect(htmlToText(testString)).to.equal(' MMMCMXCIX. foo\n I̅V̅. bar');
// });
it('should preserve line feeds in a long wrapping string containing line feeds', function () {
const html = '<p>If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.</p>';
const expected = 'If a word with a line feed exists over the line feed boundary then\nyou\nmust\nrespect it.';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
it('should not wrap li when wordwrap is disabled', function () {
const html = `Good morning Jacob,
<p>Lorem ipsum dolor sit amet</p>
<p><strong>Lorem ipsum dolor sit amet.</strong></p>
<ul>
<li>run in the park <span style="color:#888888;">(in progress)</span></li>
</ul>
`;
const resultExpected = 'Good morning Jacob,\n\nLorem ipsum dolor sit amet\n\nLorem ipsum dolor sit amet.\n\n * run in the park (in progress)';
const result = htmlToText(html, { wordwrap: false });
expect(result).to.equal(resultExpected);
it('should preserve line feeds in a long string containing line feeds across the wrap', function () {
const html = '<p>If a word with a line feed exists over the line feed boundary then\nyou must respect it.</p>';
const expected = 'If a word with a line feed exists over the line feed boundary then\nyou must respect it.';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
it('should handle non-li elements inside a list gracefully', function () {
const html = /*html*/`
<ul>
<li>list item</li>
plain text
<li>list item</li>
<div>div</div>
<li>list item</li>
<p>paragraph</p>
<li>list item</li>
</ul>
`;
const resultExpected = ' * list item\n plain text\n * list item\n div\n * list item\n\n paragraph\n\n * list item';
const result = htmlToText(html);
expect(result).to.equal(resultExpected);
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed before 80 chars', function () {
const html = '<p>This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.</p>';
const expected = 'This string is meant to test if a string is split properly across a\nnewlineandlongword with following text.';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
});
describe('entities', function () {
it('should not insert null bytes', function () {
const html = '<a href="some-url?a=b&amp;b=c">Testing &amp; Done</a>';
const result = htmlToText(html);
expect(result).to.equal('Testing & Done [some-url?a=b&b=c]');
it('should preserve line feeds in a long string containing line feeds across the wrap with a line feed after 80 chars', function () {
const html = '<p>This string is meant to test if a string is split properly across anewlineandlong\nword with following text.</p>';
const expected = 'This string is meant to test if a string is split properly across\nanewlineandlong\nword with following text.';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
it('should replace entities inside `alt` attributes of images', function () {
const html = '<img src="test.png" alt="&quot;Awesome&quot;">';
const result = htmlToText(html);
expect(result).to.equal('"Awesome" [test.png]');
it('should split long lines', function () {
const html = '<p>If a word with a line feed exists over the line feed boundary then you must respect it.</p>';
const expected = 'If a word with a line feed exists over the line feed boundary then you must\nrespect it.';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
it('should update relatively sourced entities with linkHrefBaseUrl', function () {
const html1 = '<img src="/test.png">';
const html2 = '<a href="/test.html">test</a>';
const options = { linkHrefBaseUrl: 'http://www.domain.com' };
const result1 = htmlToText(html1, options);
expect(result1).to.equal('[http://www.domain.com/test.png]');
const result2 = htmlToText(html2, options);
expect(result2).to.equal('test [http://www.domain.com/test.html]');
it('should remove spaces if they occur around line feed', function () {
const html = '<p>A string of text\nwith \nmultiple\n spaces \n that \n \n can be safely removed.</p>';
const expected = 'A string of text\nwith\nmultiple\nspaces\nthat\n\ncan be safely removed.';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
});
describe('unicode support', function () {
it('should decode &#128514; to 😂', function () {
const result = htmlToText('&#128514;');
expect(result).to.equal('😂');
it('should remove spaces if they occur around line feed 2', function () {
const html = 'multiple\n spaces';
const expected = 'multiple\nspaces';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
});
describe('disable uppercaseHeadings', function () {
for (const i of [1, 2, 3, 4, 5, 6]) {
it('should return h' + i + ' in lowercase', function () {
const result = htmlToText(
'<h' + i + '>test</h' + i + '>',
{ uppercaseHeadings: false }
);
expect(result).to.equal('test');
});
}
});
describe('custom formatting', function () {
it('should allow to override formatting of existing tags', function () {
const result = htmlToText('<h1>TeSt</h1><h1>mOrE tEsT</h1>', {
formatters: {
heading: function (elem, walk, builder, formatOptions) {
builder.openBlock(2);
builder.pushWordTransform(str => str.toLowerCase());
walk(elem.children, builder);
builder.popWordTransform();
builder.closeBlock(2, str => {
const line = '='.repeat(str.length);
return `${line}\n${str}\n${line}`;
});
}
}
});
expect(result).to.equal('====\ntest\n====\n\n=========\nmore test\n=========');
});
describe('unicode and html entities', function () {
it('should allow to skip tags with dummy formatting function', function () {
const input = '<ruby>漢<rt>かん</rt>字<rt>じ</rt></ruby>';
const expected = '漢字';
const result = htmlToText(
input,
{ tags: { 'rt': { format: 'skip' } } }
);
expect(result).to.equal(expected);
it('should decode &#128514; to 😂', function () {
expect(htmlToText('&#128514;')).to.equal('😂');
});
it('should allow to define basic support for inline tags', function () {
const input = /*html*/`<p>a <span>b </span>c<span> d </span>e</p>`;
const expected = 'a b c d e';
const result = htmlToText(
input,
{ tags: { 'span': { format: 'inline' } } }
);
expect(result).to.equal(expected);
it('should decode &lt;&gt; to <>', function () {
expect(htmlToText('<span>span</span>, &lt;not a span&gt;')).to.equal('span, <not a span>');
});
it('should allow to define basic support for block-level tags', function () {
const input = /*html*/`<widget><gadget>a</gadget><fidget>b</fidget></widget>c<budget>d</budget>e`;
const expected = 'a\nb\nc\nd\ne';
const result = htmlToText(
input,
{
tags: {
'budget': { format: 'block' },
'fidget': { format: 'block' },
'gadget': { format: 'block' },
'widget': { format: 'block' },
}
}
);
expect(result).to.equal(expected);
});
it('should allow to add support for different tags', function () {
const input = '<div><foo>foo<br/>content</foo><bar src="bar.src" /></div>';
const expected = '[FOO]foo\ncontent[/FOO]\n[BAR src="bar.src"]';
const result = htmlToText(
input,
{
formatters: {
'formatFoo': function (elem, walk, builder, formatOptions) {
builder.openBlock(1);
walk(elem.children, builder);
builder.closeBlock(1, str => `[FOO]${str}[/FOO]`);
},
'formatBar': function (elem, walk, builder, formatOptions) {
// attribute availability check is left out for brevity
builder.addInline(`[BAR src="${elem.attribs.src}"]`, true);
}
},
tags: {
'foo': { format: 'formatFoo' },
'bar': { format: 'formatBar' }
}
}
);
expect(result).to.equal(expected);
});
});
describe('Base element', function () {
describe('base element', function () {
it('should retrieve and convert the entire document under `body` by default', function () {
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const txtFile = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8');
const html = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expected = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8');
const options = { tables: ['#invoice', '.address'] };
const text = htmlToText(htmlFile, options);
expect(text).to.equal(txtFile);
expect(htmlToText(html, options)).to.equal(expected);
});
it('should only retrieve and convert content under the specified base element if found', function () {
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const txtFile = fs.readFileSync(path.join(__dirname, 'test-address.txt'), 'utf8');
const html = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expected = fs.readFileSync(path.join(__dirname, 'test-address.txt'), 'utf8');
const options = {

@@ -765,10 +215,8 @@ tables: ['.address'],

};
const text = htmlToText(htmlFile, options);
expect(text).to.equal(txtFile);
expect(htmlToText(html, options)).to.equal(expected);
});
it('should retrieve and convert content under multiple base elements', function () {
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const txtFile = fs.readFileSync(path.join(__dirname, 'test-address-dup.txt'), 'utf8');
const html = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expected = fs.readFileSync(path.join(__dirname, 'test-address-dup.txt'), 'utf8');
const options = {

@@ -778,10 +226,8 @@ tables: ['.address'],

};
const text = htmlToText(htmlFile, options);
expect(text).to.equal(txtFile);
expect(htmlToText(html, options)).to.equal(expected);
});
it('should retrieve and convert content under multiple base elements in any order', function () {
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const txtFile = fs.readFileSync(path.join(__dirname, 'test-any-order.txt'), 'utf8');
const html = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expected = fs.readFileSync(path.join(__dirname, 'test-any-order.txt'), 'utf8');
const options = {

@@ -791,10 +237,8 @@ tables: ['.address'],

};
const text = htmlToText(htmlFile, options);
expect(text).to.equal(txtFile);
expect(htmlToText(html, options)).to.equal(expected);
});
it('should process the first base element found when multiple exist', function () {
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const txtFile = fs.readFileSync(path.join(__dirname, 'test-first-element.txt'), 'utf8');
const html = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expected = fs.readFileSync(path.join(__dirname, 'test-first-element.txt'), 'utf8');
const options = {

@@ -804,10 +248,8 @@ tables: ['.address'],

};
const text = htmlToText(htmlFile, options);
expect(text).to.equal(txtFile);
expect(htmlToText(html, options)).to.equal(expected);
});
it('should retrieve and convert the entire document by default if no base element is found', function () {
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const txtFile = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8');
const html = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expected = fs.readFileSync(path.join(__dirname, 'test.txt'), 'utf8');
const options = {

@@ -817,10 +259,8 @@ tables: ['#invoice', '.address'],

};
const text = htmlToText(htmlFile, options);
expect(text).to.equal(txtFile);
expect(htmlToText(html, options)).to.equal(expected);
});
it('should return null if the base element isn\'t found and we\'re not returning the DOM by default', function () {
const htmlFile = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expectedTxt = '';
const html = fs.readFileSync(path.join(__dirname, 'test.html'), 'utf8');
const expected = '';
const options = {

@@ -831,116 +271,141 @@ baseElement: 'table.notthere',

};
const text = htmlToText(htmlFile, options);
expect(text).to.equal(expectedTxt);
expect(htmlToText(html, options)).to.equal(expected);
});
});
describe('Long words', function () {
describe('long words', function () {
it('should split long words if forceWrapOnLimit is set, existing linefeeds converted to space', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: true } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlo\nng word_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: true } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlo\nng word_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should not wrap a string if longWordSplit is not set', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.</p>';
expect(htmlToText(testString, {}))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.</p>';
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlongword_with_following_text.';
expect(htmlToText(html, {})).to.equal(expected);
});
it('should not wrap a string if wrapCharacters are set but not found and forceWrapOnLimit is not set', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: false } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { longWordSplit: { wrapCharacters: ['/'], forceWrapOnLimit: false } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should not wrap a string if wrapCharacters are not set and forceWrapOnLimit is not set', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: false } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: false } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should wrap on the last instance of a wrap character before the wordwrap limit.', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should wrap on the last instance of a wrap character before the wordwrap limit. Content of wrapCharacters shouldn\'t matter.', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-', '_'], forceWrapOnLimit: false } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { longWordSplit: { wrapCharacters: ['/', '-', '_'], forceWrapOnLimit: false } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should wrap on the last instance of a wrap character before the wordwrap limit. Order of wrapCharacters shouldn\'t matter.', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['_', '/'], forceWrapOnLimit: false } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { longWordSplit: { wrapCharacters: ['_', '/'], forceWrapOnLimit: false } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong word_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should wrap on the last instance of a wrap character before the wordwrap limit. Should preference wrapCharacters in order', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split-properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['-', '_', '/'], forceWrapOnLimit: false } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split-\nproperly_across_anewlineandlong word_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split-properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { longWordSplit: { wrapCharacters: ['-', '_', '/'], forceWrapOnLimit: false } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split-\nproperly_across_anewlineandlong word_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should not wrap a string that is too short', function () {
const testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } }))
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js');
const html = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>';
const options = { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } };
const expected = 'https://github.com/werk85/node-html-to-text/blob/master/lib/html-to-text.js';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should wrap a url string using \'/\'', function () {
const testString = '<p>https://github.com/AndrewFinlay/node-html-to-text/commit/64836a5bd97294a672b24c26cb8a3ccdace41001</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } }))
.to.equal('https://github.com/AndrewFinlay/node-html-to-text/commit/\n64836a5bd97294a672b24c26cb8a3ccdace41001');
const html = '<p>https://github.com/AndrewFinlay/node-html-to-text/commit/64836a5bd97294a672b24c26cb8a3ccdace41001</p>';
const options = { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } };
const expected = 'https://github.com/AndrewFinlay/node-html-to-text/commit/\n64836a5bd97294a672b24c26cb8a3ccdace41001';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should wrap very long url strings using \'/\'', function () {
const testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } }))
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/\nnode-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/\nwerk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/\nlib/html-to-text.js');
const html = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>';
const options = { longWordSplit: { wrapCharacters: ['/', '-'], forceWrapOnLimit: false } };
const expected = 'https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/\nnode-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/\nwerk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/\nlib/html-to-text.js';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should wrap very long url strings using limit', function () {
const testString = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>';
expect(htmlToText(testString, { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: true } }))
.to.equal('https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-\ntext/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-t\no-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js');
const html = '<p>https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js</p>';
const options = { longWordSplit: { wrapCharacters: [], forceWrapOnLimit: true } };
const expected = 'https://github.com/werk85/node-html-to-text/blob/master/lib/werk85/node-html-to-\ntext/blob/master/lib/werk85/node-html-to-text/blob/master/lib/werk85/node-html-t\no-text/blob/master/lib/werk85/node-html-to-text/blob/master/lib/html-to-text.js';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should honour preserveNewlines and split long words', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { preserveNewlines: true, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong\nword_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const options = { preserveNewlines: true, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } };
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_\nanewlineandlong\nword_with_following_text.';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should not put in extra linefeeds if the end of the untouched long string coincides with a preserved line feed', function () {
const testString = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
expect(htmlToText(testString, { preserveNewlines: true }))
.to.equal('_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.');
const html = '<p>_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.</p>';
const expected = '_This_string_is_meant_to_test_if_a_string_is_split_properly_across_anewlineandlong\nword_with_following_text.';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expected);
});
it('should split long strings buried in links and hide the href', function () {
const testString = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>';
expect(htmlToText(testString, { hideLinkHrefIfSameAsText: true, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } }))
.to.equal('http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/');
const html = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>';
const options = { longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false }, tags: { 'a': { options: { hideLinkHrefIfSameAsText: true } } } };
const expected = 'http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should split long strings buried in links and show the href', function () {
const testString = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>';
expect(htmlToText(testString, { hideLinkHrefIfSameAsText: false, longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } }))
.to.equal('http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/\n[http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/]');
const html = '<a href="http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/">http://images.fb.com/2015/12/21/ivete-sangalo-launches-360-music-video-on-facebook/</a>';
const options = { longWordSplit: { wrapCharacters: ['/', '_'], forceWrapOnLimit: false } };
const expected = 'http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/\n[http://images.fb.com/2015/12/21/\nivete-sangalo-launches-360-music-video-on-facebook/]';
expect(htmlToText(html, options)).to.equal(expected);
});
});
describe('whitespace', function () {
it('should not be ignored inside a whitespace-only node', function () {
const testString = 'foo<span> </span>bar';
expect(htmlToText(testString)).to.equal('foo bar');
const html = 'foo<span> </span>bar';
const expected = 'foo bar';
expect(htmlToText(html)).to.equal(expected);
});
it('should handle html character entities for html whitespace characters', function () {
const html = /*html*/`a<span>&#x0020;</span>b<span>&Tab;</span>c<span>&NewLine;</span>d<span>&#10;</span>e`;
const result = htmlToText(html);
const expected = 'a b c d e';
expect(result).to.equal(expected);
});
it('should not add additional whitespace after <sup>', function () {
const testString = '<p>This text contains <sup>superscript</sup> text.</p>';
const html = '<p>This text contains <sup>superscript</sup> text.</p>';
const options = { preserveNewlines: true };
expect(htmlToText(testString, options)).to.equal('This text contains superscript text.');
const expected = 'This text contains superscript text.';
expect(htmlToText(html, options)).to.equal(expected);
});

@@ -950,16 +415,20 @@

// No-Break Space - decimal 160, hex \u00a0.
const testString = '<span>first span\u00a0</span>\u00a0<span>\u00a0last span</span>';
const html = /*html*/`<span>first span\u00a0</span>&nbsp;<span>&#160;last span</span>`;
const expectedDefault = 'first span\u00a0\u00a0\u00a0last span';
expect(htmlToText(html)).to.equal(expectedDefault);
const options = { whitespaceCharacters: ' \t\r\n\f\u200b\u00a0' };
const expectedCustom = 'first span last span';
const options = { whitespaceCharacters: ' \t\r\n\f\u200b\u00a0' };
expect(htmlToText(testString)).to.equal(expectedDefault);
expect(htmlToText(testString, options)).to.equal(expectedCustom);
expect(htmlToText(html, options)).to.equal(expectedCustom);
});
it('should handle space and newline combination - keep space when and only when needed', function () {
const testString = '<span>foo</span> \n<span>bar</span>\n <span>baz</span>';
const defaultResult = htmlToText(testString);
const resultWithNewLine = htmlToText(testString, { preserveNewlines: true });
expect(defaultResult).to.equal('foo bar baz');
expect(resultWithNewLine).to.equal('foo\nbar\nbaz');
const html = '<span>foo</span> \n<span>bar</span>\n <span>baz</span>';
const expectedDefault = 'foo bar baz';
expect(htmlToText(html)).to.equal(expectedDefault);
const expectedCustom = 'foo\nbar\nbaz';
expect(htmlToText(html, { preserveNewlines: true })).to.equal(expectedCustom);
});

@@ -974,39 +443,30 @@

</html>`;
const text = htmlToText(html);
expect(text).to.equal('foo\n\nbar');
const expected = 'foo\n\nbar';
expect(htmlToText(html)).to.equal(expected);
});
it('should not have extra spaces at the beginning for space-indented html with explicitly block-level tags', function () {
const html = /*html*/`<html>
<body>
<div>foo</div>
<div>bar</div>
</body>
</html>`;
expect(htmlToText(html, { tags: { 'div': { format: 'block', level: 'block' } } })).to.equal('foo\nbar');
});
});
describe('lots of tags, limits', function () {
it('should handle a large number of wbr tags w/o stack overflow', function () {
let testString = '<!DOCTYPE html><html><head></head><body>\n';
let expectedResult = '';
let html = '<!DOCTYPE html><html><head></head><body>\n';
let expected = '';
for (let i = 0; i < 10000; i++) {
if (i !== 0 && i % 80 === 0) {
expectedResult += '\n';
expected += '\n';
}
expectedResult += 'n';
testString += '<wbr>n';
expected += 'n';
html += '<wbr>n';
}
testString += '</body></html>';
expect(htmlToText(testString)).to.equal(expectedResult);
html += '</body></html>';
expect(htmlToText(html)).to.equal(expected);
});
it('should handle a very large number of wbr tags with limits', function () {
let testString = '<!DOCTYPE html><html><head></head><body>';
let html = '<!DOCTYPE html><html><head></head><body>';
for (let i = 0; i < 70000; i++) {
testString += '<wbr>n';
html += '<wbr>n';
}
testString += '</body></html>';
html += '</body></html>';
const options = {

@@ -1018,8 +478,8 @@ limits: {

};
const expectedResult = 'nnnnn(...)';
expect(htmlToText(testString, options)).to.equal(expectedResult);
const expected = 'nnnnn(...)';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should respect maxDepth limit', function () {
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><span>a<span>b<span>c<span>d</span>e</span>f</span>g<span>h<span>i<span>j</span>k</span>l</span>m</span></body></html>`;
const html = /*html*/`<!DOCTYPE html><html><head></head><body><span>a<span>b<span>c<span>d</span>e</span>f</span>g<span>h<span>i<span>j</span>k</span>l</span>m</span></body></html>`;
const options = {

@@ -1031,40 +491,40 @@ limits: {

};
const expectedResult = 'a(...)g(...)m';
expect(htmlToText(testString, options)).to.equal(expectedResult);
const expected = 'a(...)g(...)m';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should respect maxChildNodes limit', function () {
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`;
const html = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`;
const options = {
singleNewLineParagraphs: true,
limits: {
maxChildNodes: 6,
ellipsis: '(skipped the rest)'
}
},
tags: { 'p': { options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } } }
};
const expectedResult = 'a\nb\nc\nd\ne\nf\n(skipped the rest)';
expect(htmlToText(testString, options)).to.equal(expectedResult);
const expected = 'a\nb\nc\nd\ne\nf\n(skipped the rest)';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should not add ellipsis when maxChildNodes limit is exact match', function () {
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`;
const html = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`;
const options = {
singleNewLineParagraphs: true,
limits: {
maxChildNodes: 10,
ellipsis: 'can\'t see me'
}
},
tags: { 'p': { options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } } }
};
const expectedResult = 'a\nb\nc\nd\ne\nf\ng\nh\ni\nj';
expect(htmlToText(testString, options)).to.equal(expectedResult);
const expected = 'a\nb\nc\nd\ne\nf\ng\nh\ni\nj';
expect(htmlToText(html, options)).to.equal(expected);
});
it('should use default ellipsis value if none provided', function () {
const testString = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`;
const html = /*html*/`<!DOCTYPE html><html><head></head><body><p>a</p><p>b</p><p>c</p><p>d</p><p>e</p><p>f</p><p>g</p><p>h</p><p>i</p><p>j</p></body></html>`;
const options = {
singleNewLineParagraphs: true,
limits: { maxChildNodes: 6 }
limits: { maxChildNodes: 6 },
tags: { 'p': { options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } } }
};
const expectedResult = 'a\nb\nc\nd\ne\nf\n...';
expect(htmlToText(testString, options)).to.equal(expectedResult);
const expected = 'a\nb\nc\nd\ne\nf\n...';
expect(htmlToText(html, options)).to.equal(expected);
});

@@ -1090,110 +550,19 @@

it('should respect default limit of maxInputLength', function () {
const testString = '0123456789'.repeat(2000000);
const html = '0123456789'.repeat(2000000);
const options = { wordwrap: false };
expect(htmlToText(testString, options).length).to.equal(1 << 24);
expect(getProcessStderrBuffer()).to.equal('Input lenght 20000000 is above allowed limit of 16777216. Truncating without ellipsis.\n');
expect(htmlToText(html, options).length).to.equal(1 << 24);
const expectedStderrBuffer = 'Input length 20000000 is above allowed limit of 16777216. Truncating without ellipsis.\n';
expect(getProcessStderrBuffer()).to.equal(expectedStderrBuffer);
});
it('should respect custom maxInputLength', function () {
const testString = '0123456789'.repeat(2000000);
const html = '0123456789'.repeat(2000000);
const options = { limits: { maxInputLength: 42 } };
expect(htmlToText(testString, options).length).to.equal(42);
expect(getProcessStderrBuffer()).to.equal('Input lenght 20000000 is above allowed limit of 42. Truncating without ellipsis.\n');
expect(htmlToText(html, options).length).to.equal(42);
const expectedStderrBuffer = 'Input length 20000000 is above allowed limit of 42. Truncating without ellipsis.\n';
expect(getProcessStderrBuffer()).to.equal(expectedStderrBuffer);
});
});
describe('blockquote', function () {
it('should handle format single-line blockquote', function () {
const testString = 'foo<blockquote>test</blockquote>bar';
const expectedResult = 'foo\n\n> test\n\nbar';
expect(htmlToText(testString)).to.equal(expectedResult);
});
it('should format multi-line blockquote', function () {
const testString = '<blockquote>a<br/>b</blockquote>';
const expectedResult = '> a\n> b';
expect(htmlToText(testString)).to.equal(expectedResult);
});
it('should trim newlines unless disabled', function () {
const testString = '<blockquote><br/>a<br/><br/><br/></blockquote>';
const expectedDefaultResult = '> a';
const expectedCustomResult = '> \n> a\n> \n> \n> ';
expect(htmlToText(testString)).to.equal(expectedDefaultResult);
expect(htmlToText(testString, { tags: { 'blockquote': { options: { trimEmptyLines: false } } } })).to.equal(expectedCustomResult);
});
});
describe('pre', function () {
it('should support simple preformatted text', function () {
const testString = '<P>Code fragment:</P><PRE> body {\n color: red;\n }</PRE>';
const expectedResult = 'Code fragment:\n\n body {\n color: red;\n }';
expect(htmlToText(testString)).to.equal(expectedResult);
});
it('should support preformatted text with inner tags', function () {
const testString = /*html*/`<p>Code fragment:</p>
<pre><code> var total = 0;
<em style="color: green;">// Add 1 to total and display in a paragraph</em>
<strong style="color: blue;">document.write('&lt;p&gt;Sum: ' + (total + 1) + '&lt;/p&gt;');</strong></code></pre>`;
const expectedResult = `Code fragment:\n\n var total = 0;\n\n // Add 1 to total and display in a paragraph\n document.write('<p>Sum: ' + (total + 1) + '</p>');`;
expect(htmlToText(testString)).to.equal(expectedResult);
});
it('should support preformatted text with line break tags', function () {
const testString = '<pre> line 1 <br/> line 2 </pre>';
const expectedResult = ' line 1 \n line 2 ';
expect(htmlToText(testString)).to.equal(expectedResult);
});
it('should support preformatted text with a table', function () {
const testString = /*html*/`
<pre><table>
<tr>
<td>[a&#32;&#32;&#32;
</td>
<td> b&#32;&#32;
</td>
<td> c]
</td>
</tr>
<tr>
<td>&#32;&#32;&#32;&#32;&#32;
d]</td>
<td>&#32;&#32;&#32;&#32;&#32;
e </td>
<td>&#32;&#32;&#32;&#32;&#32;
[f </td>
</tr>
</table></pre>`;
const expectedResult =
'[a b c]\n' +
' \n' +
' \n' +
' d] e [f ';
expect(htmlToText(testString, { tables: true })).to.equal(expectedResult);
});
});
describe('hr', function () {
it('should output horizontal line of default length', function () {
const testString = '<div>foo</div><hr/><div>bar</div>';
const expectedResult = 'foo\n\n--------------------------------------------------------------------------------\n\nbar';
expect(htmlToText(testString)).to.equal(expectedResult);
});
it('should output horizontal line of specific length', function () {
const testString = '<div>foo</div><hr/><div>bar</div>';
const expectedResult = 'foo\n\n------------------------------\n\nbar';
expect(htmlToText(testString, { tags: { 'hr': { options: { length: 30 } } } })).to.equal(expectedResult);
});
it('should output horizontal line of length 40 when wordwrap is disabled', function () {
const testString = '<div>foo</div><hr/><div>bar</div>';
const expectedResult = 'foo\n\n----------------------------------------\n\nbar';
expect(htmlToText(testString, { wordwrap: false })).to.equal(expectedResult);
});
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc