html-to-text
Advanced tools
Comparing version 8.0.0 to 8.1.0
@@ -21,3 +21,3 @@ module.exports = { | ||
'class-methods-use-this': 'error', | ||
'complexity': ['error', 14], | ||
'complexity': ['error', 15], | ||
'consistent-return': 'error', | ||
@@ -83,3 +83,3 @@ 'curly': 'error', | ||
'max-len': ['error', 130], | ||
'max-lines-per-function': ['error', 80], | ||
'max-lines-per-function': ['error', 81], | ||
'max-nested-callbacks': ['error', 4], | ||
@@ -86,0 +86,0 @@ 'max-params': ['error', 5], |
# Changelog | ||
## Version 8.1.0 | ||
* Fix for too many newlines in certain cases when `preserveNewlines` option is used. Addresses [#232](https://github.com/html-to-text/node-html-to-text/issues/232); | ||
* Link and image formatters now have a `linkBrackets` option - it accepts an array of two strings (default: `['[', ']']`) or `false` to remove the brackets. Addresses [#236](https://github.com/html-to-text/node-html-to-text/issues/236); | ||
* `noLinkBrackets` formatters option is now deprecated. | ||
All commits: [8.0.0...8.1.0](https://github.com/html-to-text/node-html-to-text/compare/8.0.0...8.1.0) | ||
## Version 8.0.0 | ||
@@ -29,3 +37,3 @@ | ||
Base elements can be arranged in output text in the order of matched selectors (default, to keep it closer to the old implementation) or in the order of appearance in sourse HTML document. | ||
Base elements can be arranged in output text in the order of matched selectors (default, to keep it closer to the old implementation) or in the order of appearance in source HTML document. | ||
@@ -113,3 +121,3 @@ BREAKING CHANGE: previous implementation was treating id selectors in the same way as class selectors (could match `<foo id="a b">` with `foo#a` selector). New implementation is closer to the spec and doesn't expect multiple ids on an element. You can achieve the old behavior with `foo[id~=a]` selector in case you rely on it for some poorly formatted documents (note that it has different specificity though). | ||
Please refer to the Readme to see how things are wired now, in case you were using them for anything othen than dealing with the lack of block-level tags support. | ||
Please refer to the Readme to see how things are wired now, in case you were using them for anything other than dealing with the lack of block-level tags support. | ||
@@ -216,3 +224,3 @@ ### Tables support was improved | ||
* Extra space ploblem fixed. #88 | ||
* Extra space problem fixed. #88 | ||
@@ -258,3 +266,3 @@ ## Version 2.1.0 | ||
* Uppercase tag processing added. Table center support added. #56 | ||
* Unuused dependencies removed. | ||
* Unused dependencies removed. | ||
@@ -261,0 +269,0 @@ ## Version 1.3.2 |
@@ -32,3 +32,3 @@ | ||
this.picker = picker; | ||
this.whitepaceProcessor = new WhitespaceProcessor(options); | ||
this.whitespaceProcessor = new WhitespaceProcessor(options); | ||
/** @type { StackItem } */ | ||
@@ -139,15 +139,27 @@ this._stackItem = new BlockStackItem(options); | ||
if ( | ||
this.whitepaceProcessor.testContainsWords(str) || // There are words to add; | ||
(str.length && !this._stackItem.stashedLineBreaks) // or at least spaces to take into account. | ||
) { | ||
if (this._stackItem.stashedLineBreaks) { | ||
this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks); | ||
str.length === 0 || // empty string | ||
( | ||
this._stackItem.stashedLineBreaks && // stashed linebreaks make whitespace irrelevant | ||
!this.whitespaceProcessor.testContainsWords(str) // no words to add | ||
) | ||
) { return; } | ||
if (this.options.preserveNewlines) { | ||
const newlinesNumber = this.whitespaceProcessor.countNewlinesNoWords(str); | ||
if (newlinesNumber > 0) { | ||
this._stackItem.inlineTextBuilder.startNewLine(newlinesNumber); | ||
// keep stashedLineBreaks unchanged | ||
return; | ||
} | ||
this.whitepaceProcessor.shrinkWrapAdd( | ||
str, | ||
this._stackItem.inlineTextBuilder, | ||
(this._wordTransformer && !noWordTransform) ? this._getCombinedWordTransformer() : undefined | ||
); | ||
this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks | ||
} | ||
if (this._stackItem.stashedLineBreaks) { | ||
this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks); | ||
} | ||
this.whitespaceProcessor.shrinkWrapAdd( | ||
str, | ||
this._stackItem.inlineTextBuilder, | ||
(this._wordTransformer && !noWordTransform) ? this._getCombinedWordTransformer() : undefined | ||
); | ||
this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks | ||
} | ||
@@ -325,3 +337,3 @@ | ||
* | ||
* Number value is depreceted. | ||
* Number value is deprecated. | ||
* | ||
@@ -328,0 +340,0 @@ * @param { number } [optionsObjectOrColSpacing.colSpacing = 3] |
@@ -129,2 +129,14 @@ const he = require('he'); | ||
function withBrackets (str, brackets) { | ||
if (!brackets) { return str; } | ||
const lbr = (typeof brackets[0] === 'string') | ||
? brackets[0] | ||
: '['; | ||
const rbr = (typeof brackets[1] === 'string') | ||
? brackets[1] | ||
: ']'; | ||
return lbr + str + rbr; | ||
} | ||
/** | ||
@@ -148,4 +160,4 @@ * Process an image. | ||
: (!alt) | ||
? '[' + src + ']' | ||
: alt + ' [' + src + ']'; | ||
? withBrackets(src, formatOptions.linkBrackets) | ||
: alt + ' ' + withBrackets(src, formatOptions.linkBrackets); | ||
@@ -190,5 +202,3 @@ builder.addInline(text); | ||
? href | ||
: (formatOptions.noLinkBrackets) | ||
? ' ' + href | ||
: ' [' + href + ']', | ||
: ' ' + withBrackets(href, formatOptions.linkBrackets), | ||
{ noWordTransform: true } | ||
@@ -205,3 +215,3 @@ ); | ||
* @param { FormatOptions } formatOptions Options specific to a formatter. | ||
* @param { () => string } nextPrefixCallback Function that returns inreasing index each time it is called. | ||
* @param { () => string } nextPrefixCallback Function that returns increasing index each time it is called. | ||
*/ | ||
@@ -215,3 +225,3 @@ function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) { | ||
const listItems = (elem.children || []) | ||
// it might be more accuurate to check only for html spaces here, but no significant benefit | ||
// it might be more accurate to check only for html spaces here, but no significant benefit | ||
.filter(child => child.type !== 'text' || !/^\s*$/.test(child.data)) | ||
@@ -218,0 +228,0 @@ .map(function (child) { |
@@ -122,22 +122,2 @@ | ||
/** | ||
* Set a nested property of an object. | ||
* | ||
* @param { object } obj The object to modify. | ||
* @param { string[] } path The path to the property. | ||
* @param { any } value The value to set. | ||
*/ | ||
function set (obj, path, value) { | ||
const valueKey = path.pop(); | ||
for (const key of path) { | ||
let nested = obj[key]; | ||
if (!nested) { | ||
nested = {}; | ||
obj[key] = nested; | ||
} | ||
obj = nested; | ||
} | ||
obj[valueKey] = value; | ||
} | ||
/** | ||
* Deduplicate an array by a given key callback. | ||
@@ -175,5 +155,4 @@ * Item properties are merged recursively and with the preference for last defined values. | ||
numberToRoman: numberToRoman, | ||
set: set, | ||
splitClassesAndIds: splitClassesAndIds, | ||
trimCharacter: trimCharacter | ||
}; |
@@ -9,3 +9,3 @@ const { hp2Builder } = require('@selderee/plugin-htmlparser2'); | ||
const defaultFormatters = require('./formatter'); | ||
const { limitedDepthRecursive, mergeDuplicatesPreferLast, set } = require('./helper'); | ||
const { limitedDepthRecursive, mergeDuplicatesPreferLast, get } = require('./helper'); | ||
@@ -56,4 +56,4 @@ // eslint-disable-next-line import/no-unassigned-import | ||
ignoreHref: false, | ||
noAnchorUrl: true, | ||
noLinkBrackets: false | ||
linkBrackets: ['[', ']'], | ||
noAnchorUrl: true | ||
} | ||
@@ -84,3 +84,7 @@ }, | ||
}, | ||
{ selector: 'img', format: 'image', options: { baseUrl: null } }, | ||
{ | ||
selector: 'img', | ||
format: 'image', | ||
options: { baseUrl: null, linkBrackets: ['[', ']'] } | ||
}, | ||
{ selector: 'main', format: 'block' }, | ||
@@ -247,2 +251,15 @@ { selector: 'nav', format: 'block' }, | ||
function set (obj, path, value) { | ||
const valueKey = path.pop(); | ||
for (const key of path) { | ||
let nested = obj[key]; | ||
if (!nested) { | ||
nested = {}; | ||
obj[key] = nested; | ||
} | ||
obj = nested; | ||
} | ||
obj[valueKey] = value; | ||
} | ||
function copyFormatterOption (source, format, target) { | ||
@@ -299,2 +316,8 @@ if (options[source] === undefined) { return; } | ||
} | ||
for (const definition of selectorDefinitions) { | ||
if (definition.format === 'anchor' && get(definition, ['options', 'noLinkBrackets'])) { | ||
set(definition, ['options', 'linkBrackets'], false); | ||
} | ||
} | ||
} | ||
@@ -301,0 +324,0 @@ |
@@ -12,3 +12,3 @@ | ||
* | ||
* For more informations see the [he](https://github.com/mathiasbynens/he) module. | ||
* For more information see the [he](https://github.com/mathiasbynens/he) module. | ||
* | ||
@@ -73,9 +73,9 @@ * @property { object.< string, FormatCallback > } [formatters = {}] | ||
* | ||
* For more informations see the [he](https://github.com/mathiasbynens/he) module. | ||
* For more information see the [he](https://github.com/mathiasbynens/he) module. | ||
* | ||
* @property { boolean } [isAttributeValue = false] | ||
* TLDR: If set to `true` - leave attribute values raw, don't parse them as text content. | ||
* TL;DR: If set to `true` - leave attribute values raw, don't parse them as text content. | ||
* | ||
* @property { boolean } [strict = false] | ||
* TLDR: If set to `true` - throw an error on invalid HTML input. | ||
* TL;DR: If set to `true` - throw an error on invalid HTML input. | ||
*/ | ||
@@ -180,8 +180,10 @@ | ||
* | ||
* @property { [string, string] | false } [linkBrackets] | ||
* (Only for: `anchor` and `image` formatters.) Surround links with these brackets. Default: `['[', ']']`. | ||
* | ||
* Set to `false` or `['', '']` to disable. | ||
* | ||
* @property { boolean } [noAnchorUrl = true] | ||
* (Only for: `anchor` formatter.) Ignore anchor links (where `href='#...'`). | ||
* | ||
* @property { boolean } [noLinkBrackets = false] | ||
* (Only for: `anchor` formatter.) Don't print brackets around links. | ||
* | ||
* @property { string } [itemPrefix = ' * '] | ||
@@ -188,0 +190,0 @@ * (Only for: `unorderedList` formatter.) String prefix for each list item. |
@@ -36,6 +36,7 @@ | ||
this.allWhitespaceOrEmptyRe = new RegExp(`^[${whitespaceCodes}]*$`); | ||
this.newlineOrNonWhitespaceRe = new RegExp(`(\\n|[^\\n${whitespaceCodes}])`, 'g'); | ||
if (options.preserveNewlines) { | ||
const wordOrNewlineRe = new RegExp(`\n|[^\n${whitespaceCodes}]+`, 'gm'); | ||
const wordOrNewlineRe = new RegExp(`\\n|[^\\n${whitespaceCodes}]+`, 'gm'); | ||
@@ -132,4 +133,26 @@ /** | ||
/** | ||
* Return the number of newlines if there are no words. | ||
* | ||
* If any word is found then return zero regardless of the actual number of newlines. | ||
* | ||
* @param { string } text Input string. | ||
* @returns { number } | ||
*/ | ||
countNewlinesNoWords (text) { | ||
this.newlineOrNonWhitespaceRe.lastIndex = 0; | ||
let counter = 0; | ||
let match; | ||
while ((match = this.newlineOrNonWhitespaceRe.exec(text)) !== null) { | ||
if (match[0] === '\n') { | ||
counter++; | ||
} else { | ||
return 0; | ||
} | ||
} | ||
return counter; | ||
} | ||
} | ||
module.exports = { WhitespaceProcessor: WhitespaceProcessor }; |
{ | ||
"name": "html-to-text", | ||
"version": "8.0.0", | ||
"version": "8.1.0", | ||
"description": "Advanced html to plain text converter", | ||
@@ -34,3 +34,3 @@ "license": "MIT", | ||
"scripts": { | ||
"cover": "nyc --reporter=lcov --reporter=text-summary npm test", | ||
"cover": "c8 --reporter=lcov --reporter=text-summary mocha -t 20000", | ||
"example": "node ./example/html-to-text.js", | ||
@@ -50,11 +50,11 @@ "lint": "eslint .", | ||
"devDependencies": { | ||
"c8": "^7.10.0", | ||
"chai": "^4.3.4", | ||
"eslint": "^7.28.0", | ||
"eslint": "^7.32.0", | ||
"eslint-plugin-filenames": "^1.3.2", | ||
"eslint-plugin-import": "^2.23.4", | ||
"eslint-plugin-import": "^2.24.2", | ||
"eslint-plugin-jsdoc": "^33.3.0", | ||
"eslint-plugin-mocha": "^8.1.0", | ||
"mocha": "^8.4.0", | ||
"nyc": "^15.1.0" | ||
"eslint-plugin-mocha": "^8.2.0", | ||
"mocha": "^8.4.0" | ||
} | ||
} |
@@ -81,5 +81,5 @@ # html-to-text | ||
`baseElements.selectors` | `['body']` | Elements matching any of provided selectors will be processed and included in the output text, with all inner content.<br/>Refer to [Supported selectors](#supported-selectors) section below. | ||
`baseElements.orderBy` | `selectors` | `'selectors'` - arrange base elements in the same order as `baseElements.selectors` array;<br/>`'occurrence'` - arrange base elements in the order they are found in the input document. | ||
`baseElements.orderBy` | `'selectors'` | `'selectors'` - arrange base elements in the same order as `baseElements.selectors` array;<br/>`'occurrence'` - arrange base elements in the order they are found in the input document. | ||
`baseElements.returnDomByDefault` | `true` | Convert the entire document if none of provided selectors match. | ||
`decodeOptions` | `{ isAttributeValue: false, strict: false }` | Text decoding options given to `he.decode`. For more informations see the [he](https://github.com/mathiasbynens/he) module. | ||
`decodeOptions` | `{ isAttributeValue: false, strict: false }` | Text decoding options given to `he.decode`. For more information see the [he](https://github.com/mathiasbynens/he) module. | ||
`formatters` | `{}` | An object with custom formatting functions for specific elements (see [Override formatting](#override-formatting) section below). | ||
@@ -111,3 +111,3 @@ `limits` | | Describes how to limit the output text in case of large HTML documents. | ||
`noAnchorUrl` | 6.0 | *9.0* | `selectors: [ { selector: 'a', options: { noAnchorUrl: true } } ]` | ||
`noLinkBrackets` | 6.0 | *9.0* | `selectors: [ { selector: 'a', options: { noLinkBrackets: true } } ]` | ||
`noLinkBrackets` | 6.0 | *9.0* | `selectors: [ { selector: 'a', options: { linkBrackets: false } } ]` | ||
`returnDomByDefault` | 8.0 | | `baseElements: { returnDomByDefault: true }` | ||
@@ -171,3 +171,3 @@ `singleNewLineParagraphs` | 6.0 | *9.0* | `selectors: [`<br/>`{ selector: 'p', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },`<br/>`{ selector: 'pre', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }`<br/>`]` | ||
Following selectors have a formatter specified as a part of the default configuration. Everything can be overriden, but you don't have to repeat the `format` or options that you don't want to override. (But keep in mind this is only true for the same selector. There is no connection between different selectors.) | ||
Following selectors have a formatter specified as a part of the default configuration. Everything can be overridden, but you don't have to repeat the `format` or options that you don't want to override. (But keep in mind this is only true for the same selector. There is no connection between different selectors.) | ||
@@ -217,6 +217,6 @@ Selector | Default format | Notes | ||
`baseUrl` | `null` | `anchor`, `image` | Server host for link `href` attributes and image `src` attributes relative to the root (the ones that start with `/`).<br/>For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>` the link in the text will be `http://asdf.com/dir/subdir`.<br/>Keep in mind that `baseUrl` should not end with a `/`. | ||
`linkBrackets` | `['[', ']']` | `anchor`, `image` | Surround links with these brackets.<br/>Set to `false` or `['', '']` to disable. | ||
`hideLinkHrefIfSameAsText` | `false` | `anchor` | By default links are translated in the following way:<br/>`<a href='link'>text</a>` => becomes => `text [link]`.<br/>If this option is set to `true` and `link` and `text` are the same, `[link]` will be omitted and only `text` will be present. | ||
`ignoreHref` | `false` | `anchor` | Ignore all links. Only process internal text of anchor tags. | ||
`noAnchorUrl` | `true` | `anchor` | Ignore anchor links (where `href='#...'`). | ||
`noLinkBrackets` | `false` | `anchor` | Don't print brackets around links. | ||
`itemPrefix` | `' * '` | `unorderedList` | String prefix for each list item. | ||
@@ -231,2 +231,8 @@ `uppercase` | `true` | `heading` | By default, headings (`<h1>`, `<h2>`, etc) are uppercased.<br/>Set this to `false` to leave headings as they are. | ||
##### Deprecated format options | ||
Old option | Applies to | Depr. | Rem. | Instead use | ||
------------------- | ------------------ | ----- | ---- | --------------------- | ||
`noLinkBrackets` | `anchor` | 8.1 | | `linkBrackets: false` | ||
### Override formatting | ||
@@ -233,0 +239,0 @@ |
@@ -185,2 +185,36 @@ const fs = require('fs'); | ||
it('should produce equal results regardless of newline position between blocks', function () { | ||
const newlineOutside = '<p>A</p>\n<p>B</p>'; | ||
const newlineInside = '<p>A</p><p>\nB</p>'; | ||
const r1 = convert(newlineOutside, { preserveNewlines: true }); | ||
const r2 = convert(newlineInside, { preserveNewlines: true }); | ||
expect(r1).to.equal(r2); | ||
}); | ||
it('should produce equal results for preserved newlines and BR tags', function () { | ||
const nlHtml = '<p>A</p>\n<p>B</p><p>\nC</p>'; | ||
const brHtml = '<p>A</p><br/><p>B</p><p><br/>C</p>'; | ||
const nlResult = convert(nlHtml, { preserveNewlines: true }); | ||
const brResult = convert(brHtml); | ||
expect(nlResult).to.equal(brResult); | ||
}); | ||
it('should account for trailing/leading linebreaks of adjacent blocks equally', function () { | ||
const html = '<p>A</p>\n<div>B</div>\n<div>C</div>\n<p>D</p>'; | ||
const newlineInside = 'A\n\n\nB\n\nC\n\n\nD'; | ||
expect(convert(html, { preserveNewlines: true })).to.equal(newlineInside); | ||
}); | ||
it('should work with multiple linebreaks and in presence of whitespaces', function () { | ||
const html = '<p>A</p> \n \n <p>B</p>'; | ||
const newlineInside = 'A\n\n\n\nB'; | ||
expect(convert(html, { preserveNewlines: true })).to.equal(newlineInside); | ||
}); | ||
it('should have no special behavior in presence of words among linebreaks', function () { | ||
const html = '<p>A</p> \n B \n <p>C</p>'; | ||
const newlineInside = 'A\n\n\nB\n\n\nC'; | ||
expect(convert(html, { preserveNewlines: true })).to.equal(newlineInside); | ||
}); | ||
}); | ||
@@ -187,0 +221,0 @@ |
@@ -202,2 +202,35 @@ | ||
it('should return image link without brackets if linkBrackets is set to false', function () { | ||
const html = '<img src="test.png" alt="Awesome">'; | ||
const expected = 'Awesome test.png'; | ||
const options = { | ||
selectors: [ | ||
{ selector: 'img', options: { linkBrackets: false } } | ||
] | ||
}; | ||
expect(htmlToText(html, options)).to.equal(expected); | ||
}); | ||
it('should return image link without brackets if linkBrackets is set to ["", ""]', function () { | ||
const html = '<img src="test.png" alt="Awesome">'; | ||
const expected = 'Awesome test.png'; | ||
const options = { | ||
selectors: [ | ||
{ selector: 'img', options: { linkBrackets: ['', ''] } } | ||
] | ||
}; | ||
expect(htmlToText(html, options)).to.equal(expected); | ||
}); | ||
it('should return image link with custom brackets', function () { | ||
const html = '<img src="test.png" alt="Awesome">'; | ||
const expected = 'Awesome ===> test.png <==='; | ||
const options = { | ||
selectors: [ | ||
{ selector: 'img', options: { linkBrackets: ['===> ', ' <==='] } } | ||
] | ||
}; | ||
expect(htmlToText(html, options)).to.equal(expected); | ||
}); | ||
}); | ||
@@ -242,3 +275,3 @@ | ||
it('should return link without brackets if noLinkBrackets is set to true', function () { | ||
it('should return link without brackets if linkBrackets is set to false', function () { | ||
const html = '<a href="http://my.link">test</a>'; | ||
@@ -248,3 +281,3 @@ const expected = 'test http://my.link'; | ||
selectors: [ | ||
{ selector: 'a', options: { noLinkBrackets: true } } | ||
{ selector: 'a', options: { linkBrackets: false } } | ||
] | ||
@@ -255,2 +288,24 @@ }; | ||
it('should return link without brackets if linkBrackets is set to ["", ""]', function () { | ||
const html = '<a href="http://my.link">test</a>'; | ||
const expected = 'test http://my.link'; | ||
const options = { | ||
selectors: [ | ||
{ selector: 'a', options: { linkBrackets: ['', ''] } } | ||
] | ||
}; | ||
expect(htmlToText(html, options)).to.equal(expected); | ||
}); | ||
it('should return link with custom brackets', function () { | ||
const html = '<a href="http://my.link">test</a>'; | ||
const expected = 'test ===> http://my.link <==='; | ||
const options = { | ||
selectors: [ | ||
{ selector: 'a', options: { linkBrackets: ['===> ', ' <==='] } } | ||
] | ||
}; | ||
expect(htmlToText(html, options)).to.equal(expected); | ||
}); | ||
it('should not return link for anchor if noAnchorUrl is set to true', function () { | ||
@@ -257,0 +312,0 @@ const html = '<a href="#link">test</a>'; |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
197266
39
3656
312