Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

clean-html

Package Overview
Dependencies
Maintainers
1
Versions
30
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

clean-html - npm Package Compare versions

Comparing version 1.1.3 to 1.2.0

.npmignore

8

demo.js

@@ -6,3 +6,3 @@ var cleaner = require('./index.js'),

if (!file) {
return process.stderr.write('no file specified\n');
throw 'no file specified\n';
}

@@ -12,6 +12,8 @@

if (err) {
return process.stderr.write(err);
throw err;
}
process.stdout.write(cleaner.clean(data) + '\n');
cleaner.clean(data, function (html) {
console.log(html);
});
});

@@ -1,2 +0,3 @@

var options = {};
var htmlparser = require('htmlparser2'),
options = {};

@@ -7,12 +8,16 @@ function setup(opt) {

'align',
'valign',
'bgcolor',
'border',
'cellpadding',
'cellspacing',
'color',
'width',
'disabled',
'height',
'border',
'cellpadding',
'cellspacing'
'target',
'valign',
'width'
],
'block-tags': [
'blockquote',
'div',
'h1',

@@ -24,12 +29,10 @@ 'h2',

'h6',
'div',
'hr',
'p',
'table',
'tr',
'td',
'blockquote',
'hr'
'tr'
],
'break-around-comments': true,
'break-after-br': true,
'close-empty-tags': false,
'empty-tags': [

@@ -40,8 +43,7 @@ 'br',

],
'fix-end-tags': true,
'indent': ' ',
'pretty': true,
'remove-comments': false,
'remove-empty-paras': false,
'tags-to-remove': [
'center',
'font'

@@ -57,8 +59,6 @@ ]

options['block-tags'] = opt['block-tags'] || options['block-tags'];
options['break-around-comments'] = opt['break-around-comments'] === false ? false : true;
options['break-after-br'] = opt['break-after-br'] === false ? false : true;
options['close-empty-tags'] = opt['close-empty-tags'] === true ? true : false;
options['empty-tags'] = opt['empty-tags'] || options['empty-tags'];
options['fix-end-tags'] = opt['fix-end-tags'] === false ? false : true;
options['indent'] = opt['indent'] || options['indent'];
options['pretty'] = opt['pretty'] === false ? false : true;
options['remove-comments'] = opt['remove-comments'] === true ? true : false;

@@ -85,105 +85,94 @@ options['remove-empty-paras'] = opt['remove-empty-paras'] === true ? true : false;

function replaceWhiteSpace(html) {
return html.replace(/\s/g, ' ');
}
function isEmpty(node) {
if (node.type == 'text' || node.type == 'comment') {
return !node.data.trim();
}
function removeExtraSpaces(html) {
return html.replace(/ {2,}/g, ' ');
return !node.children.length || node.children.every(isEmpty);
}
function closeEmptyTag(tag) {
return tag.replace(/ ?\/?>/, '/>');
function renderText(node) {
return node.data.replace(/\s+/g, ' ');
}
function removeTrailingSlash(tag) {
return tag.replace(/ ?\/>/, '>');
}
function renderComment(node) {
if (options['remove-comments']) {
return '';
}
function cleanAttributes(tag) {
return tag.replace(/ (\w+)=['"].+?['"]/g, function (attribute, attributeName) {
if (options['attr-to-remove'].indexOf(attributeName) > -1) {
return '';
}
if (options['break-around-comments']) {
return '\n' + '<!--' + node.data + '-->' + '\n';
}
return attribute;
});
return '<!--' + node.data + '-->';
}
function cleanTags(html) {
var openTags = [];
function renderTag(node) {
if (options['remove-empty-paras'] && node.name == 'p' && isEmpty(node)) {
return '';
}
html = html.replace(/<\/?(\w+).*?>/g, function (tag, tagName) {
tag = tag.toLowerCase();
tagName = tagName.toLowerCase();
if (options['tags-to-remove'].indexOf(tagName) > -1) {
if (options['tags-to-remove'].indexOf(node.name) > -1) {
if (!node.children.length) {
return '';
}
if (options['empty-tags'].indexOf(tagName) > -1) {
if (options['close-empty-tags']) {
tag = closeEmptyTag(tag);
} else {
tag = removeTrailingSlash(tag);
}
return render(node.children);
}
return cleanAttributes(tag);
}
var openTag = '<' + node.name,
closeTag;
if (tag.indexOf('</') == -1) {
// open tag
openTags.unshift(tagName);
return cleanAttributes(tag);
for (var attrib in node.attribs) {
if (options['attr-to-remove'].indexOf(attrib) == -1) {
openTag += ' ' + attrib + '="' + node.attribs[attrib] + '"';
}
}
if (openTags[0] == tagName) {
// close tag
openTags.shift();
openTag += '>';
return tag;
if (options['empty-tags'].indexOf(node.name) > -1) {
if (options['break-after-br'] && node.name == 'br') {
return openTag + '\n';
}
var openTagIndex = openTags.indexOf(tagName);
return openTag;
}
if (openTagIndex > -1) {
// tags are out of order - close previous tags, then close this tag
return '</' + openTags.splice(0, openTagIndex + 1).join('></') + '>';
}
closeTag = '</' + node.name + '>';
// tag was never opened or was already closed - discard
return '';
});
if (options['block-tags'].indexOf(node.name) > -1) {
openTag = '\n' + openTag + '\n';
closeTag = '\n' + closeTag + '\n';
}
if (openTags.length) {
// append remaining tags
html += '</' + openTags.join('></') + '>';
if (!node.children.length) {
return openTag + closeTag;
}
return html;
return openTag + render(node.children) + closeTag;
}
function removeComments(html) {
return html.replace(/<!--.*?-->/g, '');
}
function render(nodes) {
var html = '';
function removeEmptyParagraphs(html) {
return html.replace(/<p( \w+=['"].+?['"])?>\s*<\/p>/g, '');
}
nodes.forEach(function (node) {
if (node.type == 'root') {
html += render(node.children);
return;
}
function addLineBreaks(html) {
return html.replace(/<\/?(\w+).*?>/g, function (tag, tagName) {
if (options['block-tags'].indexOf(tagName) > -1) {
return '\n' + tag + '\n';
if (node.type == 'text') {
html += renderText(node);
return;
}
if (tagName == 'br' && options['break-after-br']) {
return tag + '\n';
if (node.type == 'comment') {
html += renderComment(node);
return;
}
return tag;
html += renderTag(node);
});
}
function removeBlankLines(html) {
return html.replace(/\s{2,}/g, '\n');

@@ -231,24 +220,24 @@ }

function clean(html, opt) {
function clean(html, opt, callback) {
if (typeof opt == 'function') {
callback = opt;
opt = null;
}
setup(opt);
html = replaceWhiteSpace(html);
html = removeExtraSpaces(html);
html = cleanTags(html);
var handler = new htmlparser.DomHandler(function (err, dom) {
if (err) {
throw err;
}
if (options['remove-comments']) {
html = removeComments(html);
}
var html = render(dom);
html = indent(html).trim();
if (options['remove-empty-paras']) {
html = removeEmptyParagraphs(html);
}
callback(html);
});
if (options['pretty']) {
html = addLineBreaks(html);
html = removeBlankLines(html);
html = indent(html);
}
return html.trim();
var parser = new htmlparser.Parser(handler);
parser.write(html);
parser.done();
}

@@ -255,0 +244,0 @@

{
"name": "clean-html",
"version": "1.1.3",
"version": "1.2.0",
"description": "HTML cleaner and beautifier",
"main": "index.js",
"dependencies": {},
"dependencies": {
"htmlparser2": "3.8.2"
},
"devDependencies": {},

@@ -8,0 +10,0 @@ "scripts": {

@@ -38,3 +38,5 @@ ## HTML cleaner and beautifier

fs.readFile(file, 'utf-8', function (err, data) {
process.stdout.write(cleaner.clean(data) + '\n');
cleaner.clean(data, function (html) {
console.log(html);
});
});

@@ -66,28 +68,2 @@ ```

If you like, you can even close the empty tags, lose the comments and get rid of that nasty presentational markup:
```javascript
var options = {
'close-empty-tags': true,
'remove-comments': true,
'add-tags-to-remove': ['table', 'tr', 'td', 'blockquote']
};
process.stdout.write(cleaner.clean(data, options) + '\n');
```
Voila!
```html
<b>Currently we have these articles available:</b>
<p>
<a href="foo.html">The History of Foo</a><br/>
An <span>informative</span> piece of information.
</p>
<p>
<a href="bar.html">A Horse Walked Into a Bar</a><br/>
The bartender said "Why the long face?"
</p>
```
## Options

@@ -100,14 +76,14 @@

Type: Array
Default: `['align', 'valign', 'bgcolor', 'color', 'width', 'height', 'border', 'cellpadding', 'cellspacing']`
Default: `['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'color', 'disabled', 'height', 'target', 'valign', 'width']`
### block-tags
Block level element tags. Line breaks are added before and after, and nested content is indented. Note: this option has no effect unless pretty is set to true.
Block level element tags. Line breaks are added before and after, and nested content is indented.
Type: Array
Default: `['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p', 'table', 'tr', 'td', 'blockquote', 'hr']`
Default: `['blockquote', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'p', 'table', 'td', 'tr']`
### break-after-br
### break-around-comments
Adds line breaks after br tags. Note: this option has no effect unless pretty is set to true.
Adds line breaks before and after comments.

@@ -117,8 +93,8 @@ Type: Boolean

### close-empty-tags
### break-after-br
If set to true, adds trailing slashes to empty tags. Otherwise removes trailing slashes.
Adds line breaks after br tags.
Type: Boolean
Default: `false`
Default: `true`

@@ -132,31 +108,5 @@ ### empty-tags

### fix-end-tags
Adds end tags where they are missing. For example, this:
```html
<blockquote>Now Scotch is a real drink for a man.
```
becomes this:
```html
<blockquote>Now Scotch is a real drink for a man.</blockquote>
```
Also fixes end tags that are closed in the wrong order.
```html
You <b>belong in the <i>circus</b></i>, Spock, not a starship.
```
becomes this:
```html
You <b>belong in the <i>circus</i></b>, Spock, not a starship.
```
### indent
The string to use for indentation. e.g., a tab character or one or more spaces. Note: this option has no effect unless pretty is set to true.
The string to use for indentation. e.g., a tab character or one or more spaces.

@@ -166,9 +116,2 @@ Type: String

### pretty
Pretty prints the output by adding line breaks and indentation.
Type: Boolean
Default: `true`
### remove-comments

@@ -193,3 +136,3 @@

Type: Array
Default: `['font']`
Default: `['center', 'font']`

@@ -196,0 +139,0 @@ ## Adding values to option lists

@@ -5,58 +5,77 @@ var assert = require('assert'),

// test that text is unchanged
assert.equal(cleaner.clean('Foo Bar'), 'Foo Bar');
cleaner.clean('Foo Bar', function (html) {
assert.equal(html, 'Foo Bar');
});
// test that extra whitespace is removed
assert.equal(cleaner.clean('Foo Bar'), 'Foo Bar');
assert.equal(cleaner.clean('Foo\nBar'), 'Foo Bar');
cleaner.clean('Foo Bar', function (html) {
assert.equal(html, 'Foo Bar');
});
cleaner.clean('Foo\nBar', function (html) {
assert.equal(html, 'Foo Bar');
});
// test that uppercase tags and attributes are lowercased
assert.equal(cleaner.clean('<FOO BAR="QUX">Bam</FOO>'), '<foo bar="qux">Bam</foo>');
// test that comments are removed
cleaner.clean('<!-- foo -->', function (html) {
assert.equal(html, '<!-- foo -->');
});
cleaner.clean('<!-- foo -->', {'remove-comments': true}, function (html) {
assert.equal(html, '');
});
// test that lines breaks are added before and after comments
cleaner.clean('foo<!-- bar -->qux', function (html) {
assert.equal(html, 'foo\n<!-- bar -->\nqux');
});
cleaner.clean('foo<!-- bar -->qux', {'break-around-comments': false}, function (html) {
assert.equal(html, 'foo<!-- bar -->qux');
});
// test that empty paragraph tags are removed
cleaner.clean('<p>\n</p>', function (html) {
assert.equal(html, '<p>\n</p>');
});
cleaner.clean('<p>\n</p>', {'remove-empty-paras': true}, function (html) {
assert.equal(html, '');
});
// test that deprecated tags are removed
assert.equal(cleaner.clean('foo <font="arial">bar</font>'), 'foo bar');
cleaner.clean('<font face="arial">foo</font>', function (html) {
assert.equal(html, 'foo');
});
// test that trailing slash is removed from empty element tag
assert.equal(cleaner.clean('<br />'), '<br>');
assert.equal(cleaner.clean('<br/>'), '<br>');
assert.equal(cleaner.clean('<br>'), '<br>');
assert.equal(cleaner.clean('<br />', {'close-empty-tags': true}), '<br/>');
assert.equal(cleaner.clean('<br/>', {'close-empty-tags': true}), '<br/>');
assert.equal(cleaner.clean('<br>', {'close-empty-tags': true}), '<br/>');
// test that legacy attributes are removed
assert.equal(cleaner.clean('<foo color="red">bar</foo>'), '<foo>bar</foo>');
cleaner.clean('<span color="red">foo</span>', function (html) {
assert.equal(html, '<span>foo</span>');
});
// test that missing end tags are added
assert.equal(cleaner.clean('<quote>Now Scotch is a real drink for a man.'), '<quote>Now Scotch is a real drink for a man.</quote>');
// test that line break is added after br tag
cleaner.clean('foo<br>bar', function (html) {
assert.equal(html, 'foo<br>\nbar');
});
cleaner.clean('foo<br>bar', {'break-after-br': false}, function (html) {
assert.equal(html, 'foo<br>bar');
});
// test that end tags are closed in the right order
assert.equal(cleaner.clean('You <b>belong in the <i>circus</b></i>, Spock, not a starship.'), 'You <b>belong in the <i>circus</i></b>, Spock, not a starship.');
// test that comments are removed
assert.equal(cleaner.clean('foo<!-- bar -->'), 'foo<!-- bar -->');
assert.equal(cleaner.clean('foo<!-- bar -->', {'remove-comments': true}), 'foo');
// test that empty paragraph tags are removed
assert.equal(cleaner.clean('<p></p>', {'remove-empty-paras': true}), '');
assert.equal(cleaner.clean('<p>\n</p>', {'remove-empty-paras': true}), '');
assert.equal(cleaner.clean('<p foo="bar"></p>', {'remove-empty-paras': true}), '');
// test that line breaks are added before and after block element tags
assert.equal(cleaner.clean('foo<div></div>foo'), 'foo\n<div>\n</div>\nfoo');
assert.equal(cleaner.clean('foo<div></div>foo', {'pretty': false}), 'foo<div></div>foo');
cleaner.clean('foo<div></div>bar', function (html) {
assert.equal(html, 'foo\n<div>\n</div>\nbar');
});
// test that line break is added after br element tag
assert.equal(cleaner.clean('foo<br>foo'), 'foo<br>\nfoo');
assert.equal(cleaner.clean('foo<br>foo', {'break-after-br': false}), 'foo<br>foo');
assert.equal(cleaner.clean('foo<br>foo', {'pretty': false}), 'foo<br>foo');
// test that nested tags are indented after block element tags
assert.equal(cleaner.clean('<div>bar</div>'), '<div>\n bar\n</div>');
assert.equal(cleaner.clean('<div><div>bar</div></div>'), '<div>\n <div>\n bar\n </div>\n</div>');
assert.equal(cleaner.clean('<div>bar</div>', {'indent': ' '}), '<div>\n bar\n</div>');
assert.equal(cleaner.clean('<div>bar</div>', {'pretty': false}), '<div>bar</div>');
cleaner.clean('<div>foo</div>', function (html) {
assert.equal(html, '<div>\n foo\n</div>');
});
cleaner.clean('<div><div>foo</div></div>', function (html) {
assert.equal(html, '<div>\n <div>\n foo\n </div>\n</div>');
});
cleaner.clean('<div>foo</div>', {'indent': ' '}, function (html) {
assert.equal(html, '<div>\n foo\n</div>');
});
// test that output is trimmed
assert.equal(cleaner.clean(' Foo\n'), 'Foo');
cleaner.clean(' foo\n', function (html) {
assert.equal(html, 'foo');
});
process.stdout.write('all tests passed\n');
console.log('all tests passed');
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc