clean-html
Advanced tools
Comparing version 1.1.3 to 1.2.0
@@ -6,3 +6,3 @@ var cleaner = require('./index.js'), | ||
if (!file) { | ||
return process.stderr.write('no file specified\n'); | ||
throw 'no file specified\n'; | ||
} | ||
@@ -12,6 +12,8 @@ | ||
if (err) { | ||
return process.stderr.write(err); | ||
throw err; | ||
} | ||
process.stdout.write(cleaner.clean(data) + '\n'); | ||
cleaner.clean(data, function (html) { | ||
console.log(html); | ||
}); | ||
}); |
191
index.js
@@ -1,2 +0,3 @@ | ||
var options = {}; | ||
var htmlparser = require('htmlparser2'), | ||
options = {}; | ||
@@ -7,12 +8,16 @@ function setup(opt) { | ||
'align', | ||
'valign', | ||
'bgcolor', | ||
'border', | ||
'cellpadding', | ||
'cellspacing', | ||
'color', | ||
'width', | ||
'disabled', | ||
'height', | ||
'border', | ||
'cellpadding', | ||
'cellspacing' | ||
'target', | ||
'valign', | ||
'width' | ||
], | ||
'block-tags': [ | ||
'blockquote', | ||
'div', | ||
'h1', | ||
@@ -24,12 +29,10 @@ 'h2', | ||
'h6', | ||
'div', | ||
'hr', | ||
'p', | ||
'table', | ||
'tr', | ||
'td', | ||
'blockquote', | ||
'hr' | ||
'tr' | ||
], | ||
'break-around-comments': true, | ||
'break-after-br': true, | ||
'close-empty-tags': false, | ||
'empty-tags': [ | ||
@@ -40,8 +43,7 @@ 'br', | ||
], | ||
'fix-end-tags': true, | ||
'indent': ' ', | ||
'pretty': true, | ||
'remove-comments': false, | ||
'remove-empty-paras': false, | ||
'tags-to-remove': [ | ||
'center', | ||
'font' | ||
@@ -57,8 +59,6 @@ ] | ||
options['block-tags'] = opt['block-tags'] || options['block-tags']; | ||
options['break-around-comments'] = opt['break-around-comments'] === false ? false : true; | ||
options['break-after-br'] = opt['break-after-br'] === false ? false : true; | ||
options['close-empty-tags'] = opt['close-empty-tags'] === true ? true : false; | ||
options['empty-tags'] = opt['empty-tags'] || options['empty-tags']; | ||
options['fix-end-tags'] = opt['fix-end-tags'] === false ? false : true; | ||
options['indent'] = opt['indent'] || options['indent']; | ||
options['pretty'] = opt['pretty'] === false ? false : true; | ||
options['remove-comments'] = opt['remove-comments'] === true ? true : false; | ||
@@ -85,105 +85,94 @@ options['remove-empty-paras'] = opt['remove-empty-paras'] === true ? true : false; | ||
function replaceWhiteSpace(html) { | ||
return html.replace(/\s/g, ' '); | ||
} | ||
function isEmpty(node) { | ||
if (node.type == 'text' || node.type == 'comment') { | ||
return !node.data.trim(); | ||
} | ||
function removeExtraSpaces(html) { | ||
return html.replace(/ {2,}/g, ' '); | ||
return !node.children.length || node.children.every(isEmpty); | ||
} | ||
function closeEmptyTag(tag) { | ||
return tag.replace(/ ?\/?>/, '/>'); | ||
function renderText(node) { | ||
return node.data.replace(/\s+/g, ' '); | ||
} | ||
function removeTrailingSlash(tag) { | ||
return tag.replace(/ ?\/>/, '>'); | ||
} | ||
function renderComment(node) { | ||
if (options['remove-comments']) { | ||
return ''; | ||
} | ||
function cleanAttributes(tag) { | ||
return tag.replace(/ (\w+)=['"].+?['"]/g, function (attribute, attributeName) { | ||
if (options['attr-to-remove'].indexOf(attributeName) > -1) { | ||
return ''; | ||
} | ||
if (options['break-around-comments']) { | ||
return '\n' + '<!--' + node.data + '-->' + '\n'; | ||
} | ||
return attribute; | ||
}); | ||
return '<!--' + node.data + '-->'; | ||
} | ||
function cleanTags(html) { | ||
var openTags = []; | ||
function renderTag(node) { | ||
if (options['remove-empty-paras'] && node.name == 'p' && isEmpty(node)) { | ||
return ''; | ||
} | ||
html = html.replace(/<\/?(\w+).*?>/g, function (tag, tagName) { | ||
tag = tag.toLowerCase(); | ||
tagName = tagName.toLowerCase(); | ||
if (options['tags-to-remove'].indexOf(tagName) > -1) { | ||
if (options['tags-to-remove'].indexOf(node.name) > -1) { | ||
if (!node.children.length) { | ||
return ''; | ||
} | ||
if (options['empty-tags'].indexOf(tagName) > -1) { | ||
if (options['close-empty-tags']) { | ||
tag = closeEmptyTag(tag); | ||
} else { | ||
tag = removeTrailingSlash(tag); | ||
} | ||
return render(node.children); | ||
} | ||
return cleanAttributes(tag); | ||
} | ||
var openTag = '<' + node.name, | ||
closeTag; | ||
if (tag.indexOf('</') == -1) { | ||
// open tag | ||
openTags.unshift(tagName); | ||
return cleanAttributes(tag); | ||
for (var attrib in node.attribs) { | ||
if (options['attr-to-remove'].indexOf(attrib) == -1) { | ||
openTag += ' ' + attrib + '="' + node.attribs[attrib] + '"'; | ||
} | ||
} | ||
if (openTags[0] == tagName) { | ||
// close tag | ||
openTags.shift(); | ||
openTag += '>'; | ||
return tag; | ||
if (options['empty-tags'].indexOf(node.name) > -1) { | ||
if (options['break-after-br'] && node.name == 'br') { | ||
return openTag + '\n'; | ||
} | ||
var openTagIndex = openTags.indexOf(tagName); | ||
return openTag; | ||
} | ||
if (openTagIndex > -1) { | ||
// tags are out of order - close previous tags, then close this tag | ||
return '</' + openTags.splice(0, openTagIndex + 1).join('></') + '>'; | ||
} | ||
closeTag = '</' + node.name + '>'; | ||
// tag was never opened or was already closed - discard | ||
return ''; | ||
}); | ||
if (options['block-tags'].indexOf(node.name) > -1) { | ||
openTag = '\n' + openTag + '\n'; | ||
closeTag = '\n' + closeTag + '\n'; | ||
} | ||
if (openTags.length) { | ||
// append remaining tags | ||
html += '</' + openTags.join('></') + '>'; | ||
if (!node.children.length) { | ||
return openTag + closeTag; | ||
} | ||
return html; | ||
return openTag + render(node.children) + closeTag; | ||
} | ||
function removeComments(html) { | ||
return html.replace(/<!--.*?-->/g, ''); | ||
} | ||
function render(nodes) { | ||
var html = ''; | ||
function removeEmptyParagraphs(html) { | ||
return html.replace(/<p( \w+=['"].+?['"])?>\s*<\/p>/g, ''); | ||
} | ||
nodes.forEach(function (node) { | ||
if (node.type == 'root') { | ||
html += render(node.children); | ||
return; | ||
} | ||
function addLineBreaks(html) { | ||
return html.replace(/<\/?(\w+).*?>/g, function (tag, tagName) { | ||
if (options['block-tags'].indexOf(tagName) > -1) { | ||
return '\n' + tag + '\n'; | ||
if (node.type == 'text') { | ||
html += renderText(node); | ||
return; | ||
} | ||
if (tagName == 'br' && options['break-after-br']) { | ||
return tag + '\n'; | ||
if (node.type == 'comment') { | ||
html += renderComment(node); | ||
return; | ||
} | ||
return tag; | ||
html += renderTag(node); | ||
}); | ||
} | ||
function removeBlankLines(html) { | ||
return html.replace(/\s{2,}/g, '\n'); | ||
@@ -231,24 +220,24 @@ } | ||
function clean(html, opt) { | ||
function clean(html, opt, callback) { | ||
if (typeof opt == 'function') { | ||
callback = opt; | ||
opt = null; | ||
} | ||
setup(opt); | ||
html = replaceWhiteSpace(html); | ||
html = removeExtraSpaces(html); | ||
html = cleanTags(html); | ||
var handler = new htmlparser.DomHandler(function (err, dom) { | ||
if (err) { | ||
throw err; | ||
} | ||
if (options['remove-comments']) { | ||
html = removeComments(html); | ||
} | ||
var html = render(dom); | ||
html = indent(html).trim(); | ||
if (options['remove-empty-paras']) { | ||
html = removeEmptyParagraphs(html); | ||
} | ||
callback(html); | ||
}); | ||
if (options['pretty']) { | ||
html = addLineBreaks(html); | ||
html = removeBlankLines(html); | ||
html = indent(html); | ||
} | ||
return html.trim(); | ||
var parser = new htmlparser.Parser(handler); | ||
parser.write(html); | ||
parser.done(); | ||
} | ||
@@ -255,0 +244,0 @@ |
{ | ||
"name": "clean-html", | ||
"version": "1.1.3", | ||
"version": "1.2.0", | ||
"description": "HTML cleaner and beautifier", | ||
"main": "index.js", | ||
"dependencies": {}, | ||
"dependencies": { | ||
"htmlparser2": "3.8.2" | ||
}, | ||
"devDependencies": {}, | ||
@@ -8,0 +10,0 @@ "scripts": { |
@@ -38,3 +38,5 @@ ## HTML cleaner and beautifier | ||
fs.readFile(file, 'utf-8', function (err, data) { | ||
process.stdout.write(cleaner.clean(data) + '\n'); | ||
cleaner.clean(data, function (html) { | ||
console.log(html); | ||
}); | ||
}); | ||
@@ -66,28 +68,2 @@ ``` | ||
If you like, you can even close the empty tags, lose the comments and get rid of that nasty presentational markup: | ||
```javascript | ||
var options = { | ||
'close-empty-tags': true, | ||
'remove-comments': true, | ||
'add-tags-to-remove': ['table', 'tr', 'td', 'blockquote'] | ||
}; | ||
process.stdout.write(cleaner.clean(data, options) + '\n'); | ||
``` | ||
Voila! | ||
```html | ||
<b>Currently we have these articles available:</b> | ||
<p> | ||
<a href="foo.html">The History of Foo</a><br/> | ||
An <span>informative</span> piece of information. | ||
</p> | ||
<p> | ||
<a href="bar.html">A Horse Walked Into a Bar</a><br/> | ||
The bartender said "Why the long face?" | ||
</p> | ||
``` | ||
## Options | ||
@@ -100,14 +76,14 @@ | ||
Type: Array | ||
Default: `['align', 'valign', 'bgcolor', 'color', 'width', 'height', 'border', 'cellpadding', 'cellspacing']` | ||
Default: `['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'color', 'disabled', 'height', 'target', 'valign', 'width']` | ||
### block-tags | ||
Block level element tags. Line breaks are added before and after, and nested content is indented. Note: this option has no effect unless pretty is set to true. | ||
Block level element tags. Line breaks are added before and after, and nested content is indented. | ||
Type: Array | ||
Default: `['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p', 'table', 'tr', 'td', 'blockquote', 'hr']` | ||
Default: `['blockquote', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'p', 'table', 'td', 'tr']` | ||
### break-after-br | ||
### break-around-comments | ||
Adds line breaks after br tags. Note: this option has no effect unless pretty is set to true. | ||
Adds line breaks before and after comments. | ||
@@ -117,8 +93,8 @@ Type: Boolean | ||
### close-empty-tags | ||
### break-after-br | ||
If set to true, adds trailing slashes to empty tags. Otherwise removes trailing slashes. | ||
Adds line breaks after br tags. | ||
Type: Boolean | ||
Default: `false` | ||
Default: `true` | ||
@@ -132,31 +108,5 @@ ### empty-tags | ||
### fix-end-tags | ||
Adds end tags where they are missing. For example, this: | ||
```html | ||
<blockquote>Now Scotch is a real drink for a man. | ||
``` | ||
becomes this: | ||
```html | ||
<blockquote>Now Scotch is a real drink for a man.</blockquote> | ||
``` | ||
Also fixes end tags that are closed in the wrong order. | ||
```html | ||
You <b>belong in the <i>circus</b></i>, Spock, not a starship. | ||
``` | ||
becomes this: | ||
```html | ||
You <b>belong in the <i>circus</i></b>, Spock, not a starship. | ||
``` | ||
### indent | ||
The string to use for indentation. e.g., a tab character or one or more spaces. Note: this option has no effect unless pretty is set to true. | ||
The string to use for indentation. e.g., a tab character or one or more spaces. | ||
@@ -166,9 +116,2 @@ Type: String | ||
### pretty | ||
Pretty prints the output by adding line breaks and indentation. | ||
Type: Boolean | ||
Default: `true` | ||
### remove-comments | ||
@@ -193,3 +136,3 @@ | ||
Type: Array | ||
Default: `['font']` | ||
Default: `['center', 'font']` | ||
@@ -196,0 +139,0 @@ ## Adding values to option lists |
103
test.js
@@ -5,58 +5,77 @@ var assert = require('assert'), | ||
// test that text is unchanged | ||
assert.equal(cleaner.clean('Foo Bar'), 'Foo Bar'); | ||
cleaner.clean('Foo Bar', function (html) { | ||
assert.equal(html, 'Foo Bar'); | ||
}); | ||
// test that extra whitespace is removed | ||
assert.equal(cleaner.clean('Foo Bar'), 'Foo Bar'); | ||
assert.equal(cleaner.clean('Foo\nBar'), 'Foo Bar'); | ||
cleaner.clean('Foo Bar', function (html) { | ||
assert.equal(html, 'Foo Bar'); | ||
}); | ||
cleaner.clean('Foo\nBar', function (html) { | ||
assert.equal(html, 'Foo Bar'); | ||
}); | ||
// test that uppercase tags and attributes are lowercased | ||
assert.equal(cleaner.clean('<FOO BAR="QUX">Bam</FOO>'), '<foo bar="qux">Bam</foo>'); | ||
// test that comments are removed | ||
cleaner.clean('<!-- foo -->', function (html) { | ||
assert.equal(html, '<!-- foo -->'); | ||
}); | ||
cleaner.clean('<!-- foo -->', {'remove-comments': true}, function (html) { | ||
assert.equal(html, ''); | ||
}); | ||
// test that lines breaks are added before and after comments | ||
cleaner.clean('foo<!-- bar -->qux', function (html) { | ||
assert.equal(html, 'foo\n<!-- bar -->\nqux'); | ||
}); | ||
cleaner.clean('foo<!-- bar -->qux', {'break-around-comments': false}, function (html) { | ||
assert.equal(html, 'foo<!-- bar -->qux'); | ||
}); | ||
// test that empty paragraph tags are removed | ||
cleaner.clean('<p>\n</p>', function (html) { | ||
assert.equal(html, '<p>\n</p>'); | ||
}); | ||
cleaner.clean('<p>\n</p>', {'remove-empty-paras': true}, function (html) { | ||
assert.equal(html, ''); | ||
}); | ||
// test that deprecated tags are removed | ||
assert.equal(cleaner.clean('foo <font="arial">bar</font>'), 'foo bar'); | ||
cleaner.clean('<font face="arial">foo</font>', function (html) { | ||
assert.equal(html, 'foo'); | ||
}); | ||
// test that trailing slash is removed from empty element tag | ||
assert.equal(cleaner.clean('<br />'), '<br>'); | ||
assert.equal(cleaner.clean('<br/>'), '<br>'); | ||
assert.equal(cleaner.clean('<br>'), '<br>'); | ||
assert.equal(cleaner.clean('<br />', {'close-empty-tags': true}), '<br/>'); | ||
assert.equal(cleaner.clean('<br/>', {'close-empty-tags': true}), '<br/>'); | ||
assert.equal(cleaner.clean('<br>', {'close-empty-tags': true}), '<br/>'); | ||
// test that legacy attributes are removed | ||
assert.equal(cleaner.clean('<foo color="red">bar</foo>'), '<foo>bar</foo>'); | ||
cleaner.clean('<span color="red">foo</span>', function (html) { | ||
assert.equal(html, '<span>foo</span>'); | ||
}); | ||
// test that missing end tags are added | ||
assert.equal(cleaner.clean('<quote>Now Scotch is a real drink for a man.'), '<quote>Now Scotch is a real drink for a man.</quote>'); | ||
// test that line break is added after br tag | ||
cleaner.clean('foo<br>bar', function (html) { | ||
assert.equal(html, 'foo<br>\nbar'); | ||
}); | ||
cleaner.clean('foo<br>bar', {'break-after-br': false}, function (html) { | ||
assert.equal(html, 'foo<br>bar'); | ||
}); | ||
// test that end tags are closed in the right order | ||
assert.equal(cleaner.clean('You <b>belong in the <i>circus</b></i>, Spock, not a starship.'), 'You <b>belong in the <i>circus</i></b>, Spock, not a starship.'); | ||
// test that comments are removed | ||
assert.equal(cleaner.clean('foo<!-- bar -->'), 'foo<!-- bar -->'); | ||
assert.equal(cleaner.clean('foo<!-- bar -->', {'remove-comments': true}), 'foo'); | ||
// test that empty paragraph tags are removed | ||
assert.equal(cleaner.clean('<p></p>', {'remove-empty-paras': true}), ''); | ||
assert.equal(cleaner.clean('<p>\n</p>', {'remove-empty-paras': true}), ''); | ||
assert.equal(cleaner.clean('<p foo="bar"></p>', {'remove-empty-paras': true}), ''); | ||
// test that line breaks are added before and after block element tags | ||
assert.equal(cleaner.clean('foo<div></div>foo'), 'foo\n<div>\n</div>\nfoo'); | ||
assert.equal(cleaner.clean('foo<div></div>foo', {'pretty': false}), 'foo<div></div>foo'); | ||
cleaner.clean('foo<div></div>bar', function (html) { | ||
assert.equal(html, 'foo\n<div>\n</div>\nbar'); | ||
}); | ||
// test that line break is added after br element tag | ||
assert.equal(cleaner.clean('foo<br>foo'), 'foo<br>\nfoo'); | ||
assert.equal(cleaner.clean('foo<br>foo', {'break-after-br': false}), 'foo<br>foo'); | ||
assert.equal(cleaner.clean('foo<br>foo', {'pretty': false}), 'foo<br>foo'); | ||
// test that nested tags are indented after block element tags | ||
assert.equal(cleaner.clean('<div>bar</div>'), '<div>\n bar\n</div>'); | ||
assert.equal(cleaner.clean('<div><div>bar</div></div>'), '<div>\n <div>\n bar\n </div>\n</div>'); | ||
assert.equal(cleaner.clean('<div>bar</div>', {'indent': ' '}), '<div>\n bar\n</div>'); | ||
assert.equal(cleaner.clean('<div>bar</div>', {'pretty': false}), '<div>bar</div>'); | ||
cleaner.clean('<div>foo</div>', function (html) { | ||
assert.equal(html, '<div>\n foo\n</div>'); | ||
}); | ||
cleaner.clean('<div><div>foo</div></div>', function (html) { | ||
assert.equal(html, '<div>\n <div>\n foo\n </div>\n</div>'); | ||
}); | ||
cleaner.clean('<div>foo</div>', {'indent': ' '}, function (html) { | ||
assert.equal(html, '<div>\n foo\n</div>'); | ||
}); | ||
// test that output is trimmed | ||
assert.equal(cleaner.clean(' Foo\n'), 'Foo'); | ||
cleaner.clean(' foo\n', function (html) { | ||
assert.equal(html, 'foo'); | ||
}); | ||
process.stdout.write('all tests passed\n'); | ||
console.log('all tests passed'); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
6
275
12478
1
163
1
+ Addedhtmlparser2@3.8.2
+ Addedcore-util-is@1.0.3(transitive)
+ Addeddom-serializer@0.2.2(transitive)
+ Addeddomelementtype@1.3.12.3.0(transitive)
+ Addeddomhandler@2.3.0(transitive)
+ Addeddomutils@1.5.1(transitive)
+ Addedentities@1.0.02.2.0(transitive)
+ Addedhtmlparser2@3.8.2(transitive)
+ Addedinherits@2.0.4(transitive)
+ Addedisarray@0.0.1(transitive)
+ Addedreadable-stream@1.1.14(transitive)
+ Addedstring_decoder@0.10.31(transitive)