New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

degausser

Package Overview
Dependencies
Maintainers
2
Versions
17
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

degausser - npm Package Compare versions

Comparing version 2.0.0 to 2.1.0

src/domWalker.js

718

dist/degausser.js

@@ -7,3 +7,114 @@ (function (global, factory) {

// Char codes for \t, \n, and non-  space character
function _classCallCheck(instance, Constructor) {
if (!(instance instanceof Constructor)) {
throw new TypeError("Cannot call a class as a function");
}
}
function _defineProperties(target, props) {
for (var i = 0; i < props.length; i++) {
var descriptor = props[i];
descriptor.enumerable = descriptor.enumerable || false;
descriptor.configurable = true;
if ("value" in descriptor) descriptor.writable = true;
Object.defineProperty(target, descriptor.key, descriptor);
}
}
function _createClass(Constructor, protoProps, staticProps) {
if (protoProps) _defineProperties(Constructor.prototype, protoProps);
if (staticProps) _defineProperties(Constructor, staticProps);
return Constructor;
}
function _unsupportedIterableToArray(o, minLen) {
if (!o) return;
if (typeof o === "string") return _arrayLikeToArray(o, minLen);
var n = Object.prototype.toString.call(o).slice(8, -1);
if (n === "Object" && o.constructor) n = o.constructor.name;
if (n === "Map" || n === "Set") return Array.from(o);
if (n === "Arguments" || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(n)) return _arrayLikeToArray(o, minLen);
}
function _arrayLikeToArray(arr, len) {
if (len == null || len > arr.length) len = arr.length;
for (var i = 0, arr2 = new Array(len); i < len; i++) arr2[i] = arr[i];
return arr2;
}
function _createForOfIteratorHelper(o) {
if (typeof Symbol === "undefined" || o[Symbol.iterator] == null) {
if (Array.isArray(o) || (o = _unsupportedIterableToArray(o))) {
var i = 0;
var F = function () {};
return {
s: F,
n: function () {
if (i >= o.length) return {
done: true
};
return {
done: false,
value: o[i++]
};
},
e: function (e) {
throw e;
},
f: F
};
}
throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.");
}
var it,
normalCompletion = true,
didErr = false,
err;
return {
s: function () {
it = o[Symbol.iterator]();
},
n: function () {
var step = it.next();
normalCompletion = step.done;
return step;
},
e: function (e) {
didErr = true;
err = e;
},
f: function () {
try {
if (!normalCompletion && it.return != null) it.return();
} finally {
if (didErr) throw err;
}
}
};
}
function autoBind() {
var _iterator = _createForOfIteratorHelper(Object.getOwnPropertyNames(Object.getPrototypeOf(this))),
_step;
try {
for (_iterator.s(); !(_step = _iterator.n()).done;) {
var prop = _step.value;
if (prop === 'constructor' || typeof this[prop] !== 'function') continue;
this[prop] = this[prop].bind(this);
}
} catch (err) {
_iterator.e(err);
} finally {
_iterator.f();
}
} // Char codes for \t, \n, and non-&nbsp; space character
var whitespaces = [9, 10, 13, 32];

@@ -15,2 +126,8 @@

var BreakType = {
NONE: 'none',
SINGLE: 'single',
DOUBLE: 'double'
};
var trimBeginAndEnd = function trimBeginAndEnd(string) {

@@ -79,196 +196,525 @@ // Get the first and last non-whitespace character index

var degausser = function degausser(parentNode) {
// If there's no Node, just return
if (!parentNode) {
return null;
} // Tracking Entities
var StringCollector = /*#__PURE__*/function () {
function StringCollector() {
_classCallCheck(this, StringCollector);
this.runs = [];
this.text = [];
this.hasEncounteredFirstCell = false;
this.lastBreak = null;
autoBind.call(this);
}
var runs = [];
var text = [];
var haveEncounteredFirstCell = false;
var lastBreak = null;
var breakType = {
NONE: 'none',
SINGLE: 'single',
DOUBLE: 'double'
};
_createClass(StringCollector, [{
key: "addBreak",
value: function addBreak(_double) {
if (this.lastBreak === null) {
// The only time it should be null is at the beginning of document
return;
}
var addBreak = function addBreak(_double) {
if (lastBreak === null) {
// The only time it should be null is at the beginning of document
return;
if (_double) {
this.lastBreak = BreakType.DOUBLE;
} else if (this.lastBreak !== BreakType.DOUBLE) {
this.lastBreak = BreakType.SINGLE;
}
}
}, {
key: "processBreaks",
value: function processBreaks() {
if (!this.lastBreak) {
return;
}
if (_double) {
lastBreak = breakType.DOUBLE;
} else if (lastBreak !== breakType.DOUBLE) {
lastBreak = breakType.SINGLE;
switch (this.lastBreak) {
case BreakType.SINGLE:
this.runs.push('\n');
break;
case BreakType.DOUBLE:
this.runs.push('\n\n');
break;
}
this.lastBreak = BreakType.NONE;
}
};
}, {
key: "processText",
value: function processText() {
if (this.text.length === 0) {
return;
} // Trim
var processBreaks = function processBreaks() {
if (!lastBreak) {
return;
var trimmed = trimBeginAndEnd(this.text.join(''));
if (!trimmed) {
// Trimmed into an empty string
// Preserve all preceding breaks
this.text = [];
return;
}
if (this.lastBreak === null) {
this.lastBreak = BreakType.NONE;
}
this.runs.push(trimBeginAndEnd(collapseWhitespace(trimmed)));
this.text = [];
}
}, {
key: "processElementNode",
value: function processElementNode(node, isOpening) {
var tag = node.tagName.toLowerCase(); // Special case for Preformatted
switch (lastBreak) {
case breakType.SINGLE:
runs.push('\n');
break;
if (tag === 'pre') {
this.processText();
this.addBreak(false);
this.processBreaks();
this.runs.push(node.textContent);
this.lastBreak = BreakType.SINGLE;
return true;
} // Process other tags
case breakType.DOUBLE:
runs.push('\n\n');
break;
switch (tag) {
case 'br':
this.processText();
this.processBreaks();
this.runs.push('\n');
return true;
case 'wbr':
this.processBreaks();
this.text.push("\u200B");
return true;
}
if (node.hasAttribute('alt')) {
this.processBreaks();
this.text.push(" ".concat(node.getAttribute('alt'), " "));
return true;
}
this.processBlockConstruct(tag, isOpening);
return false;
}
}, {
key: "processBlockConstruct",
value: function processBlockConstruct(tag, isOpening) {
if (phrasingConstructs.includes(tag)) {
// Do not process phrasing tags as block constructs
return;
}
lastBreak = breakType.NONE;
};
if (tag === 'th' || tag === 'td') {
// Special Block
if (isOpening) {
// I'm assuming the DOM will fix all table element malformations
if (!this.hasEncounteredFirstCell) {
this.hasEncounteredFirstCell = true;
} else {
this.processBreaks();
this.runs.push('\t');
}
} else {
this.processText();
}
var processText = function processText() {
if (text.length === 0) {
return;
} // Trim
return;
} // Regular Block
var trimmed = trimBeginAndEnd(text.join(''));
this.processText();
if (!trimmed) {
// Trimmed into an empty string
// Preserve all preceding breaks
text = [];
return;
if (tag === 'tr') {
this.hasEncounteredFirstCell = false;
}
if (tag === 'p') {
this.addBreak(true);
}
this.addBreak(false);
}
}, {
key: "processTextNode",
value: function processTextNode(node) {
var string = node.textContent.normalize(); // Trim
if (lastBreak === null) {
lastBreak = breakType.NONE;
var trimmed = trimBeginAndEnd(string);
if (trimmed) {
this.processBreaks();
}
this.text.push(string);
}
}, {
key: "getResult",
value: function getResult() {
// Get Stragglers
this.processText();
return this.runs.join('');
}
}]);
runs.push(trimBeginAndEnd(collapseWhitespace(trimmed)));
text = [];
};
return StringCollector;
}();
var processBlockConstruct = function processBlockConstruct(tag, opening) {
if (phrasingConstructs.includes(tag)) {
return;
} // Not a phrasing construct, therefore is Block
var MapType = {
TEXT: 'Text',
BREAK: 'Break'
};
var MapCollector = /*#__PURE__*/function () {
function MapCollector() {
_classCallCheck(this, MapCollector);
this.map = [];
this.text = [];
this.hasEncounteredFirstCell = false;
this.lastBreak = null;
autoBind.call(this);
}
if (tag === 'th' || tag === 'td') {
// Special Block
if (opening) {
// I'm assuming the DOM will fix all table element malformations
if (!haveEncounteredFirstCell) {
haveEncounteredFirstCell = true;
} else {
processBreaks();
runs.push('\t');
}
} else {
processText();
_createClass(MapCollector, [{
key: "addBreak",
value: function addBreak(_double) {
if (this.lastBreak === null) {
// The only time it should be null is at the beginning of document
return;
}
return;
} // Regular Blocks
if (_double) {
this.lastBreak = BreakType.DOUBLE;
} else if (this.lastBreak !== BreakType.DOUBLE) {
this.lastBreak = BreakType.SINGLE;
}
}
}, {
key: "processBreaks",
value: function processBreaks() {
if (!this.lastBreak) {
return;
}
switch (this.lastBreak) {
case BreakType.SINGLE:
this.map.push({
type: MapType.BREAK,
"double": false
});
break;
processText();
case BreakType.DOUBLE:
this.map.push({
type: MapType.BREAK,
"double": true
});
break;
}
if (tag === 'tr') {
haveEncounteredFirstCell = false;
this.lastBreak = BreakType.NONE;
}
}, {
key: "processText",
value: function processText() {
var _this$map;
if (tag === 'p') {
addBreak(true);
}
if (this.text.length === 0) {
return;
}
addBreak(false);
};
var joinedText = this.text.map(function (element) {
return element.string;
}).join(''); // TODO: might have to check for null string here
var processTextNode = function processTextNode(node) {
var string = node.textContent.normalize(); // Trim
var trimmed = trimBeginAndEnd(joinedText);
var trimmed = trimBeginAndEnd(string);
if (!trimmed) {
// Trimmed into an empty string
// Preserve all preceding breaks
this.text = [];
return;
}
if (trimmed) {
processBreaks();
var fullText = trimBeginAndEnd(collapseWhitespace(trimmed));
var blockMap = [];
var currentIndexOfString = 0;
var _iterator = _createForOfIteratorHelper(this.text),
_step;
try {
for (_iterator.s(); !(_step = _iterator.n()).done;) {
var textMap = _step.value;
var shrunkText = trimBeginAndEnd(collapseWhitespace(textMap.string));
if (!shrunkText) {
continue;
}
var index = fullText.indexOf(shrunkText);
if (index < 0) {
throw new Error("Could not find shrunk string \"".concat(shrunkText, "\" in \"").concat(fullText, "\""));
}
blockMap.push({
type: MapType.TEXT,
node: textMap.node,
start: currentIndexOfString + index,
content: shrunkText,
length: shrunkText.length
});
fullText = fullText.slice(index + shrunkText.length);
currentIndexOfString += shrunkText.length + index;
} // Do some more magic on block map
} catch (err) {
_iterator.e(err);
} finally {
_iterator.f();
}
for (var i = 1; i < blockMap.length; ++i) {
if (blockMap[i].start - blockMap[i - 1].start !== blockMap[i - 1].length) {
blockMap[i - 1].length = blockMap[i].start - blockMap[i - 1].start;
}
}
(_this$map = this.map).push.apply(_this$map, blockMap);
if (this.lastBreak === null) {
this.lastBreak = BreakType.NONE;
}
this.text = [];
}
}, {
key: "processElementNode",
value: function processElementNode(node, isOpening) {
var tag = node.tagName.toLowerCase(); // Special case for Preformatted
text.push(string);
};
if (tag === 'pre') {
this.processText();
this.addBreak(false);
this.processBreaks();
this.lastBreak = BreakType.SINGLE;
this.map.push({
type: MapType.TEXT,
node: node,
content: node.textContent,
length: node.textContent.length
});
return true;
} // Process other tags
var processElementNode = function processElementNode(node) {
var tag = node.tagName && node.tagName.toLowerCase(); // Special case for Preformatted
if (tag === 'pre') {
processText();
addBreak(false);
processBreaks();
runs.push(node.textContent);
lastBreak = breakType.SINGLE;
return;
switch (tag) {
case 'br':
this.processText();
this.processBreaks();
this.map.push({
type: MapType.TEXT,
node: node,
content: '\n',
length: 1
});
return true;
case 'wbr':
this.processBreaks();
this.text.push({
node: node,
string: "\u200B"
});
return true;
}
if (node.hasAttribute('alt')) {
this.processBreaks();
this.text.push({
node: node,
string: " ".concat(node.getAttribute('alt'), " ")
});
return true;
}
this.processBlockConstruct(node, isOpening);
return false;
}
}, {
key: "processBlockConstruct",
value: function processBlockConstruct(node, isOpening) {
var tag = node.tagName.toLowerCase();
processBlockConstruct(tag, true);
if (phrasingConstructs.includes(tag)) {
// Do not process phrasing tags as block constructs
return;
}
if (node.hasChildNodes()) {
node.childNodes.forEach(function (child) {
processNode(child);
});
} // Process other tags
if (tag === 'th' || tag === 'td') {
// Special Block
if (isOpening) {
// I'm assuming the DOM will fix all table element malformations
if (!this.hasEncounteredFirstCell) {
this.hasEncounteredFirstCell = true;
} else {
this.processBreaks();
this.map.push({
type: MapType.TEXT,
node: node,
content: '\t',
length: 1
});
}
} else {
this.processText();
}
return;
}
switch (tag) {
case 'br':
processText();
processBreaks();
runs.push('\n');
break;
this.processText();
case 'wbr':
processBreaks();
text.push("\u200B");
break;
if (tag === 'tr') {
this.hasEncounteredFirstCell = false;
}
if (tag === 'p') {
this.addBreak(true);
}
this.addBreak(false);
}
}, {
key: "processTextNode",
value: function processTextNode(node) {
var string = node.textContent.normalize(); // Trim
if (node.hasAttribute('alt')) {
processBreaks();
text.push(" ".concat(node.getAttribute('alt'), " "));
var trimmed = trimBeginAndEnd(string);
if (trimmed) {
this.processBreaks();
}
this.text.push({
node: node,
string: string
});
}
}, {
key: "getResult",
value: function getResult() {
var result = [];
var runningIndex = 0;
processBlockConstruct(tag, false);
};
var _iterator2 = _createForOfIteratorHelper(this.map),
_step2;
var processNode = function processNode(node) {
switch (node.nodeType) {
case Node.TEXT_NODE:
processTextNode(node);
break;
try {
for (_iterator2.s(); !(_step2 = _iterator2.n()).done;) {
var entity = _step2.value;
case Node.ELEMENT_NODE:
if (blacklist.includes(node.tagName.toLowerCase())) {
return;
}
switch (entity.type) {
case MapType.TEXT:
result.push({
node: entity.node,
content: entity.content,
start: runningIndex,
length: entity.length
});
runningIndex += entity.length;
break;
processElementNode(node);
break;
case MapType.BREAK:
var lastResult = result[result.length - 1];
case Node.DOCUMENT_NODE:
case Node.DOCUMENT_FRAGMENT_NODE:
if (node.hasChildNodes()) {
node.childNodes.forEach(function (child) {
processNode(child);
});
if (entity["double"]) {
lastResult.length += 2;
runningIndex += 2;
} else {
lastResult.length += 1;
runningIndex += 1;
}
break;
}
}
} catch (err) {
_iterator2.e(err);
} finally {
_iterator2.f();
}
break;
return result;
}
};
}]);
processNode(parentNode); // Get any stragglers
return MapCollector;
}();
processText();
return runs.join('');
var walkDOM = function walkDOM(parentNode, collector) {
if (!parentNode) {
return;
}
processNode(parentNode, collector);
return collector.getResult();
};
var processNode = function processNode(node, collector) {
switch (node.nodeType) {
case Node.TEXT_NODE:
collector.processTextNode(node);
break;
case Node.ELEMENT_NODE:
if (blacklist.includes(node.tagName.toLowerCase())) {
return;
}
processElementNode(node, collector);
break;
case Node.DOCUMENT_NODE:
case Node.DOCUMENT_FRAGMENT_NODE:
if (node.hasChildNodes()) {
node.childNodes.forEach(function (child) {
processNode(child, collector);
});
}
break;
}
};
var processElementNode = function processElementNode(node, collector) {
var skipRest = collector.processElementNode(node, true);
if (skipRest) {
return;
}
if (node.hasChildNodes()) {
node.childNodes.forEach(function (child) {
processNode(child, collector);
});
}
collector.processElementNode(node, false);
};
var degausser = function degausser(parentNode) {
var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
var collector = new StringCollector();
if (options.map) {
collector = new MapCollector();
}
return walkDOM(parentNode, collector);
};
exports.degausser = degausser;

@@ -275,0 +721,0 @@

21

package.json
{
"name": "degausser",
"version": "2.0.0",
"version": "2.1.0",
"description": "Transforms HTML to plain text by eliminating tags from a document.",

@@ -32,11 +32,12 @@ "author": "FlowPub",

"devDependencies": {
"@babel/core": "^7.8.7",
"@babel/preset-env": "^7.8.7",
"@rollup/plugin-node-resolve": "^7.1.1",
"glob": "^7.1.6",
"jest": "^25.1.0",
"prettier": "^1.19.1",
"rollup": "^2.0.5",
"rollup-plugin-babel": "^4.4.0"
}
"@babel/core": "7.9.6",
"@babel/preset-env": "7.9.6",
"@rollup/plugin-node-resolve": "7.1.3",
"glob": "7.1.6",
"jest": "26.0.1",
"prettier": "2.0.5",
"rollup": "2.7.1",
"rollup-plugin-babel": "4.4.0"
},
"dependencies": {}
}

@@ -1,196 +0,13 @@

import {
blacklist,
trimBeginAndEnd,
collapseWhitespace,
phrasingConstructs,
} from './util.js'
import { StringCollector } from './stringCollector'
import { MapCollector } from './mapCollector'
import { walkDOM } from './domWalker'
export const degausser = parentNode => {
// If there's no Node, just return
if (!parentNode) {
return null
}
export const degausser = (parentNode, options = {}) => {
let collector = new StringCollector()
// Tracking Entities
const runs = []
let text = []
let haveEncounteredFirstCell = false
let lastBreak = null
const breakType = {
NONE: 'none',
SINGLE: 'single',
DOUBLE: 'double',
if (options.map) {
collector = new MapCollector()
}
const addBreak = double => {
if (lastBreak === null) {
// The only time it should be null is at the beginning of document
return
}
if (double) {
lastBreak = breakType.DOUBLE
} else if (lastBreak !== breakType.DOUBLE) {
lastBreak = breakType.SINGLE
}
}
const processBreaks = () => {
if (!lastBreak) {
return
}
switch (lastBreak) {
case breakType.SINGLE:
runs.push('\n')
break
case breakType.DOUBLE:
runs.push('\n\n')
break
}
lastBreak = breakType.NONE
}
const processText = () => {
if (text.length === 0) {
return
}
// Trim
const trimmed = trimBeginAndEnd(text.join(''))
if (!trimmed) {
// Trimmed into an empty string
// Preserve all preceding breaks
text = []
return
}
if (lastBreak === null) {
lastBreak = breakType.NONE
}
runs.push(trimBeginAndEnd(collapseWhitespace(trimmed)))
text = []
}
const processBlockConstruct = (tag, opening) => {
if (phrasingConstructs.includes(tag)) {
return
}
// Not a phrasing construct, therefore is Block
if (tag === 'th' || tag === 'td') {
// Special Block
if (opening) {
// I'm assuming the DOM will fix all table element malformations
if (!haveEncounteredFirstCell) {
haveEncounteredFirstCell = true
} else {
processBreaks()
runs.push('\t')
}
} else {
processText()
}
return
}
// Regular Blocks
processText()
if (tag === 'tr') {
haveEncounteredFirstCell = false
}
if (tag === 'p') {
addBreak(true)
}
addBreak(false)
}
const processTextNode = node => {
const string = node.textContent.normalize()
// Trim
const trimmed = trimBeginAndEnd(string)
if (trimmed) {
processBreaks()
}
text.push(string)
}
const processElementNode = node => {
const tag = node.tagName && node.tagName.toLowerCase()
// Special case for Preformatted
if (tag === 'pre') {
processText()
addBreak(false)
processBreaks()
runs.push(node.textContent)
lastBreak = breakType.SINGLE
return
}
processBlockConstruct(tag, true)
if (node.hasChildNodes()) {
node.childNodes.forEach(child => {
processNode(child)
})
}
// Process other tags
switch (tag) {
case 'br':
processText()
processBreaks()
runs.push('\n')
break
case 'wbr':
processBreaks()
text.push('\u200B')
break
}
if (node.hasAttribute('alt')) {
processBreaks()
text.push(` ${node.getAttribute('alt')} `)
}
processBlockConstruct(tag, false)
}
const processNode = node => {
switch (node.nodeType) {
case Node.TEXT_NODE:
processTextNode(node)
break
case Node.ELEMENT_NODE:
if (blacklist.includes(node.tagName.toLowerCase())) {
return
}
processElementNode(node)
break
case Node.DOCUMENT_NODE:
case Node.DOCUMENT_FRAGMENT_NODE:
if (node.hasChildNodes()) {
node.childNodes.forEach(child => {
processNode(child)
})
}
break
}
}
processNode(parentNode)
// Get any stragglers
processText()
return runs.join('')
return walkDOM(parentNode, collector)
}

@@ -0,8 +1,21 @@

function autoBind() {
for (let prop of Object.getOwnPropertyNames(Object.getPrototypeOf(this))) {
if (prop === 'constructor' || typeof this[prop] !== 'function') continue
this[prop] = this[prop].bind(this)
}
}
// Char codes for \t, \n, and non-&nbsp; space character
const whitespaces = [9, 10, 13, 32]
const isCharWhitespace = charCode => {
const isCharWhitespace = (charCode) => {
return whitespaces.includes(charCode)
}
const trimBeginAndEnd = string => {
const BreakType = {
NONE: 'none',
SINGLE: 'single',
DOUBLE: 'double',
}
const trimBeginAndEnd = (string) => {
// Get the first and last non-whitespace character index

@@ -38,3 +51,3 @@ let firstNonWhite = null,

}
const collapseWhitespace = string => {
const collapseWhitespace = (string) => {
// Collapse all other sequential whitespace into a single whitespace

@@ -135,2 +148,9 @@ const textElements = []

export { blacklist, trimBeginAndEnd, collapseWhitespace, phrasingConstructs }
export {
autoBind,
blacklist,
BreakType,
trimBeginAndEnd,
collapseWhitespace,
phrasingConstructs,
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc