Comparing version 2.0.0 to 2.1.0
@@ -7,3 +7,114 @@ (function (global, factory) { | ||
// Char codes for \t, \n, and non- space character | ||
function _classCallCheck(instance, Constructor) { | ||
if (!(instance instanceof Constructor)) { | ||
throw new TypeError("Cannot call a class as a function"); | ||
} | ||
} | ||
function _defineProperties(target, props) { | ||
for (var i = 0; i < props.length; i++) { | ||
var descriptor = props[i]; | ||
descriptor.enumerable = descriptor.enumerable || false; | ||
descriptor.configurable = true; | ||
if ("value" in descriptor) descriptor.writable = true; | ||
Object.defineProperty(target, descriptor.key, descriptor); | ||
} | ||
} | ||
function _createClass(Constructor, protoProps, staticProps) { | ||
if (protoProps) _defineProperties(Constructor.prototype, protoProps); | ||
if (staticProps) _defineProperties(Constructor, staticProps); | ||
return Constructor; | ||
} | ||
function _unsupportedIterableToArray(o, minLen) { | ||
if (!o) return; | ||
if (typeof o === "string") return _arrayLikeToArray(o, minLen); | ||
var n = Object.prototype.toString.call(o).slice(8, -1); | ||
if (n === "Object" && o.constructor) n = o.constructor.name; | ||
if (n === "Map" || n === "Set") return Array.from(o); | ||
if (n === "Arguments" || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(n)) return _arrayLikeToArray(o, minLen); | ||
} | ||
function _arrayLikeToArray(arr, len) { | ||
if (len == null || len > arr.length) len = arr.length; | ||
for (var i = 0, arr2 = new Array(len); i < len; i++) arr2[i] = arr[i]; | ||
return arr2; | ||
} | ||
function _createForOfIteratorHelper(o) { | ||
if (typeof Symbol === "undefined" || o[Symbol.iterator] == null) { | ||
if (Array.isArray(o) || (o = _unsupportedIterableToArray(o))) { | ||
var i = 0; | ||
var F = function () {}; | ||
return { | ||
s: F, | ||
n: function () { | ||
if (i >= o.length) return { | ||
done: true | ||
}; | ||
return { | ||
done: false, | ||
value: o[i++] | ||
}; | ||
}, | ||
e: function (e) { | ||
throw e; | ||
}, | ||
f: F | ||
}; | ||
} | ||
throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); | ||
} | ||
var it, | ||
normalCompletion = true, | ||
didErr = false, | ||
err; | ||
return { | ||
s: function () { | ||
it = o[Symbol.iterator](); | ||
}, | ||
n: function () { | ||
var step = it.next(); | ||
normalCompletion = step.done; | ||
return step; | ||
}, | ||
e: function (e) { | ||
didErr = true; | ||
err = e; | ||
}, | ||
f: function () { | ||
try { | ||
if (!normalCompletion && it.return != null) it.return(); | ||
} finally { | ||
if (didErr) throw err; | ||
} | ||
} | ||
}; | ||
} | ||
function autoBind() { | ||
var _iterator = _createForOfIteratorHelper(Object.getOwnPropertyNames(Object.getPrototypeOf(this))), | ||
_step; | ||
try { | ||
for (_iterator.s(); !(_step = _iterator.n()).done;) { | ||
var prop = _step.value; | ||
if (prop === 'constructor' || typeof this[prop] !== 'function') continue; | ||
this[prop] = this[prop].bind(this); | ||
} | ||
} catch (err) { | ||
_iterator.e(err); | ||
} finally { | ||
_iterator.f(); | ||
} | ||
} // Char codes for \t, \n, and non- space character | ||
var whitespaces = [9, 10, 13, 32]; | ||
@@ -15,2 +126,8 @@ | ||
var BreakType = { | ||
NONE: 'none', | ||
SINGLE: 'single', | ||
DOUBLE: 'double' | ||
}; | ||
var trimBeginAndEnd = function trimBeginAndEnd(string) { | ||
@@ -79,196 +196,525 @@ // Get the first and last non-whitespace character index | ||
var degausser = function degausser(parentNode) { | ||
// If there's no Node, just return | ||
if (!parentNode) { | ||
return null; | ||
} // Tracking Entities | ||
var StringCollector = /*#__PURE__*/function () { | ||
function StringCollector() { | ||
_classCallCheck(this, StringCollector); | ||
this.runs = []; | ||
this.text = []; | ||
this.hasEncounteredFirstCell = false; | ||
this.lastBreak = null; | ||
autoBind.call(this); | ||
} | ||
var runs = []; | ||
var text = []; | ||
var haveEncounteredFirstCell = false; | ||
var lastBreak = null; | ||
var breakType = { | ||
NONE: 'none', | ||
SINGLE: 'single', | ||
DOUBLE: 'double' | ||
}; | ||
_createClass(StringCollector, [{ | ||
key: "addBreak", | ||
value: function addBreak(_double) { | ||
if (this.lastBreak === null) { | ||
// The only time it should be null is at the beginning of document | ||
return; | ||
} | ||
var addBreak = function addBreak(_double) { | ||
if (lastBreak === null) { | ||
// The only time it should be null is at the beginning of document | ||
return; | ||
if (_double) { | ||
this.lastBreak = BreakType.DOUBLE; | ||
} else if (this.lastBreak !== BreakType.DOUBLE) { | ||
this.lastBreak = BreakType.SINGLE; | ||
} | ||
} | ||
}, { | ||
key: "processBreaks", | ||
value: function processBreaks() { | ||
if (!this.lastBreak) { | ||
return; | ||
} | ||
if (_double) { | ||
lastBreak = breakType.DOUBLE; | ||
} else if (lastBreak !== breakType.DOUBLE) { | ||
lastBreak = breakType.SINGLE; | ||
switch (this.lastBreak) { | ||
case BreakType.SINGLE: | ||
this.runs.push('\n'); | ||
break; | ||
case BreakType.DOUBLE: | ||
this.runs.push('\n\n'); | ||
break; | ||
} | ||
this.lastBreak = BreakType.NONE; | ||
} | ||
}; | ||
}, { | ||
key: "processText", | ||
value: function processText() { | ||
if (this.text.length === 0) { | ||
return; | ||
} // Trim | ||
var processBreaks = function processBreaks() { | ||
if (!lastBreak) { | ||
return; | ||
var trimmed = trimBeginAndEnd(this.text.join('')); | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
this.text = []; | ||
return; | ||
} | ||
if (this.lastBreak === null) { | ||
this.lastBreak = BreakType.NONE; | ||
} | ||
this.runs.push(trimBeginAndEnd(collapseWhitespace(trimmed))); | ||
this.text = []; | ||
} | ||
}, { | ||
key: "processElementNode", | ||
value: function processElementNode(node, isOpening) { | ||
var tag = node.tagName.toLowerCase(); // Special case for Preformatted | ||
switch (lastBreak) { | ||
case breakType.SINGLE: | ||
runs.push('\n'); | ||
break; | ||
if (tag === 'pre') { | ||
this.processText(); | ||
this.addBreak(false); | ||
this.processBreaks(); | ||
this.runs.push(node.textContent); | ||
this.lastBreak = BreakType.SINGLE; | ||
return true; | ||
} // Process other tags | ||
case breakType.DOUBLE: | ||
runs.push('\n\n'); | ||
break; | ||
switch (tag) { | ||
case 'br': | ||
this.processText(); | ||
this.processBreaks(); | ||
this.runs.push('\n'); | ||
return true; | ||
case 'wbr': | ||
this.processBreaks(); | ||
this.text.push("\u200B"); | ||
return true; | ||
} | ||
if (node.hasAttribute('alt')) { | ||
this.processBreaks(); | ||
this.text.push(" ".concat(node.getAttribute('alt'), " ")); | ||
return true; | ||
} | ||
this.processBlockConstruct(tag, isOpening); | ||
return false; | ||
} | ||
}, { | ||
key: "processBlockConstruct", | ||
value: function processBlockConstruct(tag, isOpening) { | ||
if (phrasingConstructs.includes(tag)) { | ||
// Do not process phrasing tags as block constructs | ||
return; | ||
} | ||
lastBreak = breakType.NONE; | ||
}; | ||
if (tag === 'th' || tag === 'td') { | ||
// Special Block | ||
if (isOpening) { | ||
// I'm assuming the DOM will fix all table element malformations | ||
if (!this.hasEncounteredFirstCell) { | ||
this.hasEncounteredFirstCell = true; | ||
} else { | ||
this.processBreaks(); | ||
this.runs.push('\t'); | ||
} | ||
} else { | ||
this.processText(); | ||
} | ||
var processText = function processText() { | ||
if (text.length === 0) { | ||
return; | ||
} // Trim | ||
return; | ||
} // Regular Block | ||
var trimmed = trimBeginAndEnd(text.join('')); | ||
this.processText(); | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
text = []; | ||
return; | ||
if (tag === 'tr') { | ||
this.hasEncounteredFirstCell = false; | ||
} | ||
if (tag === 'p') { | ||
this.addBreak(true); | ||
} | ||
this.addBreak(false); | ||
} | ||
}, { | ||
key: "processTextNode", | ||
value: function processTextNode(node) { | ||
var string = node.textContent.normalize(); // Trim | ||
if (lastBreak === null) { | ||
lastBreak = breakType.NONE; | ||
var trimmed = trimBeginAndEnd(string); | ||
if (trimmed) { | ||
this.processBreaks(); | ||
} | ||
this.text.push(string); | ||
} | ||
}, { | ||
key: "getResult", | ||
value: function getResult() { | ||
// Get Stragglers | ||
this.processText(); | ||
return this.runs.join(''); | ||
} | ||
}]); | ||
runs.push(trimBeginAndEnd(collapseWhitespace(trimmed))); | ||
text = []; | ||
}; | ||
return StringCollector; | ||
}(); | ||
var processBlockConstruct = function processBlockConstruct(tag, opening) { | ||
if (phrasingConstructs.includes(tag)) { | ||
return; | ||
} // Not a phrasing construct, therefore is Block | ||
var MapType = { | ||
TEXT: 'Text', | ||
BREAK: 'Break' | ||
}; | ||
var MapCollector = /*#__PURE__*/function () { | ||
function MapCollector() { | ||
_classCallCheck(this, MapCollector); | ||
this.map = []; | ||
this.text = []; | ||
this.hasEncounteredFirstCell = false; | ||
this.lastBreak = null; | ||
autoBind.call(this); | ||
} | ||
if (tag === 'th' || tag === 'td') { | ||
// Special Block | ||
if (opening) { | ||
// I'm assuming the DOM will fix all table element malformations | ||
if (!haveEncounteredFirstCell) { | ||
haveEncounteredFirstCell = true; | ||
} else { | ||
processBreaks(); | ||
runs.push('\t'); | ||
} | ||
} else { | ||
processText(); | ||
_createClass(MapCollector, [{ | ||
key: "addBreak", | ||
value: function addBreak(_double) { | ||
if (this.lastBreak === null) { | ||
// The only time it should be null is at the beginning of document | ||
return; | ||
} | ||
return; | ||
} // Regular Blocks | ||
if (_double) { | ||
this.lastBreak = BreakType.DOUBLE; | ||
} else if (this.lastBreak !== BreakType.DOUBLE) { | ||
this.lastBreak = BreakType.SINGLE; | ||
} | ||
} | ||
}, { | ||
key: "processBreaks", | ||
value: function processBreaks() { | ||
if (!this.lastBreak) { | ||
return; | ||
} | ||
switch (this.lastBreak) { | ||
case BreakType.SINGLE: | ||
this.map.push({ | ||
type: MapType.BREAK, | ||
"double": false | ||
}); | ||
break; | ||
processText(); | ||
case BreakType.DOUBLE: | ||
this.map.push({ | ||
type: MapType.BREAK, | ||
"double": true | ||
}); | ||
break; | ||
} | ||
if (tag === 'tr') { | ||
haveEncounteredFirstCell = false; | ||
this.lastBreak = BreakType.NONE; | ||
} | ||
}, { | ||
key: "processText", | ||
value: function processText() { | ||
var _this$map; | ||
if (tag === 'p') { | ||
addBreak(true); | ||
} | ||
if (this.text.length === 0) { | ||
return; | ||
} | ||
addBreak(false); | ||
}; | ||
var joinedText = this.text.map(function (element) { | ||
return element.string; | ||
}).join(''); // TODO: might have to check for null string here | ||
var processTextNode = function processTextNode(node) { | ||
var string = node.textContent.normalize(); // Trim | ||
var trimmed = trimBeginAndEnd(joinedText); | ||
var trimmed = trimBeginAndEnd(string); | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
this.text = []; | ||
return; | ||
} | ||
if (trimmed) { | ||
processBreaks(); | ||
var fullText = trimBeginAndEnd(collapseWhitespace(trimmed)); | ||
var blockMap = []; | ||
var currentIndexOfString = 0; | ||
var _iterator = _createForOfIteratorHelper(this.text), | ||
_step; | ||
try { | ||
for (_iterator.s(); !(_step = _iterator.n()).done;) { | ||
var textMap = _step.value; | ||
var shrunkText = trimBeginAndEnd(collapseWhitespace(textMap.string)); | ||
if (!shrunkText) { | ||
continue; | ||
} | ||
var index = fullText.indexOf(shrunkText); | ||
if (index < 0) { | ||
throw new Error("Could not find shrunk string \"".concat(shrunkText, "\" in \"").concat(fullText, "\"")); | ||
} | ||
blockMap.push({ | ||
type: MapType.TEXT, | ||
node: textMap.node, | ||
start: currentIndexOfString + index, | ||
content: shrunkText, | ||
length: shrunkText.length | ||
}); | ||
fullText = fullText.slice(index + shrunkText.length); | ||
currentIndexOfString += shrunkText.length + index; | ||
} // Do some more magic on block map | ||
} catch (err) { | ||
_iterator.e(err); | ||
} finally { | ||
_iterator.f(); | ||
} | ||
for (var i = 1; i < blockMap.length; ++i) { | ||
if (blockMap[i].start - blockMap[i - 1].start !== blockMap[i - 1].length) { | ||
blockMap[i - 1].length = blockMap[i].start - blockMap[i - 1].start; | ||
} | ||
} | ||
(_this$map = this.map).push.apply(_this$map, blockMap); | ||
if (this.lastBreak === null) { | ||
this.lastBreak = BreakType.NONE; | ||
} | ||
this.text = []; | ||
} | ||
}, { | ||
key: "processElementNode", | ||
value: function processElementNode(node, isOpening) { | ||
var tag = node.tagName.toLowerCase(); // Special case for Preformatted | ||
text.push(string); | ||
}; | ||
if (tag === 'pre') { | ||
this.processText(); | ||
this.addBreak(false); | ||
this.processBreaks(); | ||
this.lastBreak = BreakType.SINGLE; | ||
this.map.push({ | ||
type: MapType.TEXT, | ||
node: node, | ||
content: node.textContent, | ||
length: node.textContent.length | ||
}); | ||
return true; | ||
} // Process other tags | ||
var processElementNode = function processElementNode(node) { | ||
var tag = node.tagName && node.tagName.toLowerCase(); // Special case for Preformatted | ||
if (tag === 'pre') { | ||
processText(); | ||
addBreak(false); | ||
processBreaks(); | ||
runs.push(node.textContent); | ||
lastBreak = breakType.SINGLE; | ||
return; | ||
switch (tag) { | ||
case 'br': | ||
this.processText(); | ||
this.processBreaks(); | ||
this.map.push({ | ||
type: MapType.TEXT, | ||
node: node, | ||
content: '\n', | ||
length: 1 | ||
}); | ||
return true; | ||
case 'wbr': | ||
this.processBreaks(); | ||
this.text.push({ | ||
node: node, | ||
string: "\u200B" | ||
}); | ||
return true; | ||
} | ||
if (node.hasAttribute('alt')) { | ||
this.processBreaks(); | ||
this.text.push({ | ||
node: node, | ||
string: " ".concat(node.getAttribute('alt'), " ") | ||
}); | ||
return true; | ||
} | ||
this.processBlockConstruct(node, isOpening); | ||
return false; | ||
} | ||
}, { | ||
key: "processBlockConstruct", | ||
value: function processBlockConstruct(node, isOpening) { | ||
var tag = node.tagName.toLowerCase(); | ||
processBlockConstruct(tag, true); | ||
if (phrasingConstructs.includes(tag)) { | ||
// Do not process phrasing tags as block constructs | ||
return; | ||
} | ||
if (node.hasChildNodes()) { | ||
node.childNodes.forEach(function (child) { | ||
processNode(child); | ||
}); | ||
} // Process other tags | ||
if (tag === 'th' || tag === 'td') { | ||
// Special Block | ||
if (isOpening) { | ||
// I'm assuming the DOM will fix all table element malformations | ||
if (!this.hasEncounteredFirstCell) { | ||
this.hasEncounteredFirstCell = true; | ||
} else { | ||
this.processBreaks(); | ||
this.map.push({ | ||
type: MapType.TEXT, | ||
node: node, | ||
content: '\t', | ||
length: 1 | ||
}); | ||
} | ||
} else { | ||
this.processText(); | ||
} | ||
return; | ||
} | ||
switch (tag) { | ||
case 'br': | ||
processText(); | ||
processBreaks(); | ||
runs.push('\n'); | ||
break; | ||
this.processText(); | ||
case 'wbr': | ||
processBreaks(); | ||
text.push("\u200B"); | ||
break; | ||
if (tag === 'tr') { | ||
this.hasEncounteredFirstCell = false; | ||
} | ||
if (tag === 'p') { | ||
this.addBreak(true); | ||
} | ||
this.addBreak(false); | ||
} | ||
}, { | ||
key: "processTextNode", | ||
value: function processTextNode(node) { | ||
var string = node.textContent.normalize(); // Trim | ||
if (node.hasAttribute('alt')) { | ||
processBreaks(); | ||
text.push(" ".concat(node.getAttribute('alt'), " ")); | ||
var trimmed = trimBeginAndEnd(string); | ||
if (trimmed) { | ||
this.processBreaks(); | ||
} | ||
this.text.push({ | ||
node: node, | ||
string: string | ||
}); | ||
} | ||
}, { | ||
key: "getResult", | ||
value: function getResult() { | ||
var result = []; | ||
var runningIndex = 0; | ||
processBlockConstruct(tag, false); | ||
}; | ||
var _iterator2 = _createForOfIteratorHelper(this.map), | ||
_step2; | ||
var processNode = function processNode(node) { | ||
switch (node.nodeType) { | ||
case Node.TEXT_NODE: | ||
processTextNode(node); | ||
break; | ||
try { | ||
for (_iterator2.s(); !(_step2 = _iterator2.n()).done;) { | ||
var entity = _step2.value; | ||
case Node.ELEMENT_NODE: | ||
if (blacklist.includes(node.tagName.toLowerCase())) { | ||
return; | ||
} | ||
switch (entity.type) { | ||
case MapType.TEXT: | ||
result.push({ | ||
node: entity.node, | ||
content: entity.content, | ||
start: runningIndex, | ||
length: entity.length | ||
}); | ||
runningIndex += entity.length; | ||
break; | ||
processElementNode(node); | ||
break; | ||
case MapType.BREAK: | ||
var lastResult = result[result.length - 1]; | ||
case Node.DOCUMENT_NODE: | ||
case Node.DOCUMENT_FRAGMENT_NODE: | ||
if (node.hasChildNodes()) { | ||
node.childNodes.forEach(function (child) { | ||
processNode(child); | ||
}); | ||
if (entity["double"]) { | ||
lastResult.length += 2; | ||
runningIndex += 2; | ||
} else { | ||
lastResult.length += 1; | ||
runningIndex += 1; | ||
} | ||
break; | ||
} | ||
} | ||
} catch (err) { | ||
_iterator2.e(err); | ||
} finally { | ||
_iterator2.f(); | ||
} | ||
break; | ||
return result; | ||
} | ||
}; | ||
}]); | ||
processNode(parentNode); // Get any stragglers | ||
return MapCollector; | ||
}(); | ||
processText(); | ||
return runs.join(''); | ||
var walkDOM = function walkDOM(parentNode, collector) { | ||
if (!parentNode) { | ||
return; | ||
} | ||
processNode(parentNode, collector); | ||
return collector.getResult(); | ||
}; | ||
var processNode = function processNode(node, collector) { | ||
switch (node.nodeType) { | ||
case Node.TEXT_NODE: | ||
collector.processTextNode(node); | ||
break; | ||
case Node.ELEMENT_NODE: | ||
if (blacklist.includes(node.tagName.toLowerCase())) { | ||
return; | ||
} | ||
processElementNode(node, collector); | ||
break; | ||
case Node.DOCUMENT_NODE: | ||
case Node.DOCUMENT_FRAGMENT_NODE: | ||
if (node.hasChildNodes()) { | ||
node.childNodes.forEach(function (child) { | ||
processNode(child, collector); | ||
}); | ||
} | ||
break; | ||
} | ||
}; | ||
var processElementNode = function processElementNode(node, collector) { | ||
var skipRest = collector.processElementNode(node, true); | ||
if (skipRest) { | ||
return; | ||
} | ||
if (node.hasChildNodes()) { | ||
node.childNodes.forEach(function (child) { | ||
processNode(child, collector); | ||
}); | ||
} | ||
collector.processElementNode(node, false); | ||
}; | ||
var degausser = function degausser(parentNode) { | ||
var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}; | ||
var collector = new StringCollector(); | ||
if (options.map) { | ||
collector = new MapCollector(); | ||
} | ||
return walkDOM(parentNode, collector); | ||
}; | ||
exports.degausser = degausser; | ||
@@ -275,0 +721,0 @@ |
{ | ||
"name": "degausser", | ||
"version": "2.0.0", | ||
"version": "2.1.0", | ||
"description": "Transforms HTML to plain text by eliminating tags from a document.", | ||
@@ -32,11 +32,12 @@ "author": "FlowPub", | ||
"devDependencies": { | ||
"@babel/core": "^7.8.7", | ||
"@babel/preset-env": "^7.8.7", | ||
"@rollup/plugin-node-resolve": "^7.1.1", | ||
"glob": "^7.1.6", | ||
"jest": "^25.1.0", | ||
"prettier": "^1.19.1", | ||
"rollup": "^2.0.5", | ||
"rollup-plugin-babel": "^4.4.0" | ||
} | ||
"@babel/core": "7.9.6", | ||
"@babel/preset-env": "7.9.6", | ||
"@rollup/plugin-node-resolve": "7.1.3", | ||
"glob": "7.1.6", | ||
"jest": "26.0.1", | ||
"prettier": "2.0.5", | ||
"rollup": "2.7.1", | ||
"rollup-plugin-babel": "4.4.0" | ||
}, | ||
"dependencies": {} | ||
} |
@@ -1,196 +0,13 @@ | ||
import { | ||
blacklist, | ||
trimBeginAndEnd, | ||
collapseWhitespace, | ||
phrasingConstructs, | ||
} from './util.js' | ||
import { StringCollector } from './stringCollector' | ||
import { MapCollector } from './mapCollector' | ||
import { walkDOM } from './domWalker' | ||
export const degausser = parentNode => { | ||
// If there's no Node, just return | ||
if (!parentNode) { | ||
return null | ||
} | ||
export const degausser = (parentNode, options = {}) => { | ||
let collector = new StringCollector() | ||
// Tracking Entities | ||
const runs = [] | ||
let text = [] | ||
let haveEncounteredFirstCell = false | ||
let lastBreak = null | ||
const breakType = { | ||
NONE: 'none', | ||
SINGLE: 'single', | ||
DOUBLE: 'double', | ||
if (options.map) { | ||
collector = new MapCollector() | ||
} | ||
const addBreak = double => { | ||
if (lastBreak === null) { | ||
// The only time it should be null is at the beginning of document | ||
return | ||
} | ||
if (double) { | ||
lastBreak = breakType.DOUBLE | ||
} else if (lastBreak !== breakType.DOUBLE) { | ||
lastBreak = breakType.SINGLE | ||
} | ||
} | ||
const processBreaks = () => { | ||
if (!lastBreak) { | ||
return | ||
} | ||
switch (lastBreak) { | ||
case breakType.SINGLE: | ||
runs.push('\n') | ||
break | ||
case breakType.DOUBLE: | ||
runs.push('\n\n') | ||
break | ||
} | ||
lastBreak = breakType.NONE | ||
} | ||
const processText = () => { | ||
if (text.length === 0) { | ||
return | ||
} | ||
// Trim | ||
const trimmed = trimBeginAndEnd(text.join('')) | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
text = [] | ||
return | ||
} | ||
if (lastBreak === null) { | ||
lastBreak = breakType.NONE | ||
} | ||
runs.push(trimBeginAndEnd(collapseWhitespace(trimmed))) | ||
text = [] | ||
} | ||
const processBlockConstruct = (tag, opening) => { | ||
if (phrasingConstructs.includes(tag)) { | ||
return | ||
} | ||
// Not a phrasing construct, therefore is Block | ||
if (tag === 'th' || tag === 'td') { | ||
// Special Block | ||
if (opening) { | ||
// I'm assuming the DOM will fix all table element malformations | ||
if (!haveEncounteredFirstCell) { | ||
haveEncounteredFirstCell = true | ||
} else { | ||
processBreaks() | ||
runs.push('\t') | ||
} | ||
} else { | ||
processText() | ||
} | ||
return | ||
} | ||
// Regular Blocks | ||
processText() | ||
if (tag === 'tr') { | ||
haveEncounteredFirstCell = false | ||
} | ||
if (tag === 'p') { | ||
addBreak(true) | ||
} | ||
addBreak(false) | ||
} | ||
const processTextNode = node => { | ||
const string = node.textContent.normalize() | ||
// Trim | ||
const trimmed = trimBeginAndEnd(string) | ||
if (trimmed) { | ||
processBreaks() | ||
} | ||
text.push(string) | ||
} | ||
const processElementNode = node => { | ||
const tag = node.tagName && node.tagName.toLowerCase() | ||
// Special case for Preformatted | ||
if (tag === 'pre') { | ||
processText() | ||
addBreak(false) | ||
processBreaks() | ||
runs.push(node.textContent) | ||
lastBreak = breakType.SINGLE | ||
return | ||
} | ||
processBlockConstruct(tag, true) | ||
if (node.hasChildNodes()) { | ||
node.childNodes.forEach(child => { | ||
processNode(child) | ||
}) | ||
} | ||
// Process other tags | ||
switch (tag) { | ||
case 'br': | ||
processText() | ||
processBreaks() | ||
runs.push('\n') | ||
break | ||
case 'wbr': | ||
processBreaks() | ||
text.push('\u200B') | ||
break | ||
} | ||
if (node.hasAttribute('alt')) { | ||
processBreaks() | ||
text.push(` ${node.getAttribute('alt')} `) | ||
} | ||
processBlockConstruct(tag, false) | ||
} | ||
const processNode = node => { | ||
switch (node.nodeType) { | ||
case Node.TEXT_NODE: | ||
processTextNode(node) | ||
break | ||
case Node.ELEMENT_NODE: | ||
if (blacklist.includes(node.tagName.toLowerCase())) { | ||
return | ||
} | ||
processElementNode(node) | ||
break | ||
case Node.DOCUMENT_NODE: | ||
case Node.DOCUMENT_FRAGMENT_NODE: | ||
if (node.hasChildNodes()) { | ||
node.childNodes.forEach(child => { | ||
processNode(child) | ||
}) | ||
} | ||
break | ||
} | ||
} | ||
processNode(parentNode) | ||
// Get any stragglers | ||
processText() | ||
return runs.join('') | ||
return walkDOM(parentNode, collector) | ||
} |
@@ -0,8 +1,21 @@ | ||
function autoBind() { | ||
for (let prop of Object.getOwnPropertyNames(Object.getPrototypeOf(this))) { | ||
if (prop === 'constructor' || typeof this[prop] !== 'function') continue | ||
this[prop] = this[prop].bind(this) | ||
} | ||
} | ||
// Char codes for \t, \n, and non- space character | ||
const whitespaces = [9, 10, 13, 32] | ||
const isCharWhitespace = charCode => { | ||
const isCharWhitespace = (charCode) => { | ||
return whitespaces.includes(charCode) | ||
} | ||
const trimBeginAndEnd = string => { | ||
const BreakType = { | ||
NONE: 'none', | ||
SINGLE: 'single', | ||
DOUBLE: 'double', | ||
} | ||
const trimBeginAndEnd = (string) => { | ||
// Get the first and last non-whitespace character index | ||
@@ -38,3 +51,3 @@ let firstNonWhite = null, | ||
} | ||
const collapseWhitespace = string => { | ||
const collapseWhitespace = (string) => { | ||
// Collapse all other sequential whitespace into a single whitespace | ||
@@ -135,2 +148,9 @@ const textElements = [] | ||
export { blacklist, trimBeginAndEnd, collapseWhitespace, phrasingConstructs } | ||
export { | ||
autoBind, | ||
blacklist, | ||
BreakType, | ||
trimBeginAndEnd, | ||
collapseWhitespace, | ||
phrasingConstructs, | ||
} |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
36094
9
1141