Comparing version 2.4.0 to 2.4.1
@@ -143,2 +143,6 @@ (function (global, factory) { | ||
var isCharNewLine = function isCharNewLine(charCode) { | ||
return charCode === 10 || charCode === 13; | ||
}; | ||
var BreakType = { | ||
@@ -149,3 +153,88 @@ NONE: 'none', | ||
}; | ||
/** | ||
* Trim whitespace from the start of the string | ||
* @param string | ||
* @returns { string } | ||
*/ | ||
var trimBeginOnly = function trimBeginOnly(string) { | ||
// Get the first non-whitespace character index | ||
var firstNonWhite = null; | ||
for (var index = 0; index < string.length; index++) { | ||
if (!isCharWhitespace(string.charCodeAt(index))) { | ||
firstNonWhite = index; | ||
break; | ||
} | ||
} // If the first non-whitespace character is null, the string is entirely whitespace | ||
if (firstNonWhite === null) { | ||
return string; | ||
} // Return the non-empty sections of the string | ||
return string.slice(firstNonWhite); | ||
}; | ||
/** | ||
* Trim any new line characters from the end of the string | ||
* Also trim any whitespace that comes after that new line character, but not any that comes before. | ||
* @param string | ||
* @returns {*} | ||
*/ | ||
var trimEndNewLine = function trimEndNewLine(string) { | ||
var lastNonNewLine = null; | ||
var foundNewLineCharacter = false; | ||
var foundNonWhiteSpaceCharacter = false; | ||
for (var index = string.length - 1; index >= 0; index--) { | ||
var charCode = string.charCodeAt(index); | ||
var isNewLine = isCharNewLine(charCode); | ||
if (isCharWhitespace(charCode)) { | ||
if (!isNewLine) { | ||
// okay to trim out any white space | ||
continue; | ||
} else { | ||
foundNewLineCharacter = true; | ||
} | ||
} else { | ||
foundNonWhiteSpaceCharacter = true; | ||
} | ||
if (!isNewLine) { | ||
if (foundNewLineCharacter) { | ||
lastNonNewLine = index; | ||
} | ||
break; | ||
} | ||
} | ||
if (!foundNonWhiteSpaceCharacter) { | ||
return null; | ||
} // If both are null, the string is entirely whitespace | ||
if (lastNonNewLine === null) { | ||
return string; | ||
} // Return the non-empty sections of the string | ||
return string.slice(0, lastNonNewLine ? lastNonNewLine + 1 : undefined); | ||
}; | ||
/** | ||
* Trims any whitespace at the start and trims any newline characters at the end of the string. | ||
* Trims any whitespace after newline characters at the end of the string, but not any that comes before. | ||
* @param string | ||
* @returns {*} | ||
*/ | ||
var trimAllExceptEndWhiteSpace = function trimAllExceptEndWhiteSpace(string) { | ||
return trimEndNewLine(trimBeginOnly(string)); | ||
}; | ||
var trimBeginAndEnd = function trimBeginAndEnd(string) { | ||
@@ -208,2 +297,6 @@ // Get the first and last non-whitespace character index | ||
var trimAndCollapseWhitespace = function trimAndCollapseWhitespace(string) { | ||
return trimBeginAndEnd(collapseWhitespace(string)); | ||
}; | ||
var blacklist = ['base', 'command', 'link', 'meta', 'noscript', 'script', 'style', 'title', // special cases | ||
@@ -374,3 +467,21 @@ // "html", | ||
case BreakType.DOUBLE: | ||
this.runs.push('\n\n'); | ||
var paragraphBreakAdded = false; // iterate through runs backwards: | ||
for (var i = this.runs.length - 1; i >= 0; i--) { | ||
var run = this.runs[i]; | ||
if (run === '\n\n') { | ||
// found double break | ||
paragraphBreakAdded = true; | ||
break; | ||
} else if (run !== '\n') { | ||
// found text content | ||
break; | ||
} | ||
} | ||
if (!paragraphBreakAdded) { | ||
this.runs.push('\n\n'); | ||
} | ||
break; | ||
@@ -382,4 +493,4 @@ } | ||
}, { | ||
key: "processText", | ||
value: function processText() { | ||
key: "processTextAndTrim", | ||
value: function processTextAndTrim(trimmingFunction) { | ||
if (this.text.length === 0) { | ||
@@ -390,7 +501,7 @@ return; | ||
var trimmed = trimBeginAndEnd(this.text.join('')); | ||
var trimmed = trimmingFunction(this.text.join('')); | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
// Preserve all preceding breaks | ||
this.text = []; | ||
@@ -404,6 +515,17 @@ return; | ||
this.runs.push(trimBeginAndEnd(collapseWhitespace(trimmed))); | ||
this.runs.push(trimmingFunction(trimmed)); | ||
this.text = []; | ||
} | ||
}, { | ||
key: "processText", | ||
value: function processText() { | ||
var trimEndSpaces = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : true; | ||
if (trimEndSpaces) { | ||
this.processTextAndTrim(trimAndCollapseWhitespace); | ||
} else { | ||
this.processTextAndTrim(trimAllExceptEndWhiteSpace); | ||
} | ||
} | ||
}, { | ||
key: "processElementNode", | ||
@@ -429,3 +551,3 @@ value: function processElementNode(node, isOpening) { | ||
case 'br': | ||
this.processText(); | ||
this.processText(false); | ||
this.processBreaks(); | ||
@@ -562,6 +684,22 @@ this.runs.push('\n'); | ||
case BreakType.DOUBLE: | ||
this.map.push({ | ||
type: MapType.BREAK, | ||
"double": true | ||
}); | ||
var paragraphBreakAdded = false; // iterate through map backwards: | ||
for (var i = this.map.length - 1; i >= 0; --i) { | ||
var map = this.map[i]; | ||
if (map.type === MapType.BREAK && map["double"]) { | ||
paragraphBreakAdded = true; | ||
break; | ||
} else if (!this.isSingleBreak(map)) { | ||
break; | ||
} | ||
} | ||
if (!paragraphBreakAdded) { | ||
this.map.push({ | ||
type: MapType.BREAK, | ||
"double": true | ||
}); | ||
} | ||
break; | ||
@@ -573,4 +711,11 @@ } | ||
}, { | ||
key: "processText", | ||
value: function processText() { | ||
key: "isSingleBreak", | ||
value: function isSingleBreak(mapObject) { | ||
var isSingleBreak = mapObject.type === MapType.BREAK && !mapObject["double"]; | ||
var isNewLine = mapObject.type === MapType.TEXT && mapObject.content === '\n'; | ||
return isSingleBreak || isNewLine; | ||
} | ||
}, { | ||
key: "processTextAndTrim", | ||
value: function processTextAndTrim(trimmingFunction) { | ||
var _this$map; | ||
@@ -586,7 +731,7 @@ | ||
var trimmed = trimBeginAndEnd(joinedText); | ||
var trimmed = trimmingFunction(joinedText); | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
// Preserve all preceding breaks | ||
this.text = []; | ||
@@ -596,3 +741,3 @@ return; | ||
var fullText = trimBeginAndEnd(collapseWhitespace(trimmed)); | ||
var fullText = trimmingFunction(trimmed); | ||
var blockMap = []; | ||
@@ -607,3 +752,3 @@ var currentIndexOfString = 0; | ||
var textMap = _step.value; | ||
var shrunkText = trimBeginAndEnd(collapseWhitespace(textMap.string)); | ||
var shrunkText = trimmingFunction(textMap.string); | ||
@@ -652,2 +797,13 @@ if (!shrunkText) { | ||
}, { | ||
key: "processText", | ||
value: function processText() { | ||
var trimEndSpaces = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : true; | ||
if (trimEndSpaces) { | ||
this.processTextAndTrim(trimAndCollapseWhitespace); | ||
} else { | ||
this.processTextAndTrim(trimAllExceptEndWhiteSpace); | ||
} | ||
} | ||
}, { | ||
key: "processElementNode", | ||
@@ -678,3 +834,3 @@ value: function processElementNode(node, isOpening) { | ||
case 'br': | ||
this.processText(); | ||
this.processText(false); | ||
this.processBreaks(); | ||
@@ -681,0 +837,0 @@ this.map.push({ |
{ | ||
"name": "degausser", | ||
"version": "2.4.0", | ||
"version": "2.4.1", | ||
"description": "Transforms HTML to plain text by eliminating tags from a document.", | ||
@@ -5,0 +5,0 @@ "author": "FlowPub", |
@@ -5,3 +5,2 @@ import { | ||
trimBeginAndEnd, | ||
collapseWhitespace, | ||
isCharWhitespace, | ||
@@ -12,2 +11,4 @@ phrasingConstructs, | ||
elementCanHaveAltText, | ||
trimAllExceptEndWhiteSpace, | ||
trimAndCollapseWhitespace, | ||
} from './util' | ||
@@ -59,6 +60,19 @@ | ||
case BreakType.DOUBLE: | ||
this.map.push({ | ||
type: MapType.BREAK, | ||
double: true, | ||
}) | ||
let paragraphBreakAdded = false | ||
// iterate through map backwards: | ||
for (let i = this.map.length - 1; i >= 0; --i) { | ||
const map = this.map[i] | ||
if (map.type === MapType.BREAK && map.double) { | ||
paragraphBreakAdded = true | ||
break | ||
} else if (!this.isSingleBreak(map)) { | ||
break | ||
} | ||
} | ||
if (!paragraphBreakAdded) { | ||
this.map.push({ | ||
type: MapType.BREAK, | ||
double: true, | ||
}) | ||
} | ||
break | ||
@@ -70,3 +84,9 @@ } | ||
processText() { | ||
isSingleBreak(mapObject) { | ||
const isSingleBreak = mapObject.type === MapType.BREAK && !mapObject.double | ||
const isNewLine = mapObject.type === MapType.TEXT && mapObject.content === '\n' | ||
return isSingleBreak || isNewLine | ||
} | ||
processTextAndTrim(trimmingFunction) { | ||
if (this.text.length === 0) { | ||
@@ -78,7 +98,6 @@ return | ||
// TODO: might have to check for null string here | ||
const trimmed = trimBeginAndEnd(joinedText) | ||
const trimmed = trimmingFunction(joinedText) | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
// Preserve all preceding breaks | ||
this.text = [] | ||
@@ -88,3 +107,3 @@ return | ||
let fullText = trimBeginAndEnd(collapseWhitespace(trimmed)) | ||
let fullText = trimmingFunction(trimmed) | ||
@@ -95,3 +114,3 @@ let blockMap = [] | ||
for (const textMap of this.text) { | ||
const shrunkText = trimBeginAndEnd(collapseWhitespace(textMap.string)) | ||
const shrunkText = trimmingFunction(textMap.string) | ||
if (!shrunkText) { | ||
@@ -140,2 +159,10 @@ continue | ||
processText(trimEndSpaces = true) { | ||
if (trimEndSpaces) { | ||
this.processTextAndTrim(trimAndCollapseWhitespace) | ||
} else { | ||
this.processTextAndTrim(trimAllExceptEndWhiteSpace) | ||
} | ||
} | ||
processElementNode(node, isOpening) { | ||
@@ -176,3 +203,3 @@ if ( | ||
case 'br': | ||
this.processText() | ||
this.processText(false) | ||
this.processBreaks() | ||
@@ -179,0 +206,0 @@ |
@@ -5,3 +5,2 @@ import { | ||
trimBeginAndEnd, | ||
collapseWhitespace, | ||
phrasingConstructs, | ||
@@ -11,2 +10,4 @@ isElementBlacklisted, | ||
elementCanHaveAltText, | ||
trimAndCollapseWhitespace, | ||
trimAllExceptEndWhiteSpace, | ||
} from './util' | ||
@@ -49,3 +50,18 @@ | ||
case BreakType.DOUBLE: | ||
this.runs.push('\n\n') | ||
let paragraphBreakAdded = false | ||
// iterate through runs backwards: | ||
for (let i = this.runs.length - 1; i >= 0; i--) { | ||
const run = this.runs[i] | ||
if (run === '\n\n') { | ||
// found double break | ||
paragraphBreakAdded = true | ||
break | ||
} else if (run !== '\n') { | ||
// found text content | ||
break | ||
} | ||
} | ||
if (!paragraphBreakAdded) { | ||
this.runs.push('\n\n') | ||
} | ||
break | ||
@@ -57,3 +73,3 @@ } | ||
processText() { | ||
processTextAndTrim(trimmingFunction) { | ||
if (this.text.length === 0) { | ||
@@ -64,6 +80,6 @@ return | ||
// Trim | ||
const trimmed = trimBeginAndEnd(this.text.join('')) | ||
const trimmed = trimmingFunction(this.text.join('')) | ||
if (!trimmed) { | ||
// Trimmed into an empty string | ||
// Preserve all preceding breaks | ||
// Preserve all preceding breaks | ||
this.text = [] | ||
@@ -77,6 +93,14 @@ return | ||
this.runs.push(trimBeginAndEnd(collapseWhitespace(trimmed))) | ||
this.runs.push(trimmingFunction(trimmed)) | ||
this.text = [] | ||
} | ||
processText(trimEndSpaces = true) { | ||
if (trimEndSpaces) { | ||
this.processTextAndTrim(trimAndCollapseWhitespace) | ||
} else { | ||
this.processTextAndTrim(trimAllExceptEndWhiteSpace) | ||
} | ||
} | ||
processElementNode(node, isOpening) { | ||
@@ -111,3 +135,3 @@ if ( | ||
case 'br': | ||
this.processText() | ||
this.processText(false) | ||
this.processBreaks() | ||
@@ -114,0 +138,0 @@ this.runs.push('\n') |
@@ -14,2 +14,6 @@ function autoBind() { | ||
const isCharNewLine = (charCode) => { | ||
return charCode === 10 || charCode === 13 | ||
} | ||
const BreakType = { | ||
@@ -21,2 +25,82 @@ NONE: 'none', | ||
/** | ||
* Trim whitespace from the start of the string | ||
* @param string | ||
* @returns { string } | ||
*/ | ||
const trimBeginOnly = (string) => { | ||
// Get the first non-whitespace character index | ||
let firstNonWhite = null | ||
for (let index = 0; index < string.length; index++) { | ||
if (!isCharWhitespace(string.charCodeAt(index))) { | ||
firstNonWhite = index | ||
break | ||
} | ||
} | ||
// If the first non-whitespace character is null, the string is entirely whitespace | ||
if (firstNonWhite === null) { | ||
return string | ||
} | ||
// Return the non-empty sections of the string | ||
return string.slice(firstNonWhite) | ||
} | ||
/** | ||
* Trim any new line characters from the end of the string | ||
* Also trim any whitespace that comes after that new line character, but not any that comes before. | ||
* @param string | ||
* @returns {*} | ||
*/ | ||
const trimEndNewLine = (string) => { | ||
let lastNonNewLine = null | ||
let foundNewLineCharacter = false | ||
let foundNonWhiteSpaceCharacter = false | ||
for (let index = string.length - 1; index >= 0; index--) { | ||
const charCode = string.charCodeAt(index) | ||
const isNewLine = isCharNewLine(charCode) | ||
if (isCharWhitespace(charCode)) { | ||
if (!isNewLine) { | ||
// okay to trim out any white space | ||
continue | ||
} else { | ||
foundNewLineCharacter = true | ||
} | ||
} else { | ||
foundNonWhiteSpaceCharacter = true | ||
} | ||
if (!isNewLine) { | ||
if (foundNewLineCharacter) { | ||
lastNonNewLine = index | ||
} | ||
break | ||
} | ||
} | ||
if (!foundNonWhiteSpaceCharacter) { | ||
return null | ||
} | ||
// If both are null, the string is entirely whitespace | ||
if (lastNonNewLine === null) { | ||
return string | ||
} | ||
// Return the non-empty sections of the string | ||
return string.slice( | ||
0, | ||
lastNonNewLine ? lastNonNewLine + 1 : undefined, | ||
) | ||
} | ||
/** | ||
* Trims any whitespace at the start and trims any newline characters at the end of the string. | ||
* Trims any whitespace after newline characters at the end of the string, but not any that comes before. | ||
* @param string | ||
* @returns {*} | ||
*/ | ||
const trimAllExceptEndWhiteSpace = (string) => { | ||
return trimEndNewLine(trimBeginOnly(string)) | ||
} | ||
const trimBeginAndEnd = (string) => { | ||
@@ -53,2 +137,3 @@ // Get the first and last non-whitespace character index | ||
} | ||
const collapseWhitespace = (string) => { | ||
@@ -84,2 +169,6 @@ // Collapse all other sequential whitespace into a single whitespace | ||
const trimAndCollapseWhitespace = (string) => { | ||
return trimBeginAndEnd(collapseWhitespace(string)) | ||
} | ||
const blacklist = [ | ||
@@ -257,3 +346,7 @@ 'base', | ||
BreakType, | ||
trimBeginOnly, | ||
trimEndNewLine, | ||
trimBeginAndEnd, | ||
trimAllExceptEndWhiteSpace, | ||
trimAndCollapseWhitespace, | ||
collapseWhitespace, | ||
@@ -260,0 +353,0 @@ phrasingConstructs, |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
62239
1884