New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

tm-content-parser

Package Overview
Dependencies
Maintainers
11
Versions
21
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

tm-content-parser - npm Package Compare versions

Comparing version 1.0.20 to 1.0.23

59

index.js

@@ -15,4 +15,3 @@ "use strict";

const parseScript = content => content.is('script') ? 'IGNORE' : null;
const parseHTMLTable = (ele, markup) => ele.is("table") ? parseTable(markup) : null;
const parseBasicHTML = (ele, markup) => ele.is("ul,ol,h1,h2,h3,iframe") ? parseParagraph(markup) : null;
const parseBasicHTML = (ele, markup) => ele.is("ul,ol,table,h1,h2,h3,iframe") ? parseParagraph(markup) : null;
const parseEscenicImage = (ele, content) => {

@@ -163,3 +162,2 @@ if (ele.is('p')) {

return parseScript(ele) ||
parseHTMLTable(ele, markup) ||
parseBasicHTML(ele, markup)||

@@ -211,56 +209,2 @@ parseEscenicImage(ele, content) ||

// return array of same dimensions containing count of <b> tags in each cell
const tableCountBolds = ($) => $("tr").toArray().map( (tr) => {
const tds = $(tr).find("td");
return tds.map((idx, td) => $(td).has("b").length).toArray();
});
// return float bold % in table
const tableBoldPercent = (table) => {
const flat = table.flat();
const countBold = flat.filter(x => x).length;
const boldPercent = countBold/flat.length;
return boldPercent;
};
// return 2 dimensional array from html
const tableToArray = ($) => $("tr").toArray().map( (tr) => {
const tds = $(tr).find("td");
return tds.map((idx, td) => $(td).text()).toArray();
});
// extract <th> into array
const tableGetHeaders = ($) => $("th").toArray().map(th => $(th).text());
const parseTable = text => {
const $ = cheerio.load(text, {decodeEntities: false});
const rows = tableToArray($).filter(arr => arr.length); // remove empty inner arrays []
let headers = tableGetHeaders($);
if (!headers.length) { // no <th>. check bold rules
let boldTable = tableCountBolds($);
boldTable = boldTable.filter(arr => arr.length);
const firstRowAllBold = (boldTable) => !!boldTable[0].length && boldTable[0].filter(x => x).length === boldTable[0].length;
const boldPercent = tableBoldPercent(boldTable);
const tableAllBold = Math.abs(1.0 - boldPercent) < 0.001;
// if first row is all bold AND table is not all with bold then use first row as header
if (!tableAllBold && firstRowAllBold(boldTable)) {
headers = rows.shift();
}
}
const static_content = {
type: 'table',
headers,
rows
};
return {
type: 'static',
static_content
};
};
const removeEmptyHtmlTags = markup => {

@@ -322,2 +266,3 @@ const processedMarkup = R.replace(/<(\w*)><\/\1>|<(\w*)>\s<\/\2>/g, '', markup);

R.replace(/<script async.*?\/>/g, ''),
R.replace(/<\/?div.*?>/g, ""),
parser,

@@ -324,0 +269,0 @@ R.map(R.partial(parseText, [content])),

6

package.json
{
"name": "tm-content-parser",
"version": "1.0.20",
"version": "1.0.23",
"description": "Trinity Mirror Content Type Parser",

@@ -31,3 +31,3 @@ "main": "index.js",

],
"dependencies": {
"dependencies":{
"cheerio": "0.22.0",

@@ -49,3 +49,3 @@ "co": "4.6.0",

},
"tmAppsNode": "6.10.3"
"tmAppsNode":"6.10.3"
}

@@ -37,6 +37,2 @@ # tm-content-parser

## Release Log
1.0.18 - use newer node version
1.0.17 - pulled AB-700 branch
1.0.15 - extract <th> from tables
1.0.15 - parse html tables, produce new type: table
1.0.11 - not parsing &amp; as this broke urls querystrings.
1.0.11 - not parsing &amp; as this broke urls querystrings.

@@ -142,116 +142,2 @@ "use strict";

describe('test table parser', () => {
const parseTable = contentTypeParser.__get__("parseTable");
it('should parse table without <th>', () => {
const table = `
<table>
<tr>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td><b>Maria Anders</b></td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td><b>Mexico</b></td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.rows.should.eql([
["Alfreds Futterkiste","Maria Anders","Germany"],
["Centro comercial Moctezuma","Francisco Chang","Mexico"]
]);
parsed.static_content.headers.should.eql([]);
});
it('should extract table header from <th>', () => {
const table = `
<table>
<tr>
<th>Company</th>
<th>Contact</th>
<th>Country</th>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td><b>Maria Anders</b></td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td><b>Mexico</b></td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.rows.should.eql([
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ],
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ]
]);
parsed.static_content.headers.should.eql([ 'Company', 'Contact', 'Country' ]);
});
it('should extract header from first row with all bold', () => {
const table = `
<table>
<tr>
<td><b>Company</b></td>
<td><b>Contact</b></td>
<td><b>Country</b></td>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td><b>Maria Anders</b></td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td><b>Mexico</b></td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.rows.should.eql([
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ],
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ]
]);
parsed.static_content.headers.should.eql([ 'Company', 'Contact', 'Country' ]);
});
it('should not add hedaers when all table elements are bold', () => {
const table = `
<table>
<tr>
<td><b>Company</b></td>
<td><b>Contact</b></td>
<td><b>Country</b></td>
</tr>
<tr>
<td><b>Alfreds Futterkiste</td>
<td><b>Maria Anders</b></td>
<td><b>Germany</td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.rows.should.eql([
[ 'Company', 'Contact', 'Country' ],
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ]
]
);
parsed.static_content.headers.should.eql([]);
});
});
describe('idAfterContent', () => {

@@ -258,0 +144,0 @@ const idAfterContent = contentTypeParser.__get__("idAfterContent");

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc