tm-content-parser
Advanced tools
Comparing version 1.0.20 to 1.0.23
59
index.js
@@ -15,4 +15,3 @@ "use strict"; | ||
const parseScript = content => content.is('script') ? 'IGNORE' : null; | ||
const parseHTMLTable = (ele, markup) => ele.is("table") ? parseTable(markup) : null; | ||
const parseBasicHTML = (ele, markup) => ele.is("ul,ol,h1,h2,h3,iframe") ? parseParagraph(markup) : null; | ||
const parseBasicHTML = (ele, markup) => ele.is("ul,ol,table,h1,h2,h3,iframe") ? parseParagraph(markup) : null; | ||
const parseEscenicImage = (ele, content) => { | ||
@@ -163,3 +162,2 @@ if (ele.is('p')) { | ||
return parseScript(ele) || | ||
parseHTMLTable(ele, markup) || | ||
parseBasicHTML(ele, markup)|| | ||
@@ -211,56 +209,2 @@ parseEscenicImage(ele, content) || | ||
// return array of same dimensions containing count of <b> tags in each cell | ||
const tableCountBolds = ($) => $("tr").toArray().map( (tr) => { | ||
const tds = $(tr).find("td"); | ||
return tds.map((idx, td) => $(td).has("b").length).toArray(); | ||
}); | ||
// return float bold % in table | ||
const tableBoldPercent = (table) => { | ||
const flat = table.flat(); | ||
const countBold = flat.filter(x => x).length; | ||
const boldPercent = countBold/flat.length; | ||
return boldPercent; | ||
}; | ||
// return 2 dimensional array from html | ||
const tableToArray = ($) => $("tr").toArray().map( (tr) => { | ||
const tds = $(tr).find("td"); | ||
return tds.map((idx, td) => $(td).text()).toArray(); | ||
}); | ||
// extract <th> into array | ||
const tableGetHeaders = ($) => $("th").toArray().map(th => $(th).text()); | ||
const parseTable = text => { | ||
const $ = cheerio.load(text, {decodeEntities: false}); | ||
const rows = tableToArray($).filter(arr => arr.length); // remove empty inner arrays [] | ||
let headers = tableGetHeaders($); | ||
if (!headers.length) { // no <th>. check bold rules | ||
let boldTable = tableCountBolds($); | ||
boldTable = boldTable.filter(arr => arr.length); | ||
const firstRowAllBold = (boldTable) => !!boldTable[0].length && boldTable[0].filter(x => x).length === boldTable[0].length; | ||
const boldPercent = tableBoldPercent(boldTable); | ||
const tableAllBold = Math.abs(1.0 - boldPercent) < 0.001; | ||
// if first row is all bold AND table is not all with bold then use first row as header | ||
if (!tableAllBold && firstRowAllBold(boldTable)) { | ||
headers = rows.shift(); | ||
} | ||
} | ||
const static_content = { | ||
type: 'table', | ||
headers, | ||
rows | ||
}; | ||
return { | ||
type: 'static', | ||
static_content | ||
}; | ||
}; | ||
const removeEmptyHtmlTags = markup => { | ||
@@ -322,2 +266,3 @@ const processedMarkup = R.replace(/<(\w*)><\/\1>|<(\w*)>\s<\/\2>/g, '', markup); | ||
R.replace(/<script async.*?\/>/g, ''), | ||
R.replace(/<\/?div.*?>/g, ""), | ||
parser, | ||
@@ -324,0 +269,0 @@ R.map(R.partial(parseText, [content])), |
{ | ||
"name": "tm-content-parser", | ||
"version": "1.0.20", | ||
"version": "1.0.23", | ||
"description": "Trinity Mirror Content Type Parser", | ||
@@ -31,3 +31,3 @@ "main": "index.js", | ||
], | ||
"dependencies": { | ||
"dependencies":{ | ||
"cheerio": "0.22.0", | ||
@@ -49,3 +49,3 @@ "co": "4.6.0", | ||
}, | ||
"tmAppsNode": "6.10.3" | ||
"tmAppsNode":"6.10.3" | ||
} |
@@ -37,6 +37,2 @@ # tm-content-parser | ||
## Release Log | ||
1.0.18 - use newer node version | ||
1.0.17 - pulled AB-700 branch | ||
1.0.15 - extract <th> from tables | ||
1.0.15 - parse html tables, produce new type: table | ||
1.0.11 - not parsing & as this broke urls querystrings. | ||
1.0.11 - not parsing & as this broke urls querystrings. |
@@ -142,116 +142,2 @@ "use strict"; | ||
describe('test table parser', () => { | ||
const parseTable = contentTypeParser.__get__("parseTable"); | ||
it('should parse table without <th>', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
</tr> | ||
<tr> | ||
<td>Alfreds Futterkiste</td> | ||
<td><b>Maria Anders</b></td> | ||
<td>Germany</td> | ||
</tr> | ||
<tr> | ||
<td>Centro comercial Moctezuma</td> | ||
<td>Francisco Chang</td> | ||
<td><b>Mexico</b></td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.rows.should.eql([ | ||
["Alfreds Futterkiste","Maria Anders","Germany"], | ||
["Centro comercial Moctezuma","Francisco Chang","Mexico"] | ||
]); | ||
parsed.static_content.headers.should.eql([]); | ||
}); | ||
it('should extract table header from <th>', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
<th>Company</th> | ||
<th>Contact</th> | ||
<th>Country</th> | ||
</tr> | ||
<tr> | ||
<td>Alfreds Futterkiste</td> | ||
<td><b>Maria Anders</b></td> | ||
<td>Germany</td> | ||
</tr> | ||
<tr> | ||
<td>Centro comercial Moctezuma</td> | ||
<td>Francisco Chang</td> | ||
<td><b>Mexico</b></td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.rows.should.eql([ | ||
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ], | ||
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ] | ||
]); | ||
parsed.static_content.headers.should.eql([ 'Company', 'Contact', 'Country' ]); | ||
}); | ||
it('should extract header from first row with all bold', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
<td><b>Company</b></td> | ||
<td><b>Contact</b></td> | ||
<td><b>Country</b></td> | ||
</tr> | ||
<tr> | ||
<td>Alfreds Futterkiste</td> | ||
<td><b>Maria Anders</b></td> | ||
<td>Germany</td> | ||
</tr> | ||
<tr> | ||
<td>Centro comercial Moctezuma</td> | ||
<td>Francisco Chang</td> | ||
<td><b>Mexico</b></td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.rows.should.eql([ | ||
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ], | ||
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ] | ||
]); | ||
parsed.static_content.headers.should.eql([ 'Company', 'Contact', 'Country' ]); | ||
}); | ||
it('should not add hedaers when all table elements are bold', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
<td><b>Company</b></td> | ||
<td><b>Contact</b></td> | ||
<td><b>Country</b></td> | ||
</tr> | ||
<tr> | ||
<td><b>Alfreds Futterkiste</td> | ||
<td><b>Maria Anders</b></td> | ||
<td><b>Germany</td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.rows.should.eql([ | ||
[ 'Company', 'Contact', 'Country' ], | ||
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ] | ||
] | ||
); | ||
parsed.static_content.headers.should.eql([]); | ||
}); | ||
}); | ||
describe('idAfterContent', () => { | ||
@@ -258,0 +144,0 @@ const idAfterContent = contentTypeParser.__get__("idAfterContent"); |
Sorry, the diff of this file is not supported yet
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
69282
11
1367
37
1