Comparing version 0.11.0 to 1.0.0
// Generated by CoffeeScript 2.0.0-beta7 | ||
void function () { | ||
var _, addSiblings, biggestTitleChunk, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, stopwords, updateNodeCount, updateScore; | ||
var _, addSiblings, biggestTitleChunk, cleanText, cleanTitle, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, rawTitle, stopwords, updateNodeCount, updateScore; | ||
_ = require('lodash'); | ||
@@ -8,15 +8,46 @@ stopwords = require('./stopwords'); | ||
module.exports = { | ||
date: function (doc) { | ||
var cache$, cache$1, cache$2, cache$3, cache$4, dateCandidates; | ||
dateCandidates = doc("meta[property='article:published_time'], meta[itemprop*='datePublished'], meta[name='dcterms.modified'], meta[name='dcterms.date'], meta[name='DC.date.issued'], meta[name='dc.date.issued'], meta[name='dc.date.modified'], meta[name='dc.date.created'], meta[name='DC.date'], meta[name='DC.Date'], meta[name='dc.date'], meta[name='date'], time[itemprop*='pubDate'], time[itemprop*='pubdate'], span[itemprop*='datePublished'], span[property*='datePublished'], p[itemprop*='datePublished'], p[property*='datePublished'], div[itemprop*='datePublished'], div[property*='datePublished'], li[itemprop*='datePublished'], li[property*='datePublished'], time, span[class*='date'], p[class*='date'], div[class*='date']"); | ||
return (null != dateCandidates && null != (cache$ = dateCandidates.first()) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0) || (null != dateCandidates && null != (cache$2 = dateCandidates.first()) && null != (cache$3 = cache$2.attr('datetime')) ? cache$3.trim() : void 0) || cleanText(null != dateCandidates && null != (cache$4 = dateCandidates.first()) ? cache$4.text() : void 0) || null; | ||
}, | ||
copyright: function (doc) { | ||
var cache$, copyright, copyrightCandidates, text; | ||
copyrightCandidates = doc("p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']"); | ||
text = null != copyrightCandidates && null != (cache$ = copyrightCandidates.first()) ? cache$.text() : void 0; | ||
if (!text) { | ||
text = doc('body').text().replace(/\s*[\r\n]+\s*/g, '. '); | ||
if (!(text.indexOf('\xa9') > 0)) | ||
return null; | ||
} | ||
copyright = text.replace(/.*?©(\s*copyright)?([^,;:.|\r\n]+).*/gi, '$2').trim(); | ||
return cleanText(copyright); | ||
}, | ||
author: function (doc) { | ||
var authorCandidates, authorList, cache$, cache$1, cache$2, cache$3, cache$4, cache$5, fallbackAuthor; | ||
authorCandidates = doc("meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']"); | ||
authorList = []; | ||
authorCandidates.each(function () { | ||
var author, cache$, cache$1; | ||
author = null != (cache$ = doc(this)) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0; | ||
if (author) | ||
return authorList.push(author); | ||
}); | ||
if (authorList.length === 0) { | ||
fallbackAuthor = (null != (cache$ = doc("span[class*='author']").first()) ? cache$.text() : void 0) || (null != (cache$1 = doc("p[class*='author']").first()) ? cache$1.text() : void 0) || (null != (cache$2 = doc("div[class*='author']").first()) ? cache$2.text() : void 0) || (null != (cache$3 = doc("span[class*='byline']").first()) ? cache$3.text() : void 0) || (null != (cache$4 = doc("p[class*='byline']").first()) ? cache$4.text() : void 0) || (null != (cache$5 = doc("div[class*='byline']").first()) ? cache$5.text() : void 0); | ||
if (fallbackAuthor) | ||
authorList.push(cleanText(fallbackAuthor)); | ||
} | ||
return authorList; | ||
}, | ||
publisher: function (doc) { | ||
var cache$, cache$1, publisherCandidates; | ||
publisherCandidates = doc("meta[property='og:site_name'], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']"); | ||
if (null != publisherCandidates && null != (cache$ = publisherCandidates.first()) && null != (cache$1 = cache$.attr('content'))) | ||
return cache$1.trim(); | ||
}, | ||
title: function (doc) { | ||
var titleElement, titleText, usedDelimeter; | ||
titleElement = doc("meta[property='og:title']"); | ||
if (titleElement) | ||
titleText = titleElement.attr('content'); | ||
if (!titleText) { | ||
titleElement = doc('title').first(); | ||
titleText = titleElement.text(); | ||
} | ||
if (!titleElement) | ||
return null; | ||
usedDelimeter = false; | ||
_.each([ | ||
var titleText; | ||
titleText = rawTitle(doc); | ||
return cleanTitle(titleText, [ | ||
'|', | ||
@@ -26,10 +57,13 @@ ' - ', | ||
':' | ||
], function (c) { | ||
if (titleText.indexOf(c) >= 0 && !usedDelimeter) { | ||
titleText = biggestTitleChunk(titleText, c); | ||
return usedDelimeter = true; | ||
} | ||
}); | ||
return titleText.replace(/�/g, '').trim(); | ||
]); | ||
}, | ||
softTitle: function (doc) { | ||
var titleText; | ||
titleText = rawTitle(doc); | ||
return cleanTitle(titleText, [ | ||
'|', | ||
' - ', | ||
'\xbb' | ||
]); | ||
}, | ||
text: function (doc, topNode, lang) { | ||
@@ -419,2 +453,35 @@ if (topNode) { | ||
}; | ||
cleanText = function (text) { | ||
return text.replace(/[\r\n\t]/g, ' ').replace(/\s\s+/g, ' ').replace(/<!--.+?-->/g, '').replace(/�/g, '').trim(); | ||
}; | ||
cleanTitle = function (title, delimiters) { | ||
var titleText, usedDelimeter; | ||
titleText = title || ''; | ||
usedDelimeter = false; | ||
_.each(delimiters, function (c) { | ||
if (titleText.indexOf(c) >= 0 && !usedDelimeter) { | ||
titleText = biggestTitleChunk(titleText, c); | ||
return usedDelimeter = true; | ||
} | ||
}); | ||
return cleanText(titleText); | ||
}; | ||
rawTitle = function (doc) { | ||
var cache$, cache$1, cache$2, cache$3, cache$4, cache$5, cache$6, cache$7, cache$8, cache$9, gotTitle, titleText; | ||
gotTitle = false; | ||
titleText = ''; | ||
_.each([ | ||
null != (cache$ = doc("meta[property='og:title']")) && null != (cache$1 = cache$.first()) ? cache$1.attr('content') : void 0, | ||
null != (cache$2 = doc("h1[class*='title']")) && null != (cache$3 = cache$2.first()) ? cache$3.text() : void 0, | ||
null != (cache$4 = doc('title')) && null != (cache$5 = cache$4.first()) ? cache$5.text() : void 0, | ||
null != (cache$6 = doc('h1')) && null != (cache$7 = cache$6.first()) ? cache$7.text() : void 0, | ||
null != (cache$8 = doc('h2')) && null != (cache$9 = cache$8.first()) ? cache$9.text() : void 0 | ||
], function (candidate) { | ||
if (candidate && candidate.trim() && !gotTitle) { | ||
titleText = candidate.trim(); | ||
return gotTitle = true; | ||
} | ||
}); | ||
return titleText; | ||
}; | ||
}.call(this); |
@@ -13,2 +13,7 @@ // Generated by CoffeeScript 2.0.0-beta7 | ||
title: extractor.title(doc), | ||
softTitle: extractor.softTitle(doc), | ||
date: extractor.date(doc), | ||
author: extractor.author(doc), | ||
publisher: extractor.publisher(doc), | ||
copyright: extractor.copyright(doc), | ||
favicon: extractor.favicon(doc), | ||
@@ -35,2 +40,27 @@ description: extractor.description(doc), | ||
}, | ||
softTitle: function () { | ||
var doc; | ||
doc = getParsedDoc.call(this, html); | ||
return null != this.softTitle_ ? this.softTitle_ : this.softTitle_ = extractor.softTitle(doc); | ||
}, | ||
date: function () { | ||
var doc; | ||
doc = getParsedDoc.call(this, html); | ||
return null != this.date_ ? this.date_ : this.date_ = extractor.date(doc); | ||
}, | ||
copyright: function () { | ||
var doc; | ||
doc = getParsedDoc.call(this, html); | ||
return null != this.copyright_ ? this.copyright_ : this.copyright_ = extractor.copyright(doc); | ||
}, | ||
author: function () { | ||
var doc; | ||
doc = getParsedDoc.call(this, html); | ||
return null != this.author_ ? this.author_ : this.author_ = extractor.author(doc); | ||
}, | ||
publisher: function () { | ||
var doc; | ||
doc = getParsedDoc.call(this, html); | ||
return null != this.publisher_ ? this.publisher_ : this.publisher_ = extractor.publisher(doc); | ||
}, | ||
favicon: function () { | ||
@@ -37,0 +67,0 @@ var doc; |
{ | ||
"name": "unfluff", | ||
"version": "0.11.0", | ||
"version": "1.0.0", | ||
"description": "A web page content extractor", | ||
@@ -5,0 +5,0 @@ "homepage": "https://github.com/ageitgey/node-unfluff", |
@@ -57,2 +57,7 @@ # unfluff | ||
- `title` - The document's title (from the <title> tag) | ||
- `softTitle` - A version of `title` with less truncation | ||
- `date` - The document's publication date | ||
- `copyright` - The document's copyright line, if present | ||
- `author` - The document's author | ||
- `publisher` - The document's publisher (website name) | ||
- `text` - The main text of the document with all the junk thrown away | ||
@@ -131,3 +136,10 @@ - `image` - The main image for the document (what's used by facebook, etc.) | ||
{ | ||
"title": "Shovel Knight review: rewrite history", | ||
"title": "Shovel Knight review", | ||
"softTitle": "Shovel Knight review: rewrite history", | ||
"date": "2014-06-26T13:00:03Z", | ||
"copyright": "2016 Vox Media Inc Designed in house", | ||
"author": [ | ||
"Griffin McElroy" | ||
], | ||
"publisher": "Polygon", | ||
"text": "Shovel Knight is inspired by the past in all the right ways — but it's far from stuck in it. [.. snip ..]", | ||
@@ -164,2 +176,7 @@ "image": "http://cdn2.vox-cdn.com/uploads/chorus_image/image/34834129/jellyfish_hero.0_cinema_1280.0.png", | ||
console.log(data.title()); | ||
console.log(data.softTitle()); | ||
console.log(data.date()); | ||
console.log(data.copyright()); | ||
console.log(data.author()); | ||
console.log(data.publisher()); | ||
console.log(data.text()); | ||
@@ -166,0 +183,0 @@ console.log(data.image()); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
5302725
165
2041
0
216