Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

unfluff

Package Overview
Dependencies
Maintainers
1
Versions
20
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

unfluff - npm Package Compare versions

Comparing version 0.11.0 to 1.0.0

data/stopwords/stopwords-cs.txt

107

lib/extractor.js
// Generated by CoffeeScript 2.0.0-beta7
void function () {
var _, addSiblings, biggestTitleChunk, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, stopwords, updateNodeCount, updateScore;
var _, addSiblings, biggestTitleChunk, cleanText, cleanTitle, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, rawTitle, stopwords, updateNodeCount, updateScore;
_ = require('lodash');

@@ -8,15 +8,46 @@ stopwords = require('./stopwords');

module.exports = {
date: function (doc) {
var cache$, cache$1, cache$2, cache$3, cache$4, dateCandidates;
dateCandidates = doc("meta[property='article:published_time'], meta[itemprop*='datePublished'], meta[name='dcterms.modified'], meta[name='dcterms.date'], meta[name='DC.date.issued'], meta[name='dc.date.issued'], meta[name='dc.date.modified'], meta[name='dc.date.created'], meta[name='DC.date'], meta[name='DC.Date'], meta[name='dc.date'], meta[name='date'], time[itemprop*='pubDate'], time[itemprop*='pubdate'], span[itemprop*='datePublished'], span[property*='datePublished'], p[itemprop*='datePublished'], p[property*='datePublished'], div[itemprop*='datePublished'], div[property*='datePublished'], li[itemprop*='datePublished'], li[property*='datePublished'], time, span[class*='date'], p[class*='date'], div[class*='date']");
return (null != dateCandidates && null != (cache$ = dateCandidates.first()) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0) || (null != dateCandidates && null != (cache$2 = dateCandidates.first()) && null != (cache$3 = cache$2.attr('datetime')) ? cache$3.trim() : void 0) || cleanText(null != dateCandidates && null != (cache$4 = dateCandidates.first()) ? cache$4.text() : void 0) || null;
},
copyright: function (doc) {
var cache$, copyright, copyrightCandidates, text;
copyrightCandidates = doc("p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']");
text = null != copyrightCandidates && null != (cache$ = copyrightCandidates.first()) ? cache$.text() : void 0;
if (!text) {
text = doc('body').text().replace(/\s*[\r\n]+\s*/g, '. ');
if (!(text.indexOf('\xa9') > 0))
return null;
}
copyright = text.replace(/.*?©(\s*copyright)?([^,;:.|\r\n]+).*/gi, '$2').trim();
return cleanText(copyright);
},
author: function (doc) {
var authorCandidates, authorList, cache$, cache$1, cache$2, cache$3, cache$4, cache$5, fallbackAuthor;
authorCandidates = doc("meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']");
authorList = [];
authorCandidates.each(function () {
var author, cache$, cache$1;
author = null != (cache$ = doc(this)) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0;
if (author)
return authorList.push(author);
});
if (authorList.length === 0) {
fallbackAuthor = (null != (cache$ = doc("span[class*='author']").first()) ? cache$.text() : void 0) || (null != (cache$1 = doc("p[class*='author']").first()) ? cache$1.text() : void 0) || (null != (cache$2 = doc("div[class*='author']").first()) ? cache$2.text() : void 0) || (null != (cache$3 = doc("span[class*='byline']").first()) ? cache$3.text() : void 0) || (null != (cache$4 = doc("p[class*='byline']").first()) ? cache$4.text() : void 0) || (null != (cache$5 = doc("div[class*='byline']").first()) ? cache$5.text() : void 0);
if (fallbackAuthor)
authorList.push(cleanText(fallbackAuthor));
}
return authorList;
},
publisher: function (doc) {
var cache$, cache$1, publisherCandidates;
publisherCandidates = doc("meta[property='og:site_name'], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']");
if (null != publisherCandidates && null != (cache$ = publisherCandidates.first()) && null != (cache$1 = cache$.attr('content')))
return cache$1.trim();
},
title: function (doc) {
var titleElement, titleText, usedDelimeter;
titleElement = doc("meta[property='og:title']");
if (titleElement)
titleText = titleElement.attr('content');
if (!titleText) {
titleElement = doc('title').first();
titleText = titleElement.text();
}
if (!titleElement)
return null;
usedDelimeter = false;
_.each([
var titleText;
titleText = rawTitle(doc);
return cleanTitle(titleText, [
'|',

@@ -26,10 +57,13 @@ ' - ',

':'
], function (c) {
if (titleText.indexOf(c) >= 0 && !usedDelimeter) {
titleText = biggestTitleChunk(titleText, c);
return usedDelimeter = true;
}
});
return titleText.replace(/�/g, '').trim();
]);
},
softTitle: function (doc) {
var titleText;
titleText = rawTitle(doc);
return cleanTitle(titleText, [
'|',
' - ',
'\xbb'
]);
},
text: function (doc, topNode, lang) {

@@ -419,2 +453,35 @@ if (topNode) {

};
cleanText = function (text) {
return text.replace(/[\r\n\t]/g, ' ').replace(/\s\s+/g, ' ').replace(/<!--.+?-->/g, '').replace(/�/g, '').trim();
};
cleanTitle = function (title, delimiters) {
var titleText, usedDelimeter;
titleText = title || '';
usedDelimeter = false;
_.each(delimiters, function (c) {
if (titleText.indexOf(c) >= 0 && !usedDelimeter) {
titleText = biggestTitleChunk(titleText, c);
return usedDelimeter = true;
}
});
return cleanText(titleText);
};
rawTitle = function (doc) {
var cache$, cache$1, cache$2, cache$3, cache$4, cache$5, cache$6, cache$7, cache$8, cache$9, gotTitle, titleText;
gotTitle = false;
titleText = '';
_.each([
null != (cache$ = doc("meta[property='og:title']")) && null != (cache$1 = cache$.first()) ? cache$1.attr('content') : void 0,
null != (cache$2 = doc("h1[class*='title']")) && null != (cache$3 = cache$2.first()) ? cache$3.text() : void 0,
null != (cache$4 = doc('title')) && null != (cache$5 = cache$4.first()) ? cache$5.text() : void 0,
null != (cache$6 = doc('h1')) && null != (cache$7 = cache$6.first()) ? cache$7.text() : void 0,
null != (cache$8 = doc('h2')) && null != (cache$9 = cache$8.first()) ? cache$9.text() : void 0
], function (candidate) {
if (candidate && candidate.trim() && !gotTitle) {
titleText = candidate.trim();
return gotTitle = true;
}
});
return titleText;
};
}.call(this);

@@ -13,2 +13,7 @@ // Generated by CoffeeScript 2.0.0-beta7

title: extractor.title(doc),
softTitle: extractor.softTitle(doc),
date: extractor.date(doc),
author: extractor.author(doc),
publisher: extractor.publisher(doc),
copyright: extractor.copyright(doc),
favicon: extractor.favicon(doc),

@@ -35,2 +40,27 @@ description: extractor.description(doc),

},
softTitle: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.softTitle_ ? this.softTitle_ : this.softTitle_ = extractor.softTitle(doc);
},
date: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.date_ ? this.date_ : this.date_ = extractor.date(doc);
},
copyright: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.copyright_ ? this.copyright_ : this.copyright_ = extractor.copyright(doc);
},
author: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.author_ ? this.author_ : this.author_ = extractor.author(doc);
},
publisher: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.publisher_ ? this.publisher_ : this.publisher_ = extractor.publisher(doc);
},
favicon: function () {

@@ -37,0 +67,0 @@ var doc;

2

package.json
{
"name": "unfluff",
"version": "0.11.0",
"version": "1.0.0",
"description": "A web page content extractor",

@@ -5,0 +5,0 @@ "homepage": "https://github.com/ageitgey/node-unfluff",

@@ -57,2 +57,7 @@ # unfluff

- `title` - The document's title (from the &lt;title&gt; tag)
- `softTitle` - A version of `title` with less truncation
- `date` - The document's publication date
- `copyright` - The document's copyright line, if present
- `author` - The document's author
- `publisher` - The document's publisher (website name)
- `text` - The main text of the document with all the junk thrown away

@@ -131,3 +136,10 @@ - `image` - The main image for the document (what's used by facebook, etc.)

{
"title": "Shovel Knight review: rewrite history",
"title": "Shovel Knight review",
"softTitle": "Shovel Knight review: rewrite history",
"date": "2014-06-26T13:00:03Z",
"copyright": "2016 Vox Media Inc Designed in house",
"author": [
"Griffin McElroy"
],
"publisher": "Polygon",
"text": "Shovel Knight is inspired by the past in all the right ways — but it's far from stuck in it. [.. snip ..]",

@@ -164,2 +176,7 @@ "image": "http://cdn2.vox-cdn.com/uploads/chorus_image/image/34834129/jellyfish_hero.0_cinema_1280.0.png",

console.log(data.title());
console.log(data.softTitle());
console.log(data.date());
console.log(data.copyright());
console.log(data.author());
console.log(data.publisher());
console.log(data.text());

@@ -166,0 +183,0 @@ console.log(data.image());

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc