Comparing version 0.0.1 to 0.1.0
(function() { | ||
var Htmlcarve, buildLinkDerivats, chardet, cheerio, combineInfoPartials, extractKeywords, extractMeta, extractOG, extractTC, iconv, link, link2, link3, link4, link5, logObject, mergeObjects, parsePageBody, request, splitToWords, | ||
var Htmlcarve, buildLinkDerivats, chardet, cheerio, combineInfoPartials, extractKeywords, iconv, link, link2, link3, link4, link5, mergeObjects, parsePageBody, parser, request, splitToWords, | ||
__hasProp = {}.hasOwnProperty; | ||
@@ -15,13 +15,25 @@ | ||
parser = { | ||
ogp: require("./ogp_parser"), | ||
tc: require("./tc_parser"), | ||
meta: require("./meta_parser") | ||
}; | ||
Htmlcarve.fromUrl = function(url, fn) { | ||
return request(url, { | ||
var headers; | ||
headers = { | ||
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36" | ||
}; | ||
return request({ | ||
url: url, | ||
encoding: null, | ||
timeout: 10000 | ||
timeout: 10000, | ||
headers: headers | ||
}, function(error, response, body) { | ||
return parsePageBody(error, response, body, function(error, page) { | ||
var info; | ||
info = Htmlcarve.fromString(page); | ||
info.links = buildLinkDerivats(url); | ||
var data; | ||
data = Htmlcarve.fromString(page); | ||
data.links = buildLinkDerivats(url); | ||
if (fn) { | ||
return fn(null, info); | ||
return fn(null, data); | ||
} | ||
@@ -33,13 +45,14 @@ }); | ||
Htmlcarve.fromString = function(text) { | ||
var $, info; | ||
var $, data; | ||
$ = cheerio.load(text, { | ||
ignoreWhitespace: true | ||
}); | ||
info = { | ||
html_meta: extractMeta($), | ||
open_graph: extractOG($), | ||
twitter_card: extractTC($) | ||
data = {}; | ||
data.source = { | ||
html_meta: parser.meta.execute($), | ||
open_graph: parser.ogp.execute($), | ||
twitter_card: parser.tc.execute($) | ||
}; | ||
info.info = combineInfoPartials(info); | ||
return info; | ||
data.result = combineInfoPartials(data.source); | ||
return data; | ||
}; | ||
@@ -65,33 +78,2 @@ | ||
extractOG = function($) { | ||
return { | ||
title: $("meta[property='og:title']").first().attr("content"), | ||
summary: $("meta[property='og:description']").first().attr("content"), | ||
image: $("meta[property='og:image']").first().attr("content"), | ||
language: $("meta[property='og:locale']").first().attr("content") | ||
}; | ||
}; | ||
extractMeta = function($) { | ||
return { | ||
title: $("title").first().text() || $("h1").first().text() || $("h2").first().text() || $("h3").first().text(), | ||
summary: $("meta[name='description']").first().attr("content"), | ||
image: $("div img").first().attr("src"), | ||
language: $("html").attr("lang") || $("meta[http-equiv='content-language']").attr("content") || $("meta[name='language']").attr("content"), | ||
feed: $("link[type='application/rss+xml']").attr("href") || $("link[type='application/atom+xml']").attr("href") || $("link[rel='alternate']").attr("href"), | ||
favicon: $("link[rel='apple-touch-icon']").attr("href") || $("link[rel='shortcut icon']").attr("href") || $("link[rel='icon']").attr("href"), | ||
keywords: $("meta[name='keywords']").first().attr("content"), | ||
author: $("meta[name='author']").first().attr("content") | ||
}; | ||
}; | ||
extractTC = function($) { | ||
return { | ||
title: $("meta[name='twitter:title']").attr("content"), | ||
summary: $("meta[name='twitter:description']").attr("content"), | ||
image: $("meta[name='twitter:image']").attr("content"), | ||
author: $("meta[name='twitter:creator']").attr("content") | ||
}; | ||
}; | ||
buildLinkDerivats = function(link) { | ||
@@ -141,6 +123,2 @@ var url; | ||
logObject = function(str, obj) { | ||
return console.log(str, obj); | ||
}; | ||
if (process.argv[1] === __filename) { | ||
@@ -147,0 +125,0 @@ link = "http://www.spiegel.de/politik/deutschland/parteien-betonen-offenheit-vor-koalitions-sondierungen-a-927551.html"; |
{ | ||
"name": "htmlcarve", | ||
"version": "0.0.1", | ||
"version": "0.1.0", | ||
"description": "Extract essential meta-informations from any web page, fast and dead simple.", | ||
@@ -16,3 +16,7 @@ "keywords": [ | ||
}, | ||
"author": "Maximilian Stroh", | ||
"author": { | ||
"name": "Maximilian Stroh", | ||
"email": "Hisako1337@gmail.com", | ||
"web": "http://blog.plague-dev.de" | ||
}, | ||
"dependencies": { | ||
@@ -42,8 +46,9 @@ "coffee-script": "~1.6.3", | ||
], | ||
"repositories" : [ | ||
{ | ||
"type": "git", | ||
"url": "https://github.com/Anonyfox/node-htmlcarve.git" | ||
} | ||
], | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/Anonyfox/node-htmlcarve.git" | ||
}, | ||
"bugs": { | ||
"url": "https://github.com/Anonyfox/node-htmlcarve/issues" | ||
}, | ||
"licenses": [ | ||
@@ -54,3 +59,13 @@ { | ||
} | ||
] | ||
], | ||
"devDependencies": { | ||
"mocha": "~1.13.0", | ||
"chai": "~1.8.1", | ||
"jitter": "~1.2.1" | ||
}, | ||
"scripts": { | ||
"test": "./node_modules/mocha/bin/mocha", | ||
"tester": "./node_modules/mocha/bin/mocha --watch --growl", | ||
"compiler": "./node_modules/jitter/bin/jitter src lib" | ||
} | ||
} |
#node-htmlcarve | ||
Extract essential meta-informations from any web page, fast and dead simple. Do you need general informations from a given html-site, like the title, a summary, a favicon or a possible RSS-Feed? Just throw an url into this module, and it'll try to find that stuff for you. | ||
**warning: this is a work-in-progress and isn't ready for anything than tinkering around. there are currently *no tests* written!!** | ||
## Installation | ||
Clone this repository, grab the single coffeescript/javascript-file, or simply use NPM: | ||
```npm install htmlcarve``` | ||
(not yet published) | ||
```npm install htmlcarve``` | ||
@@ -23,16 +19,80 @@ ##Usage | ||
htmlcarve.fromUrl "http://venturebeat.com/", (error, data) -> | ||
console.log data.info | ||
console.log data.result | ||
``` | ||
The returned `data` object has several attributes, where `data.info` is the compressed general result. It has the following fields of interest: | ||
##Samples | ||
* `title` #=> the "headline" of the page. | ||
* `summary` #=> a short snippet describing the content | ||
* `image` #=> an image to illustrate that stuff | ||
* `author` #=> any author/creator of the page/content | ||
* `language` #=> the language of this content. | ||
* `feed` #=> the url of a rss/atom-feed if existing | ||
* `favicon` #=> the url of an favicon of the page | ||
* `keywords` #=> an array of keywords describing the page. | ||
```Shell | ||
{ source: | ||
{ html_meta: | ||
{ title: 'Ouch: HP is now promoting PCs running Windows 7 (because Windows 8 isn\'t doing so hot) | VentureBeat | Business | by Ricardo Bilton', | ||
summary: undefined, | ||
image: 'http://venturebeat.files.wordpress.com/2014/01/patrick-collison-headshot.jpg?w=311&h=150&crop=1', | ||
language: 'en', | ||
feed: 'http://feeds.venturebeat.com/VentureBeat', | ||
favicon: 'http://0.gravatar.com/blavatar/6a5449d7551fc1e8f149b0920ca4b6f6?s=16', | ||
keywords: undefined, | ||
author: undefined }, | ||
open_graph: | ||
{ title: 'Ouch: HP is now promoting PCs running Windows 7 (because Windows 8 isn\'t doing so hot)', | ||
summary: 'HP\'s new Windows 7 promotion should tell you all you need to know about the state of its Windows 8 hardware. With its latest promotion, HP is heavily pushing PCs running Windows 7, which it says it...', | ||
image: 'http://venturebeat.files.wordpress.com/2014/01/hp-windows.png', | ||
language: undefined }, | ||
twitter_card: | ||
{ title: undefined, | ||
summary: undefined, | ||
image: undefined, | ||
author: '@chernandburn' } }, | ||
result: | ||
{ title: 'Ouch: HP is now promoting PCs running Windows 7 (because Windows 8 isn\'t doing so hot)', | ||
summary: 'HP\'s new Windows 7 promotion should tell you all you need to know about the state of its Windows 8 hardware. With its latest promotion, HP is heavily pushing PCs running Windows 7, which it says it...', | ||
image: 'http://venturebeat.files.wordpress.com/2014/01/hp-windows.png', | ||
author: '@chernandburn', | ||
language: 'en', | ||
feed: 'http://feeds.venturebeat.com/VentureBeat', | ||
favicon: 'http://0.gravatar.com/blavatar/6a5449d7551fc1e8f149b0920ca4b6f6?s=16', | ||
keywords: undefined }, | ||
links: | ||
{ deep: 'http://venturebeat.com/2014/01/20/ouch-hp-is-now-promoting-pcs-running-windows-7-because-windows-8-isnt-doing-so-hot/', | ||
shallow: 'http://venturebeat.com/2014/01/20/ouch-hp-is-now-promoting-pcs-running-windows-7-because-windows-8-isnt-doing-so-hot/', | ||
base: 'http://venturebeat.com' } } | ||
``` | ||
```Shell | ||
$ htmlcarve http://www.spin.com/articles/miserable-halloween-dream-stream/ | ||
{ source: | ||
{ html_meta: | ||
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\' | SPIN | SPIN Mix | Premieres', | ||
summary: 'Ex-Whirr singer preps solo EP for February 18 release', | ||
image: 'http://www.spin.com/sites/all/themes/zen_spin/assets/images/default-images/spin-logo.png', | ||
language: 'en', | ||
feed: 'http://www.spin.com/rss.xml', | ||
favicon: 'http://www.spin.com/favicon.ico', | ||
keywords: 'miserable', | ||
author: undefined }, | ||
open_graph: | ||
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\'', | ||
summary: 'Ex-Whirr singer preps solo EP for February 18 release', | ||
image: 'http://www.spin.com/sites/all/files/140122-miserable.jpg', | ||
language: undefined }, | ||
twitter_card: | ||
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\'', | ||
summary: 'Ex-Whirr singer preps solo EP for February 18 release', | ||
image: 'http://www.spin.com/sites/all/files/140122-miserable.jpg', | ||
author: undefined } }, | ||
result: | ||
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\'', | ||
summary: 'Ex-Whirr singer preps solo EP for February 18 release', | ||
image: 'http://www.spin.com/sites/all/files/140122-miserable.jpg', | ||
author: undefined, | ||
language: 'en', | ||
feed: 'http://www.spin.com/rss.xml', | ||
favicon: 'http://www.spin.com/favicon.ico', | ||
keywords: 'miserable' }, | ||
links: | ||
{ deep: 'http://www.spin.com/articles/miserable-halloween-dream-stream/', | ||
shallow: 'http://www.spin.com/articles/miserable-halloween-dream-stream/', | ||
base: 'http://www.spin.com' } } | ||
``` | ||
##How does this stuff work? | ||
@@ -52,6 +112,2 @@ Htmlcarve will process several steps to gather all that informations. | ||
- extract keywords if none are present | ||
- write some tests ;_; | ||
- add a more useful documentation | ||
- design a better response format | ||
- clean up the code | ||
- include the full protocols, not only this quick'n'dirty hack. | ||
@@ -62,2 +118,2 @@ - include schema.org | ||
##License | ||
MIT. | ||
MIT. |
Sorry, the diff of this file is not supported yet
No bug tracker
MaintenancePackage does not have a linked bug tracker in package.json.
Found 1 instance in 1 package
No repository
Supply chain riskPackage does not have a linked source code repository. Without this field, a package will have no reference to the location of the source code use to generate the package.
Found 1 instance in 1 package
480433
16
147
1
117
0
3