New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

htmlcarve

Package Overview
Dependencies
Maintainers
1
Versions
2
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

htmlcarve - npm Package Compare versions

Comparing version 0.0.1 to 0.1.0

lib/meta_parser.js

76

lib/htmlcarve.js
(function() {
var Htmlcarve, buildLinkDerivats, chardet, cheerio, combineInfoPartials, extractKeywords, extractMeta, extractOG, extractTC, iconv, link, link2, link3, link4, link5, logObject, mergeObjects, parsePageBody, request, splitToWords,
var Htmlcarve, buildLinkDerivats, chardet, cheerio, combineInfoPartials, extractKeywords, iconv, link, link2, link3, link4, link5, mergeObjects, parsePageBody, parser, request, splitToWords,
__hasProp = {}.hasOwnProperty;

@@ -15,13 +15,25 @@

parser = {
ogp: require("./ogp_parser"),
tc: require("./tc_parser"),
meta: require("./meta_parser")
};
Htmlcarve.fromUrl = function(url, fn) {
return request(url, {
var headers;
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"
};
return request({
url: url,
encoding: null,
timeout: 10000
timeout: 10000,
headers: headers
}, function(error, response, body) {
return parsePageBody(error, response, body, function(error, page) {
var info;
info = Htmlcarve.fromString(page);
info.links = buildLinkDerivats(url);
var data;
data = Htmlcarve.fromString(page);
data.links = buildLinkDerivats(url);
if (fn) {
return fn(null, info);
return fn(null, data);
}

@@ -33,13 +45,14 @@ });

Htmlcarve.fromString = function(text) {
var $, info;
var $, data;
$ = cheerio.load(text, {
ignoreWhitespace: true
});
info = {
html_meta: extractMeta($),
open_graph: extractOG($),
twitter_card: extractTC($)
data = {};
data.source = {
html_meta: parser.meta.execute($),
open_graph: parser.ogp.execute($),
twitter_card: parser.tc.execute($)
};
info.info = combineInfoPartials(info);
return info;
data.result = combineInfoPartials(data.source);
return data;
};

@@ -65,33 +78,2 @@

extractOG = function($) {
return {
title: $("meta[property='og:title']").first().attr("content"),
summary: $("meta[property='og:description']").first().attr("content"),
image: $("meta[property='og:image']").first().attr("content"),
language: $("meta[property='og:locale']").first().attr("content")
};
};
extractMeta = function($) {
return {
title: $("title").first().text() || $("h1").first().text() || $("h2").first().text() || $("h3").first().text(),
summary: $("meta[name='description']").first().attr("content"),
image: $("div img").first().attr("src"),
language: $("html").attr("lang") || $("meta[http-equiv='content-language']").attr("content") || $("meta[name='language']").attr("content"),
feed: $("link[type='application/rss+xml']").attr("href") || $("link[type='application/atom+xml']").attr("href") || $("link[rel='alternate']").attr("href"),
favicon: $("link[rel='apple-touch-icon']").attr("href") || $("link[rel='shortcut icon']").attr("href") || $("link[rel='icon']").attr("href"),
keywords: $("meta[name='keywords']").first().attr("content"),
author: $("meta[name='author']").first().attr("content")
};
};
extractTC = function($) {
return {
title: $("meta[name='twitter:title']").attr("content"),
summary: $("meta[name='twitter:description']").attr("content"),
image: $("meta[name='twitter:image']").attr("content"),
author: $("meta[name='twitter:creator']").attr("content")
};
};
buildLinkDerivats = function(link) {

@@ -141,6 +123,2 @@ var url;

logObject = function(str, obj) {
return console.log(str, obj);
};
if (process.argv[1] === __filename) {

@@ -147,0 +125,0 @@ link = "http://www.spiegel.de/politik/deutschland/parteien-betonen-offenheit-vor-koalitions-sondierungen-a-927551.html";

{
"name": "htmlcarve",
"version": "0.0.1",
"version": "0.1.0",
"description": "Extract essential meta-informations from any web page, fast and dead simple.",

@@ -16,3 +16,7 @@ "keywords": [

},
"author": "Maximilian Stroh",
"author": {
"name": "Maximilian Stroh",
"email": "Hisako1337@gmail.com",
"web": "http://blog.plague-dev.de"
},
"dependencies": {

@@ -42,8 +46,9 @@ "coffee-script": "~1.6.3",

],
"repositories" : [
{
"type": "git",
"url": "https://github.com/Anonyfox/node-htmlcarve.git"
}
],
"repository": {
"type": "git",
"url": "https://github.com/Anonyfox/node-htmlcarve.git"
},
"bugs": {
"url": "https://github.com/Anonyfox/node-htmlcarve/issues"
},
"licenses": [

@@ -54,3 +59,13 @@ {

}
]
],
"devDependencies": {
"mocha": "~1.13.0",
"chai": "~1.8.1",
"jitter": "~1.2.1"
},
"scripts": {
"test": "./node_modules/mocha/bin/mocha",
"tester": "./node_modules/mocha/bin/mocha --watch --growl",
"compiler": "./node_modules/jitter/bin/jitter src lib"
}
}
#node-htmlcarve
Extract essential meta-informations from any web page, fast and dead simple. Do you need general informations from a given html-site, like the title, a summary, a favicon or a possible RSS-Feed? Just throw an url into this module, and it'll try to find that stuff for you.
**warning: this is a work-in-progress and isn't ready for anything than tinkering around. there are currently *no tests* written!!**
## Installation
Clone this repository, grab the single coffeescript/javascript-file, or simply use NPM:
```npm install htmlcarve```
(not yet published)
```npm install htmlcarve```

@@ -23,16 +19,80 @@ ##Usage

htmlcarve.fromUrl "http://venturebeat.com/", (error, data) ->
console.log data.info
console.log data.result
```
The returned `data` object has several attributes, where `data.info` is the compressed general result. It has the following fields of interest:
##Samples
* `title` #=> the "headline" of the page.
* `summary` #=> a short snippet describing the content
* `image` #=> an image to illustrate that stuff
* `author` #=> any author/creator of the page/content
* `language` #=> the language of this content.
* `feed` #=> the url of a rss/atom-feed if existing
* `favicon` #=> the url of an favicon of the page
* `keywords` #=> an array of keywords describing the page.
```Shell
{ source:
{ html_meta:
{ title: 'Ouch: HP is now promoting PCs running Windows 7 (because Windows 8 isn\'t doing so hot) | VentureBeat | Business | by Ricardo Bilton',
summary: undefined,
image: 'http://venturebeat.files.wordpress.com/2014/01/patrick-collison-headshot.jpg?w=311&h=150&crop=1',
language: 'en',
feed: 'http://feeds.venturebeat.com/VentureBeat',
favicon: 'http://0.gravatar.com/blavatar/6a5449d7551fc1e8f149b0920ca4b6f6?s=16',
keywords: undefined,
author: undefined },
open_graph:
{ title: 'Ouch: HP is now promoting PCs running Windows 7 (because Windows 8 isn\'t doing so hot)',
summary: 'HP\'s new Windows 7 promotion should tell you all you need to know about the state of its Windows 8 hardware. With its latest promotion, HP is heavily pushing PCs running Windows 7, which it says it...',
image: 'http://venturebeat.files.wordpress.com/2014/01/hp-windows.png',
language: undefined },
twitter_card:
{ title: undefined,
summary: undefined,
image: undefined,
author: '@chernandburn' } },
result:
{ title: 'Ouch: HP is now promoting PCs running Windows 7 (because Windows 8 isn\'t doing so hot)',
summary: 'HP\'s new Windows 7 promotion should tell you all you need to know about the state of its Windows 8 hardware. With its latest promotion, HP is heavily pushing PCs running Windows 7, which it says it...',
image: 'http://venturebeat.files.wordpress.com/2014/01/hp-windows.png',
author: '@chernandburn',
language: 'en',
feed: 'http://feeds.venturebeat.com/VentureBeat',
favicon: 'http://0.gravatar.com/blavatar/6a5449d7551fc1e8f149b0920ca4b6f6?s=16',
keywords: undefined },
links:
{ deep: 'http://venturebeat.com/2014/01/20/ouch-hp-is-now-promoting-pcs-running-windows-7-because-windows-8-isnt-doing-so-hot/',
shallow: 'http://venturebeat.com/2014/01/20/ouch-hp-is-now-promoting-pcs-running-windows-7-because-windows-8-isnt-doing-so-hot/',
base: 'http://venturebeat.com' } }
```
```Shell
$ htmlcarve http://www.spin.com/articles/miserable-halloween-dream-stream/
{ source:
{ html_meta:
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\' | SPIN | SPIN Mix | Premieres',
summary: 'Ex-Whirr singer preps solo EP for February 18 release',
image: 'http://www.spin.com/sites/all/themes/zen_spin/assets/images/default-images/spin-logo.png',
language: 'en',
feed: 'http://www.spin.com/rss.xml',
favicon: 'http://www.spin.com/favicon.ico',
keywords: 'miserable',
author: undefined },
open_graph:
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\'',
summary: 'Ex-Whirr singer preps solo EP for February 18 release',
image: 'http://www.spin.com/sites/all/files/140122-miserable.jpg',
language: undefined },
twitter_card:
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\'',
summary: 'Ex-Whirr singer preps solo EP for February 18 release',
image: 'http://www.spin.com/sites/all/files/140122-miserable.jpg',
author: undefined } },
result:
{ title: 'Stream Miserable\'s Cratering \'Halloween Dream\'',
summary: 'Ex-Whirr singer preps solo EP for February 18 release',
image: 'http://www.spin.com/sites/all/files/140122-miserable.jpg',
author: undefined,
language: 'en',
feed: 'http://www.spin.com/rss.xml',
favicon: 'http://www.spin.com/favicon.ico',
keywords: 'miserable' },
links:
{ deep: 'http://www.spin.com/articles/miserable-halloween-dream-stream/',
shallow: 'http://www.spin.com/articles/miserable-halloween-dream-stream/',
base: 'http://www.spin.com' } }
```
##How does this stuff work?

@@ -52,6 +112,2 @@ Htmlcarve will process several steps to gather all that informations.

- extract keywords if none are present
- write some tests ;_;
- add a more useful documentation
- design a better response format
- clean up the code
- include the full protocols, not only this quick'n'dirty hack.

@@ -62,2 +118,2 @@ - include schema.org

##License
MIT.
MIT.

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc