web-auto-extractor - npm Package Compare versions

web-auto-extractor

Package Overview

Dependencies

Maintainers

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.0.0-beta.6 to 1.0.0

dist/index.js

		@@ -7,2 +7,28 @@ 'use strict';

		exports.default = function () {
		var $html = null;

		var loadCheerioObject = function loadCheerioObject(_$html) {
		$html = _$html;
		};

		var parse = function parse(html, options) {
		if (!($html && $html.prototype && $html.prototype.cheerio)) {
		$html = _cheerio2.default.load(html, options);
		}

		return {
		metatags: (0, _metatagParser2.default)($html),
		microdata: (0, _microRdfaParser2.default)(html, 'micro'),
		rdfa: (0, _microRdfaParser2.default)(html, 'rdfa'),
		jsonld: (0, _jsonldParser2.default)($html)
		};
		};

		return {
		parse: parse,
		loadCheerioObject: loadCheerioObject
		};
		};

		require('babel-polyfill');
		@@ -26,17 +52,2 @@

		function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }

		exports.default = {
		parse: function parse(html, $html) {
		if (!($html && $html.prototype && $html.prototype.cheerio)) {
		$html = _cheerio2.default.load(html, { xmlMode: true });
		}

		return {
		metatags: (0, _metatagParser2.default)($html),
		microdata: (0, _microRdfaParser2.default)(html, 'micro', $html),
		rdfa: (0, _microRdfaParser2.default)(html, 'rdfa', $html),
		jsonld: (0, _jsonldParser2.default)($html)
		};
		}
		};
		function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }

dist/parsers/micro-rdfa-parser.js

		@@ -145,3 +145,3 @@ 'use strict';

		exports.default = function (html, specName, $) {
		exports.default = function (html, specName) {
		var handler = createHandler(specName);
		@@ -148,0 +148,0 @@ new _htmlparser2.default.Parser(handler).end(html);

package.json

		{
		"name": "web-auto-extractor",
		"version": "1.0.0-beta.6",
		"version": "1.0.0",
		"description": "Automatically extracts structured information from webpages",
		@@ -5,0 +5,0 @@ "main": "dist/index.js",

166

README.md

		# Web Auto Extractor
		[![Build Status](https://travis-ci.org/ind9/web-auto-extractor.svg?branch=master)](https://travis-ci.org/ind9/web-auto-extractor)

		Automatically extracts semantically structured information from any HTML webpage.
		Parse semantically structured information from any HTML webpage.

		Supported formats:-
		- Formats that support Schema.org vocabularies:-
		- Encodings that support [Schema.org](http://schema.org/) vocabularies:-
		- Microdata
		- RDFa-lite
		- JSON-LD
		- Miscellaneous meta tags
		- Random Meta tags

		Popularly, many websites mark up their webpages with Schema.org vocabularies for better SEO. This library helps you parse that information to JSON.

		[Demo](https://tonicdev.com/npm/web-auto-extractor) it on tonicdev

		## Installation
		`npm install web-auto-extractor`

		## Usage

		```js
		// IF CommonJS
		var WAE = require('web-auto-extractor').default
		//ES6: import WAE from 'web-auto-extractor'
		var wae = WAE.parse(sampleHTML)
		console.log(wae)
		/*
		OUTPUT
		======
		{
		microdata: { data: {..}, unnormalizedData: {..} },
		rdfa: { data: {..}, unnormalizedData: {..} },
		jsonld: { data: {..}, unnormalizedData: null,
		metaTags: { data: {..}, unnormalizedData: {..} }
		}
		*/
		```
		// IF ES6
		import WAE from 'web-auto-extractor'

		### Installation
		`npm install web-auto-extractor`
		var parsed = WAE().parse(sampleHTML)

		```

		### Usage
		Let's use the following text as the `sampleHTML` in our example. It uses Schema.org vocabularies to structure a Product information and is encoded in `microdata` format.

		#### Import
		```js
		> var WAE = require('web-auto-extractor').default
		//ES6: import WAE from 'web-auto-extractor'
		```

		Lets use this `sampleHTML` for our example
		```html
		@@ -76,65 +65,78 @@ <div itemscope itemtype="http://schema.org/Product">

		#### Parsing
		```js
		> var wae = WAE.parse(sampleHTML)
		```
		This returns an object with the following attributes, each of which is of the type [WAEParserObject](#waeparserobject-attributes).
		#### Result

		- microdata
		- rdfa
		- jsonld
		- metaTags
		Our `parsed` object should look like -

		```js
		// Since our sampleHTML uses microdata
		> var parsedMicrodata = wae.microdata
		```json
		{
		"microdata": {
		"Product": [
		{
		"@context": "http://schema.org/",
		"@type": "Product",
		"brand": "ACME",
		"name": "Executive Anvil",
		"image": "anvil_executive.jpg",
		"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
		"mpn": "925872",
		"aggregateRating": {
		"@context": "http://schema.org/",
		"@type": "AggregateRating",
		"ratingValue": "4.4",
		"reviewCount": "89"
		},
		"offers": {
		"@context": "http://schema.org/",
		"@type": "Offer",
		"priceCurrency": "USD",
		"price": "119.99",
		"priceValidUntil": "5 November!",
		"seller": {
		"@context": "http://schema.org/",
		"@type": "Organization",
		"name": "Executive Objects"
		},
		"itemCondition": "http://schema.org/UsedCondition",
		"availability": "http://schema.org/InStock"
		}
		}
		]
		},
		"rdfa": {},
		"jsonld": {},
		"metatags": {
		"priceCurrency": [
		"USD",
		"USD"
		]
		}
		}
		```

		##### WAEParserObject Attributes
		The `parsed` object includes four objects - `microdata`, `rdfa`, `jsonld` and `metatags`. Since the above HTML does not have any information encoded in `rdfa` and `jsonld`, those two objects are empty.

		###### .data
		Gets the normalized result of the parsed format.
		## Caveat

		```js
		// Let's print this out for our example
		> parsedMicrodata.data
		I wouldn't call it a caveat but rather the parser is strict by design. It might not parse like expected if the HTML isn't encoded correctly, so one might assume the parser is broken.

		For example, take the following HTML snippet.

		```html
		<div itemscope itemtype="http://schema.org/Movie">
		<h1 itemprop="name">Ghostbusters</h1>
		<div itemprop="productionCompany" itemscope itemtype="http://schema.org/Organization">Black Rhino</div>
		<div itemprop="countryOfOrigin" itemscope itemtype="http://schema.org/Country">
		Country: <span itemprop="name" content="USA">United States</span><p>
		</div>
		</div>
		```
		OUTPUT:
		```json
		[
		{
		"@context": "http://schema.org/",
		"@type": "Product",
		"brand": "ACME",
		"name": "Executive Anvil",
		"image": "anvil_executive.jpg",
		"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
		"mpn": "925872",
		"aggregateRating": {
		"@context": "http://schema.org/",
		"@type": "AggregateRating",
		"ratingValue": "4.4",
		"reviewCount": "89"
		},
		"offers": {
		"@context": "http://schema.org/",
		"@type": "Offer",
		"priceCurrency": "USD",
		"price": "119.99",
		"priceValidUntil": "5 November!",
		"seller": {
		"@context": "http://schema.org/",
		"@type": "Organization",
		"name": "Executive Objects"
		},
		"itemCondition": "http://schema.org/UsedCondition",
		"availability": "http://schema.org/InStock"
		}
		}
		]
		```

		###### .unnormalizedData
		Gets the unnormalized flattened intermediate result of the parsed format which includes meta information relating to the parsed properties.
		The problem here is the `itemprop` - `productionCompany` which is of `itemtype` - `Organization` doesn't have any `itemprop` as its children, in this case - `name`.

		For more examples, [See this output here](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/expectedResult.json) which uses [this HTML](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/testPage.html)
		The parser assumes every `itemtype` contains an `itemprop`, or every `typeof` contains a `property` in case of `rdfa`. So the `"Black Rhino"` information is lost.

		It'll be nice to fix this by having a `non-strict` mode for parsing this information. PRs are welcome.

		## License

		MIT

tonicExample.js

		@@ -9,14 +9,9 @@ var WAE = require('web-auto-extractor').default
		request(pageUrl, function (error, response, body) {
		var wae = WAE.parse(body)

		var wae = WAE()

		var parsed = wae.parse(body)

		console.log(wae)
		/*
		OUTPUT
		======
		{
		microdata: { .. },
		rdfa: { .. },
		jsonld: { .. },
		metaTags: { .. }
		}
		*/

		})

dist/parsers/fields.js

Fixed alerts

Improved metrics

Worsened metrics