feedparser - npm Package Compare versions

		@@ -5,16 +5,16 @@ #!/usr/bin/env node
		*
		* Usage: node dump.js <feed url or filename>
		* Usage: curl <feed url> \| bin/dump.js
		* cat <feed file> \| bin/dump.js
		*
		*/
		var util = require('util')
		, feedparser = require('../')
		, file = process.argv[2];
		, FeedParser = require('../');

		if (!file) {
		process.exit(2);
		}
		feedparser.parseFile(file)
		process.stdin.pipe(new FeedParser())
		.on('error', console.error)
		.on('complete', function(){
		console.log(util.inspect(arguments, null, 10, true));
		.on('readable', function() {
		var stream = this, item;
		while (item = stream.read()) {
		console.log(util.inspect(item, null, 10, true));
		}
		});

examples/simple.js

		@@ -9,3 +9,3 @@ /*!
		, fs = require('fs')
		, feed = '../test/feeds/rss2sample.xml';
		, feed = __dirname+'/../test/feeds/rss2sample.xml';

		@@ -20,4 +20,7 @@ fs.createReadStream(feed)
		})
		.on('article', function(article){
		console.log('Got article: %s', article.title \|\| article.description);
		.on('readable', function() {
		var stream = this, item;
		while (item = stream.read()) {
		console.log('Got article: %s', item.title \|\| item.description);
		}
		});

History.md


		v0.16.0 / 2013-06-11
		==================

		* Update README
		* Remove legacy libxml-like helpers
		* Update dump script
		* Update examples
		* Update tests
		* Emit SAXErrors and allow consumer to handle or bail on SAXErrors
		* Update copyright notices
		* Merge branch 'AndreasMadsen-transform-stream'
		* Change stream test to not require additional dependency
		* make feedparser a transform stream

		v0.15.4 / 2013-06-04
		@@ -3,0 +17,0 @@ ==================

366

main.js

		/**********************************************************************
		node-feedparser - A robust RSS, Atom, RDF parser for node.
		http://github.com/danmactough/node-feedparser
		Copyright (c) 2011 Dan MacTough
		Copyright (c) 2011, 2012, 2013 Dan MacTough and contributors
		http://yabfog.com
		@@ -13,18 +13,54 @@
		var sax = require('sax')
		, request = require('request')
		, addressparser = require('addressparser')
		, indexOfObject = require('array-indexofobject')
		, resanitize = require('resanitize')
		, fs = require('fs')
		, URL = require('url')
		, util = require('util')
		, EventEmitter = require('events').EventEmitter
		, Stream = require('stream').Stream
		, STATUS_CODES = require('http').STATUS_CODES
		, TransformStream = require('stream').Transform
		, utils = require('./utils')
		;

		if (TransformStream === undefined) {
		TransformStream = require('readable-stream').Transform;
		}

		/**
		* FeedParser constructor. Most apps will only use one instance.
		*
		* Exposes a duplex (transform) stream to parse a feed.
		*
		* Each article/post in the feed will have the following keys:
		* - title {String}
		* - description {String}
		* - summary {String}
		* - date {Date} (or null)
		* - pubdate {Date} (or null)
		* - link {String}
		* - origlink {String}
		* - author {String}
		* - guid {String}
		* - comments {String}
		* - image {Object}
		* - categories {Array}
		* - source {Object}
		* - enclosures {Array}
		* - meta {Object}
		* - Object.keys(meta):
		* - #ns {Array} key,value pairs of each namespace declared for the feed
		* - #type {String} one of 'atom', 'rss', 'rdf'
		* - #version {String}
		* - title {String}
		* - description {String}
		* - date {Date} (or null)
		* - pubdate {Date} (or null)
		* - link {String} i.e., to the website, not the feed
		* - xmlurl {String} the canonical URL of the feed, as declared by the feed
		* - author {String}
		* - language {String}
		* - image {Object}
		* - favicon {String}
		* - copyright {String}
		* - generator {String}
		* - categories {Array}
		*
		* @param {Object} options
		@@ -35,2 +71,6 @@ * @api public
		if (!(this instanceof FeedParser)) return new FeedParser(options);
		TransformStream.call(this, {
		objectMode: true
		});

		this.init();
		@@ -40,3 +80,3 @@ this.parseOpts(options);
		this.stream = sax.createStream(this.options.strict /* strict mode - no by default */, {lowercase: true, xmlns: true });
		this.stream.on('error', this.handleError.bind(this, this.handleSaxError.bind(this)));
		this.stream.on('error', this.handleSaxError.bind(this));
		this.stream.on('processinginstruction', this.handleProcessingInstruction.bind(this));
		@@ -48,7 +88,4 @@ this.stream.on('opentag', this.handleOpenTag.bind(this));
		this.stream.on('end', this.handleEnd.bind(this));
		Stream.call(this);
		this.writable = true;
		this.readable = true;
		}
		util.inherits(FeedParser, Stream);
		util.inherits(FeedParser, TransformStream);

		@@ -66,3 +103,2 @@ /*
		};
		this.articles = [];
		this.stack = [];
		@@ -76,3 +112,2 @@ this.nodes = {};
		this.errors = [];
		this.silenceErrors = false;
		};
		@@ -88,2 +123,3 @@
		if (!('addmeta' in this.options)) this.options.addmeta = true;
		if (!('resume_saxerror' in this.options)) this.options.resume_saxerror = true;
		if ('MAX_BUFFER_LENGTH' in this.options) {
		@@ -100,32 +136,17 @@ sax.MAX_BUFFER_LENGTH = this.options.MAX_BUFFER_LENGTH; // set to Infinity to have unlimited buffers
		// parsing a feed
		if (!this.errors.length && this.meta && !this.meta['#type']) {
		this.meta['#type'] = 'INVALID'; // Set a value so we don't cause an infinite loop
		if (this.meta && !this.meta['#type']) {
		var e = new Error('Not a feed');
		if (this.response && this.response.request && this.response.request.href) {
		e.url = this.response.request.href;
		}
		return this.handleError(e);
		}
		if ('function' === typeof this.callback) {
		if (this.errors.length) {
		var error = this.errors.pop();
		if (this.errors.length) {
		error.errors = this.errors;
		}
		this.callback(error);
		} else {
		this.callback(null, this.meta, this.articles);
		}
		this.push(null);
		};

		FeedParser.prototype.handleSaxError = function (e) {
		this.emit('error', e);
		if (this.options.resume_saxerror) {
		this.resumeSaxError();
		}
		if (!this.errors.length) { this.emit('complete', this.meta, this.articles); }
		this.emit('end');
		if (this.stream) {
		this.stream.removeAllListeners('end');
		this.stream.removeAllListeners('error');
		}
		this.stream.on('error', function() {});
		this.stream._parser.close();
		};

		FeedParser.prototype.handleSaxError = function (){
		FeedParser.prototype.resumeSaxError = function () {
		if (this.stream._parser) {
		@@ -137,23 +158,4 @@ this.stream._parser.error = null;

		FeedParser.prototype.handleError = function (next, e){
		// A SaxError will prepend an error-handling callback,
		// but other calls to #handleError will not
		if (next && !e) {
		e = next;
		next = null;
		}
		// Only emit the error event if we are not using CPS or
		// if we have a listener on 'error' even if we are using CPS
		if (!this.silenceErrors && (!this.callback \|\| this.listeners('error').length)) {
		this.emit('error', e);
		}
		this.errors.push(e);
		if (typeof next === 'function') {
		next();
		} else {
		['processinginstruction', 'opentag', 'closetag', 'text', 'cdata', 'end'].forEach(function(ev){
		this.stream && this.stream.removeAllListeners(ev);
		}, this);
		this.handleEnd();
		}
		FeedParser.prototype.handleError = function (e){
		this.emit('error', e);
		};
		@@ -314,4 +316,3 @@
		if (this.meta.author && !item.author) item.author = this.meta.author;
		this.emit('article', item);
		this.articles.push(item);
		this.push(item);
		} else if (!this.meta.title && // We haven't yet parsed all the metadata
		@@ -1001,249 +1002,12 @@ (node['#name'] === 'channel' \|\|
		// Naive Stream API
		FeedParser.prototype.write = function (data) {
		FeedParser.prototype._transform = function (data, encoding, done) {
		this.stream.write(data);
		return true;
		done();
		};

		FeedParser.prototype.end = function (chunk) {
		if (chunk && chunk.length) this.stream.write(chunk);
		FeedParser.prototype._flush = function (done) {
		this.stream.end();
		return true;
		done();
		};

		function feedparser (options, callback) {
		if ('function' === typeof options) {
		callback = options;
		options = {};
		}
		var fp = new FeedParser(options);
		fp.callback = callback;
		return fp;
		}

		/**
		* Parses a feed contained in a string.
		*
		* For each article/post in a feed, emits an 'article' event
		* with an object with the following keys:
		* title {String}
		* description {String}
		* summary {String}
		* date {Date} (or null)
		* pubdate {Date} (or null)
		* link {String}
		* origlink {String}
		* author {String}
		* guid {String}
		* comments {String}
		* image {Object}
		* categories {Array}
		* source {Object}
		* enclosures {Array}
		* meta {Object}
		* Object.keys(meta):
		* #ns {Array} key,value pairs of each namespace declared for the feed
		* #type {String} one of 'atom', 'rss', 'rdf'
		* #version {String}
		* title {String}
		* description {String}
		* date {Date} (or null)
		* pubdate {Date} (or null)
		* link {String} i.e., to the website, not the feed
		* xmlurl {String} the canonical URL of the feed, as declared by the feed
		* author {String}
		* language {String}
		* image {Object}
		* favicon {String}
		* copyright {String}
		* generator {String}
		* categories {Array}
		*
		* Emits a 'warning' event on each XML parser warning
		*
		* Emits an 'error' event on each XML parser error
		*
		* @param {String} string of XML representing the feed
		* @param {Object} options
		* @param {Function} callback
		* @api public
		*/
		FeedParser.parseString = function (string, options, callback) {
		var fp = feedparser(options, callback);
		// Must delay to give caller a change to attach event handlers
		process.nextTick(function(){
		fp.stream
		.on('error', fp.handleError.bind(fp))
		.end(string, Buffer.isBuffer(string) ? null : 'utf8'); // Accomodate a Buffer in addition to a String
		});
		return fp;
		};

		/**
		* Parses a feed from a file or (for compatability with libxml) a url.
		* See parseString for more info.
		*
		* @param {String} path to the feed file or a fully qualified uri or parsed url object from url.parse()
		* @param {Object} options
		* @param {Function} callback
		* @api public
		*/
		FeedParser.parseFile = function (file, options, callback) {
		if (/^https?:/.test(file) \|\| (typeof file === 'object' && ('href' in file \|\| 'uri' in file \|\| 'url' in file))) {
		return FeedParser.parseUrl(file, options, callback);
		}
		var fp = feedparser(options, callback);
		fs.createReadStream(file)
		.on('error', fp.handleError.bind(fp))
		.pipe(fp.stream);
		return fp;
		};

		/**
		* Parses a feed from a Stream.
		*
		* Example:
		* fp = new FeedParser();
		* fp.on('article', function (article){ // do something });
		* fp.parseStream(fs.createReadStream('file.xml')[, callback]);
		*
		*
		* See parseString for more info.
		*
		* @param {Readable Stream}
		* @param {Object} options
		* @param {Function} callback
		* @api public
		*/
		FeedParser.parseStream = function (stream, options, callback) {
		var fp = feedparser(options, callback);
		stream && stream
		.on('error', fp.handleError.bind(fp))
		.pipe(fp.stream);
		return fp;
		};

		/**
		* Parses a feed from a url.
		*
		* Please consider whether it would be better to perform conditional GETs
		* and pass in the results instead.
		*
		* See parseString for more info.
		*
		* @param {String\|Object} fully qualified uri, parsed url object from url.parse(),
		* or a Request object with uri\|url and headers
		* @param {Object} options
		* @param {Function} callback
		* @api public
		*/
		FeedParser.parseUrl = function (url, options, callback) {
		var fp = feedparser(options, callback);

		var handleResponse = function (response) {
		fp.response = response;
		fp.emit('response', response);
		var code = response.statusCode;
		var codeReason = STATUS_CODES[code] \|\| 'Unknown Failure';
		var contentType = response.headers && response.headers['content-type'] \|\| '';
		var e = new Error();
		if (code !== 200) {
		if (code === 304) {
		fp.emit('304');
		fp.meta = fp.articles = null;
		fp.silenceErrors = true;
		fp.removeAllListeners('complete');
		fp.removeAllListeners('meta');
		fp.removeAllListeners('article');
		fp.handleEnd();
		}
		else {
		e.message = 'Remote server responded: ' + codeReason;
		e.code = code;
		e.url = url;
		fp.handleError(e);
		response.request && response.request.abort();
		}
		return;
		}
		(function () {
		var parts = contentType.split(/; ?/);
		var mediatype = parts[0]
		, mediatype_parts = mediatype.split('/')
		, parameters = parts.length ? parts.slice(1) : [];
		fp.meta['#content-type'] = parameters.reduce(function (map, param) {
		var pair = param.split('=');
		map['@'][pair[0]] = pair[1];
		return map;
		},{
		'#': contentType,
		'@': {
		'media-type': mediatype,
		'type': mediatype_parts[0],
		'subtype': mediatype_parts[1],
		}
		});
		})();
		return;
		};

		// Make sure we have a url and normalize the request object
		var invalid = 'Invalid URL: must be a string or valid request object - %s';

		if (/^https?:/.test(url)) {
		url = {
		uri: url
		};
		} else if (url && typeof url === 'object') {
		if ('href' in url) { // parsed url
		if (!/^https?:/.test(URL.format(url))) {
		throw (new Error(util.format(invalid, url)));
		}
		url = {
		url: url
		};
		} else {
		if (url.url && url.uri) delete url.uri; // wtf?!
		if (! (url.url \|\| url.uri) ) throw (new Error(util.format(invalid, url)));
		if (url.url) {
		if (/^https?:/.test(url.url)) {
		url.uri = url.url;
		delete url.url;
		} else if ( !(typeof url.url === 'object' && 'href' in url.url && /^https?:/.test(URL.format(url.url))) ) {
		// not a string, not a parsed url
		throw (new Error(util.format(invalid, url.url)));
		}
		}
		if (url.uri) {
		if ( typeof url.uri === 'object' && 'href' in url.uri && /^https?:/.test(URL.format(url.uri)) ) {
		url.url = url.uri;
		delete url.uri;
		} else if (!/^https?:/.test(url.uri)) {
		// not a string, not a parsed url
		throw (new Error(util.format(invalid, url.uri)));
		}
		}
		}
		} else {
		throw (new Error(util.format(invalid, url)));
		}

		url.headers = url.headers \|\| {};
		url.headers['Accept-Encoding'] = 'identity';

		if (!fp.xmlbase.length) {
		if (url.uri) {
		fp.xmlbase.unshift({ '#name': 'xml', '#': url.uri });
		} else if (url.url) {
		fp.xmlbase.unshift({ '#name': 'xml', '#': URL.format(url.url) });
		}
		}

		request(url)
		.on('error', fp.handleError.bind(fp))
		.on('response', handleResponse)
		.pipe(fp.stream)
		;
		return fp;
		};

		exports = module.exports = FeedParser;

package.json

		@@ -5,3 +5,3 @@ {
		"description": "Robust RSS Atom and RDF feed parsing using sax js",
		"version": "0.15.4",
		"version": "0.16.0",
		"keywords": [
		@@ -34,2 +34,3 @@ "rss",
		"array-indexofobject": "0.0.1",
		"readable-stream": "1.0.x",
		"resanitize": "~0.1.10"
		@@ -36,0 +37,0 @@ },

100

README.md

		@@ -16,6 +16,6 @@ [![Build Status](https://secure.travis-ci.org/danmactough/node-feedparser.png?branch=master)](https://travis-ci.org/danmactough/node-feedparser)
		- [sax](https://github.com/isaacs/sax-js)
		- [request](https://github.com/mikeal/request)
		- [addressparser](https://github.com/andris9/addressparser)
		- [resanitize](https://github.com/danmactough/node-resanitize)
		- [array-indexofobject](https://github.com/danmactough/node-array-indexofobject)
		- [readable-stream](https://github.com/isaacs/readable-stream) (only if using Node <= v0.8.x)

		@@ -28,11 +28,26 @@ ## Installation

		## Changes since v0.13.x
		## Changes since v0.15.x

		- The ability to handle `.pipe()` is back. The libxml-like helper methods will
		probably be going away in the next minor (or major) version release -- as soon
		as I'm sure the stream API is stable and compatible with Node v0.10.x.
		- The libxml-like helper methods have been removed. There is now just one input
		interface: the stream interface.

		- Events:

		- `304`, `response` - removed, as Feedparser no longer fetches urls
		- `article`, `complete` - removed; use the stream interface
		- `data` - all readable streams will emit a `data` event, but this puts the
		stream into "old" v0.8-style push streams
		- `end` - stream behavior dictates that the `end` event will never fire if
		you don't read any data from the stream; you can kick the Feedparser stream
		to work like an "old" v0.8-style push stream (and get the old `end` event
		behavior) by calling `.resume()`.

		- `SAXErrors` are emitted as `error` events. By default, they are automatically
		resumed. Pass `{ resume_saxerrors: false }` as an option if you want to manually
		handle `SAXErrors` (abort parsing, perhaps).

		## Usage

		The easiest way to use feedparser is to just give it a [readable stream](http://nodejs.org/api/stream.html#stream_readable_stream).
		The easiest way to use feedparser is to just give it a [readable stream](http://nodejs.org/api/stream.html#stream_readable_stream).
		It will then return a readable object stream.

		@@ -52,10 +67,19 @@ ```js
		})
		.on('article', function (article) {
		// do something else
		.on('readable', function () {
		// do something else, then do the next thing
		})
		.on('end', function () {
		// do the next thing
		});
		```

		Or:

		```js

		var FeedParser = require('feedparser')
		, request = require('request');

		request('http://somefeedurl.xml')
		.pipe(new FeedParser([options]))
		.pipe([some other stream])
		```

		### options
		@@ -69,3 +93,3 @@
		- `addmeta` - Set to `false` to override Feedparser's default behavior, which
		is to add the feed's `meta` information to each `article`.
		is to add the feed's `meta` information to each article.

		@@ -82,25 +106,8 @@ - `feedurl` - The url (string) of the feed. FeedParser is very good at

		## libxml-like Helper Methods (deprecated)
		- `resume_saxerror` - Set to `false` to override Feedparser's default behavior, which
		is to emit any `SAXError` on `error` and then automatically resume parsing. In
		my experience, `SAXErrors` are not usually fatal, so this is usually helpful
		behavior. If you want total control over handling these errors and optionally
		aborting parsing the feed, use this option.

		### parser.parseString(string, [options], [callback])

		- `string` - the contents of the feed

		### parser.parseFile(filename, [options], [callback])

		- `filename` - a local filename or remote url

		### parser.parseUrl(url, [options], [callback])

		The first argument can be either a url or a `request` options object. The only
		required option is uri, all others are optional. See
		[request](https://github.com/mikeal/request#requestoptions-callback) for details
		about what that `request` options object might look like.

		- `url` - fully qualified uri or a parsed url object from url.parse()

		### parser.parseStream(readableStream, [options], [callback])

		- `readableStream` - a [Readable Stream](http://nodejs.org/api/stream.html#stream_readable_stream)

		## Examples
		@@ -112,22 +119,17 @@

		### Events Emitted
		### Transform Stream

		* `error` - called with `error` whenever there is a an error of any kind (SAXError, Feedparser error, request error, etc.)
		* `meta` - called with `meta` when it has been parsed
		* `article` - called with a single `article` when each article has been parsed
		* `complete` - called with `meta` and `articles` when parsing is complete
		* `end` - called with no parameters when parsing is complete or aborted (e.g., due to error)
		* `response` - called with the HTTP `response` only when a url has been fetched via parseUrl or parseFile
		* `304` - called with no parameters when when a url has been fetched with a conditional GET via parseUrl or parseFile and the remote server responds with '304 Not Modified'
		Feedparser is a [transform stream](http://nodejs.org/api/stream.html#stream_class_stream_transform) operating in "object mode": XML in -> Javascript objects out.
		Each readable chunk is an object representing an article in the feed.

		### callback(error, meta, articles)
		### Events Emitted

		You can provide a callback (i.e., via the libxml-like helper methods) to be
		called when the feed is finished being parsed. Because the helper methods are
		deprecated, you should consider the ability to provide a callback deprecated,
		as well. Use events instead.
		* `meta` - called with feed `meta` when it has been parsed
		* `error` - called with `error` whenever there is a Feedparser error of any kind (SAXError, Feedparser error, etc.)

		## What is the parsed output produced by feedparser?

		Feedparser parses each feed into a `meta` portion and one or more `articles`.
		Feedparser parses each feed into a `meta` (emitted on the `meta` event) portion
		and one or more `articles` (emited on the `data` event or readable after the `readable`
		is emitted).

		@@ -216,3 +218,3 @@ Regardless of the format of the feed, the `meta` and each `article` contain a

		Copyright (c) 2011-2013 Dan MacTough <danmactough@gmail.com>
		Copyright (c) 2011, 2012, 2013 Dan MacTough and contributors

		@@ -219,0 +221,0 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of

test/category.js

		@@ -1,26 +0,21 @@
		describe('feedparser', function(){
		describe('categories', function(){

		var feed = __dirname + '/feeds/category-feed.xml'
		, meta = {}
		, articles = {}
		;
		var feed = __dirname + '/feeds/category-feed.xml';

		describe('categories with comma in them', function(){
		before(function(done){
		FeedParser.parseFile(feed, function (error, _meta, _articles) {
		assert.ifError(error);
		meta = _meta;
		articles = _articles;
		done();
		});
		});
		describe('article', function(){
		it('should should not seperate by comma', function() {
		assert.deepEqual(articles[0].categories, [
		it('should not seperate by comma', function (done) {
		fs.createReadStream(feed).pipe(new FeedParser())
		.once('readable', function () {
		var stream = this;
		assert.deepEqual(stream.read().categories, [
		'Water Pollution',
		'Gowanus Canal (Brooklyn, NY)'
		]);
		done();
		})
		.on('error', function (err) {
		assert.ifError(err);
		done(err);
		});
		});
		});

		});

test/common.js

		@@ -5,7 +5,7 @@ /global assert:true, FeedParser:true, server:true/
		assert = require('assert');
		var fs = require('fs')
		, path = require('path')
		var path = require('path')
		, zlib = require('zlib')
		, gzip = zlib.createGzip();

		fs = require('fs');
		FeedParser = require('../');
		@@ -12,0 +12,0 @@ server = function (done) {

examples/parseStreamWithCallback.js

examples/parseUrlConditionalGet.js

test/api-callback.js

test/api-event.js

test/api-stream.js

test/badUrls.js

test/codeGoogle.js

test/dedupes-enclosures-01.js

test/feeds/codeGoogle.atom

test/feeds/ongoing.atom

test/feeds/plasmasturm01.atom

test/feeds/wapowellness-altns.xml

test/getsUncompressedFeed.js

test/namespacedElements-01.js

test/namespacedElements-02.js

test/nondefaultNamespaces-01.js

test/nondefaultNamespaces-02.js

test/nondefaultNamespaces-03.js

test/nondefaultNamespaces-04.js

test/nondefaultNamespaces-05.js

test/notaFeed.js

test/parseString-intertwingly-feedurl.js

test/parseString-intertwingly.js

test/parseString-simple-options.js

test/parseString-simple.js

test/parseUrl304.js

test/parseUrl404.js

LICENSE

Sorry, the diff of this file is not supported yet

test/mocha.opts

Sorry, the diff of this file is not supported yet

Improved metrics

Worsened metrics

Dependency changes