robots-parser - npm Package Compare versions

Comparing version 1.0.0 to 1.0.1

package.json

		{
		"name": "robots-parser",
		"version": "1.0.0",
		"version": "1.0.1",
		"description": "Robots.txt parser.",
		@@ -10,7 +10,7 @@ "main": "index.js",
		"scripts": {
		"test": "./node_modules/mocha/bin/mocha test"
		"test": "istanbul cover _mocha --print detail --report html --dir ./report/coverage"
		},
		"repository" : {
		"type" : "git",
		"url" : "https://github.com/samclarke/robots-parser.git"
		"repository": {
		"type": "git",
		"url": "https://github.com/samclarke/robots-parser.git"
		},
		@@ -20,5 +20,6 @@ "author": "Sam Clarke <sam@samclarke.com>",
		"devDependencies": {
		"chai": "^1.9.1",
		"mocha": "^1.21.4"
		"chai": "^3.5.0",
		"istanbul": "^0.4.5",
		"mocha": "^3.1.2"
		}
		}

Robots.js

		@@ -102,3 +102,3 @@ var libUrl = require('url');

		if (!line \|\| line[0].indexOf('#') === 0) {
		if (!line \|\| !line[0] \|\| line[0].indexOf('#') === 0) {
		continue;
		@@ -113,3 +113,5 @@ }

		currentUserAgents.push(formatUserAgent(line[1]));
		if (line[1]) {
		currentUserAgents.push(formatUserAgent(line[1]));
		}
		break;
		@@ -126,10 +128,14 @@ case 'disallow':
		case 'sitemap':
		robots.addSitemap(line[1]);
		if (line[1]) {
		robots.addSitemap(line[1]);
		}
		break;
		case 'host':
		robots.setPreferredHost(line[1].toLowerCase());
		if (line[1]) {
		robots.setPreferredHost(line[1].toLowerCase());
		}
		break;
		}

		isNoneUserAgentState = line[0] !== 'user-agent';
		isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent';
		}
		@@ -257,6 +263,2 @@ }
		*
		* If the rules don't specify it will return the
		* return true unless defaultToDissallow option
		* is set to true.
		*
		* @param {string} url
		@@ -267,3 +269,2 @@ * @param {string?} ua
		Robots.prototype.isAllowed = function (url, ua) {
		var rule = true;
		var parsedUrl = libUrl.parse(url);
		@@ -282,11 +283,5 @@ var userAgent = formatUserAgent(ua \|\| '*');

		if (!this._rules[userAgent]) {
		userAgent = '*';
		}
		var rules = this._rules[userAgent] \|\| this._rules['*'] \|\| [];

		if (this._rules[userAgent]) {
		rule = isPathAllowed(parsedUrl.path, this._rules[userAgent]);
		}

		return rule;
		return isPathAllowed(parsedUrl.path, rules);
		};
		@@ -337,2 +332,3 @@

		module.exports = Robots;
		module.exports = Robots;

177

test/Robots.js

		@@ -14,3 +14,3 @@ var robotsParser = require('../index');
		disallowed.forEach(function (url) {
		expect(robots.isAllowed(url)).to.equal(false);
		expect(robots.isDisallowed(url)).to.equal(true);
		});
		@@ -20,3 +20,3 @@ }
		describe('Robots', function () {
		it('should parse the disallow directive', function (done) {
		it('should parse the disallow directive', function () {
		var contents = [
		@@ -40,7 +40,5 @@ 'User-agent: *',
		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);

		done();
		});

		it('should parse the allow directive', function (done) {
		it('should parse the allow directive', function () {
		var contents = [
		@@ -67,7 +65,5 @@ 'User-agent: *',
		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);

		done();
		});

		it('should parse patterns', function (done) {
		it('should parse patterns', function () {
		var contents = [
		@@ -92,7 +88,5 @@ 'User-agent: *',
		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);

		done();
		});

		it('should have the correct order presidence for allow and disallow', function (done) {
		it('should have the correct order presidence for allow and disallow', function () {
		var contents = [
		@@ -119,7 +113,5 @@ 'User-agent: *',
		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);

		done();
		});

		it('should ignore rules that are not in a group', function (done) {
		it('should ignore rules that are not in a group', function () {
		var contents = [
		@@ -137,9 +129,90 @@ 'Disallow: /secret.html',
		testRobots('http://www.example.com/robots.txt', contents, allowed, []);
		});

		done();

		it('should ignore comments', function () {
		var contents = [
		'#',
		'# This is a comment',
		'#',
		'User-agent: *',
		'# This is a comment',
		'Disallow: /fish/',
		'# Disallow: fish',
		'Disallow: /test.html'
		].join('\n');

		var allowed = [
		'http://www.example.com/fish',
		'http://www.example.com/Test.html'
		];

		var disallowed = [
		'http://www.example.com/fish/index.php',
		'http://www.example.com/fish/',
		'http://www.example.com/test.html'
		];

		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
		});

		it('should return undefined for invalid urls', function (done) {
		it('should ignore invalid lines', function () {
		var contents = [
		'invalid line',
		'User-agent: *',
		'Disallow: /fish/',
		':::::another invalid line:::::',
		'Disallow: /test.html',
		'Unknown: tule'
		].join('\n');

		var allowed = [
		'http://www.example.com/fish',
		'http://www.example.com/Test.html'
		];

		var disallowed = [
		'http://www.example.com/fish/index.php',
		'http://www.example.com/fish/',
		'http://www.example.com/test.html'
		];

		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
		});

		it('should ignore empty user-agent lines', function () {
		var contents = [
		'User-agent:',
		'Disallow: /fish/',
		'Disallow: /test.html'
		].join('\n');

		var allowed = [
		'http://www.example.com/fish',
		'http://www.example.com/Test.html',
		'http://www.example.com/fish/index.php',
		'http://www.example.com/fish/',
		'http://www.example.com/test.html'
		];

		var disallowed = [];

		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
		});

		it('should support groups with multiple user agents (case insensitive)', function () {
		var contents = [
		'User-agent: agenta',
		'User-agent: agentb',
		'Disallow: /fish',
		].join('\n');

		var robots = robotsParser('http://www.example.com/robots.txt', contents);

		expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false);
		});

		it('should return undefined for invalid urls', function () {
		var contents = [
		'User-agent: *',
		'Disallow: /secret.html',
		@@ -160,7 +233,5 @@ 'Disallow: /test',
		});

		done();
		});

		it('should handle Unicode and punycode URLs', function (done) {
		it('should handle Unicode and punycode URLs', function () {
		var contents = [
		@@ -183,7 +254,5 @@ 'User-agent: *',
		testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed);

		done();
		});

		it('should allow all if empty robots.txt', function (done) {
		it('should allow all if empty robots.txt', function () {
		var allowed = [
		@@ -200,7 +269,12 @@ 'http://www.example.com/secret.html',
		});
		});

		done();
		it('should treat null as allowing all', function () {
		var robots = robotsParser('http://www.example.com/robots.txt', null);

		expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true);
		expect(robots.isAllowed("http://www.example.com/")).to.equal(true);
		});

		it('should parse the crawl-delay directive', function (done) {
		it('should parse the crawl-delay directive', function () {
		var contents = [
		@@ -224,7 +298,6 @@ 'user-agent: a',
		expect(robots.getCrawlDelay('d')).to.equal(10);

		done();
		expect(robots.getCrawlDelay()).to.equal(undefined);
		});

		it('should ignore invalid crawl-delay directives', function (done) {
		it('should ignore invalid crawl-delay directives', function () {
		var contents = [
		@@ -248,7 +321,5 @@ 'user-agent: a',
		expect(robots.getCrawlDelay('d')).to.equal(undefined);

		done();
		});

		it('should parse the sitemap directive', function (done) {
		it('should parse the sitemap directive', function () {
		var contents = [
		@@ -273,7 +344,5 @@ 'user-agent: a',
		]);

		done();
		});

		it('should parse the host directive', function (done) {
		it('should parse the host directive', function () {
		var contents = [
		@@ -293,7 +362,24 @@ 'user-agent: a',
		expect(robots.getPreferredHost()).to.equal('example.com');
		});

		done();
		it('should parse empty and invalid directives', function () {
		var contents = [
		'user-agent:',
		'user-agent:::: a::',
		'crawl-delay:',
		'crawl-delay:::: 0:',
		'host:',
		'host:: example.com',
		'sitemap:',
		'sitemap:: site:map.xml',
		'disallow:',
		'disallow::: /:',
		'allow:',
		'allow::: /:',
		].join('\n');

		var robots = robotsParser('http://www.example.com/robots.txt', contents);
		});

		it('should treat only the last host directive as valid', function (done) {
		it('should treat only the last host directive as valid', function () {
		var contents = [
		@@ -314,7 +400,5 @@ 'user-agent: a',
		expect(robots.getPreferredHost()).to.equal('example.com');

		done();
		});

		it('should return null when there is no host directive', function (done) {
		it('should return null when there is no host directive', function () {
		var contents = [
		@@ -331,7 +415,5 @@ 'user-agent: a',
		expect(robots.getPreferredHost()).to.equal(null);

		done();
		});

		it('should fallback to * when a UA has no rules of its own', function (done) {
		it('should fallback to * when a UA has no rules of its own', function () {
		var contents = [
		@@ -354,7 +436,5 @@ 'user-agent: *',
		expect(robots.getCrawlDelay('dd')).to.equal(1);

		done();
		});

		it('should not fallback to * when a UA has rules', function (done) {
		it('should not fallback to * when a UA has rules', function () {
		var contents = [
		@@ -371,7 +451,5 @@ 'user-agent: *',
		expect(robots.getCrawlDelay('b')).to.equal(undefined);

		done();
		});

		it('should ignore version numbers in the UA string', function (done) {
		it('should ignore version numbers in the UA string', function () {
		var contents = [
		@@ -395,5 +473,4 @@ 'user-agent: *',
		expect(robots.getCrawlDelay('b / 1.0')).to.equal(12);
		});
		});

		done();
		});
		});

.npmignore

Sorry, the diff of this file is not supported yet

robots-parser - npm Package Compare versions

Improved metrics

Worsened metrics