robots-parser - npm Package Compare versions

Comparing version 2.1.1 to 2.2.0

package.json

		{
		"name": "robots-parser",
		"version": "2.1.1",
		"version": "2.2.0",
		"description": "Robots.txt parser.",
		@@ -10,3 +10,3 @@ "main": "index.js",
		"scripts": {
		"test": "istanbul cover _mocha --print detail --report html --dir ./report/coverage"
		"test": "nyc --clean --reporter html --report-dir ./report/coverage ./node_modules/.bin/mocha"
		},
		@@ -20,6 +20,6 @@ "repository": {
		"devDependencies": {
		"chai": "^3.5.0",
		"istanbul": "^0.4.5",
		"mocha": "^3.1.2"
		"chai": "^4.2.0",
		"mocha": "^6.1.4",
		"nyc": "^14.1.1"
		}
		}

readme.md

		@@ -46,3 +46,3 @@ # Robots Parser [![DeepScan Grade](https://deepscan.io/api/projects/1275/branches/3378/badge/grade.svg)](https://deepscan.io/dashboard/#view=project&pid=1275&bid=3378) [![GitHub license](https://img.shields.io/github/license/samclarke/robots-parser.svg)](https://github.com/samclarke/robots-parser/blob/master/license.md)
		robots.getSitemaps(); // ['http://example.com/sitemap.xml']
		robots.getPreferedHost(); // example.com
		robots.getPreferredHost(); // example.com
		```
		@@ -99,2 +99,9 @@

		### Version 2.2.0:

		* Fix bug that with matching wildcard patterns with some URLs
		– Thanks to @ckylape for reporting and fixing
		* Changed matching algorithm to match Google's implementation in google/robotstxt
		* Changed order of precedence to match current spec

		### Version 2.1.1:
		@@ -101,0 +108,0 @@

107

Robots.js

		@@ -81,3 +81,3 @@ var URL = require('url').URL;
		* unicode characters.
		*
		*
		* @param {string} path
		@@ -111,29 +111,65 @@ * @return {string}
		/**
		* Converts the pattern into a regexp if it is a wildcard
		* pattern.
		* Matches a pattern with the specified path
		*
		* Returns a string if the pattern isn't a wildcard pattern
		* Uses same algorithm to match patterns as the Google implementation in
		* google/robotstxt so it should be consistent with the spec.
		*
		* @param {string} pattern
		* @return {string\|RegExp}
		* @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74
		* @param {string} pattern
		* @param {string} path
		* @return {boolean}
		* @private
		*/
		function parsePattern(pattern) {
		var regexSpecialChars = /[\-\[\]\/\{\}\+\?\.\\\^\$\\|]/g;
		// Treat consecutive wildcards as one (#12)
		var wildCardPattern = /\*+/g;
		var endOfLinePattern = /\\\$$/;
		function matches(pattern, path) {
		// I've added extra comments to try make this easier to understand

		pattern = normaliseEncoding(pattern)
		// Stores the lengths of all the current matching substrings.
		// Maximum number of possible matching lengths is every length in path plus
		// 1 to handle 0 length too (if pattern starts with * which is zero or more)
		var matchingLengths = new Array(path.length + 1);
		var numMatchingLengths = 1;

		if (pattern.indexOf('*') < 0 && pattern.indexOf('$') < 0) {
		return pattern;
		// Initially longest match is 0
		matchingLengths[0] = 0;

		for (var p = 0; p < pattern.length; p++) {
		// If $ is at the end of pattern then we must match the whole path.
		// Which is true if the longest matching length matches path length
		if (pattern[p] === '$' && p + 1 === pattern.length) {
		return matchingLengths[numMatchingLengths - 1] === path.length;
		}

		// Handle wildcards
		if (pattern[p] == '*') {
		// Wildcard so all substrings minus the current smallest matching
		// length are matches
		numMatchingLengths = path.length - matchingLengths[0] + 1;

		// Update matching lengths to include the smallest all the way up
		// to numMatchingLengths
		// Don't update smallest possible match as * matches zero or more
		// so the smallest current match is also valid
		for (var i = 1; i < numMatchingLengths; i++) {
		matchingLengths[i] = matchingLengths[i - 1] + 1;
		}
		} else {
		// Check the char at the matching length matches the pattern, if it
		// does increment it and add it as a valid length, ignore if not.
		var numMatches = 0;
		for (var i = 0; i < numMatchingLengths; i++) {
		if (matchingLengths[i] < path.length && path[matchingLengths[i]] === pattern[p]) {
		matchingLengths[numMatches++] = matchingLengths[i] + 1;
		}
		}

		// No paths matched the current pattern char so not a match
		if (numMatches == 0) {
		return false;
		}

		numMatchingLengths = numMatches;
		}
		}

		pattern = pattern
		.replace(regexSpecialChars, '\\$&')
		.replace(wildCardPattern, '(?:.*)')
		.replace(endOfLinePattern, '$');

		return new RegExp(pattern);
		return true;
		}
		@@ -207,16 +243,13 @@

		if (typeof rule.pattern === 'string') {
		if (path.indexOf(rule.pattern) !== 0) {
		continue;
		}
		if (!matches(rule.pattern, path)) {
		continue;
		}

		// The longest matching rule takes precedence
		if (!matchingRule \|\| rule.pattern.length > matchingRule.pattern.length) {
		matchingRule = rule;
		}
		// The first matching pattern takes precedence
		// over all other rules including other patterns
		} else if (rule.pattern.test(path)) {
		return rule;
		}
		// The longest matching rule takes precedence
		if (!matchingRule \|\| rule.pattern.length > matchingRule.pattern.length) {
		matchingRule = rule;
		} else if (rule.pattern.length == matchingRule.pattern.length &&
		rule.allow && !matchingRule.allow) {
		matchingRule = rule;
		}
		}
		@@ -229,6 +262,6 @@
		* Converts provided string into an URL object.
		*
		*
		* Will return null if provided string is not a valid URL.
		*
		* @param {string} url
		*
		* @param {string} url
		* @return {?URL}
		@@ -277,3 +310,3 @@ * @private
		rules[userAgent].push({
		pattern: parsePattern(pattern),
		pattern: normaliseEncoding(pattern),
		allow: allow,
		@@ -280,0 +313,0 @@ lineNumber: lineNumber

149

test/Robots.js

		@@ -51,3 +51,4 @@ var robotsParser = require('../index');
		'http://www.example.com/fish/test.html',
		'http://www.example.com/Test.html'
		'http://www.example.com/Test.html',
		'http://www.example.com/test.html'
		];
		@@ -58,3 +59,2 @@
		'http://www.example.com/fish/',
		'http://www.example.com/test.html'
		];
		@@ -69,3 +69,4 @@
		'Disallow: /fish*.php',
		'Disallow: /*.dext$'
		'Disallow: /*.dext$',
		'Disallow: /dir*'
		].join('\n');
		@@ -75,3 +76,5 @@
		'http://www.example.com/Fish.PHP',
		'http://www.example.com/Fish.dext1'
		'http://www.example.com/Fish.dext1',
		'http://www.example.com/folder/dir.html',
		'http://www.example.com/folder/dir/test.html'
		];
		@@ -83,3 +86,5 @@
		'http://www.example.com/AnYthInG.dext',
		'http://www.example.com/Fish.dext.dext'
		'http://www.example.com/Fish.dext.dext',
		'http://www.example.com/dir/test.html',
		'http://www.example.com/directory.html'
		];
		@@ -90,3 +95,3 @@

		it('should have the correct order presidence for allow and disallow', function () {
		it('should have the correct order precedence for allow and disallow', function () {
		var contents = [
		@@ -98,2 +103,6 @@ 'User-agent: *',
		'Allow: /test/',
		'Disallow: /aa/',
		'Allow: /aa/',
		'Allow: /bb/',
		'Disallow: /bb/',
		].join('\n');
		@@ -103,3 +112,7 @@
		'http://www.example.com/test/index.html',
		'http://www.example.com/test/'
		'http://www.example.com/fish/index.php',
		'http://www.example.com/test/',
		'http://www.example.com/aa/',
		'http://www.example.com/bb/',
		'http://www.example.com/x/'
		];
		@@ -110,3 +123,2 @@
		'http://www.example.com/fishheads/catfish.php?parameters',
		'http://www.example.com/fish/index.php',
		'http://www.example.com/test'
		@@ -118,2 +130,88 @@ ];

		it('should have the correct order precedence for wildcards', function () {
		var contents = [
		'User-agent: *',
		'Disallow: /*/',
		'Allow: /x/',
		].join('\n');

		var allowed = [
		'http://www.example.com/x/',
		'http://www.example.com/fish.php',
		'http://www.example.com/test'
		];

		var disallowed = [
		'http://www.example.com/a/',
		'http://www.example.com/xx/',
		'http://www.example.com/test/index.html'
		];

		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
		});

		it('should parse lines delimitated by \\r', function () {
		var contents = [
		'User-agent: *',
		'Disallow: /fish/',
		'Disallow: /test.html'
		].join('\r');

		var allowed = [
		'http://www.example.com/fish',
		'http://www.example.com/Test.html'
		];

		var disallowed = [
		'http://www.example.com/fish/index.php',
		'http://www.example.com/fish/',
		'http://www.example.com/test.html'
		];

		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
		});

		it('should parse lines delimitated by \\r\\n', function () {
		var contents = [
		'User-agent: *',
		'Disallow: /fish/',
		'Disallow: /test.html'
		].join('\r\n');

		var allowed = [
		'http://www.example.com/fish',
		'http://www.example.com/Test.html'
		];

		var disallowed = [
		'http://www.example.com/fish/index.php',
		'http://www.example.com/fish/',
		'http://www.example.com/test.html'
		];

		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
		});


		it('should parse lines delimitated by mixed line endings', function () {
		var contents = [
		'User-agent: *\r',
		'Disallow: /fish/\r\n',
		'Disallow: /test.html\n\n'
		].join('');

		var allowed = [
		'http://www.example.com/fish',
		'http://www.example.com/Test.html'
		];

		var disallowed = [
		'http://www.example.com/fish/index.php',
		'http://www.example.com/fish/',
		'http://www.example.com/test.html'
		];

		testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
		});

		it('should ignore rules that are not in a group', function () {
		@@ -602,3 +700,6 @@ var contents = [
		'disallow: /t*t',
		// check UA returns -1 if no matching UA and also handles patterns both allow and disaloow
		'',
		'User-agent: c',
		'Disallow: /fish*.php',
		'Allow: /fish/index.php'
		].join('\n');
		@@ -614,8 +715,32 @@
		expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4);
		expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(5);
		expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7);

		expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10);
		expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'b')).to.equal(14);

		expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17);
		expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18);
		});

		it('should handle large wildcards efficiently', function () {
		var contents = [
		'User-agent: *',
		'Disallow: /' + '*'.repeat(2048) + '.html',
		].join('\n');

		var allowed = [
		'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php',
		];

		var disallowed = [
		'http://www.example.com/secret.html'
		];

		const start = Date.now();
		testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
		const end = Date.now();

		// Should take less than 500 ms (high to allow for variableness of
		// machines running the test, should normally be much less)
		expect(end - start).to.be.lessThan(500);
		});
		});

robots-parser - npm Package Compare versions

Improved metrics