robots-parser
Advanced tools
Comparing version 2.1.1 to 2.2.0
{ | ||
"name": "robots-parser", | ||
"version": "2.1.1", | ||
"version": "2.2.0", | ||
"description": "Robots.txt parser.", | ||
@@ -10,3 +10,3 @@ "main": "index.js", | ||
"scripts": { | ||
"test": "istanbul cover _mocha --print detail --report html --dir ./report/coverage" | ||
"test": "nyc --clean --reporter html --report-dir ./report/coverage ./node_modules/.bin/mocha" | ||
}, | ||
@@ -20,6 +20,6 @@ "repository": { | ||
"devDependencies": { | ||
"chai": "^3.5.0", | ||
"istanbul": "^0.4.5", | ||
"mocha": "^3.1.2" | ||
"chai": "^4.2.0", | ||
"mocha": "^6.1.4", | ||
"nyc": "^14.1.1" | ||
} | ||
} |
@@ -46,3 +46,3 @@ # Robots Parser [](https://deepscan.io/dashboard/#view=project&pid=1275&bid=3378) [](https://github.com/samclarke/robots-parser/blob/master/license.md) | ||
robots.getSitemaps(); // ['http://example.com/sitemap.xml'] | ||
robots.getPreferedHost(); // example.com | ||
robots.getPreferredHost(); // example.com | ||
``` | ||
@@ -99,2 +99,9 @@ | ||
### Version 2.2.0: | ||
* Fix bug that with matching wildcard patterns with some URLs | ||
– Thanks to @ckylape for reporting and fixing | ||
* Changed matching algorithm to match Google's implementation in google/robotstxt | ||
* Changed order of precedence to match current spec | ||
### Version 2.1.1: | ||
@@ -101,0 +108,0 @@ |
107
Robots.js
@@ -81,3 +81,3 @@ var URL = require('url').URL; | ||
* unicode characters. | ||
* | ||
* | ||
* @param {string} path | ||
@@ -111,29 +111,65 @@ * @return {string} | ||
/** | ||
* Converts the pattern into a regexp if it is a wildcard | ||
* pattern. | ||
* Matches a pattern with the specified path | ||
* | ||
* Returns a string if the pattern isn't a wildcard pattern | ||
* Uses same algorithm to match patterns as the Google implementation in | ||
* google/robotstxt so it should be consistent with the spec. | ||
* | ||
* @param {string} pattern | ||
* @return {string|RegExp} | ||
* @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74 | ||
* @param {string} pattern | ||
* @param {string} path | ||
* @return {boolean} | ||
* @private | ||
*/ | ||
function parsePattern(pattern) { | ||
var regexSpecialChars = /[\-\[\]\/\{\}\(\)\+\?\.\\\^\$\|]/g; | ||
// Treat consecutive wildcards as one (#12) | ||
var wildCardPattern = /\*+/g; | ||
var endOfLinePattern = /\\\$$/; | ||
function matches(pattern, path) { | ||
// I've added extra comments to try make this easier to understand | ||
pattern = normaliseEncoding(pattern) | ||
// Stores the lengths of all the current matching substrings. | ||
// Maximum number of possible matching lengths is every length in path plus | ||
// 1 to handle 0 length too (if pattern starts with * which is zero or more) | ||
var matchingLengths = new Array(path.length + 1); | ||
var numMatchingLengths = 1; | ||
if (pattern.indexOf('*') < 0 && pattern.indexOf('$') < 0) { | ||
return pattern; | ||
// Initially longest match is 0 | ||
matchingLengths[0] = 0; | ||
for (var p = 0; p < pattern.length; p++) { | ||
// If $ is at the end of pattern then we must match the whole path. | ||
// Which is true if the longest matching length matches path length | ||
if (pattern[p] === '$' && p + 1 === pattern.length) { | ||
return matchingLengths[numMatchingLengths - 1] === path.length; | ||
} | ||
// Handle wildcards | ||
if (pattern[p] == '*') { | ||
// Wildcard so all substrings minus the current smallest matching | ||
// length are matches | ||
numMatchingLengths = path.length - matchingLengths[0] + 1; | ||
// Update matching lengths to include the smallest all the way up | ||
// to numMatchingLengths | ||
// Don't update smallest possible match as * matches zero or more | ||
// so the smallest current match is also valid | ||
for (var i = 1; i < numMatchingLengths; i++) { | ||
matchingLengths[i] = matchingLengths[i - 1] + 1; | ||
} | ||
} else { | ||
// Check the char at the matching length matches the pattern, if it | ||
// does increment it and add it as a valid length, ignore if not. | ||
var numMatches = 0; | ||
for (var i = 0; i < numMatchingLengths; i++) { | ||
if (matchingLengths[i] < path.length && path[matchingLengths[i]] === pattern[p]) { | ||
matchingLengths[numMatches++] = matchingLengths[i] + 1; | ||
} | ||
} | ||
// No paths matched the current pattern char so not a match | ||
if (numMatches == 0) { | ||
return false; | ||
} | ||
numMatchingLengths = numMatches; | ||
} | ||
} | ||
pattern = pattern | ||
.replace(regexSpecialChars, '\\$&') | ||
.replace(wildCardPattern, '(?:.*)') | ||
.replace(endOfLinePattern, '$'); | ||
return new RegExp(pattern); | ||
return true; | ||
} | ||
@@ -207,16 +243,13 @@ | ||
if (typeof rule.pattern === 'string') { | ||
if (path.indexOf(rule.pattern) !== 0) { | ||
continue; | ||
} | ||
if (!matches(rule.pattern, path)) { | ||
continue; | ||
} | ||
// The longest matching rule takes precedence | ||
if (!matchingRule || rule.pattern.length > matchingRule.pattern.length) { | ||
matchingRule = rule; | ||
} | ||
// The first matching pattern takes precedence | ||
// over all other rules including other patterns | ||
} else if (rule.pattern.test(path)) { | ||
return rule; | ||
} | ||
// The longest matching rule takes precedence | ||
if (!matchingRule || rule.pattern.length > matchingRule.pattern.length) { | ||
matchingRule = rule; | ||
} else if (rule.pattern.length == matchingRule.pattern.length && | ||
rule.allow && !matchingRule.allow) { | ||
matchingRule = rule; | ||
} | ||
} | ||
@@ -229,6 +262,6 @@ | ||
* Converts provided string into an URL object. | ||
* | ||
* | ||
* Will return null if provided string is not a valid URL. | ||
* | ||
* @param {string} url | ||
* | ||
* @param {string} url | ||
* @return {?URL} | ||
@@ -277,3 +310,3 @@ * @private | ||
rules[userAgent].push({ | ||
pattern: parsePattern(pattern), | ||
pattern: normaliseEncoding(pattern), | ||
allow: allow, | ||
@@ -280,0 +313,0 @@ lineNumber: lineNumber |
@@ -51,3 +51,4 @@ var robotsParser = require('../index'); | ||
'http://www.example.com/fish/test.html', | ||
'http://www.example.com/Test.html' | ||
'http://www.example.com/Test.html', | ||
'http://www.example.com/test.html' | ||
]; | ||
@@ -58,3 +59,2 @@ | ||
'http://www.example.com/fish/', | ||
'http://www.example.com/test.html' | ||
]; | ||
@@ -69,3 +69,4 @@ | ||
'Disallow: /fish*.php', | ||
'Disallow: /*.dext$' | ||
'Disallow: /*.dext$', | ||
'Disallow: /dir*' | ||
].join('\n'); | ||
@@ -75,3 +76,5 @@ | ||
'http://www.example.com/Fish.PHP', | ||
'http://www.example.com/Fish.dext1' | ||
'http://www.example.com/Fish.dext1', | ||
'http://www.example.com/folder/dir.html', | ||
'http://www.example.com/folder/dir/test.html' | ||
]; | ||
@@ -83,3 +86,5 @@ | ||
'http://www.example.com/AnYthInG.dext', | ||
'http://www.example.com/Fish.dext.dext' | ||
'http://www.example.com/Fish.dext.dext', | ||
'http://www.example.com/dir/test.html', | ||
'http://www.example.com/directory.html' | ||
]; | ||
@@ -90,3 +95,3 @@ | ||
it('should have the correct order presidence for allow and disallow', function () { | ||
it('should have the correct order precedence for allow and disallow', function () { | ||
var contents = [ | ||
@@ -98,2 +103,6 @@ 'User-agent: *', | ||
'Allow: /test/', | ||
'Disallow: /aa/', | ||
'Allow: /aa/', | ||
'Allow: /bb/', | ||
'Disallow: /bb/', | ||
].join('\n'); | ||
@@ -103,3 +112,7 @@ | ||
'http://www.example.com/test/index.html', | ||
'http://www.example.com/test/' | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/test/', | ||
'http://www.example.com/aa/', | ||
'http://www.example.com/bb/', | ||
'http://www.example.com/x/' | ||
]; | ||
@@ -110,3 +123,2 @@ | ||
'http://www.example.com/fishheads/catfish.php?parameters', | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/test' | ||
@@ -118,2 +130,88 @@ ]; | ||
it('should have the correct order precedence for wildcards', function () { | ||
var contents = [ | ||
'User-agent: *', | ||
'Disallow: /*/', | ||
'Allow: /x/', | ||
].join('\n'); | ||
var allowed = [ | ||
'http://www.example.com/x/', | ||
'http://www.example.com/fish.php', | ||
'http://www.example.com/test' | ||
]; | ||
var disallowed = [ | ||
'http://www.example.com/a/', | ||
'http://www.example.com/xx/', | ||
'http://www.example.com/test/index.html' | ||
]; | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
}); | ||
it('should parse lines delimitated by \\r', function () { | ||
var contents = [ | ||
'User-agent: *', | ||
'Disallow: /fish/', | ||
'Disallow: /test.html' | ||
].join('\r'); | ||
var allowed = [ | ||
'http://www.example.com/fish', | ||
'http://www.example.com/Test.html' | ||
]; | ||
var disallowed = [ | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/fish/', | ||
'http://www.example.com/test.html' | ||
]; | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
}); | ||
it('should parse lines delimitated by \\r\\n', function () { | ||
var contents = [ | ||
'User-agent: *', | ||
'Disallow: /fish/', | ||
'Disallow: /test.html' | ||
].join('\r\n'); | ||
var allowed = [ | ||
'http://www.example.com/fish', | ||
'http://www.example.com/Test.html' | ||
]; | ||
var disallowed = [ | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/fish/', | ||
'http://www.example.com/test.html' | ||
]; | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
}); | ||
it('should parse lines delimitated by mixed line endings', function () { | ||
var contents = [ | ||
'User-agent: *\r', | ||
'Disallow: /fish/\r\n', | ||
'Disallow: /test.html\n\n' | ||
].join(''); | ||
var allowed = [ | ||
'http://www.example.com/fish', | ||
'http://www.example.com/Test.html' | ||
]; | ||
var disallowed = [ | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/fish/', | ||
'http://www.example.com/test.html' | ||
]; | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
}); | ||
it('should ignore rules that are not in a group', function () { | ||
@@ -602,3 +700,6 @@ var contents = [ | ||
'disallow: /t*t', | ||
// check UA returns -1 if no matching UA and also handles patterns both allow and disaloow | ||
'', | ||
'User-agent: c', | ||
'Disallow: /fish*.php', | ||
'Allow: /fish/index.php' | ||
].join('\n'); | ||
@@ -614,8 +715,32 @@ | ||
expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4); | ||
expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(5); | ||
expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7); | ||
expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10); | ||
expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'b')).to.equal(14); | ||
expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17); | ||
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18); | ||
}); | ||
it('should handle large wildcards efficiently', function () { | ||
var contents = [ | ||
'User-agent: *', | ||
'Disallow: /' + '*'.repeat(2048) + '.html', | ||
].join('\n'); | ||
var allowed = [ | ||
'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php', | ||
]; | ||
var disallowed = [ | ||
'http://www.example.com/secret.html' | ||
]; | ||
const start = Date.now(); | ||
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed); | ||
const end = Date.now(); | ||
// Should take less than 500 ms (high to allow for variableness of | ||
// machines running the test, should normally be much less) | ||
expect(end - start).to.be.lessThan(500); | ||
}); | ||
}); | ||
36011
986
164