robots-parser
Advanced tools
Comparing version 2.2.0 to 2.3.0
{ | ||
"name": "robots-parser", | ||
"version": "2.2.0", | ||
"description": "Robots.txt parser.", | ||
"main": "index.js", | ||
"directories": { | ||
"test": "tests" | ||
}, | ||
"scripts": { | ||
"test": "nyc --clean --reporter html --report-dir ./report/coverage ./node_modules/.bin/mocha" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/samclarke/robots-parser.git" | ||
}, | ||
"author": "Sam Clarke <sam@samclarke.com>", | ||
"license": "MIT", | ||
"devDependencies": { | ||
"chai": "^4.2.0", | ||
"mocha": "^6.1.4", | ||
"nyc": "^14.1.1" | ||
} | ||
"name": "robots-parser", | ||
"version": "2.3.0", | ||
"description": "Robots.txt parser.", | ||
"main": "index.js", | ||
"directories": { | ||
"test": "tests" | ||
}, | ||
"scripts": { | ||
"test": "nyc --reporter=text-summary --reporter=html --reporter=lcovonly mocha" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/samclarke/robots-parser.git" | ||
}, | ||
"author": "Sam Clarke <sam@samclarke.com>", | ||
"license": "MIT", | ||
"files": [ | ||
"/Robots.js", | ||
"/index.js" | ||
], | ||
"prettier": { | ||
"tabWidth": 4, | ||
"useTabs": true, | ||
"singleQuote": true, | ||
"trailingComma": "none" | ||
}, | ||
"devDependencies": { | ||
"chai": "^4.2.0", | ||
"mocha": "^6.1.4", | ||
"nyc": "^14.1.1" | ||
} | ||
} |
@@ -1,2 +0,2 @@ | ||
# Robots Parser [](https://deepscan.io/dashboard/#view=project&pid=1275&bid=3378) [](https://github.com/samclarke/robots-parser/blob/master/license.md) | ||
# Robots Parser [](https://deepscan.io/dashboard#view=project&tid=457&pid=16277&bid=344939) [](https://github.com/samclarke/robots-parser/blob/master/license.md) [](https://coveralls.io/github/samclarke/robots-parser?branch=master) | ||
@@ -98,5 +98,28 @@ NodeJS robots.txt parser. | ||
### Version 2.3.0: | ||
* Fixed bug where if the user-agent passed to `isAllowed()` / `isDisallowed()` is called "constructor" it would throw an error. | ||
* Added support for relative URLs. This does not affect the default behavior so can safely be upgraded. | ||
Relative matching is only allowed if both the robots.txt URL and the URLs being checked are relative. | ||
For example: | ||
```js | ||
var robots = robotsParser('/robots.txt', [ | ||
'User-agent: *', | ||
'Disallow: /dir/', | ||
'Disallow: /test.html', | ||
'Allow: /dir/test.html', | ||
'Allow: /test.html' | ||
].join('\n')); | ||
robots.isAllowed('/test.html', 'Sams-Bot/1.0'); // false | ||
robots.isAllowed('/dir/test.html', 'Sams-Bot/1.0'); // true | ||
robots.isDisallowed('/dir/test2.html', 'Sams-Bot/1.0'); // true | ||
``` | ||
### Version 2.2.0: | ||
* Fix bug that with matching wildcard patterns with some URLs | ||
* Fixed bug that with matching wildcard patterns with some URLs | ||
– Thanks to @ckylape for reporting and fixing | ||
@@ -103,0 +126,0 @@ * Changed matching algorithm to match Google's implementation in google/robotstxt |
@@ -89,3 +89,3 @@ var URL = require('url').URL; | ||
return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%')); | ||
} catch(e) { | ||
} catch (e) { | ||
return path; | ||
@@ -159,3 +159,6 @@ } | ||
for (var i = 0; i < numMatchingLengths; i++) { | ||
if (matchingLengths[i] < path.length && path[matchingLengths[i]] === pattern[p]) { | ||
if ( | ||
matchingLengths[i] < path.length && | ||
path[matchingLengths[i]] === pattern[p] | ||
) { | ||
matchingLengths[numMatches++] = matchingLengths[i] + 1; | ||
@@ -187,3 +190,3 @@ } | ||
var isNoneUserAgentState = true; | ||
for (var i=0; i < lines.length; i++) { | ||
for (var i = 0; i < lines.length; i++) { | ||
var line = lines[i]; | ||
@@ -239,6 +242,6 @@ | ||
function findRule(path, rules) { | ||
var matchingRule = null; | ||
var matchedRule = null; | ||
for (var i=0; i < rules.length; i++) { | ||
var rule = rules[i]; | ||
for (var i = 0; i < rules.length; i++) { | ||
var rule = rules[i]; | ||
@@ -250,11 +253,15 @@ if (!matches(rule.pattern, path)) { | ||
// The longest matching rule takes precedence | ||
if (!matchingRule || rule.pattern.length > matchingRule.pattern.length) { | ||
matchingRule = rule; | ||
} else if (rule.pattern.length == matchingRule.pattern.length && | ||
rule.allow && !matchingRule.allow) { | ||
matchingRule = rule; | ||
// If rules are the same length then allow takes precedence | ||
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) { | ||
matchedRule = rule; | ||
} else if ( | ||
rule.pattern.length == matchedRule.pattern.length && | ||
rule.allow && | ||
!matchedRule.allow | ||
) { | ||
matchedRule = rule; | ||
} | ||
} | ||
} | ||
return matchingRule; | ||
return matchedRule; | ||
} | ||
@@ -273,4 +280,8 @@ | ||
try { | ||
return new URL(url); | ||
} catch(e) { | ||
// Specify a URL to be used with relative paths | ||
// Using non-existent subdomain so can never cause conflict unless | ||
// trying to crawl it but doesn't exist and even if tried worst that can | ||
// happen is it allows relative URLs on it. | ||
return new URL(url, 'http://robots-relative.samclarke.com/'); | ||
} catch (e) { | ||
return null; | ||
@@ -280,3 +291,2 @@ } | ||
function Robots(url, contents) { | ||
@@ -286,3 +296,3 @@ this._url = parseUrl(url) || {}; | ||
this._rules = {}; | ||
this._rules = Object.create(null); | ||
this._sitemaps = []; | ||
@@ -364,8 +374,10 @@ this._preferredHost = null; | ||
parsedUrl.port = parsedUrl.port || '80'; | ||
parsedUrl.port = parsedUrl.port || 80; | ||
// The base URL must match otherwise this robots.txt is not valid for it. | ||
if (parsedUrl.protocol !== this._url.protocol || | ||
if ( | ||
parsedUrl.protocol !== this._url.protocol || | ||
parsedUrl.hostname !== this._url.hostname || | ||
parsedUrl.port !== this._url.port) { | ||
parsedUrl.port !== this._url.port | ||
) { | ||
return; | ||
@@ -375,3 +387,3 @@ } | ||
var rules = this._rules[userAgent] || this._rules['*'] || []; | ||
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search) | ||
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); | ||
var rule = findRule(path, rules); | ||
@@ -378,0 +390,0 @@ |
187
18378
5
407