robots-parser
Advanced tools
Comparing version
{ | ||
"name": "robots-parser", | ||
"version": "2.1.1", | ||
"description": "Robots.txt parser.", | ||
"main": "index.js", | ||
"directories": { | ||
"test": "tests" | ||
}, | ||
"scripts": { | ||
"test": "istanbul cover _mocha --print detail --report html --dir ./report/coverage" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/samclarke/robots-parser.git" | ||
}, | ||
"author": "Sam Clarke <sam@samclarke.com>", | ||
"license": "MIT", | ||
"devDependencies": { | ||
"chai": "^3.5.0", | ||
"istanbul": "^0.4.5", | ||
"mocha": "^3.1.2" | ||
} | ||
"name": "robots-parser", | ||
"version": "2.4.0", | ||
"description": "NodeJS robots.txt parser with support for wildcard (*) matching.", | ||
"keywords": [ | ||
"robots.txt", | ||
"parser", | ||
"user-agent", | ||
"scraper", | ||
"spider", | ||
"bot", | ||
"robots-exclusion-standard" | ||
], | ||
"main": "index.js", | ||
"directories": { | ||
"test": "tests" | ||
}, | ||
"scripts": { | ||
"test": "nyc --reporter=text-summary --reporter=html --reporter=lcovonly mocha" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/samclarke/robots-parser.git" | ||
}, | ||
"author": "Sam Clarke <sam@samclarke.com>", | ||
"license": "MIT", | ||
"files": [ | ||
"/Robots.js", | ||
"/index.js", | ||
"/index.d.ts" | ||
], | ||
"prettier": { | ||
"tabWidth": 4, | ||
"useTabs": true, | ||
"singleQuote": true, | ||
"trailingComma": "none" | ||
}, | ||
"devDependencies": { | ||
"chai": "^4.3.4", | ||
"mocha": "^9.1.3", | ||
"nyc": "^15.1.0" | ||
}, | ||
"types": "./index.d.ts" | ||
} |
134
readme.md
@@ -1,2 +0,2 @@ | ||
# Robots Parser [](https://deepscan.io/dashboard/#view=project&pid=1275&bid=3378) [](https://github.com/samclarke/robots-parser/blob/master/license.md) | ||
# Robots Parser [](https://www.npmjs.com/package/robots-parser) [](https://deepscan.io/dashboard#view=project&tid=457&pid=16277&bid=344939) [](https://github.com/samclarke/robots-parser/blob/master/license.md) [](https://coveralls.io/github/samclarke/robots-parser?branch=master) | ||
@@ -7,9 +7,9 @@ NodeJS robots.txt parser. | ||
* User-agent: | ||
* Allow: | ||
* Disallow: | ||
* Sitemap: | ||
* Crawl-delay: | ||
* Host: | ||
* Paths with wildcards (*) and EOL matching ($) | ||
- User-agent: | ||
- Allow: | ||
- Disallow: | ||
- Sitemap: | ||
- Crawl-delay: | ||
- Host: | ||
- Paths with wildcards (\*) and EOL matching ($) | ||
@@ -42,3 +42,3 @@ ## Installation | ||
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // false | ||
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true | ||
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true | ||
@@ -48,7 +48,7 @@ robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true | ||
robots.getSitemaps(); // ['http://example.com/sitemap.xml'] | ||
robots.getPreferedHost(); // example.com | ||
robots.getPreferredHost(); // example.com | ||
``` | ||
### isAllowed(url, [ua]) | ||
### isAllowed(url, [ua]) | ||
**boolean or undefined** | ||
@@ -60,4 +60,4 @@ | ||
### isDisallowed(url, [ua]) | ||
### isDisallowed(url, [ua]) | ||
**boolean or undefined** | ||
@@ -69,4 +69,4 @@ | ||
### getMatchingLineNumber(url, [ua]) | ||
### getMatchingLineNumber(url, [ua]) | ||
**number or undefined** | ||
@@ -80,4 +80,4 @@ | ||
### getCrawlDelay([ua]) | ||
### getCrawlDelay([ua]) | ||
**number or undefined** | ||
@@ -89,4 +89,4 @@ | ||
### getSitemaps() | ||
### getSitemaps() | ||
**array** | ||
@@ -96,4 +96,4 @@ | ||
### getPreferredHost() | ||
### getPreferredHost() | ||
**string or null** | ||
@@ -103,17 +103,52 @@ | ||
# Changes | ||
### Version 2.4.0: | ||
- Added Typescript definitions | ||
– Thanks to @danhab99 for creating | ||
- Added SECURITY.md policy and CodeQL scanning | ||
### Version 2.3.0: | ||
- Fixed bug where if the user-agent passed to `isAllowed()` / `isDisallowed()` is called "constructor" it would throw an error. | ||
- Added support for relative URLs. This does not affect the default behavior so can safely be upgraded. | ||
Relative matching is only allowed if both the robots.txt URL and the URLs being checked are relative. | ||
For example: | ||
```js | ||
var robots = robotsParser('/robots.txt', [ | ||
'User-agent: *', | ||
'Disallow: /dir/', | ||
'Disallow: /test.html', | ||
'Allow: /dir/test.html', | ||
'Allow: /test.html' | ||
].join('\n')); | ||
robots.isAllowed('/test.html', 'Sams-Bot/1.0'); // false | ||
robots.isAllowed('/dir/test.html', 'Sams-Bot/1.0'); // true | ||
robots.isDisallowed('/dir/test2.html', 'Sams-Bot/1.0'); // true | ||
``` | ||
### Version 2.2.0: | ||
- Fixed bug that with matching wildcard patterns with some URLs | ||
– Thanks to @ckylape for reporting and fixing | ||
- Changed matching algorithm to match Google's implementation in google/robotstxt | ||
- Changed order of precedence to match current spec | ||
### Version 2.1.1: | ||
* Fix bug that could be used to causing rule checking to take a long time | ||
– Thanks to @andeanfog | ||
- Fix bug that could be used to causing rule checking to take a long time | ||
– Thanks to @andeanfog | ||
### Version 2.1.0: | ||
* Removed use of punycode module API's as new URL API handles it | ||
* Improved test coverage | ||
* Added tests for percent encoded paths and improved support | ||
* Added `getMatchingLineNumber()` method | ||
* Fixed bug with comments on same line as directive | ||
- Removed use of punycode module API's as new URL API handles it | ||
- Improved test coverage | ||
- Added tests for percent encoded paths and improved support | ||
- Added `getMatchingLineNumber()` method | ||
- Fixed bug with comments on same line as directive | ||
@@ -124,43 +159,42 @@ ### Version 2.0.0: | ||
* Update code to not use deprecated URL module API's. | ||
– Thanks to @kdzwinel | ||
- Update code to not use deprecated URL module API's. | ||
– Thanks to @kdzwinel | ||
### Version 1.0.2: | ||
* Fixed error caused by invalid URLs missing the protocol. | ||
- Fixed error caused by invalid URLs missing the protocol. | ||
### Version 1.0.1: | ||
* Fixed bug with the "user-agent" rule being treated as case sensitive. | ||
– Thanks to @brendonboshell | ||
* Improved test coverage. | ||
– Thanks to @schornio | ||
- Fixed bug with the "user-agent" rule being treated as case sensitive. | ||
– Thanks to @brendonboshell | ||
- Improved test coverage. | ||
– Thanks to @schornio | ||
### Version 1.0.0: | ||
* Initial release. | ||
- Initial release. | ||
# License | ||
The MIT License (MIT) | ||
The MIT License (MIT) | ||
Copyright (c) 2014 Sam Clarke | ||
Copyright (c) 2014 Sam Clarke | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. |
149
Robots.js
@@ -81,3 +81,3 @@ var URL = require('url').URL; | ||
* unicode characters. | ||
* | ||
* | ||
* @param {string} path | ||
@@ -90,3 +90,3 @@ * @return {string} | ||
return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%')); | ||
} catch(e) { | ||
} catch (e) { | ||
return path; | ||
@@ -112,29 +112,68 @@ } | ||
/** | ||
* Converts the pattern into a regexp if it is a wildcard | ||
* pattern. | ||
* Matches a pattern with the specified path | ||
* | ||
* Returns a string if the pattern isn't a wildcard pattern | ||
* Uses same algorithm to match patterns as the Google implementation in | ||
* google/robotstxt so it should be consistent with the spec. | ||
* | ||
* @param {string} pattern | ||
* @return {string|RegExp} | ||
* @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74 | ||
* @param {string} pattern | ||
* @param {string} path | ||
* @return {boolean} | ||
* @private | ||
*/ | ||
function parsePattern(pattern) { | ||
var regexSpecialChars = /[\-\[\]\/\{\}\(\)\+\?\.\\\^\$\|]/g; | ||
// Treat consecutive wildcards as one (#12) | ||
var wildCardPattern = /\*+/g; | ||
var endOfLinePattern = /\\\$$/; | ||
function matches(pattern, path) { | ||
// I've added extra comments to try make this easier to understand | ||
pattern = normaliseEncoding(pattern) | ||
// Stores the lengths of all the current matching substrings. | ||
// Maximum number of possible matching lengths is every length in path plus | ||
// 1 to handle 0 length too (if pattern starts with * which is zero or more) | ||
var matchingLengths = new Array(path.length + 1); | ||
var numMatchingLengths = 1; | ||
if (pattern.indexOf('*') < 0 && pattern.indexOf('$') < 0) { | ||
return pattern; | ||
// Initially longest match is 0 | ||
matchingLengths[0] = 0; | ||
for (var p = 0; p < pattern.length; p++) { | ||
// If $ is at the end of pattern then we must match the whole path. | ||
// Which is true if the longest matching length matches path length | ||
if (pattern[p] === '$' && p + 1 === pattern.length) { | ||
return matchingLengths[numMatchingLengths - 1] === path.length; | ||
} | ||
// Handle wildcards | ||
if (pattern[p] == '*') { | ||
// Wildcard so all substrings minus the current smallest matching | ||
// length are matches | ||
numMatchingLengths = path.length - matchingLengths[0] + 1; | ||
// Update matching lengths to include the smallest all the way up | ||
// to numMatchingLengths | ||
// Don't update smallest possible match as * matches zero or more | ||
// so the smallest current match is also valid | ||
for (var i = 1; i < numMatchingLengths; i++) { | ||
matchingLengths[i] = matchingLengths[i - 1] + 1; | ||
} | ||
} else { | ||
// Check the char at the matching length matches the pattern, if it | ||
// does increment it and add it as a valid length, ignore if not. | ||
var numMatches = 0; | ||
for (var i = 0; i < numMatchingLengths; i++) { | ||
if ( | ||
matchingLengths[i] < path.length && | ||
path[matchingLengths[i]] === pattern[p] | ||
) { | ||
matchingLengths[numMatches++] = matchingLengths[i] + 1; | ||
} | ||
} | ||
// No paths matched the current pattern char so not a match | ||
if (numMatches == 0) { | ||
return false; | ||
} | ||
numMatchingLengths = numMatches; | ||
} | ||
} | ||
pattern = pattern | ||
.replace(regexSpecialChars, '\\$&') | ||
.replace(wildCardPattern, '(?:.*)') | ||
.replace(endOfLinePattern, '$'); | ||
return new RegExp(pattern); | ||
return true; | ||
} | ||
@@ -152,3 +191,3 @@ | ||
var isNoneUserAgentState = true; | ||
for (var i=0; i < lines.length; i++) { | ||
for (var i = 0; i < lines.length; i++) { | ||
var line = lines[i]; | ||
@@ -204,24 +243,25 @@ | ||
function findRule(path, rules) { | ||
var matchingRule = null; | ||
var matchedRule = null; | ||
for (var i=0; i < rules.length; i++) { | ||
var rule = rules[i]; | ||
for (var i = 0; i < rules.length; i++) { | ||
var rule = rules[i]; | ||
if (typeof rule.pattern === 'string') { | ||
if (path.indexOf(rule.pattern) !== 0) { | ||
continue; | ||
} | ||
if (!matches(rule.pattern, path)) { | ||
continue; | ||
} | ||
// The longest matching rule takes precedence | ||
if (!matchingRule || rule.pattern.length > matchingRule.pattern.length) { | ||
matchingRule = rule; | ||
} | ||
// The first matching pattern takes precedence | ||
// over all other rules including other patterns | ||
} else if (rule.pattern.test(path)) { | ||
return rule; | ||
} | ||
} | ||
// The longest matching rule takes precedence | ||
// If rules are the same length then allow takes precedence | ||
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) { | ||
matchedRule = rule; | ||
} else if ( | ||
rule.pattern.length == matchedRule.pattern.length && | ||
rule.allow && | ||
!matchedRule.allow | ||
) { | ||
matchedRule = rule; | ||
} | ||
} | ||
return matchingRule; | ||
return matchedRule; | ||
} | ||
@@ -231,6 +271,6 @@ | ||
* Converts provided string into an URL object. | ||
* | ||
* | ||
* Will return null if provided string is not a valid URL. | ||
* | ||
* @param {string} url | ||
* | ||
* @param {string} url | ||
* @return {?URL} | ||
@@ -241,4 +281,8 @@ * @private | ||
try { | ||
return new URL(url); | ||
} catch(e) { | ||
// Specify a URL to be used with relative paths | ||
// Using non-existent subdomain so can never cause conflict unless | ||
// trying to crawl it but doesn't exist and even if tried worst that can | ||
// happen is it allows relative URLs on it. | ||
return new URL(url, 'http://robots-relative.samclarke.com/'); | ||
} catch (e) { | ||
return null; | ||
@@ -248,3 +292,2 @@ } | ||
function Robots(url, contents) { | ||
@@ -254,3 +297,3 @@ this._url = parseUrl(url) || {}; | ||
this._rules = {}; | ||
this._rules = Object.create(null); | ||
this._sitemaps = []; | ||
@@ -282,3 +325,3 @@ this._preferredHost = null; | ||
rules[userAgent].push({ | ||
pattern: parsePattern(pattern), | ||
pattern: normaliseEncoding(pattern), | ||
allow: allow, | ||
@@ -333,8 +376,10 @@ lineNumber: lineNumber | ||
parsedUrl.port = parsedUrl.port || '80'; | ||
parsedUrl.port = parsedUrl.port || 80; | ||
// The base URL must match otherwise this robots.txt is not valid for it. | ||
if (parsedUrl.protocol !== this._url.protocol || | ||
if ( | ||
parsedUrl.protocol !== this._url.protocol || | ||
parsedUrl.hostname !== this._url.hostname || | ||
parsedUrl.port !== this._url.port) { | ||
parsedUrl.port !== this._url.port | ||
) { | ||
return; | ||
@@ -344,3 +389,3 @@ } | ||
var rules = this._rules[userAgent] || this._rules['*'] || []; | ||
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search) | ||
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); | ||
var rule = findRule(path, rules); | ||
@@ -347,0 +392,0 @@ |
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
191
21.66%19357
-37.87%417
-51.17%