New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

robots-parser

Package Overview
Dependencies
Maintainers
1
Versions
11
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

robots-parser - npm Package Compare versions

Comparing version 2.1.1 to 2.2.0

10

package.json
{
"name": "robots-parser",
"version": "2.1.1",
"version": "2.2.0",
"description": "Robots.txt parser.",

@@ -10,3 +10,3 @@ "main": "index.js",

"scripts": {
"test": "istanbul cover _mocha --print detail --report html --dir ./report/coverage"
"test": "nyc --clean --reporter html --report-dir ./report/coverage ./node_modules/.bin/mocha"
},

@@ -20,6 +20,6 @@ "repository": {

"devDependencies": {
"chai": "^3.5.0",
"istanbul": "^0.4.5",
"mocha": "^3.1.2"
"chai": "^4.2.0",
"mocha": "^6.1.4",
"nyc": "^14.1.1"
}
}

@@ -46,3 +46,3 @@ # Robots Parser [![DeepScan Grade](https://deepscan.io/api/projects/1275/branches/3378/badge/grade.svg)](https://deepscan.io/dashboard/#view=project&pid=1275&bid=3378) [![GitHub license](https://img.shields.io/github/license/samclarke/robots-parser.svg)](https://github.com/samclarke/robots-parser/blob/master/license.md)

robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferedHost(); // example.com
robots.getPreferredHost(); // example.com
```

@@ -99,2 +99,9 @@

### Version 2.2.0:
* Fix bug that with matching wildcard patterns with some URLs
– Thanks to @ckylape for reporting and fixing
* Changed matching algorithm to match Google's implementation in google/robotstxt
* Changed order of precedence to match current spec
### Version 2.1.1:

@@ -101,0 +108,0 @@

@@ -81,3 +81,3 @@ var URL = require('url').URL;

* unicode characters.
*
*
* @param {string} path

@@ -111,29 +111,65 @@ * @return {string}

/**
* Converts the pattern into a regexp if it is a wildcard
* pattern.
* Matches a pattern with the specified path
*
* Returns a string if the pattern isn't a wildcard pattern
* Uses same algorithm to match patterns as the Google implementation in
* google/robotstxt so it should be consistent with the spec.
*
* @param {string} pattern
* @return {string|RegExp}
* @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74
* @param {string} pattern
* @param {string} path
* @return {boolean}
* @private
*/
function parsePattern(pattern) {
var regexSpecialChars = /[\-\[\]\/\{\}\(\)\+\?\.\\\^\$\|]/g;
// Treat consecutive wildcards as one (#12)
var wildCardPattern = /\*+/g;
var endOfLinePattern = /\\\$$/;
function matches(pattern, path) {
// I've added extra comments to try make this easier to understand
pattern = normaliseEncoding(pattern)
// Stores the lengths of all the current matching substrings.
// Maximum number of possible matching lengths is every length in path plus
// 1 to handle 0 length too (if pattern starts with * which is zero or more)
var matchingLengths = new Array(path.length + 1);
var numMatchingLengths = 1;
if (pattern.indexOf('*') < 0 && pattern.indexOf('$') < 0) {
return pattern;
// Initially longest match is 0
matchingLengths[0] = 0;
for (var p = 0; p < pattern.length; p++) {
// If $ is at the end of pattern then we must match the whole path.
// Which is true if the longest matching length matches path length
if (pattern[p] === '$' && p + 1 === pattern.length) {
return matchingLengths[numMatchingLengths - 1] === path.length;
}
// Handle wildcards
if (pattern[p] == '*') {
// Wildcard so all substrings minus the current smallest matching
// length are matches
numMatchingLengths = path.length - matchingLengths[0] + 1;
// Update matching lengths to include the smallest all the way up
// to numMatchingLengths
// Don't update smallest possible match as * matches zero or more
// so the smallest current match is also valid
for (var i = 1; i < numMatchingLengths; i++) {
matchingLengths[i] = matchingLengths[i - 1] + 1;
}
} else {
// Check the char at the matching length matches the pattern, if it
// does increment it and add it as a valid length, ignore if not.
var numMatches = 0;
for (var i = 0; i < numMatchingLengths; i++) {
if (matchingLengths[i] < path.length && path[matchingLengths[i]] === pattern[p]) {
matchingLengths[numMatches++] = matchingLengths[i] + 1;
}
}
// No paths matched the current pattern char so not a match
if (numMatches == 0) {
return false;
}
numMatchingLengths = numMatches;
}
}
pattern = pattern
.replace(regexSpecialChars, '\\$&')
.replace(wildCardPattern, '(?:.*)')
.replace(endOfLinePattern, '$');
return new RegExp(pattern);
return true;
}

@@ -207,16 +243,13 @@

if (typeof rule.pattern === 'string') {
if (path.indexOf(rule.pattern) !== 0) {
continue;
}
if (!matches(rule.pattern, path)) {
continue;
}
// The longest matching rule takes precedence
if (!matchingRule || rule.pattern.length > matchingRule.pattern.length) {
matchingRule = rule;
}
// The first matching pattern takes precedence
// over all other rules including other patterns
} else if (rule.pattern.test(path)) {
return rule;
}
// The longest matching rule takes precedence
if (!matchingRule || rule.pattern.length > matchingRule.pattern.length) {
matchingRule = rule;
} else if (rule.pattern.length == matchingRule.pattern.length &&
rule.allow && !matchingRule.allow) {
matchingRule = rule;
}
}

@@ -229,6 +262,6 @@

* Converts provided string into an URL object.
*
*
* Will return null if provided string is not a valid URL.
*
* @param {string} url
*
* @param {string} url
* @return {?URL}

@@ -277,3 +310,3 @@ * @private

rules[userAgent].push({
pattern: parsePattern(pattern),
pattern: normaliseEncoding(pattern),
allow: allow,

@@ -280,0 +313,0 @@ lineNumber: lineNumber

@@ -51,3 +51,4 @@ var robotsParser = require('../index');

'http://www.example.com/fish/test.html',
'http://www.example.com/Test.html'
'http://www.example.com/Test.html',
'http://www.example.com/test.html'
];

@@ -58,3 +59,2 @@

'http://www.example.com/fish/',
'http://www.example.com/test.html'
];

@@ -69,3 +69,4 @@

'Disallow: /fish*.php',
'Disallow: /*.dext$'
'Disallow: /*.dext$',
'Disallow: /dir*'
].join('\n');

@@ -75,3 +76,5 @@

'http://www.example.com/Fish.PHP',
'http://www.example.com/Fish.dext1'
'http://www.example.com/Fish.dext1',
'http://www.example.com/folder/dir.html',
'http://www.example.com/folder/dir/test.html'
];

@@ -83,3 +86,5 @@

'http://www.example.com/AnYthInG.dext',
'http://www.example.com/Fish.dext.dext'
'http://www.example.com/Fish.dext.dext',
'http://www.example.com/dir/test.html',
'http://www.example.com/directory.html'
];

@@ -90,3 +95,3 @@

it('should have the correct order presidence for allow and disallow', function () {
it('should have the correct order precedence for allow and disallow', function () {
var contents = [

@@ -98,2 +103,6 @@ 'User-agent: *',

'Allow: /test/',
'Disallow: /aa/',
'Allow: /aa/',
'Allow: /bb/',
'Disallow: /bb/',
].join('\n');

@@ -103,3 +112,7 @@

'http://www.example.com/test/index.html',
'http://www.example.com/test/'
'http://www.example.com/fish/index.php',
'http://www.example.com/test/',
'http://www.example.com/aa/',
'http://www.example.com/bb/',
'http://www.example.com/x/'
];

@@ -110,3 +123,2 @@

'http://www.example.com/fishheads/catfish.php?parameters',
'http://www.example.com/fish/index.php',
'http://www.example.com/test'

@@ -118,2 +130,88 @@ ];

it('should have the correct order precedence for wildcards', function () {
var contents = [
'User-agent: *',
'Disallow: /*/',
'Allow: /x/',
].join('\n');
var allowed = [
'http://www.example.com/x/',
'http://www.example.com/fish.php',
'http://www.example.com/test'
];
var disallowed = [
'http://www.example.com/a/',
'http://www.example.com/xx/',
'http://www.example.com/test/index.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by \\r', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\r');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by \\r\\n', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\r\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by mixed line endings', function () {
var contents = [
'User-agent: *\r',
'Disallow: /fish/\r\n',
'Disallow: /test.html\n\n'
].join('');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore rules that are not in a group', function () {

@@ -602,3 +700,6 @@ var contents = [

'disallow: /t*t',
// check UA returns -1 if no matching UA and also handles patterns both allow and disaloow
'',
'User-agent: c',
'Disallow: /fish*.php',
'Allow: /fish/index.php'
].join('\n');

@@ -614,8 +715,32 @@

expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(5);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'b')).to.equal(14);
expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18);
});
it('should handle large wildcards efficiently', function () {
var contents = [
'User-agent: *',
'Disallow: /' + '*'.repeat(2048) + '.html',
].join('\n');
var allowed = [
'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php',
];
var disallowed = [
'http://www.example.com/secret.html'
];
const start = Date.now();
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
const end = Date.now();
// Should take less than 500 ms (high to allow for variableness of
// machines running the test, should normally be much less)
expect(end - start).to.be.lessThan(500);
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc