robots-txt-guard
Advanced tools
Comparing version 0.1.1 to 0.2.1
@@ -16,14 +16,32 @@ 'use strict'; | ||
.forEach(function (group) { | ||
var rules = group.rules | ||
.filter(function (rule) { | ||
return !!rule.path; | ||
}) | ||
.map(function (rule) { | ||
return { | ||
pattern: patterns.path(rule.path), | ||
allow: rule.rule.toLowerCase() !== 'disallow' | ||
}; | ||
}) | ||
const accessibilityRules = group.rules | ||
.filter(({ rule, path }) => !!path && ['allow', 'disallow'].includes(rule.toLowerCase())) | ||
.reduce((group, { rule, path }) => { | ||
const repeatedPath = group.find((rule) => rule.path === path); | ||
if (repeatedPath) { | ||
if (rule.toLowerCase() === 'allow') { | ||
repeatedPath.rule = 'allow'; | ||
} | ||
} else { | ||
group.push({ | ||
rule, | ||
path | ||
}); | ||
} | ||
return group; | ||
}, []) | ||
.map(({ rule, path }) => ({ | ||
pattern: patterns.path(path), | ||
allow: rule.toLowerCase() !== 'disallow' | ||
})) | ||
.sort(moreSpecificFirst); | ||
const indexabilityRules = group.rules | ||
.filter(({ rule, path }) => !!path && ['noindex'].includes(rule.toLowerCase())) | ||
.map(({ rule, path }) => ({ | ||
pattern: patterns.path(path), | ||
allow: rule.toLowerCase() !== 'noindex' | ||
})) | ||
.sort(moreSpecificFirst); | ||
group.agents | ||
@@ -33,3 +51,4 @@ .forEach(function (agent) { | ||
pattern: patterns.userAgent(agent), | ||
rules: rules | ||
accessibilityRules, | ||
indexabilityRules | ||
}); | ||
@@ -42,4 +61,3 @@ }); | ||
function findGroup(userAgent) { | ||
for (var i = 0; i < groups.length; i++) { | ||
var group = groups[i]; | ||
for (const group of groups) { | ||
if (group.pattern.test(userAgent)) { | ||
@@ -53,4 +71,3 @@ return group; | ||
function matchRule(rules, path) { | ||
for (var i = 0; i < rules.length; i++) { | ||
var rule = rules[i]; | ||
for (const rule of rules) { | ||
if (rule.pattern.test(path)) { | ||
@@ -60,10 +77,10 @@ return rule.allow; | ||
} | ||
// no rule matched? assume allowed | ||
// no rule matched? assume true | ||
return true; | ||
} | ||
function isAllowed(userAgent, path) { | ||
function isRuleSetAllowed(ruleSet, userAgent, path) { | ||
var group = findGroup(userAgent); | ||
if (group) { | ||
return matchRule(group.rules, path); | ||
return matchRule(group[ruleSet], path); | ||
} | ||
@@ -74,7 +91,15 @@ // no group matched? assume allowed | ||
function isDissalowAll(userAgent) { | ||
function isAllowed(userAgent, path) { | ||
return isRuleSetAllowed('accessibilityRules', userAgent, path); | ||
} | ||
function isIndexable(userAgent, path) { | ||
return isRuleSetAllowed('indexabilityRules', userAgent, path); | ||
} | ||
function isDisallowAll(userAgent) { | ||
var group = findGroup(userAgent); | ||
if (group) { | ||
var allowRules = group.rules.filter(function (rule) { | ||
return rule.allow; | ||
var allowRules = group.accessibilityRules.filter(function ({ pattern, allow }) { | ||
return allow || pattern.specificity > 1; | ||
}); | ||
@@ -89,4 +114,5 @@ return allowRules.length <= 0; | ||
isAllowed: isAllowed, | ||
isDissalowAll: isDissalowAll | ||
isDisallowAll: isDisallowAll, | ||
isIndexable: isIndexable | ||
}; | ||
}; |
{ | ||
"name": "robots-txt-guard", | ||
"version": "0.1.1", | ||
"version": "0.2.1", | ||
"description": "Validate urls against robots.txt rules.", | ||
@@ -5,0 +5,0 @@ "main": "lib/guard.js", |
@@ -20,3 +20,4 @@ # robots-txt-guard [![Build Status](https://travis-ci.org/Woorank/robots-txt-guard.svg)](https://travis-ci.org/Woorank/robots-txt-guard) | ||
{ rule: 'disallow', path: '/tmp/*' }, | ||
{ rule: 'disallow', path: '/temporary/*' } | ||
{ rule: 'disallow', path: '/temporary/*' }, | ||
{ rule: 'noindex', path: '/temporary/*' } | ||
] | ||
@@ -29,2 +30,5 @@ }] | ||
robotsTxt.isAllowed('googlebot-news', '/home.html'); // true | ||
robotsTxt.isIndexable('googlebot', '/tmp/*'); // true | ||
robotsTxt.isIndexable('googlebot', '/temporary/*'); // false | ||
``` |
@@ -38,5 +38,141 @@ /*global describe, it*/ | ||
assert.ok(robotsTxt.isAllowed('default', '/hello'), '5'); | ||
}); | ||
// https://stackoverflow.com/a/4589497/419436 | ||
it('allow should get priority', function () { | ||
// all groups should behave the same, regardless of the order of the rules | ||
var robotsTxt = guard({ | ||
groups: [{ | ||
agents: [ 'agent1' ], | ||
rules: [ | ||
{ rule: 'allow', path: '/fish' }, | ||
{ rule: 'disallow', path: '/fish' } | ||
] | ||
}, { | ||
agents: [ 'agent2' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/fish' }, | ||
{ rule: 'allow', path: '/fish' } | ||
] | ||
}, { | ||
agents: [ 'agent3' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/fish' }, | ||
{ rule: 'ALLOW', path: '/fish' } | ||
] | ||
}] | ||
}); | ||
assert.ok(robotsTxt.isAllowed('agent1', '/hello'), '1'); | ||
assert.ok(robotsTxt.isAllowed('agent1', '/fish'), '2'); | ||
assert.ok(robotsTxt.isAllowed('agent2', '/hello'), '3'); | ||
assert.ok(robotsTxt.isAllowed('agent2', '/fish'), '4'); | ||
assert.ok(robotsTxt.isAllowed('agent3', '/hello'), '5'); | ||
assert.ok(robotsTxt.isAllowed('agent3', '/fish'), '6'); | ||
assert.ok(robotsTxt.isAllowed('default', '/hello'), '7'); | ||
}); | ||
it('should have the correct behaviour when no / is added at the end of the path', function () { | ||
// both groups should behave the same, regardless of the order of the rules | ||
var robotsTxt = guard({ | ||
groups: [{ | ||
agents: [ 'agent1' ], | ||
rules: [ | ||
{ rule: 'allow', path: '/fish' }, | ||
{ rule: 'disallow', path: '/' } | ||
] | ||
}, { | ||
agents: [ 'agent2' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/fish' } | ||
] | ||
}] | ||
}); | ||
assert.notOk(robotsTxt.isAllowed('agent1', '/hello'), '1'); | ||
assert.ok(robotsTxt.isAllowed('agent1', '/fish'), '2'); | ||
assert.ok(robotsTxt.isAllowed('agent1', '/fish01'), '3'); | ||
assert.ok(robotsTxt.isAllowed('agent1', '/fish/'), '4'); | ||
assert.ok(robotsTxt.isAllowed('agent2', '/hello'), '5'); | ||
assert.notOk(robotsTxt.isAllowed('agent2', '/fish'), '6'); | ||
assert.notOk(robotsTxt.isAllowed('agent2', '/fish01'), '7'); | ||
assert.notOk(robotsTxt.isAllowed('agent2', '/fish/'), '8'); | ||
assert.ok(robotsTxt.isAllowed('default', '/hello'), '9'); | ||
assert.notOk(robotsTxt.isDisallowAll('agent1'), '10'); | ||
assert.notOk(robotsTxt.isDisallowAll('agent2'), '11'); | ||
}); | ||
it('should have the correct behaviour when / is added at the end of the path', function () { | ||
// both groups should behave the same, regardless of the order of the rules | ||
var robotsTxt = guard({ | ||
groups: [{ | ||
agents: [ 'agent1' ], | ||
rules: [ | ||
{ rule: 'allow', path: '/fish/' }, | ||
{ rule: 'disallow', path: '/' } | ||
] | ||
}, { | ||
agents: [ 'agent2' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/fish/' } | ||
] | ||
}] | ||
}); | ||
assert.notOk(robotsTxt.isAllowed('agent1', '/hello'), '1'); | ||
assert.notOk(robotsTxt.isAllowed('agent1', '/fish'), '2'); | ||
assert.notOk(robotsTxt.isAllowed('agent1', '/fish01'), '3'); | ||
assert.ok(robotsTxt.isAllowed('agent1', '/fish/'), '4'); | ||
assert.ok(robotsTxt.isAllowed('agent1', '/fish/path1'), '5'); | ||
assert.ok(robotsTxt.isAllowed('agent2', '/hello'), '6'); | ||
assert.ok(robotsTxt.isAllowed('agent2', '/fish'), '7'); | ||
assert.ok(robotsTxt.isAllowed('agent2', '/fish01'), '8'); | ||
assert.notOk(robotsTxt.isAllowed('agent2', '/fish/'), '9'); | ||
assert.ok(robotsTxt.isAllowed('default', '/hello'), '10'); | ||
assert.notOk(robotsTxt.isDisallowAll('agent1'), '11'); | ||
assert.notOk(robotsTxt.isDisallowAll('agent2'), '12'); | ||
}); | ||
it('noindex shouldn\'t interfere with allow', function () { | ||
// both groups should behave the same, regardless of the order of the rules | ||
var robotsTxt = guard({ | ||
groups: [{ | ||
agents: [ 'agent1' ], | ||
rules: [ | ||
{ rule: 'noindex', path: '/fish' }, | ||
{ rule: 'disallow', path: '/fish' } | ||
] | ||
}, { | ||
agents: [ 'agent2' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/fish' }, | ||
{ rule: 'noindex', path: '/fish' } | ||
] | ||
}, { | ||
agents: [ 'agent3' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/' }, | ||
{ rule: 'noindex', path: '/fish' } | ||
] | ||
}] | ||
}); | ||
assert.notOk(robotsTxt.isAllowed('agent1', '/fish'), '1'); | ||
assert.notOk(robotsTxt.isAllowed('agent2', '/fish'), '2'); | ||
assert.notOk(robotsTxt.isAllowed('agent3', '/fish'), '3'); | ||
}); | ||
it('should pick most specific agent', function () { | ||
@@ -79,3 +215,2 @@ | ||
assert.ok(robotsTxt.isAllowed('agent2', '/disallow3'), '12'); | ||
}); | ||
@@ -97,3 +232,2 @@ | ||
assert.ok(robotsTxt.isAllowed('agent', '/path'), '2'); | ||
}); | ||
@@ -119,7 +253,33 @@ | ||
assert.isTrue(robotsTxt.isDissalowAll('somebot')); | ||
assert.isFalse(robotsTxt.isDissalowAll('googlebot')); | ||
assert.isTrue(robotsTxt.isDisallowAll('somebot')); | ||
assert.isFalse(robotsTxt.isDisallowAll('googlebot')); | ||
}); | ||
it('should detect disallow all', function () { | ||
var robotsTxt = guard({ | ||
groups: [{ | ||
agents: [ '*' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/' }, | ||
{ rule: 'noindex', path: '/fish' } | ||
] | ||
}] | ||
}); | ||
assert.isTrue(robotsTxt.isDisallowAll('somebot')); | ||
}); | ||
it('should detect that not all paths are disallowed when only disallowing specific paths', function () { | ||
var robotsTxt = guard({ | ||
groups: [{ | ||
agents: [ '*' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/fish' } | ||
] | ||
}] | ||
}); | ||
assert.isFalse(robotsTxt.isDisallowAll('somebot')); | ||
}); | ||
it('should correctly detect if path is allowed with noindex', function () { | ||
@@ -149,2 +309,43 @@ | ||
it('should detect if path is indexable', function () { | ||
var robotsTxt = guard( | ||
{ | ||
groups: [ | ||
{ | ||
agents: [ '*' ], | ||
rules: [ | ||
{ rule: 'allow', path: '/path1' }, | ||
{ rule: 'noindex', path: '/path1' }, | ||
{ rule: 'disallow', path: '/*/path2/' }, | ||
{ rule: 'noindex', path: '/*/path2/' }, | ||
{ rule: 'noindex', path: '/*/path3/' }, | ||
{ rule: 'allow', path: '/path4/' }, | ||
{ rule: 'disallow', path: '/path5/' } | ||
] | ||
}, | ||
{ | ||
agents: [ 'googlebot' ], | ||
rules: [ | ||
{ rule: 'disallow', path: '/path1' }, | ||
{ rule: 'allow', path: '/path2' }, | ||
{ rule: 'noindex', path: '/path3' } | ||
] | ||
} | ||
] | ||
} | ||
); | ||
assert.ok(robotsTxt.isIndexable('*', '/'), '1'); | ||
assert.notOk(robotsTxt.isIndexable('*', '/path1'), '2'); | ||
assert.notOk(robotsTxt.isIndexable('*', '/*/path2/'), '3'); | ||
assert.notOk(robotsTxt.isIndexable('*', '/*/path3/'), '4'); | ||
assert.ok(robotsTxt.isIndexable('*', '/path4/'), '5'); | ||
assert.ok(robotsTxt.isIndexable('*', '/path5/'), '6'); | ||
assert.ok(robotsTxt.isIndexable('googlebot', '/path1/'), '7'); | ||
assert.ok(robotsTxt.isIndexable('googlebot', '/path2/'), '8'); | ||
assert.notOk(robotsTxt.isIndexable('googlebot', '/path3/'), '9'); | ||
}); | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
23160
556
33
0