Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

robots-txt-guard

Package Overview
Dependencies
Maintainers
5
Versions
8
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

robots-txt-guard - npm Package Compare versions

Comparing version 0.1.1 to 0.2.1

70

lib/guard.js

@@ -16,14 +16,32 @@ 'use strict';

.forEach(function (group) {
var rules = group.rules
.filter(function (rule) {
return !!rule.path;
})
.map(function (rule) {
return {
pattern: patterns.path(rule.path),
allow: rule.rule.toLowerCase() !== 'disallow'
};
})
const accessibilityRules = group.rules
.filter(({ rule, path }) => !!path && ['allow', 'disallow'].includes(rule.toLowerCase()))
.reduce((group, { rule, path }) => {
const repeatedPath = group.find((rule) => rule.path === path);
if (repeatedPath) {
if (rule.toLowerCase() === 'allow') {
repeatedPath.rule = 'allow';
}
} else {
group.push({
rule,
path
});
}
return group;
}, [])
.map(({ rule, path }) => ({
pattern: patterns.path(path),
allow: rule.toLowerCase() !== 'disallow'
}))
.sort(moreSpecificFirst);
const indexabilityRules = group.rules
.filter(({ rule, path }) => !!path && ['noindex'].includes(rule.toLowerCase()))
.map(({ rule, path }) => ({
pattern: patterns.path(path),
allow: rule.toLowerCase() !== 'noindex'
}))
.sort(moreSpecificFirst);
group.agents

@@ -33,3 +51,4 @@ .forEach(function (agent) {

pattern: patterns.userAgent(agent),
rules: rules
accessibilityRules,
indexabilityRules
});

@@ -42,4 +61,3 @@ });

function findGroup(userAgent) {
for (var i = 0; i < groups.length; i++) {
var group = groups[i];
for (const group of groups) {
if (group.pattern.test(userAgent)) {

@@ -53,4 +71,3 @@ return group;

function matchRule(rules, path) {
for (var i = 0; i < rules.length; i++) {
var rule = rules[i];
for (const rule of rules) {
if (rule.pattern.test(path)) {

@@ -60,10 +77,10 @@ return rule.allow;

}
// no rule matched? assume allowed
// no rule matched? assume true
return true;
}
function isAllowed(userAgent, path) {
function isRuleSetAllowed(ruleSet, userAgent, path) {
var group = findGroup(userAgent);
if (group) {
return matchRule(group.rules, path);
return matchRule(group[ruleSet], path);
}

@@ -74,7 +91,15 @@ // no group matched? assume allowed

function isDissalowAll(userAgent) {
function isAllowed(userAgent, path) {
return isRuleSetAllowed('accessibilityRules', userAgent, path);
}
function isIndexable(userAgent, path) {
return isRuleSetAllowed('indexabilityRules', userAgent, path);
}
function isDisallowAll(userAgent) {
var group = findGroup(userAgent);
if (group) {
var allowRules = group.rules.filter(function (rule) {
return rule.allow;
var allowRules = group.accessibilityRules.filter(function ({ pattern, allow }) {
return allow || pattern.specificity > 1;
});

@@ -89,4 +114,5 @@ return allowRules.length <= 0;

isAllowed: isAllowed,
isDissalowAll: isDissalowAll
isDisallowAll: isDisallowAll,
isIndexable: isIndexable
};
};
{
"name": "robots-txt-guard",
"version": "0.1.1",
"version": "0.2.1",
"description": "Validate urls against robots.txt rules.",

@@ -5,0 +5,0 @@ "main": "lib/guard.js",

@@ -20,3 +20,4 @@ # robots-txt-guard [![Build Status](https://travis-ci.org/Woorank/robots-txt-guard.svg)](https://travis-ci.org/Woorank/robots-txt-guard)

{ rule: 'disallow', path: '/tmp/*' },
{ rule: 'disallow', path: '/temporary/*' }
{ rule: 'disallow', path: '/temporary/*' },
{ rule: 'noindex', path: '/temporary/*' }
]

@@ -29,2 +30,5 @@ }]

robotsTxt.isAllowed('googlebot-news', '/home.html'); // true
robotsTxt.isIndexable('googlebot', '/tmp/*'); // true
robotsTxt.isIndexable('googlebot', '/temporary/*'); // false
```

@@ -38,5 +38,141 @@ /*global describe, it*/

assert.ok(robotsTxt.isAllowed('default', '/hello'), '5');
});
// https://stackoverflow.com/a/4589497/419436
it('allow should get priority', function () {
// all groups should behave the same, regardless of the order of the rules
var robotsTxt = guard({
groups: [{
agents: [ 'agent1' ],
rules: [
{ rule: 'allow', path: '/fish' },
{ rule: 'disallow', path: '/fish' }
]
}, {
agents: [ 'agent2' ],
rules: [
{ rule: 'disallow', path: '/fish' },
{ rule: 'allow', path: '/fish' }
]
}, {
agents: [ 'agent3' ],
rules: [
{ rule: 'disallow', path: '/fish' },
{ rule: 'ALLOW', path: '/fish' }
]
}]
});
assert.ok(robotsTxt.isAllowed('agent1', '/hello'), '1');
assert.ok(robotsTxt.isAllowed('agent1', '/fish'), '2');
assert.ok(robotsTxt.isAllowed('agent2', '/hello'), '3');
assert.ok(robotsTxt.isAllowed('agent2', '/fish'), '4');
assert.ok(robotsTxt.isAllowed('agent3', '/hello'), '5');
assert.ok(robotsTxt.isAllowed('agent3', '/fish'), '6');
assert.ok(robotsTxt.isAllowed('default', '/hello'), '7');
});
it('should have the correct behaviour when no / is added at the end of the path', function () {
// both groups should behave the same, regardless of the order of the rules
var robotsTxt = guard({
groups: [{
agents: [ 'agent1' ],
rules: [
{ rule: 'allow', path: '/fish' },
{ rule: 'disallow', path: '/' }
]
}, {
agents: [ 'agent2' ],
rules: [
{ rule: 'disallow', path: '/fish' }
]
}]
});
assert.notOk(robotsTxt.isAllowed('agent1', '/hello'), '1');
assert.ok(robotsTxt.isAllowed('agent1', '/fish'), '2');
assert.ok(robotsTxt.isAllowed('agent1', '/fish01'), '3');
assert.ok(robotsTxt.isAllowed('agent1', '/fish/'), '4');
assert.ok(robotsTxt.isAllowed('agent2', '/hello'), '5');
assert.notOk(robotsTxt.isAllowed('agent2', '/fish'), '6');
assert.notOk(robotsTxt.isAllowed('agent2', '/fish01'), '7');
assert.notOk(robotsTxt.isAllowed('agent2', '/fish/'), '8');
assert.ok(robotsTxt.isAllowed('default', '/hello'), '9');
assert.notOk(robotsTxt.isDisallowAll('agent1'), '10');
assert.notOk(robotsTxt.isDisallowAll('agent2'), '11');
});
it('should have the correct behaviour when / is added at the end of the path', function () {
// both groups should behave the same, regardless of the order of the rules
var robotsTxt = guard({
groups: [{
agents: [ 'agent1' ],
rules: [
{ rule: 'allow', path: '/fish/' },
{ rule: 'disallow', path: '/' }
]
}, {
agents: [ 'agent2' ],
rules: [
{ rule: 'disallow', path: '/fish/' }
]
}]
});
assert.notOk(robotsTxt.isAllowed('agent1', '/hello'), '1');
assert.notOk(robotsTxt.isAllowed('agent1', '/fish'), '2');
assert.notOk(robotsTxt.isAllowed('agent1', '/fish01'), '3');
assert.ok(robotsTxt.isAllowed('agent1', '/fish/'), '4');
assert.ok(robotsTxt.isAllowed('agent1', '/fish/path1'), '5');
assert.ok(robotsTxt.isAllowed('agent2', '/hello'), '6');
assert.ok(robotsTxt.isAllowed('agent2', '/fish'), '7');
assert.ok(robotsTxt.isAllowed('agent2', '/fish01'), '8');
assert.notOk(robotsTxt.isAllowed('agent2', '/fish/'), '9');
assert.ok(robotsTxt.isAllowed('default', '/hello'), '10');
assert.notOk(robotsTxt.isDisallowAll('agent1'), '11');
assert.notOk(robotsTxt.isDisallowAll('agent2'), '12');
});
it('noindex shouldn\'t interfere with allow', function () {
// both groups should behave the same, regardless of the order of the rules
var robotsTxt = guard({
groups: [{
agents: [ 'agent1' ],
rules: [
{ rule: 'noindex', path: '/fish' },
{ rule: 'disallow', path: '/fish' }
]
}, {
agents: [ 'agent2' ],
rules: [
{ rule: 'disallow', path: '/fish' },
{ rule: 'noindex', path: '/fish' }
]
}, {
agents: [ 'agent3' ],
rules: [
{ rule: 'disallow', path: '/' },
{ rule: 'noindex', path: '/fish' }
]
}]
});
assert.notOk(robotsTxt.isAllowed('agent1', '/fish'), '1');
assert.notOk(robotsTxt.isAllowed('agent2', '/fish'), '2');
assert.notOk(robotsTxt.isAllowed('agent3', '/fish'), '3');
});
it('should pick most specific agent', function () {

@@ -79,3 +215,2 @@

assert.ok(robotsTxt.isAllowed('agent2', '/disallow3'), '12');
});

@@ -97,3 +232,2 @@

assert.ok(robotsTxt.isAllowed('agent', '/path'), '2');
});

@@ -119,7 +253,33 @@

assert.isTrue(robotsTxt.isDissalowAll('somebot'));
assert.isFalse(robotsTxt.isDissalowAll('googlebot'));
assert.isTrue(robotsTxt.isDisallowAll('somebot'));
assert.isFalse(robotsTxt.isDisallowAll('googlebot'));
});
it('should detect disallow all', function () {
var robotsTxt = guard({
groups: [{
agents: [ '*' ],
rules: [
{ rule: 'disallow', path: '/' },
{ rule: 'noindex', path: '/fish' }
]
}]
});
assert.isTrue(robotsTxt.isDisallowAll('somebot'));
});
it('should detect that not all paths are disallowed when only disallowing specific paths', function () {
var robotsTxt = guard({
groups: [{
agents: [ '*' ],
rules: [
{ rule: 'disallow', path: '/fish' }
]
}]
});
assert.isFalse(robotsTxt.isDisallowAll('somebot'));
});
it('should correctly detect if path is allowed with noindex', function () {

@@ -149,2 +309,43 @@

it('should detect if path is indexable', function () {
var robotsTxt = guard(
{
groups: [
{
agents: [ '*' ],
rules: [
{ rule: 'allow', path: '/path1' },
{ rule: 'noindex', path: '/path1' },
{ rule: 'disallow', path: '/*/path2/' },
{ rule: 'noindex', path: '/*/path2/' },
{ rule: 'noindex', path: '/*/path3/' },
{ rule: 'allow', path: '/path4/' },
{ rule: 'disallow', path: '/path5/' }
]
},
{
agents: [ 'googlebot' ],
rules: [
{ rule: 'disallow', path: '/path1' },
{ rule: 'allow', path: '/path2' },
{ rule: 'noindex', path: '/path3' }
]
}
]
}
);
assert.ok(robotsTxt.isIndexable('*', '/'), '1');
assert.notOk(robotsTxt.isIndexable('*', '/path1'), '2');
assert.notOk(robotsTxt.isIndexable('*', '/*/path2/'), '3');
assert.notOk(robotsTxt.isIndexable('*', '/*/path3/'), '4');
assert.ok(robotsTxt.isIndexable('*', '/path4/'), '5');
assert.ok(robotsTxt.isIndexable('*', '/path5/'), '6');
assert.ok(robotsTxt.isIndexable('googlebot', '/path1/'), '7');
assert.ok(robotsTxt.isIndexable('googlebot', '/path2/'), '8');
assert.notOk(robotsTxt.isIndexable('googlebot', '/path3/'), '9');
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc