Socket
Socket
Sign inDemoInstall

filterhtml

Package Overview
Dependencies
0
Maintainers
1
Versions
11
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.5.0 to 0.6.0

.venv/bin/activate

242

lib/FilterHTML.js

@@ -5,5 +5,23 @@ var FilterHTML = (function() {

var ATTR_REGEX = /^[a-z\-]$/;
var UNQUOTED_INVALID_REGEX = /^[\"'`=<>]$/;
var WHITESPACE_REGEX = /^\s$/;
var UNICODE_REGEX = /^.*&#.*$/;
var CSS_ESCAPE = /^.*\\[0-9A-Fa-f].*$/;
var INVALID_ATTRIBUTE_REPLACEMENTS = {
'url': '#',
'url|empty': ''
};
var UNSAFE_URL_CHARS = {
" ": "%20",
"%": "%25",
">": "%3E",
"<": "%3C",
"[": "%5B",
"]": "%5D",
"{": "%7B",
"}": "%7D",
"|": "%7C",
"\\": "%5C",
"^": "%5E"
};
var VOID_ELEMENTS = [

@@ -35,5 +53,13 @@ 'area',

'<': '&lt;',
'"': '&quot;'
'&': '&amp;',
';': '&semi;'
};
var HTML_ESCAPE_QUOTES = {
'"': '&quot;',
'\"': '&apos;'
};
var URL_ENCODING_MATCH = /(\%[0-9a-fA-F]{2})/g;
var ENTITY_MATCH = /(\&[\#\d\w]+;)/g;
// predefined HTML colors

@@ -245,3 +271,4 @@ var HTML_COLORS = [

HTMLFilter.prototype.filter = function(html) {
var tags, i, filtered_text, tag_output;
var tags, i, tag_text, tag_output;
var is_script_processed, is_script_escaped;

@@ -256,16 +283,20 @@ this.row = 0;

is_script_processed = !this.remove_scripts;
is_script_escaped = this.spec['script'] && (typeof this.spec['script'] === 'string');
text_chars = '';
while (this.next()) {
if (this.curr_char === '<') {
filtered_text = this.filter_text(text_chars);
if (this.state === 'script-data' && !is_script_escaped) {
tag_text = '' + text_chars;
} else {
tag_text = this.filter_text(text_chars);
}
tag_output = this.filter_tag();
if (this.state === 'script-data-less-than-sign') {
if (!this.remove_scripts) {
if (this.spec['script'] && (typeof this.spec['script'] === 'string')) {
text_chars += this.escape_data('<');
} else {
text_chars += '<';
}
text_chars += this.escape_data(this.curr_char);
if (is_script_processed) {
text_chars += '<';
text_chars += this.curr_char;
}

@@ -275,3 +306,3 @@

} else {
this.filtered_html += filtered_text;
this.filtered_html += tag_text;
text_chars = '';

@@ -281,3 +312,3 @@ this.filtered_html += tag_output;

} else {
if (this.state === 'script-data' && this.remove_scripts) {
if (this.state === 'script-data' && !is_script_processed) {
// pass

@@ -287,3 +318,3 @@ } else if (this.state === 'skip-data') {

} else {
text_chars += this.escape_data(this.curr_char);
text_chars += this.curr_char;
}

@@ -323,7 +354,9 @@ }

HTMLFilter.prototype.escape_data = function(data_char) {
HTMLFilter.prototype.escape_data = function(data_char, include_quotes) {
if (HTML_ESCAPE_CHARS[data_char]) {
return HTML_ESCAPE_CHARS[data_char];
} else if (include_quotes && HTML_ESCAPE_QUOTES[data_char]) {
return HTML_ESCAPE_QUOTES[data_char];
} else {
return this.curr_char;
return data_char;
}

@@ -342,3 +375,3 @@ };

} else {
filtered_html += text_chars;
filtered_html += this.purify_text(text_chars, false);
}

@@ -575,3 +608,4 @@

HTMLFilter.prototype.filter_attribute = function(tag_name) {
var tag_spec, attribute_name, whitespace, is_allowed, value, tag_spec;
var tag_spec, attribute_name, is_allowed, value, tag_spec;
var is_in_spec, is_in_globals, is_wildcard, is_regex;

@@ -582,5 +616,9 @@ tag_spec = this.get_tag_spec(tag_name);

whitespace = this.extract_whitespace();
this.extract_whitespace();
is_allowed = (!!tag_spec[attribute_name]) || (!!this.global_attrs[attribute_name]);
is_in_spec = (!!tag_spec[attribute_name]);
is_in_globals = (!!this.global_attrs[attribute_name]);
is_wildcard = (!!tag_spec['*']);
is_regex = (!!tag_spec['^$']);
is_allowed = is_in_spec || is_in_globals || is_wildcard || is_regex;

@@ -590,4 +628,2 @@ value = null;

this.next();
this.extract_whitespace();
value = this.filter_value(tag_name, attribute_name);

@@ -601,15 +637,26 @@ if (!value) {

is_allowed = false;
} else if (tag_spec && tag_spec[attribute_name] === 'boolean') {
value = True;
}
if (is_allowed) {
return attribute_name + '=' + value;
} else {
return null;
if (value === true) {
return attribute_name;
} else if (value !== null) {
return attribute_name + '=' + value;
}
}
return null;
};
HTMLFilter.prototype.is_valid_unquoted_attr_char = function(char) {
return !UNQUOTED_INVALID_REGEX.test(char) && !WHITESPACE_REGEX.test(char);
};
HTMLFilter.prototype.filter_value = function(tag_name, attribute_name) {
var value, quote, rules, global_rules, new_value, tag_spec;
var value, num_spaces, quote, rules, global_rules, new_value, tag_spec, matches;
num_spaces = this.extract_whitespace().length;
value = '';

@@ -626,3 +673,2 @@ quote = '"';

};
break;
}

@@ -634,12 +680,28 @@

this.next();
} else if (num_spaces === 0 && this.is_valid_unquoted_attr_char(this.curr_char)) {
value += this.curr_char;
while (this.is_valid_unquoted_attr_char(this.next())) {
value += self.curr_char;
}
}
rules = null;
global_rules = null;
tag_spec = this.get_tag_spec(tag_name);
if (tag_spec !== null) {
rules = tag_spec[attribute_name];
if (tag_spec[attribute_name]) {
rules = tag_spec[attribute_name];
} else if (tag_spec['*']) {
rules = tag_spec['*'];
} else if (tag_spec['^$']) {
matches = tag_spec['^$'].filter(function(pair) {
return (pair[0] instanceof RegExp) && pair[0].test(attribute_name);
});
if (matches.length > 0) {
rules = matches[0][1];
}
}
}
global_rules = null;
if (this.global_attrs && this.global_attrs[attribute_name]) {

@@ -659,8 +721,10 @@ global_rules = this.global_attrs[attribute_name];

if (global_rules && (new_value == null || new_value == '')) {
if (global_rules && (new_value == null)) {
new_value = this.purify_attribute(attribute_name, value, global_rules);
}
if (!new_value || new_value === '') {
if (new_value === null) {
return null;
} else if (new_value === true) {
return true;
} else {

@@ -674,3 +738,3 @@ return quote + new_value + quote;

parts = this.purify_value(value, rules);
parts = this.purify_value(value, rules, attribute_name);
value = parts[0];

@@ -703,3 +767,7 @@ is_purified = parts[1];

value = allowed_values.join(' ');
if (allowed_values.length > 0) {
value = allowed_values.join(' ');
} else {
value = null;
}
} else if (attribute_name === "style" && Object.prototype.toString.call(rules) == '[object Object]') {

@@ -717,7 +785,7 @@ candidate_values = value.split(';');

} else {
value = '';
value = null;
}
} else if (rules.length > 0) {
if (rules.indexOf(value) < 0) {
value = '';
value = null;
}

@@ -730,3 +798,3 @@ }

HTMLFilter.prototype.purify_value = function(value, rules) {
HTMLFilter.prototype.purify_value = function(value, rules, attribute_name) {
var purified = true;

@@ -737,6 +805,20 @@

value = null;
} else if (rules === "*") {
value = value;
} else if (rules instanceof RegExp) {
value = this.purify_regex(value, rules);
} else if (rules === "boolean") {
if (value === '' || (attribute_name !== null && value === attribute_name)) {
value = true;
} else {
value = null;
}
} else if (rules === "url") {
value = value.trim();
value = this.purify_url(value);
} else if (rules === "url|empty") {
value = value.trim();
if (value !== '') {
value = this.purify_url(value);
}
} else if (rules === "color") {

@@ -749,7 +831,29 @@ value = this.purify_color(value);

} else if (rules === "alpha") {
value = this.purify_regex(value, /^[a-zA-Z]+$/);
if (value === '') {
value = null;
} else {
value = this.purify_regex(value, /^[a-zA-Z]+$/);
}
} else if (rules === "alphanumeric") {
value = this.purify_regex(value, /^[a-zA-Z0-9]+$/);
if (value === '') {
value = null;
} else {
value = this.purify_regex(value, /^[a-zA-Z0-9]+$/);
}
} else if (rules === "alpha|empty") {
if (value !== '') {
value = this.purify_regex(value, /^[a-zA-Z]+$/);
}
} else if (rules === "alphanumeric|empty") {
if (value !== '') {
value = this.purify_regex(value, /^[a-zA-Z0-9]+$/);
}
} else if (rules === "text") {
value = this.purify_text(value, True);
} else if (typeof rules === 'string' && rules.charAt(0) === '[' && rules.charAt(rules.length-1) === ']') {
value = this.purify_set(value, rules.slice(1,-1));
if (value === '') {
value = null;
} else {
value = this.purify_set(value, rules.slice(1,-1));
}
} else if (typeof rules === 'function') {

@@ -761,5 +865,34 @@ value = rules(value);

if (value === null && INVALID_ATTRIBUTE_REPLACEMENTS[rules]) {
value = INVALID_ATTRIBUTE_REPLACEMENTS[rules];
}
return [value, purified];
};
HTMLFilter.prototype.escape_pattern = function(pattern, value, escaper) {
var entities, match, new_text, chunks, chunk;
entities = {};
match = value.match(pattern);
if (match) {
match.forEach(function(escaped) {
entities[escaped] = true;
});
}
new_text = [];
chunks = value.split(pattern);
for (var i = 0; i < chunks.length; i++) {
chunk = chunks[i]
if (!entities[chunk]) {
chunk = chunk.split('').map(escaper).join('')
}
new_text.push(chunk)
}
return new_text.join('');
};
HTMLFilter.prototype.purify_style = function(style, rules) {

@@ -836,13 +969,12 @@ var parts, name, value, style_rules, is_purified;

HTMLFilter.prototype.purify_url = function(url) {
var parts, scheme, allowed_scheme, escape_char;
var parts, scheme, allowed_scheme, escaper;
// strip out all encoded tag characters
for (escape_char in HTML_ESCAPE_CHARS) {
if (HTML_ESCAPE_CHARS.hasOwnProperty(escape_char)) {
url = url.replace(HTML_ESCAPE_CHARS[escape_char], '');
}
}
escaper = function(char) {
return UNSAFE_URL_CHARS[char] || char;
};
url = this.escape_pattern(URL_ENCODING_MATCH, url, escaper);
if (this.allowed_schemes.indexOf('//') < 0 && url.charAt(0) === '/' && url.charAt(1) === '/') {
return '#'; // disallow protocol-relative URLs (possible XSS vector)
return null; // disallow protocol-relative URLs (possible XSS vector)
}

@@ -856,3 +988,3 @@

scheme = '';
url = '#';
url = null;
} else {

@@ -871,3 +1003,3 @@ url = parts.slice(1).join(':');

} else {
return '#';
return null;
}

@@ -880,3 +1012,3 @@ };

if (isNaN(intVal)) {
return '';
return null;
} else {

@@ -900,7 +1032,15 @@ return '' + intVal;

} else {
return '';
return null;
}
};
HTMLFilter.prototype.purify_text = function(value, include_quotes) {
var _this = this;
var escaper = function(char) {
return _this.escape_data(char, include_quotes);
};
return this.escape_pattern(ENTITY_MATCH, value, escaper);
};
var filter_html = function(html, spec, allowed_schemes, text_filter, remove) {

@@ -907,0 +1047,0 @@ var html_filter = new HTMLFilter(spec, allowed_schemes, text_filter, remove);

@@ -5,3 +5,3 @@ {

"description": "FilterHTML: A whitelisting HTML filter for Python and JavaScript",
"version": "0.5.0",
"version": "0.6.0",
"repository": {

@@ -8,0 +8,0 @@ "url": "https://github.com/dcollien/FilterHTML"

FilterHTML
---------
v0.3 - White-list tags, attributes, classes, styles. With tag-specific text filtering and tag contents removal.
v0.6 - White-list tags, attributes, classes, styles. With tag-specific text filtering and tag contents removal.
[Demo (JS)](https://dcollien.github.io/FilterHTML/index.html)
A dictionary-defined white-listing HTML filter. Useful for filtering HTML to leave behind a supported or safe sub-set.
Python and JavaScript versions
- Simple and Powerful
- No dependencies
- Python and JavaScript versions, each a **single file**:
- [FilterHTML.py](./FilterHTML.py)
- [FilterHTML.js](./lib/FilterHTML.js)

@@ -17,67 +25,72 @@ Python installation:

Browser: copy `./lib/FilterHTML.js` into your project
Browser: use `./lib/FilterHTML.js` in a &lt;script&gt; tag
Example:
Run Python Tests: `nosetests --with-coverage`
import FilterHTML
Run JavaScript Tests: `nodeunit tests/run_tests.js`
# only allow:
# <a> tags with valid href URLs
# <img> tags with valid src URLs and measurements
# <span> tags with valid color styles
whitelist = {
'a': {
'href': 'url',
'target': [
'_blank',
'_self'
],
'class': [
'button'
]
},
Filtering Example, in Python:
'img': {
'src': 'url',
'width': 'measurement',
'height': 'measurement'
},
```python
import FilterHTML
'span': {
'style': {
'color': 'color',
'background-color': 'color'
}
}
# only allow:
# <a> tags with valid href URLs
# <img> tags with valid src URLs and measurements
# <span> tags with valid color styles
whitelist = {
'a': {
'href': 'url',
'target': [
'_blank',
'_self'
],
'class': [
'button'
]
},
'img': {
'src': 'url',
'width': 'measurement',
'height': 'measurement'
},
'span': {
'style': {
'color': 'color',
'background-color': 'color'
}
}
}
# perform replacements on text (between tags)
def replace_text(text, tags):
return text.replace('sad', '<strong>happy</strong>')
# perform replacements on text (between tags)
def replace_text(text, tags):
return text.replace('sad', '<strong>happy</strong>')
# filter the unfiltered_html, using the above whitelist, using specified allowed url schemes, and a text replacement function
filtered_html = FilterHTML.filter_html(unfiltered_html, whitelist, ('http', 'https', 'mailto', 'ftp'), replace_text)
# filter the unfiltered_html, using the above whitelist, using specified allowed url schemes, and a text replacement function
filtered_html = FilterHTML.filter_html(unfiltered_html, whitelist, ('http', 'https', 'mailto', 'ftp'), replace_text)
# simpler usage: filter using the default (same as above) url schemes, and no replacement function:
filtered_html = FilterHTML.filter_html(unfiltered_html, whitelist)
# simpler usage: filter using the default (same as above) url schemes, and no replacement function:
filtered_html = FilterHTML.filter_html(unfiltered_html, whitelist)
```
What this does:
- Lets you easily define a subset of HTML and it filters out everything else
- Ensures there's no unicode encoding in attributes (e.g. &amp;#58; or \3A for CSS)
- Lets you use regular expressions, lists, function delegates or built-ins as rules/filters
- Lets you filter or match attributes on tags
- Lets you filter or match individual CSS styles in style attributes
- Lets you define allowed classes as a list
- Lets you specify a filtering function delegate for modifying text between tags (e.g. url auto-linking, emoticon parsing, #tagging, @mentioning, etc.), the output is also HTML filtered
- Lets you convert one tag into another (with specified attributes)
- Lets you completely remove contents of specified tags from HTML
- Helps to reduce XSS/code injection vulnerabilities
- Runs server-side in Python (e.g. Flask, Bottle, Django) or Javascript (e.g. Node)
- The Javascript port can also be used for client-side filtering
- Lets you **easily define a subset of HTML** and it filters out everything else
- Ensures there's **no unicode** encoding in attributes (e.g. &amp;#58; or \3A for CSS)
- Lets you use **regular expressions, lists, function delegates or built-ins** as rules/filters
- Lets you filter or match **attributes** on tags
- Lets you filter or match individual **CSS styles** in style attributes
- Lets you define **allowed classes** as a list
- Lets you specify a function delegate to define the specification for a tag, **depending on which tags it is inside**
- Lets you specify a function delegate for modifying or **filtering text nodes**, i.e. text between tags (e.g. url auto-linking, emoticon parsing, #tagging, @mentioning, etc.), the output is also HTML filtered
- Lets you **convert one tag into another** (with specified attributes)
- Lets you **completely remove contents of specified tags** from HTML
- Runs server-side in **Python** (e.g. Flask, Bottle, Django) or **JavaScript** (e.g. Node.JS, IO.js, **Browser**)
- Really helps to reduce XSS/code injection vulnerabilities
What this doesn't do:
- Clean up tag soup (use something else for that, like BeautifulSoup): this assumes the HTML is valid and complete. It will throw exceptions if it detects unclosed opening tags, or extra closing tags.
- Claim to be XSS-safe out of the box: be careful with your white-list specification and test it thoroughly (here's a handy resource: https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet)
- Claim to be XSS-safe out of the box: be careful with your white-list specification and test it thoroughly (here's a handy resource: https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet).

@@ -90,24 +103,25 @@ ### Class and Style filtering

e.g.
<pre>
{
'div': {
# style filtering:
'style': {
'width': 'measurement',
'height': 'measurement',
'background-color': 'color',
'text-align': ['left', 'right', 'center', 'justify', 'inherit'],
'border': border_filter_function, # implement your own function,
'border-radius': re.compile(r'^\d+px$')
}
},
'span': {
# class filtering (a list of allowed matches, strings, regex or functions):
'class': [
'icon',
re.compile(r'^icon\-[a-zA-Z0-9\-]+$')
]
}
```python
{
'div': {
# style filtering:
'style': {
'width': 'measurement',
'height': 'measurement',
'background-color': 'color',
'text-align': ['left', 'right', 'center', 'justify', 'inherit'],
'border': border_filter_function, # implement your own function,
'border-radius': re.compile(r'^\d+px$')
}
</pre>
},
'span': {
# class filtering (a list of allowed matches, strings, regex or functions):
'class': [
'icon',
re.compile(r'^icon\-[a-zA-Z0-9\-]+$')
]
}
}
```
### Text filtering/modification

@@ -118,32 +132,35 @@ - Text (between tags) can be filtered or modified with a delegate function. This function is passed each string of text between tags, as well as a list of the tags this string is inside (and their attributes). The string is replaced with the output of this function, and it is also filtered according to the supplied white-list specification.

N.B. the output HTML of the urlize function is also HTML filtered using the same spec.
```python
URLIZE_RE = '(%s)' % '|'.join([
r'<(?:f|ht)tps?://[^>]*>',
r'\b(?:f|ht)tps?://[^)<>\s]+[^.,)<>\s]',
])
URLIZE_RE = '(%s)' % '|'.join([
r'<(?:f|ht)tps?://[^>]*>',
r'\b(?:f|ht)tps?://[^)<>\s]+[^.,)<>\s]',
])
# second argument is a list of tags which this text is inside,
# each element a tuple: (tag_name, attributes)
def urlize(text, stack):
is_inside_a_tag = False
for tag in stack:
tag_name, attributes = tag
if tag_name == 'a':
is_inside_a_tag = True
break
# second argument is a list of tags which this text is inside,
# each element a tuple: (tag_name, attributes)
def urlize(text, stack):
is_inside_a_tag = False
for tag in stack:
tag_name, attributes = tag
if tag_name == 'a':
is_inside_a_tag = True
break
if is_inside_a_tag:
return text
else:
return re.sub(URLIZE_RE, r'<a href="\1">\1</a>', text)
if is_inside_a_tag:
return text
else:
return re.sub(URLIZE_RE, r'<a href="\1">\1</a>', text)
result = FilterHTML.filter_html(html, spec, text_filter=urlize)
result = FilterHTML.filter_html(html, spec, text_filter=urlize)
# script and style tag contents can be removed:
result = FilterHTML.filter_html(html, spec, text_filter=urlize, remove=['script', 'style'])
```
# script and style tag contents can be removed:
result = FilterHTML.filter_html(html, spec, text_filter=urlize, remove=['script', 'style'])
### Built-In Filters:
- "url", for parsing URLs and matching against allowed schemes (http://, ftp://, mailto:, etc.)
### Built-In Filters and Whitelist Types:
At the attribute, class, or style level of the whitelist, the following are valid filters:
- "url", for parsing URLs and matching against allowed schemes (http://, ftp://, mailto:, etc.). This also escapes unsafe URL characters (if not already escaped). Invalid URL attributes will be replaced with "#". Leading and trailing spaces will be stripped.
- "url|empty", same as above, but also allows empty-string attributes. Invalid URL attributes will be replaced with "" (empty string).
- "boolean", for attributes which have no value (are either present, or not, such as the "checked" attribute). N.B. attributes such as: checked="checked" or checked="" will keep the attribute present, all other values will incur the removal of the attribute.
- "color", for matching an HTML color value (either a string, like "red", "blue", etc. or "#fff", "#f0f0f0", or valid "rgb", "rgba", "hsl", or "hsla" values)

@@ -154,88 +171,203 @@ - "measurement", for matching style measurements, e.g. "42px", "10%", "6em", etc.

- "alphanumeric", for matching alphabetical and digit characters
- "[allowedchars]", for allowing characters specified between starting and ending "[ ]"
- "alpha|empty", for matching alphabetical characters, or empty string
- "alphanumeric|empty", for matching alphabetical and digit characters, or empty string
- "text", for matching against HTML-entity escaped text (e.g. alt attributes). Greater-than, less-than, ampersand, semicolon, and single/double-quote characters will be replaced with their HTML escaped entity equivalents. Existing escape sequences will remain unmodified.
- `"[allowedchars]"`, for allowing characters specified between starting and ending `[ ]`
- regular expressions (which must match the value, or the value will be removed)
- a function, which takes the value as an argument, and returns a string replacement (or a None/null value to reject and remove the attribute)
- "*", which matches anything, and will allow any value to remain unchanged
Matching can also be done against regular expressions or a list of allowed values. Values can also be passed through custom filtering functions.
Additionally for attributes:
- a list of allowed (string) values can be provided to all attributes
- "class" attributes can be treated like a standard attribute, or can be given a list of allowed (string) values which match against any of the provided class names. This may also include a function or regular expression to decide which class names are kept
- "style" attributes can be given an object/dictionary with the keys as style names, and any of the above filters as the values.
At the tag-level:
- An object/dictionary defines the allowed attributes (keys are attribute names, values are the above filters)
- A false boolean value to remove this tag and its contents
- A function, which takes two arguments: the tag name, and the stack of tags above the current tag in the document. This function returns either of the above (object/dictionary, or boolean)
#### Special Attribute Values
The following can be used instead of attribute names:
- "*" to allow these rules on all attributes which have not otherwise been specified
- A regular expression object (Python only), to use this rule-set for matching attributes which have not otherwise been specified
- "^$" to define a list of `[RegEx, rule]` pairs, to be used instead of the above, when a regular expression cannot be given as a key (i.e. JavaScript), or the regular expressions need to be evaluated in a specific order
e.g.
```javascript
{
"tag_name": {
"attribute_name": attribute_rules,
"^$": [
[/^regex$/, matching_attribute_rules]
],
"*": remaining_attribute_rules
}
}
```
### White-list
Define an allowed HTML subset as a JSON object (for the JS version) or a Python dictionary.
Define an allowed HTML subset as a JavaScript Object/Python Dictionary.
For regular expression filters, you can use /pattern/modifiers syntax in JavaScript (or new RegExp), or in Python: re.compile()
Python example whitelist:
White-list format for allowing a tag can use many combinations of different filtering options, e.g.
```python
{
"tag_name_a": {
# attribute filtering by list of allowed values, built-in, regex, function delegate,
# or a list of these types
"attribute_a": ["allowed-value", "another-allowed-value"],
"attribute_b": "url",
"attribute_c": re.compile(r'^regex$'),
"attribute_d": attribute_filtering_function,
"attribute_e": [
"allowed-value",
re.compile(r'^regex$'),
attribute_filtering_function
],
spec = {
# class filtering by a list of allowed values, or class-name matching regex
"class": [
"allowed-class-name",
"another-allowed-class-name",
re.compile(r'^class-name-regex$')
],
"div": {
# list allowed attribute values, as a list
"class": [
"container",
"content"
]
},
# style filtering by object of allowed styles
# filtered by: build-in, list of allowed values, regex, function delegate
"style": {
"style-name-a": "color",
"style-name-b": [
"value-1", "value-2"
],
"style-name-c": re.compile(r'^regex$'),
"style-name-d": style_filtering_functon
}
},
"p": {
"class": [
"centered"
],
# style parsing
"style": {
"color": re.compile(r'^#[0-9A-Fa-f]{6}$')
}
},
# Allow this tag, but no attributes
"tag_name_b": {},
"a": {
# parse urls to ensure there's no javascript, by using the "url" string.
# disallow &# unicode encoding
# by default allowed schemes are 'http', 'https', 'mailto', and 'ftp' (as well as local URIs)
# this can be changed by passing in allowed_schemes=('http', 'myscheme')
"href": "url",
"target": [
"_blank"
]
},
# Use a function delegate to specify this tag's white-list
"tag_name_c": tag_filtering_function,
"img": {
"src": "url",
# make sure these fields are integers, by using the "int" string
"width": "int",
"height": "int"
},
# Remove this tag, and all its contents
"tag_name_d": false,
"input": {
# only allow alphabetical characters
"type": "alpha",
# allow any of these characters (within the [])
"name": "[abcdefghijklmnopqrstuvwxyz-]",
# allow alphabetical and digit characters
"value": "alphanumeric"
},
# Unlisted tags will be removed, but their contents left in-tact
}
```
# filter out all attributes for these tags
"hr": {},
"br": {},
"strong": {},
White-list tag filtering functions are defined as:
```python
def tag_filtering_function(tag_name, tag_stack):
# tag_name: the name of the tag being filtered
# tag_stack: a list of (tag_name, attributes) for each tag
# above the current tag (in its parsing context)
# where the last in the list is the direct parent tag
"i": {
# use a regex match
# in javascript you can use /this style/ regex.
"class": re.compile(r'^icon-[a-z0-9_]+$/')
},
# Delete this tag and all its contents
return False
# global attributes (allowed on all elements):
# (N.B. only applies to tags already supplied as keys)
# element's specific attributes take precedence, but if they are all filtered out
# these global rules are applied to the original attribute value
"*": {
"class": ["text-left", "text-right", "text-centered"]
},
# Delete this tag, but not its contents
return None
# aliases (convert one tag to another):
# Return a custom specification for how to filter this tag
return {
'attribute_name': ['attribute_value']
}
```
# convert <b> tags to <strong> tags
"b": "strong",
Attribute/Style filtering functions are defined as:
# convert <center> tags to <p class="text-centered"> tags
"center": "p class=\"text-centered\""
```python
def attr_filter(attribute_value):
return "new-attribute-value"
# or return None, or return '' to remove this attribute
def style_filter(style_value):
return "new-style-value"
# or return None, or return '' to remove this style
```
Python example whitelist:
```python
spec = {
"div": {
# list allowed attribute values, as a list
"class": [
"container",
"content"
]
},
"p": {
"class": [
"centered"
],
# style parsing
"style": {
"color": re.compile(r'^#[0-9A-Fa-f]{6}$')
}
},
"a": {
# parse urls to ensure there's no javascript, by using the "url" string.
# disallow &# unicode encoding
# by default allowed schemes are 'http', 'https', 'mailto', and 'ftp' (as well as local URIs)
# this can be changed by passing in allowed_schemes=('http', 'myscheme')
"href": "url",
"target": [
"_blank"
]
},
"img": {
"src": "url",
# make sure these fields are integers, by using the "int" string
"width": "int",
"height": "int"
},
"input": {
# only allow alphabetical characters
"type": "alpha",
# allow any of these characters (within the [])
"name": "[abcdefghijklmnopqrstuvwxyz-]",
# allow alphabetical and digit characters
"value": "alphanumeric"
},
# filter out all attributes for these tags
"hr": {},
"br": {},
"strong": {},
"i": {
# use a regex match
# in javascript you can use /this style/ regex.
"class": re.compile(r'^icon-[a-z0-9_]+$/')
},
# global attributes (allowed on all elements):
# (N.B. only applies to tags already supplied as keys)
# element's specific attributes take precedence, but if they are all filtered out
# these global rules are applied to the original attribute value
"*": {
"class": ["text-left", "text-right", "text-centered"]
},
# aliases (convert one tag to another):
# convert <b> tags to <strong> tags
"b": "strong",
# convert <center> tags to <p class="text-centered"> tags
"center": "p class=\"text-centered\""
}
```
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc