Skip to content

Commit 69a7473

Browse files
committed
Cleaner: cover some more cases where scripts could sneak through in specially crafted style content.
1 parent 54d2985 commit 69a7473

File tree

2 files changed

+73
-12
lines changed

2 files changed

+73
-12
lines changed

src/lxml/html/clean.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,22 +76,20 @@
7676
# All kinds of schemes besides just javascript: that can cause
7777
# execution:
7878
_find_image_dataurls = re.compile(
79-
r'^data:image/(.+);base64,', re.I).findall
80-
_is_possibly_malicious_scheme = re.compile(
79+
r'data:image/(.+);base64,', re.I).findall
80+
_possibly_malicious_schemes = re.compile(
8181
r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
8282
re.I).findall
8383
# SVG images can contain script content
84-
_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall
84+
_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search
8585

86-
def _is_javascript_scheme(s):
87-
is_image_url = False
86+
def _has_javascript_scheme(s):
87+
safe_image_urls = 0
8888
for image_type in _find_image_dataurls(s):
89-
is_image_url = True
9089
if _is_unsafe_image_type(image_type):
9190
return True
92-
if is_image_url:
93-
return False
94-
return bool(_is_possibly_malicious_scheme(s))
91+
safe_image_urls += 1
92+
return len(_possibly_malicious_schemes(s)) > safe_image_urls
9593

9694
_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
9795

@@ -522,7 +520,7 @@ def _kill_elements(self, doc, condition, iterate=None):
522520
def _remove_javascript_link(self, link):
523521
# links like "j a v a s c r i p t:" might be interpreted in IE
524522
new = _substitute_whitespace('', unquote_plus(link))
525-
if _is_javascript_scheme(new):
523+
if _has_javascript_scheme(new):
526524
# FIXME: should this be None to delete?
527525
return ''
528526
return link
@@ -544,7 +542,7 @@ def _has_sneaky_javascript(self, style):
544542
style = style.replace('\\', '')
545543
style = _substitute_whitespace('', style)
546544
style = style.lower()
547-
if 'javascript:' in style:
545+
if _has_javascript_scheme(style):
548546
return True
549547
if 'expression(' in style:
550548
return True

src/lxml/html/tests/test_clean.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,14 +126,19 @@ def test_sneaky_js_in_math_style(self):
126126
lxml.html.tostring(clean_html(s)))
127127

128128
def test_sneaky_import_in_style(self):
129-
# Prevent "@@importimport" -> "@import" replacement.
129+
# Prevent "@@importimport" -> "@import" replacement etc.
130130
style_codes = [
131131
"@@importimport(extstyle.css)",
132132
"@ @ import import(extstyle.css)",
133133
"@ @ importimport(extstyle.css)",
134134
"@@ import import(extstyle.css)",
135135
"@ @import import(extstyle.css)",
136136
"@@importimport()",
137+
"@@importimport() ()",
138+
"@/* ... */import()",
139+
"@im/* ... */port()",
140+
"@ @import/* ... */import()",
141+
"@ /* ... */ import()",
137142
]
138143
for style_code in style_codes:
139144
html = '<style>%s</style>' % style_code
@@ -145,6 +150,41 @@ def test_sneaky_import_in_style(self):
145150
cleaned,
146151
"%s -> %s" % (style_code, cleaned))
147152

153+
def test_sneaky_schemes_in_style(self):
154+
style_codes = [
155+
"javasjavascript:cript:",
156+
"javascriptjavascript::",
157+
"javascriptjavascript:: :",
158+
"vbjavascript:cript:",
159+
]
160+
for style_code in style_codes:
161+
html = '<style>%s</style>' % style_code
162+
s = lxml.html.fragment_fromstring(html)
163+
164+
cleaned = lxml.html.tostring(clean_html(s))
165+
self.assertEqual(
166+
b'<style>/* deleted */</style>',
167+
cleaned,
168+
"%s -> %s" % (style_code, cleaned))
169+
170+
def test_sneaky_urls_in_style(self):
171+
style_codes = [
172+
"url(data:image/svg+xml;base64,...)",
173+
"url(javasjavascript:cript:)",
174+
"url(javasjavascript:cript: ::)",
175+
"url(vbjavascript:cript:)",
176+
"url(vbjavascript:cript: :)",
177+
]
178+
for style_code in style_codes:
179+
html = '<style>%s</style>' % style_code
180+
s = lxml.html.fragment_fromstring(html)
181+
182+
cleaned = lxml.html.tostring(clean_html(s))
183+
self.assertEqual(
184+
b'<style>url()</style>',
185+
cleaned,
186+
"%s -> %s" % (style_code, cleaned))
187+
148188
def test_svg_data_links(self):
149189
# Remove SVG images with potentially insecure content.
150190
svg = b'<svg onload="alert(123)" />'
@@ -188,6 +228,29 @@ def test_image_data_links(self):
188228
cleaned,
189229
"%s -> %s" % (url, cleaned))
190230

231+
def test_image_data_links_in_style(self):
232+
data = b'123'
233+
data_b64 = base64.b64encode(data).decode('ASCII')
234+
urls = [
235+
"data:image/jpeg;base64," + data_b64,
236+
"data:image/apng;base64," + data_b64,
237+
"data:image/png;base64," + data_b64,
238+
"data:image/gif;base64," + data_b64,
239+
"data:image/webp;base64," + data_b64,
240+
"data:image/bmp;base64," + data_b64,
241+
"data:image/tiff;base64," + data_b64,
242+
"data:image/x-icon;base64," + data_b64,
243+
]
244+
for url in urls:
245+
html = '<style> url(%s) </style>' % url
246+
s = lxml.html.fragment_fromstring(html)
247+
248+
cleaned = lxml.html.tostring(clean_html(s))
249+
self.assertEqual(
250+
html.encode("UTF-8"),
251+
cleaned,
252+
"%s -> %s" % (url, cleaned))
253+
191254
def test_formaction_attribute_in_button_input(self):
192255
# The formaction attribute overrides the form's action and should be
193256
# treated as a malicious link attribute

0 commit comments

Comments
 (0)