Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 129 additions & 89 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,48 @@
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')

starttagopen = re.compile('<[a-zA-Z]')
endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
commentclose = re.compile(r'--!?>')
commentabruptclose = re.compile(r'-?>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
# 1) if you change tagfind/attrfind remember to update locatetagend too;
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
# explode, so don't do it.
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
attrfind_tolerant = re.compile(
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
# see the HTML5 specs section "13.2.5.6 Tag open state",
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
attrfind_tolerant = re.compile(r"""
(
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
)
([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
('[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\t\n\r\f ]* # bare value
)
)?
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
""", re.VERBOSE)
locatetagend = re.compile(r"""
[a-zA-Z][^\t\n\r\f />]* # tag name
[\t\n\r\f /]* # optional whitespace before attribute name
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
(?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\t\n\r\f ]* # bare value
)
)?
[\t\n\r\f /]* # possibly followed by a space
)*
>?
""", re.VERBOSE)
# The following variables are not used, but are temporarily left for
# backward compatibility.
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
(?:[\s/]* # optional whitespace before attribute name
Expand All @@ -55,8 +85,6 @@
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

# Character reference processing logic specific to attribute values
Expand Down Expand Up @@ -100,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")

def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
Expand All @@ -117,6 +146,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._escapable = True
super().reset()

def feed(self, data):
Expand All @@ -138,13 +168,20 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def set_cdata_mode(self, elem):
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
self._escapable = escapable
if escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)

def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
self._escapable = True

# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
Expand All @@ -165,7 +202,7 @@ def goahead(self, end):
# & near the end and see if it's followed by a space or ;.
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
not re.compile(r'[\s;]').search(rawdata, amppos)):
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
Expand All @@ -177,7 +214,7 @@ def goahead(self, end):
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
Expand All @@ -195,25 +232,43 @@ def goahead(self, end):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n:
elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if not end:
break
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:k]))
if starttagopen.match(rawdata, i): # < + letter
pass
elif startswith("</", i):
if i + 2 == n:
self.handle_data("</")
elif endtagopen.match(rawdata, i): # </ + letter
pass
else:
# bogus comment
self.handle_comment(rawdata[i+2:])
elif startswith("<!--", i):
j = n
for suffix in ("--!", "--", "-"):
if rawdata.endswith(suffix, i+4):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
# bogus comment
self.handle_comment(rawdata[i+2:])
elif startswith("<?", i):
self.handle_pi(rawdata[i+2:])
else:
self.handle_data(rawdata[i:k])
raise AssertionError("we should not get here!")
k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
Expand Down Expand Up @@ -261,7 +316,7 @@ def goahead(self, end):
assert 0, "interesting.search() lied"
# end while
if end and i < n:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
Expand Down Expand Up @@ -290,8 +345,23 @@ def parse_html_declaration(self, i):
else:
return self.parse_bogus_comment(i)

# Internal -- parse comment, return length or -1 if not terminated
# see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
def parse_comment(self, i, report=True):
rawdata = self.rawdata
assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
match = commentclose.search(rawdata, i+4)
if not match:
match = commentabruptclose.match(rawdata, i+4)
if not match:
return -1
if report:
j = match.start()
self.handle_comment(rawdata[i+4: j])
return match.end()

# Internal -- parse bogus comment, return length or -1 if not terminated
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
Expand All @@ -317,6 +387,8 @@ def parse_pi(self, i):

# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
# See the HTML5 specs section "13.2.5.8 Tag name state"
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
Expand Down Expand Up @@ -356,82 +428,50 @@ def parse_starttag(self, i):
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos

# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend_tolerant.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
if next == ">":
return j + 1
if next == "/":
if rawdata.startswith("/>", j):
return j + 2
if rawdata.startswith("/", j):
# buffer boundary
return -1
# else bogus input
if j > i:
return j
else:
return i + 1
if next == "":
# end of input
return -1
if next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
if j > i:
return j
else:
return i + 1
raise AssertionError("we should not get here!")
match = locatetagend.match(rawdata, i+1)
assert match
j = match.end()
if rawdata[j-1] != ">":
return -1
return j

# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
# See the HTML5 specs section "13.2.5.7 End tag open state"
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
rawdata = self.rawdata
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
match = endendtag.search(rawdata, i+1) # >
if not match:
if rawdata.find('>', i+2) < 0: # fast check
return -1
gtpos = match.end()
match = endtagfind.match(rawdata, i) # </ + tag + >
if not match:
if self.cdata_elem is not None:
self.handle_data(rawdata[i:gtpos])
return gtpos
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
namematch = tagfind_tolerant.match(rawdata, i+2)
if not namematch:
# w3.org/TR/html5/tokenization.html#end-tag-open-state
if rawdata[i:i+3] == '</>':
return i+3
else:
return self.parse_bogus_comment(i)
tagname = namematch.group(1).lower()
# consume and ignore other stuff between the name and the >
# Note: this is not 100% correct, since we might have things like
# </tag attr=">">, but looking for > after the name should cover
# most of the cases and is much simpler
gtpos = rawdata.find('>', namematch.end())
self.handle_endtag(tagname)
return gtpos+1
if not endtagopen.match(rawdata, i): # </ + letter
if rawdata[i+2:i+3] == '>': # </> is ignored
# "missing-end-tag-name" parser error
return i+3
else:
return self.parse_bogus_comment(i)

elem = match.group(1).lower() # script or style
if self.cdata_elem is not None:
if elem != self.cdata_elem:
self.handle_data(rawdata[i:gtpos])
return gtpos
match = locatetagend.match(rawdata, i+2)
assert match
j = match.end()
if rawdata[j-1] != ">":
return -1

self.handle_endtag(elem)
# find the name: "13.2.5.8 Tag name state"
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
match = tagfind_tolerant.match(rawdata, i+2)
assert match
tag = match.group(1).lower()
self.handle_endtag(tag)
self.clear_cdata_mode()
return gtpos
return j

# Overridable -- finish processing of start+end tag: <tag.../>
def handle_startendtag(self, tag, attrs):
Expand Down
Loading
Loading