|
| 1 | +# Time: O(n) |
| 2 | +# Space: O(n) |
| 3 | + |
| 4 | +# Given a string representing a code snippet, |
| 5 | +# you need to implement a tag validator to parse the code and return whether it is valid. |
| 6 | +# A code snippet is valid if all the following rules hold: |
| 7 | +# |
| 8 | +# 1. The code must be wrapped in a valid closed tag. Otherwise, the code is invalid. |
| 9 | +# 2. A closed tag (not necessarily valid) has exactly the following format : |
| 10 | +# <TAG_NAME>TAG_CONTENT</TAG_NAME>. Among them, <TAG_NAME> is the start tag, |
| 11 | +# and </TAG_NAME> is the end tag. The TAG_NAME in start and end tags should be the same. |
| 12 | +# A closed tag is valid if and only if the TAG_NAME and TAG_CONTENT are valid. |
| 13 | +# 3. A valid TAG_NAME only contain upper-case letters, and has length in range [1,9]. |
| 14 | +# Otherwise, the TAG_NAME is invalid. |
| 15 | +# 4. A valid TAG_CONTENT may contain other valid closed tags, |
| 16 | +# cdata and any characters (see note1) EXCEPT unmatched <, |
| 17 | +# unmatched start and end tag, and unmatched or closed tags with invalid TAG_NAME. |
| 18 | +# Otherwise, the TAG_CONTENT is invalid. |
| 19 | +# 5. A start tag is unmatched if no end tag exists with the same TAG_NAME, |
| 20 | +# and vice versa. However, you also need to consider the issue of unbalanced when tags are nested. |
| 21 | +# 6. A < is unmatched if you cannot find a subsequent >. |
| 22 | +# And when you find a < or </, all the subsequent characters until the next > should be |
| 23 | +# parsed as TAG_NAME (not necessarily valid). |
| 24 | +# 7. The cdata has the following format : <![CDATA[CDATA_CONTENT]]>. |
| 25 | +# The range of CDATA_CONTENT is defined as the characters between <![CDATA[ and the first subsequent ]]>. |
| 26 | +# 8. CDATA_CONTENT may contain any characters. |
| 27 | +# The function of cdata is to forbid the validator to parse CDATA_CONTENT, |
| 28 | +# so even it has some characters that can be parsed as tag (no matter valid or invalid), |
| 29 | +# you should treat it as regular characters. |
| 30 | +# |
| 31 | +# Valid Code Examples: |
| 32 | +# Input: "<DIV>This is the first line <![CDATA[<div>]]></DIV>" |
| 33 | +# Output: True |
| 34 | +# Explanation: |
| 35 | +# The code is wrapped in a closed tag : <DIV> and </DIV>. |
| 36 | +# The TAG_NAME is valid, the TAG_CONTENT consists of some characters and cdata. |
| 37 | +# Although CDATA_CONTENT has unmatched start tag with invalid TAG_NAME, |
| 38 | +# it should be considered as plain text, not parsed as tag. |
| 39 | +# So TAG_CONTENT is valid, and then the code is valid. Thus return true. |
| 40 | +# |
| 41 | +# Input: "<DIV>>> ![cdata[]] <![CDATA[<div>]>]]>]]>>]</DIV>" |
| 42 | +# Output: True |
| 43 | +# Explanation: |
| 44 | +# We first separate the code into : start_tag|tag_content|end_tag. |
| 45 | +# start_tag -> "<DIV>" |
| 46 | +# end_tag -> "</DIV>" |
| 47 | +# tag_content could also be separated into : text1|cdata|text2. |
| 48 | +# text1 -> ">> ![cdata[]] " |
| 49 | +# cdata -> "<![CDATA[<div>]>]]>", where the CDATA_CONTENT is "<div>]>" |
| 50 | +# text2 -> "]]>>]" |
| 51 | +# |
| 52 | +# The reason why start_tag is NOT "<DIV>>>" is because of the rule 6. |
| 53 | +# The reason why cdata is NOT "<![CDATA[<div>]>]]>]]>" is because of the rule 7. |
| 54 | +# Invalid Code Examples: |
| 55 | +# Input: "<A> <B> </A> </B>" |
| 56 | +# Output: False |
| 57 | +# Explanation: Unbalanced. If "<A>" is closed, then "<B>" must be unmatched, and vice versa. |
| 58 | +# |
| 59 | +# Input: "<DIV> div tag is not closed <DIV>" |
| 60 | +# Output: False |
| 61 | +# |
| 62 | +# Input: "<DIV> unmatched < </DIV>" |
| 63 | +# Output: False |
| 64 | +# |
| 65 | +# Input: "<DIV> closed tags with invalid tag name <b>123</b> </DIV>" |
| 66 | +# Output: False |
| 67 | +# |
| 68 | +# Input: "<DIV> unmatched tags with invalid tag name </1234567890> and <CDATA[[]]> </DIV>" |
| 69 | +# Output: False |
| 70 | +# |
| 71 | +# Input: "<DIV> unmatched start tag <B> and unmatched end tag </C> </DIV>" |
| 72 | +# Output: False |
| 73 | +# Note: |
| 74 | +# For simplicity, you could assume the input code (including the any characters mentioned above) |
| 75 | +# only contain letters, digits, '<','>','/','!','[',']' and ' '. |
| 76 | + |
| 77 | +class Solution(object): |
| 78 | + def isValid(self, code): |
| 79 | + """ |
| 80 | + :type code: str |
| 81 | + :rtype: bool |
| 82 | + """ |
| 83 | + def validText(s, i): |
| 84 | + j = i |
| 85 | + i = s.find("<", i) |
| 86 | + return i != j, i |
| 87 | + |
| 88 | + def validCData(s, i): |
| 89 | + if s.find("<![CDATA[", i) != i: |
| 90 | + return False, i |
| 91 | + j = s.find("]]>", i) |
| 92 | + if j == -1: |
| 93 | + return False, i |
| 94 | + return True, j+3 |
| 95 | + |
| 96 | + def parseTagName(s, i): |
| 97 | + if s[i] != '<': |
| 98 | + return "", i |
| 99 | + j = s.find('>', i) |
| 100 | + if j == -1 or not (1 <= (j-1-i) <= 9): |
| 101 | + return "", i |
| 102 | + tag = s[i+1:j] |
| 103 | + for c in tag: |
| 104 | + if not (ord('A') <= ord(c) <= ord('Z')): |
| 105 | + return "", i |
| 106 | + return tag, j+1 |
| 107 | + |
| 108 | + def parseContent(s, i): |
| 109 | + while i < len(s): |
| 110 | + result, i = validText(s, i) |
| 111 | + if result: |
| 112 | + continue |
| 113 | + result, i = validCData(s, i) |
| 114 | + if result: |
| 115 | + continue |
| 116 | + result, i = validTag(s, i) |
| 117 | + if result: |
| 118 | + continue |
| 119 | + break |
| 120 | + return i |
| 121 | + |
| 122 | + def validTag(s, i): |
| 123 | + tag, j = parseTagName(s, i) |
| 124 | + if not tag: |
| 125 | + return False, i |
| 126 | + j = parseContent(s, j) |
| 127 | + k = j + len(tag) + 2 |
| 128 | + if k >= len(s) or s[j:k+1] != "</" + tag + ">": |
| 129 | + return False, i |
| 130 | + return True, k+1 |
| 131 | + |
| 132 | + result, i = validTag(code, 0) |
| 133 | + return result and i == len(code) |
0 commit comments