Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,27 @@ def unquote_and_decode_unicode_escape_seq(
"""
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
"""
unicode_seq_pattern = re.compile(r"\\(u|U)[0-9a-fA-F]{4}")
trailing_quote = trailing_quote if trailing_quote else leading_quote

if string.startswith(leading_quote) and string.endswith(trailing_quote):
string = string[1:-1]

cleaned_string = string.encode().decode("unicode-escape")

return cleaned_string
# Decode Unicode escape sequences. This avoid issues with encoding
# This process does not handle unicode from "\U00010000" to "\U0010FFFF"
while unicode_seq_pattern.search(string):
# Get the first Unicode escape sequence.
# mypy: unicode_seq_pattern.search(string) is not None because of the while loop
unicode_seq = unicode_seq_pattern.search(string).group(0) # type: ignore
# Replace the Unicode escape sequence with the decoded character
try:
string = string.replace(
unicode_seq, unicode_seq.encode("utf-8").decode("unicode-escape")
)
except UnicodeDecodeError:
# Skip decoding if is not possible to decode the Unicode escape sequence
break # avoid infinite loop
return string


def parse_labels(labels_str: str) -> Dict[str, str]:
Expand Down
26 changes: 26 additions & 0 deletions metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,29 @@ def test_unquote_and_decode_unicode_escape_seq():
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with invalid Unicode escape sequences
input_string = '"No escape \\u123 sequences here"'
expected_output = "No escape \\u123 sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that has multiple Unicode escape sequences
input_string = '"Hello \\u003cWorld\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e"'
expected_output = "Hello <World> <Again> <Again> <Again>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that has a Unicode escape sequence at the beginning
input_string = '"Hello \\utest"'
expected_output = "Hello \\utest"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with special characters
input_string = (
'"Hello \\u003cWorld\\u003e \\u003cçãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?\\u003e"'
)
expected_output = "Hello <World> <çãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output