Skip to content

Commit 8434999

Browse files
committed
Decode bytes to unicode in Lexer.get_tokens().
Raise TypeError if the input is neither bytes in a known encoding nor unicode nor a file-like object (file, StringIO). Remove function u(). Add bytes_type to compat. Add tests for non-ascii.
1 parent b05bc5a commit 8434999

3 files changed

Lines changed: 37 additions & 20 deletions

File tree

sqlparse/compat.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,35 +23,23 @@
2323

2424

2525
if PY3:
26-
def u(s, encoding=None):
27-
return str(s)
28-
29-
3026
def unicode_compatible(cls):
3127
return cls
3228

33-
29+
bytes_type = bytes
3430
text_type = str
3531
string_types = (str,)
3632
from io import StringIO
3733
file_types = (StringIO, TextIOBase)
3834

3935

4036
elif PY2:
41-
def u(s, encoding=None):
42-
encoding = encoding or 'unicode-escape'
43-
try:
44-
return unicode(s)
45-
except UnicodeDecodeError:
46-
return unicode(s, encoding)
47-
48-
4937
def unicode_compatible(cls):
5038
cls.__unicode__ = cls.__str__
5139
cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
5240
return cls
5341

54-
42+
bytes_type = str
5543
text_type = unicode
5644
string_types = (str, unicode,)
5745
from StringIO import StringIO

sqlparse/lexer.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from sqlparse import tokens
1616
from sqlparse.keywords import SQL_REGEX
17-
from sqlparse.compat import file_types, string_types, u
17+
from sqlparse.compat import bytes_type, text_type, file_types
1818
from sqlparse.utils import consume
1919

2020

@@ -37,10 +37,21 @@ def get_tokens(text, encoding=None):
3737
3838
``stack`` is the inital stack (default: ``['root']``)
3939
"""
40-
if isinstance(text, string_types):
41-
text = u(text, encoding)
42-
elif isinstance(text, file_types):
43-
text = u(text.read(), encoding)
40+
if isinstance(text, file_types):
41+
text = text.read()
42+
43+
if isinstance(text, text_type):
44+
pass
45+
elif isinstance(text, bytes_type):
46+
try:
47+
text = text.decode()
48+
except UnicodeDecodeError:
49+
if not encoding:
50+
encoding = 'unicode-escape'
51+
text = text.decode(encoding)
52+
else:
53+
raise TypeError(u"Expected text or file-like object, got {!r}".
54+
format(type(text)))
4455

4556
iterable = enumerate(text)
4657
for pos, char in iterable:

tests/test_parse.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import sqlparse
88
from sqlparse import sql, tokens as T
9-
from sqlparse.compat import StringIO
9+
from sqlparse.compat import StringIO, text_type
1010

1111

1212
def test_parse_tokenize():
@@ -403,3 +403,21 @@ def test_dbldollar_as_literal(sql, is_literal):
403403
else:
404404
for token in p.tokens:
405405
assert token.ttype != T.Literal
406+
407+
408+
def test_non_ascii():
409+
_test_non_ascii = u"insert into test (id, name) values (1, 'тест');"
410+
411+
s = _test_non_ascii
412+
stmts = sqlparse.parse(s)
413+
assert len(stmts) == 1
414+
statement = stmts[0]
415+
assert text_type(statement) == s
416+
assert statement._pprint_tree() is None
417+
418+
s = _test_non_ascii.encode('utf-8')
419+
stmts = sqlparse.parse(s, 'utf-8')
420+
assert len(stmts) == 1
421+
statement = stmts[0]
422+
assert text_type(statement) == _test_non_ascii
423+
assert statement._pprint_tree() is None

0 commit comments

Comments
 (0)