Decode bytes to unicode in Lexer.get_tokens().

phdru · phdru · commit 843499915e91 · 2016-08-31T16:11:22.000+03:00
Raise TypeError if the input is neither bytes in a known encoding nor
unicode nor a file-like object (file, StringIO).

Remove function u(). Add bytes_type to compat. Add tests for non-ascii.
diff --git a/sqlparse/compat.py b/sqlparse/compat.py
@@ -23,35 +23,23 @@
 
 
 if PY3:
-    def u(s, encoding=None):
-        return str(s)
-
-
     def unicode_compatible(cls):
         return cls
 
-
+    bytes_type = bytes
     text_type = str
     string_types = (str,)
     from io import StringIO
     file_types = (StringIO, TextIOBase)
 
 
 elif PY2:
-    def u(s, encoding=None):
-        encoding = encoding or 'unicode-escape'
-        try:
-            return unicode(s)
-        except UnicodeDecodeError:
-            return unicode(s, encoding)
-
-
     def unicode_compatible(cls):
         cls.__unicode__ = cls.__str__
         cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
         return cls
 
-
+    bytes_type = str
     text_type = unicode
     string_types = (str, unicode,)
     from StringIO import StringIO
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
@@ -14,7 +14,7 @@
 
 from sqlparse import tokens
 from sqlparse.keywords import SQL_REGEX
-from sqlparse.compat import file_types, string_types, u
+from sqlparse.compat import bytes_type, text_type, file_types
 from sqlparse.utils import consume
 
 
@@ -37,10 +37,21 @@ def get_tokens(text, encoding=None):
 
         ``stack`` is the inital stack (default: ``['root']``)
         """
-        if isinstance(text, string_types):
-            text = u(text, encoding)
-        elif isinstance(text, file_types):
-            text = u(text.read(), encoding)
+        if isinstance(text, file_types):
+            text = text.read()
+
+        if isinstance(text, text_type):
+            pass
+        elif isinstance(text, bytes_type):
+            try:
+                text = text.decode()
+            except UnicodeDecodeError:
+                if not encoding:
+                    encoding = 'unicode-escape'
+                text = text.decode(encoding)
+        else:
+            raise TypeError(u"Expected text or file-like object, got {!r}".
+                            format(type(text)))
 
         iterable = enumerate(text)
         for pos, char in iterable:
diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -6,7 +6,7 @@
 
 import sqlparse
 from sqlparse import sql, tokens as T
-from sqlparse.compat import StringIO
+from sqlparse.compat import StringIO, text_type
 
 
 def test_parse_tokenize():
@@ -403,3 +403,21 @@ def test_dbldollar_as_literal(sql, is_literal):
     else:
         for token in p.tokens:
             assert token.ttype != T.Literal
+
+
+def test_non_ascii():
+    _test_non_ascii = u"insert into test (id, name) values (1, '&#1090;&#1077;&#1089;&#1090;');"
+
+    s = _test_non_ascii
+    stmts = sqlparse.parse(s)
+    assert len(stmts) == 1
+    statement = stmts[0]
+    assert text_type(statement) == s
+    assert statement._pprint_tree() is None
+
+    s = _test_non_ascii.encode('utf-8')
+    stmts = sqlparse.parse(s, 'utf-8')
+    assert len(stmts) == 1
+    statement = stmts[0]
+    assert text_type(statement) == _test_non_ascii
+    assert statement._pprint_tree() is None