Skip to content

Commit 081b237

Browse files
committed
Add encoding parameter to top-level functions (fixes issue20).
1 parent e664ae1 commit 081b237

File tree

7 files changed

+49
-18
lines changed

7 files changed

+49
-18
lines changed

CHANGES

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ Bug Fixes
1010

1111
Enhancements
1212
* Improve parsing speed when SQL contains CLOBs or BLOBs (issue86).
13+
* Top-level API functions now accept encoding keyword to parse
14+
statements in certain encodings more reliable (issue20).
15+
16+
Other
17+
* Documentation updates.
1318

1419

1520
Release 0.1.6 (Jan 01, 2013)

docs/source/api.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ The :mod:`sqlparse` module provides the following functions on module-level.
1212

1313
.. autofunction:: sqlparse.parse
1414

15+
In most cases there's no need to set the `encoding` parameter. If
16+
`encoding` is not set, sqlparse assumes that the given SQL statement
17+
is encoded either in utf-8 or latin-1.
18+
1519

1620
.. _formatting:
1721

sqlparse/__init__.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,48 +18,56 @@
1818
from sqlparse.exceptions import SQLParseError
1919

2020

21-
def parse(sql):
21+
def parse(sql, encoding=None):
2222
"""Parse sql and return a list of statements.
2323
24-
*sql* is a single string containting one or more SQL statements.
25-
26-
Returns a tuple of :class:`~sqlparse.sql.Statement` instances.
24+
:param sql: A string containting one or more SQL statements.
25+
:param encoding: The encoding of the statement (optional).
26+
:returns: A tuple of :class:`~sqlparse.sql.Statement` instances.
2727
"""
28-
return tuple(parsestream(sql))
28+
return tuple(parsestream(sql, encoding))
2929

3030

31-
def parsestream(stream):
31+
def parsestream(stream, encoding=None):
3232
"""Parses sql statements from file-like object.
3333
34-
Returns a generator of Statement instances.
34+
:param stream: A file-like object.
35+
:param encoding: The encoding of the stream contents (optional).
36+
:returns: A generator of :class:`~sqlparse.sql.Statement` instances.
3537
"""
3638
stack = engine.FilterStack()
3739
stack.full_analyze()
38-
return stack.run(stream)
40+
return stack.run(stream, encoding)
3941

4042

4143
def format(sql, **options):
4244
"""Format *sql* according to *options*.
4345
4446
Available options are documented in :ref:`formatting`.
4547
46-
Returns the formatted SQL statement as string.
48+
In addition to the formatting options this function accepts the
49+
keyword "encoding" which determines the encoding of the statement.
50+
51+
:returns: The formatted SQL statement as string.
4752
"""
53+
encoding = options.pop('encoding', None)
4854
stack = engine.FilterStack()
4955
options = formatter.validate_options(options)
5056
stack = formatter.build_filter_stack(stack, options)
5157
stack.postprocess.append(filters.SerializerUnicode())
52-
return ''.join(stack.run(sql))
58+
return ''.join(stack.run(sql, encoding))
5359

5460

55-
def split(sql):
61+
def split(sql, encoding=None):
5662
"""Split *sql* into single statements.
5763
58-
Returns a list of strings.
64+
:param sql: A string containting one or more SQL statements.
65+
:param encoding: The encoding of the statement (optional).
66+
:returns: A list of strings.
5967
"""
6068
stack = engine.FilterStack()
6169
stack.split_statements = True
62-
return [unicode(stmt) for stmt in stack.run(sql)]
70+
return [unicode(stmt) for stmt in stack.run(sql, encoding)]
6371

6472

6573
from sqlparse.engine.filter import StatementFilter

sqlparse/engine/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def enable_grouping(self):
3636
def full_analyze(self):
3737
self.enable_grouping()
3838

39-
def run(self, sql):
40-
stream = lexer.tokenize(sql)
39+
def run(self, sql, encoding=None):
40+
stream = lexer.tokenize(sql, encoding)
4141
# Process token stream
4242
if self.preprocess:
4343
for filter_ in self.preprocess:

sqlparse/lexer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,8 @@ def add_filter(self, filter_, **options):
224224

225225
def _decode(self, text):
226226
if sys.version_info[0] == 3:
227-
return text
227+
if isinstance(text, str):
228+
return text
228229
if self.encoding == 'guess':
229230
try:
230231
text = text.decode('utf-8')
@@ -355,11 +356,13 @@ def get_tokens_unprocessed(self, stream, stack=('root',)):
355356
break
356357

357358

358-
def tokenize(sql):
359+
def tokenize(sql, encoding=None):
359360
"""Tokenize sql.
360361
361362
Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
362363
of ``(token type, value)`` items.
363364
"""
364365
lexer = Lexer()
366+
if encoding is not None:
367+
lexer.encoding = encoding
365368
return lexer.get_tokens(sql)

tests/files/test_cp1251.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
insert into foo values (1); -- Ïåñíÿ ïðî íàäåæäó

tests/test_regressions.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import sys
44

5-
from tests.utils import TestCaseBase
5+
from tests.utils import TestCaseBase, load_file
66

77
import sqlparse
88
from sqlparse import sql
@@ -188,3 +188,13 @@ def test_dont_alias_keywords():
188188
assert len(p.tokens) == 5
189189
assert p.tokens[0].ttype is T.Keyword
190190
assert p.tokens[2].ttype is T.Keyword
191+
192+
193+
def test_format_accepts_encoding(): # issue20
194+
sql = load_file('test_cp1251.sql', 'cp1251')
195+
formatted = sqlparse.format(sql, reindent=True, encoding='cp1251')
196+
if sys.version_info < (3,):
197+
tformatted = u'insert into foo\nvalues (1); -- Песня про надежду\n'
198+
else:
199+
tformatted = 'insert into foo\nvalues (1); -- Песня про надежду\n'
200+
assert formatted == tformatted

0 commit comments

Comments
 (0)