Skip to content

Commit e0d3928

Browse files
mrmasterplanandialbrecht
authored andcommitted
lexer documentation
1 parent f9a73a6 commit e0d3928

5 files changed

Lines changed: 151 additions & 91 deletions

File tree

docs/source/extending.rst

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
Extending :mod:`sqlparse`
2+
=========================
3+
4+
.. module:: sqlparse
5+
:synopsis: Extending parsing capability of sqlparse.
6+
7+
The :mod:`sqlparse` module uses a sql grammar that was tuned through usage and numerous
8+
PR to fit a broad range of SQL syntaxes, but it cannot cater to every given case since
9+
some SQL dialects have adopted conflicting meanings of certain keywords. Sqlparse
10+
therefore exposes a mechanism to configure the fundamental keywords and regular
11+
expressions that parse the language as described below.
12+
13+
If you find an adaptation that works for your specific use-case. Please consider
14+
contributing it back to the community by opening a PR on
15+
`GitHub <https://github.com/andialbrecht/sqlparse>`_.
16+
17+
Configuring the Lexer
18+
---------------------
19+
20+
The lexer is a singleton class that breaks down the stream of characters into language
21+
tokens. It does this by using a sequence of regular expressions and keywords that are
22+
listed in the file ``sqlparse.keywords``. Instead of applying these fixed grammar
23+
definitions directly, the lexer is default initialized in its method called
24+
``default_initialization()``. As an api user, you can adapt the Lexer configuration by
25+
applying your own configuration logic. To do so, start out by clearing previous
26+
configurations with ``.clear()``, then apply the SQL list with
27+
``.set_SQL_REGEX(SQL_REGEX)``, and apply keyword lists with ``.add_keywords(KEYWORDS)``.
28+
29+
You can do so by re-using the expressions in ``sqlparse.keywords`` (see example below),
30+
leaving parts out, or by making up your own master list.
31+
32+
See the expected types of the arguments by inspecting their structure in
33+
``sqlparse.keywords``.
34+
(For compatibility with python 3.4, this library does not use type-hints.)
35+
36+
The following example adds support for the expression ``ZORDER BY``, and adds ``BAR`` as
37+
a keyword to the lexer:
38+
39+
.. code-block:: python
40+
41+
import re
42+
43+
import sqlparse
44+
from sqlparse import keywords
45+
from sqlparse.lexer import Lexer
46+
47+
lex = Lexer()
48+
lex.clear()
49+
50+
my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword)
51+
52+
# slice the default SQL_REGEX to inject the custom object
53+
lex.set_SQL_REGEX(
54+
keywords.SQL_REGEX[:38]
55+
+ [my_regex]
56+
+ keywords.SQL_REGEX[38:]
57+
)
58+
lex.add_keywords(keywords.KEYWORDS_COMMON)
59+
lex.add_keywords(keywords.KEYWORDS_ORACLE)
60+
lex.add_keywords(keywords.KEYWORDS_PLPGSQL)
61+
lex.add_keywords(keywords.KEYWORDS_HQL)
62+
lex.add_keywords(keywords.KEYWORDS_MSACCESS)
63+
lex.add_keywords(keywords.KEYWORDS)
64+
lex.add_keywords({'BAR', sqlparse.tokens.Keyword})
65+
66+
sqlparse.parse("select * from foo zorder by bar;")

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Contents
2020
api
2121
analyzing
2222
ui
23+
extending
2324
changes
2425
license
2526
indices

sqlparse/keywords.py

Lines changed: 80 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -5,96 +5,92 @@
55
# This module is part of python-sqlparse and is released under
66
# the BSD License: https://opensource.org/licenses/BSD-3-Clause
77

8-
import re
9-
108
from sqlparse import tokens
119

1210
# object() only supports "is" and is useful as a marker
11+
# use this marker to specify that the given regex in SQL_REGEX
12+
# shall be processed further through a lookup in the KEYWORDS dictionaries
1313
PROCESS_AS_KEYWORD = object()
1414

1515

16-
SQL_REGEX = {
17-
'root': [
18-
(r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint),
19-
(r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint),
20-
21-
(r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single),
22-
(r'/\*[\s\S]*?\*/', tokens.Comment.Multiline),
23-
24-
(r'(\r\n|\r|\n)', tokens.Newline),
25-
(r'\s+?', tokens.Whitespace),
26-
27-
(r':=', tokens.Assignment),
28-
(r'::', tokens.Punctuation),
29-
30-
(r'\*', tokens.Wildcard),
31-
32-
(r"`(``|[^`])*`", tokens.Name),
33-
(r"´(´´|[^´])*´", tokens.Name),
34-
(r'((?<!\S)\$(?:[_A-ZÀ-Ü]\w*)?\$)[\s\S]*?\1', tokens.Literal),
35-
36-
(r'\?', tokens.Name.Placeholder),
37-
(r'%(\(\w+\))?s', tokens.Name.Placeholder),
38-
(r'(?<!\w)[$:?]\w+', tokens.Name.Placeholder),
39-
40-
(r'\\\w+', tokens.Command),
41-
42-
# FIXME(andi): VALUES shouldn't be listed here
43-
# see https://github.com/andialbrecht/sqlparse/pull/64
44-
# AS and IN are special, it may be followed by a parenthesis, but
45-
# are never functions, see issue183 and issue507
46-
(r'(CASE|IN|VALUES|USING|FROM|AS)\b', tokens.Keyword),
47-
48-
(r'(@|##|#)[A-ZÀ-Ü]\w+', tokens.Name),
49-
50-
# see issue #39
51-
# Spaces around period `schema . name` are valid identifier
52-
# TODO: Spaces before period not implemented
53-
(r'[A-ZÀ-Ü]\w*(?=\s*\.)', tokens.Name), # 'Name'.
54-
# FIXME(atronah): never match,
55-
# because `re.match` doesn't work with look-behind regexp feature
56-
(r'(?<=\.)[A-ZÀ-Ü]\w*', tokens.Name), # .'Name'
57-
(r'[A-ZÀ-Ü]\w*(?=\()', tokens.Name), # side effect: change kw to func
58-
(r'-?0x[\dA-F]+', tokens.Number.Hexadecimal),
59-
(r'-?\d+(\.\d+)?E-?\d+', tokens.Number.Float),
60-
(r'(?![_A-ZÀ-Ü])-?(\d+(\.\d*)|\.\d+)(?![_A-ZÀ-Ü])',
61-
tokens.Number.Float),
62-
(r'(?![_A-ZÀ-Ü])-?\d+(?![_A-ZÀ-Ü])', tokens.Number.Integer),
63-
(r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single),
64-
# not a real string literal in ANSI SQL:
65-
(r'"(""|\\\\|\\"|[^"])*"', tokens.String.Symbol),
66-
(r'(""|".*?[^\\]")', tokens.String.Symbol),
67-
# sqlite names can be escaped with [square brackets]. left bracket
68-
# cannot be preceded by word character or a right bracket --
69-
# otherwise it's probably an array index
70-
(r'(?<![\w\])])(\[[^\]\[]+\])', tokens.Name),
71-
(r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?'
72-
r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword),
73-
(r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword),
74-
(r'NOT\s+NULL\b', tokens.Keyword),
75-
(r'NULLS\s+(FIRST|LAST)\b', tokens.Keyword),
76-
(r'UNION\s+ALL\b', tokens.Keyword),
77-
(r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
78-
(r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
79-
(r'GROUP\s+BY\b', tokens.Keyword),
80-
(r'ORDER\s+BY\b', tokens.Keyword),
81-
(r'HANDLER\s+FOR\b', tokens.Keyword),
82-
(r'(LATERAL\s+VIEW\s+)'
83-
r'(EXPLODE|INLINE|PARSE_URL_TUPLE|POSEXPLODE|STACK)\b',
84-
tokens.Keyword),
85-
(r"(AT|WITH')\s+TIME\s+ZONE\s+'[^']+'", tokens.Keyword.TZCast),
86-
(r'(NOT\s+)?(LIKE|ILIKE|RLIKE)\b', tokens.Operator.Comparison),
87-
(r'(NOT\s+)?(REGEXP)\b', tokens.Operator.Comparison),
88-
# Check for keywords, also returns tokens.Name if regex matches
89-
# but the match isn't a keyword.
90-
(r'[0-9_\w][_$#\w]*', PROCESS_AS_KEYWORD),
91-
(r'[;:()\[\],\.]', tokens.Punctuation),
92-
(r'[<>=~!]+', tokens.Operator.Comparison),
93-
(r'[+/@#%^&|^-]+', tokens.Operator),
94-
]}
95-
96-
FLAGS = re.IGNORECASE | re.UNICODE
97-
SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX['root']]
16+
SQL_REGEX = [
17+
(r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint),
18+
(r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint),
19+
20+
(r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single),
21+
(r'/\*[\s\S]*?\*/', tokens.Comment.Multiline),
22+
23+
(r'(\r\n|\r|\n)', tokens.Newline),
24+
(r'\s+?', tokens.Whitespace),
25+
26+
(r':=', tokens.Assignment),
27+
(r'::', tokens.Punctuation),
28+
29+
(r'\*', tokens.Wildcard),
30+
31+
(r"`(``|[^`])*`", tokens.Name),
32+
(r"´(´´|[^´])*´", tokens.Name),
33+
(r'((?<!\S)\$(?:[_A-ZÀ-Ü]\w*)?\$)[\s\S]*?\1', tokens.Literal),
34+
35+
(r'\?', tokens.Name.Placeholder),
36+
(r'%(\(\w+\))?s', tokens.Name.Placeholder),
37+
(r'(?<!\w)[$:?]\w+', tokens.Name.Placeholder),
38+
39+
(r'\\\w+', tokens.Command),
40+
41+
# FIXME(andi): VALUES shouldn't be listed here
42+
# see https://github.com/andialbrecht/sqlparse/pull/64
43+
# AS and IN are special, it may be followed by a parenthesis, but
44+
# are never functions, see issue183 and issue507
45+
(r'(CASE|IN|VALUES|USING|FROM|AS)\b', tokens.Keyword),
46+
47+
(r'(@|##|#)[A-ZÀ-Ü]\w+', tokens.Name),
48+
49+
# see issue #39
50+
# Spaces around period `schema . name` are valid identifier
51+
# TODO: Spaces before period not implemented
52+
(r'[A-ZÀ-Ü]\w*(?=\s*\.)', tokens.Name), # 'Name'.
53+
# FIXME(atronah): never match,
54+
# because `re.match` doesn't work with look-behind regexp feature
55+
(r'(?<=\.)[A-ZÀ-Ü]\w*', tokens.Name), # .'Name'
56+
(r'[A-ZÀ-Ü]\w*(?=\()', tokens.Name), # side effect: change kw to func
57+
(r'-?0x[\dA-F]+', tokens.Number.Hexadecimal),
58+
(r'-?\d+(\.\d+)?E-?\d+', tokens.Number.Float),
59+
(r'(?![_A-ZÀ-Ü])-?(\d+(\.\d*)|\.\d+)(?![_A-ZÀ-Ü])',
60+
tokens.Number.Float),
61+
(r'(?![_A-ZÀ-Ü])-?\d+(?![_A-ZÀ-Ü])', tokens.Number.Integer),
62+
(r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single),
63+
# not a real string literal in ANSI SQL:
64+
(r'"(""|\\\\|\\"|[^"])*"', tokens.String.Symbol),
65+
(r'(""|".*?[^\\]")', tokens.String.Symbol),
66+
# sqlite names can be escaped with [square brackets]. left bracket
67+
# cannot be preceded by word character or a right bracket --
68+
# otherwise it's probably an array index
69+
(r'(?<![\w\])])(\[[^\]\[]+\])', tokens.Name),
70+
(r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?'
71+
r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword),
72+
(r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword),
73+
(r'NOT\s+NULL\b', tokens.Keyword),
74+
(r'NULLS\s+(FIRST|LAST)\b', tokens.Keyword),
75+
(r'UNION\s+ALL\b', tokens.Keyword),
76+
(r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
77+
(r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
78+
(r'GROUP\s+BY\b', tokens.Keyword),
79+
(r'ORDER\s+BY\b', tokens.Keyword),
80+
(r'HANDLER\s+FOR\b', tokens.Keyword),
81+
(r'(LATERAL\s+VIEW\s+)'
82+
r'(EXPLODE|INLINE|PARSE_URL_TUPLE|POSEXPLODE|STACK)\b',
83+
tokens.Keyword),
84+
(r"(AT|WITH')\s+TIME\s+ZONE\s+'[^']+'", tokens.Keyword.TZCast),
85+
(r'(NOT\s+)?(LIKE|ILIKE|RLIKE)\b', tokens.Operator.Comparison),
86+
(r'(NOT\s+)?(REGEXP)\b', tokens.Operator.Comparison),
87+
# Check for keywords, also returns tokens.Name if regex matches
88+
# but the match isn't a keyword.
89+
(r'[0-9_\w][_$#\w]*', PROCESS_AS_KEYWORD),
90+
(r'[;:()\[\],\.]', tokens.Punctuation),
91+
(r'[<>=~!]+', tokens.Operator.Comparison),
92+
(r'[+/@#%^&|^-]+', tokens.Operator),
93+
]
9894

9995
KEYWORDS = {
10096
'ABORT': tokens.Keyword,

sqlparse/lexer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# the BSD License: https://opensource.org/licenses/BSD-3-Clause
77

88
"""SQL Lexer"""
9-
9+
import re
1010
# This code is based on the SqlLexer in pygments.
1111
# http://pygments.org/
1212
# It's separated from the rest of pygments to increase performance
@@ -56,7 +56,8 @@ def clear(self):
5656

5757
def set_SQL_REGEX(self, SQL_REGEX):
5858
"""Set the list of regex that will parse the SQL."""
59-
self._SQL_REGEX = SQL_REGEX
59+
FLAGS = re.IGNORECASE | re.UNICODE
60+
self._SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX]
6061

6162
def add_keywords(self, keywords):
6263
"""Add keyword dictionaries. Keywords are looked up in the same order

tests/test_parse.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
"""Tests sqlparse.parse()."""
2-
import re
32
from io import StringIO
43

54
import pytest
@@ -538,10 +537,7 @@ def test_configurable_regex():
538537
lex = Lexer()
539538
lex.clear()
540539

541-
my_regex = (
542-
re.compile(r"ZORDER\s+BY\b", keywords.FLAGS).match,
543-
sqlparse.tokens.Keyword,
544-
)
540+
my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword)
545541

546542
lex.set_SQL_REGEX(keywords.SQL_REGEX[:38] + [my_regex] + keywords.SQL_REGEX[38:])
547543
lex.add_keywords(keywords.KEYWORDS_COMMON)

0 commit comments

Comments
 (0)