Skip to content

Commit 377031a

Browse files
author
Michael Schuller
committed
Improved sqlparse.utils.split_unquoted_newlines()
I've modified this function to use a regular expression for most of the work. This makes it much, much faster, especially for large queries.
1 parent 5d69341 commit 377031a

File tree

1 file changed

+38
-40
lines changed

1 file changed

+38
-40
lines changed

sqlparse/utils.py

Lines changed: 38 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
@author: piranna
55
'''
66

7+
import re
8+
79
try:
810
from collections import OrderedDict
911
except ImportError:
@@ -95,45 +97,41 @@ def wrapped_func(*args, **kwargs):
9597

9698
return wrapped_func
9799

98-
def split_unquoted_newlines(text):
99-
"""Split a string on all unquoted newlines
100100

101-
This is a fairly simplistic implementation of splitting a string on all
102-
unescaped CR, LF, or CR+LF occurences. Only iterates the string once. Seemed
103-
easier than a complex regular expression.
104-
"""
105-
lines = ['']
106-
quoted = None
107-
escape_next = False
108-
last_char = None
109-
for c in text:
110-
escaped = False
111-
# If the previous character was an unescpaed '\', this character is
112-
# escaped.
113-
if escape_next:
114-
escaped = True
115-
escape_next = False
116-
# If the current character is '\' and it is not escaped, the next
117-
# character is escaped.
118-
if c == '\\' and not escaped:
119-
escape_next = True
120-
# Start a quoted portion if a) we aren't in one already, and b) the
121-
# quote isn't escaped.
122-
if c in '"\'' and not escaped and not quoted:
123-
quoted = c
124-
# Escaped quotes (obvs) don't count as a closing match.
125-
elif c == quoted and not escaped:
126-
quoted = None
127-
128-
if not quoted and c in ['\r', '\n']:
129-
if c == '\n' and last_char == '\r':
130-
# It's a CR+LF, so don't append another line
131-
pass
132-
else:
133-
lines.append('')
134-
else:
135-
lines[-1] += c
101+
# This regular expression replaces the home-cooked parser that was here before.
102+
# It is much faster, but requires an extra post-processing step to get the
103+
# desired results (that are compatible with what you would expect from the
104+
# str.splitlines() method).
105+
#
106+
# It matches groups of characters: newlines, quoted strings, or unquoted text,
107+
# and splits on that basis. The post-processing step puts those back together
108+
# into the actual lines of SQL.
109+
SPLIT_REGEX = re.compile(r"""
110+
(
111+
(?: # Start of non-capturing group
112+
(?:\r\n|\r|\n) | # Match any single newline, or
113+
[^\r\n'"]+ | # Match any character series without quotes or
114+
# newlines, or
115+
"(?:[^"\\]|\\.)*" | # Match double-quoted strings, or
116+
'(?:[^'\\]|\\.)*' # Match single quoted strings
117+
)
118+
)
119+
""", re.VERBOSE)
120+
121+
LINE_MATCH = re.compile(r'(\r\n|\r|\n)')
136122

137-
last_char = c
138-
139-
return lines
123+
def split_unquoted_newlines(text):
124+
"""Split a string on all unquoted newlines.
125+
126+
Unlike str.splitlines(), this will ignore CR/LF/CR+LF if the requisite
127+
character is inside of a string."""
128+
lines = SPLIT_REGEX.split(text)
129+
outputlines = ['']
130+
for line in lines:
131+
if not line:
132+
continue
133+
elif LINE_MATCH.match(line):
134+
outputlines.append('')
135+
else:
136+
outputlines[-1] += line
137+
return outputlines

0 commit comments

Comments
 (0)