44@author: piranna
55'''
66
7+ import re
8+
79try :
810 from collections import OrderedDict
911except ImportError :
@@ -95,45 +97,41 @@ def wrapped_func(*args, **kwargs):
9597
9698 return wrapped_func
9799
98- def split_unquoted_newlines (text ):
99- """Split a string on all unquoted newlines
100100
101- This is a fairly simplistic implementation of splitting a string on all
102- unescaped CR, LF, or CR+LF occurences. Only iterates the string once. Seemed
103- easier than a complex regular expression.
104- """
105- lines = ['' ]
106- quoted = None
107- escape_next = False
108- last_char = None
109- for c in text :
110- escaped = False
111- # If the previous character was an unescpaed '\', this character is
112- # escaped.
113- if escape_next :
114- escaped = True
115- escape_next = False
116- # If the current character is '\' and it is not escaped, the next
117- # character is escaped.
118- if c == '\\ ' and not escaped :
119- escape_next = True
120- # Start a quoted portion if a) we aren't in one already, and b) the
121- # quote isn't escaped.
122- if c in '"\' ' and not escaped and not quoted :
123- quoted = c
124- # Escaped quotes (obvs) don't count as a closing match.
125- elif c == quoted and not escaped :
126- quoted = None
127-
128- if not quoted and c in ['\r ' , '\n ' ]:
129- if c == '\n ' and last_char == '\r ' :
130- # It's a CR+LF, so don't append another line
131- pass
132- else :
133- lines .append ('' )
134- else :
135- lines [- 1 ] += c
101+ # This regular expression replaces the home-cooked parser that was here before.
102+ # It is much faster, but requires an extra post-processing step to get the
103+ # desired results (that are compatible with what you would expect from the
104+ # str.splitlines() method).
105+ #
106+ # It matches groups of characters: newlines, quoted strings, or unquoted text,
107+ # and splits on that basis. The post-processing step puts those back together
108+ # into the actual lines of SQL.
109+ SPLIT_REGEX = re .compile (r"""
110+ (
111+ (?: # Start of non-capturing group
112+ (?:\r\n|\r|\n) | # Match any single newline, or
113+ [^\r\n'"]+ | # Match any character series without quotes or
114+ # newlines, or
115+ "(?:[^"\\]|\\.)*" | # Match double-quoted strings, or
116+ '(?:[^'\\]|\\.)*' # Match single quoted strings
117+ )
118+ )
119+ """ , re .VERBOSE )
120+
121+ LINE_MATCH = re .compile (r'(\r\n|\r|\n)' )
136122
137- last_char = c
138-
139- return lines
123+ def split_unquoted_newlines (text ):
124+ """Split a string on all unquoted newlines.
125+
126+ Unlike str.splitlines(), this will ignore CR/LF/CR+LF if the requisite
127+ character is inside of a string."""
128+ lines = SPLIT_REGEX .split (text )
129+ outputlines = ['' ]
130+ for line in lines :
131+ if not line :
132+ continue
133+ elif LINE_MATCH .match (line ):
134+ outputlines .append ('' )
135+ else :
136+ outputlines [- 1 ] += line
137+ return outputlines
0 commit comments