fix #201: standardize whitespace inside tokens (#225)

* fix #201: standardize whitespace inside tokens * chore: update changelog and primer
tconbeer · Aug 2, 2022 · fb3253f · fb3253f
1 parent a00a4f5
commit fb3253f
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 279 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+### Formatting Changes + Bug Fixes
+
+-   sqlfmt now standardizes whitespace inside word tokens ([#201](https://github.com/tconbeer/sqlfmt/issues/201))
+
 ## [0.10.0] - 2022-08-02
 
 ### Features

diff --git a/poetry.lock b/poetry.lock
diff --git a/src/sqlfmt/node_manager.py b/src/sqlfmt/node_manager.py
@@ -69,7 +69,7 @@ def create_node(self, token: Token, previous_node: Optional[Node]) -> Node:
 
         prev_token, extra_whitespace = get_previous_token(previous_node)
         prefix = self.whitespace(token, prev_token, extra_whitespace)
-        value = self.capitalize(token)
+        value = self.standardize_value(token)
 
         if token.type in (TokenType.FMT_OFF, TokenType.DATA):
             formatting_disabled = True
@@ -214,10 +214,11 @@ def whitespace(
         else:
             return SPACE
 
-    def capitalize(self, token: Token) -> str:
+    def standardize_value(self, token: Token) -> str:
         """
-        Proper style is to lowercase all keywords, statements, and names.
-        If DB identifiers can't be lowercased, they should be quoted.
+        Tokens that are words (not symbols) and aren't jinja
+        or comments should be lowercased and have any internal
+        whitespace replaced with a single space
         """
         if token.type in (
             TokenType.UNTERM_KEYWORD,
@@ -230,7 +231,7 @@ def capitalize(self, token: Token) -> str:
             TokenType.BOOLEAN_OPERATOR,
             TokenType.SET_OPERATOR,
         ):
-            return token.token.lower()
+            return " ".join(token.token.lower().split())
         elif token.type == TokenType.NAME and not self.case_sensitive_names:
             return token.token.lower()
         else:

diff --git a/src/sqlfmt_primer/primer.py b/src/sqlfmt_primer/primer.py
@@ -30,7 +30,7 @@ def get_projects() -> List[SQLProject]:
         SQLProject(
             name="gitlab",
             git_url="https://github.com/tconbeer/gitlab-analytics-sqlfmt.git",
-            git_ref="4d75449eeb2d7a97dbb63fae4458de19dd2a382a",  # sqlfmt 0d3b30e
+            git_ref="30645ca4b8a4723b9c4d4177e0f0f34ac7b40bbf",  # sqlfmt 49fa97e
             expected_changed=4,
             expected_unchanged=2413,
             expected_errored=0,
@@ -39,7 +39,7 @@ def get_projects() -> List[SQLProject]:
         SQLProject(
             name="rittman",
             git_url="https://github.com/tconbeer/rittman_ra_data_warehouse.git",
-            git_ref="0d5492b2526d5830a94037f336223fd4e0e3fbd4",  # sqlfmt 0d3b30e
+            git_ref="44bb35dd4db50e4113cac8cb8d3521da8d998e1d",  # sqlfmt 49fa97e
             expected_changed=0,
             expected_unchanged=307,
             expected_errored=4,  # true mismatching brackets

diff --git a/tests/data/unformatted/117_whitespace_in_tokens.sql b/tests/data/unformatted/117_whitespace_in_tokens.sql
@@ -0,0 +1,29 @@
+/* a multiline
+comment
+*/
+select
+top
+25
+*
+from "my table"
+where
+id not
+in (
+    1, 2, 3
+)
+union
+all
+select
+distinct
+*
+from "your table"
+)))))__SQLFMT_OUTPUT__(((((
+/* a multiline
+comment
+*/
+select top 25 *
+from "my table"
+where id not in (1, 2, 3)
+union all
+select distinct *
+from "your table"
diff --git a/tests/unit_tests/test_node_manager.py b/tests/unit_tests/test_node_manager.py
@@ -317,3 +317,26 @@ def test_bracket_whitespace(default_mode: Mode, source_string: str) -> None:
     ).parse_query(source_string=source_string)
     parsed_string = "".join(str(line) for line in q.lines)
     assert source_string == parsed_string
+
+
+@pytest.mark.parametrize(
+    "source_string,expected_string",
+    [
+        ("union\n\nall", "union all\n"),
+        ("union\n all", "union all\n"),
+        ("union    all", "union all\n"),
+        ("select\ntop\n10", "select top 10\n"),
+        ("group    by", "group by\n"),
+        ("not\nin", "not in\n"),
+        ("not\n  similar  \n to", "not similar to\n"),
+        ("right\n  outer  \n join", "right outer join\n"),
+    ],
+)
+def test_internal_whitespace(
+    default_mode: Mode, source_string: str, expected_string: str
+) -> None:
+    q = default_mode.dialect.initialize_analyzer(
+        line_length=default_mode.line_length
+    ).parse_query(source_string=source_string)
+    parsed_string = "".join(str(line) for line in q.lines)
+    assert parsed_string == expected_string