Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not truncate results when SHOW, DESC or EXPLAIN is executed #68

Merged
merged 3 commits into from
Dec 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

- Fix execution of SHOW, EXPLAIN and DESCRIBE commands.
- Speed up results display.
- Do not truncate results when SHOW, EXPLAIN or DESCRIBE command is executed.

## [0.12.0] - 2022-11-30

Expand Down
24 changes: 18 additions & 6 deletions streaming_jupyter_integrations/magics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import sys
from functools import wraps
from typing import Any, Callable, Dict, Iterable, Tuple, Union, cast
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union, cast

import nest_asyncio
import pandas as pd
Expand Down Expand Up @@ -36,7 +36,8 @@
from .jar_handler import JarHandler
from .reflection import get_method_names_for
from .sql_syntax_highlighting import SQLSyntaxHighlighting
from .sql_utils import inline_sql_in_cell, is_dml, is_dql, is_query
from .sql_utils import (inline_sql_in_cell, is_dml, is_dql, is_metadata_query,
is_query)
from .variable_substitution import CellContentFormatter
from .yarn import find_session_jm_address

Expand Down Expand Up @@ -341,10 +342,18 @@ async def __internal_execute_sql(self, stmt: str, display_row_kind: bool) -> Non
else:
execution_result = self.st_env.execute_sql(stmt)
print("Job started")
await self.__pull_results(execution_result, display_row_kind, is_dql(stmt))
# Pandas lib truncates view if the number of results exceeds the limit. The same applies to column width.
# If the query shows metadata, e.g. list of tables or list of columns, then no limit is applied.
pd_display_options = {
"display.max_rows": None if is_metadata_query(stmt) else 100,
"display.max_colwidth": None if is_metadata_query(stmt) else 100,
}
await self.__pull_results(execution_result, display_row_kind, is_dql(stmt), pd_display_options)

async def __pull_results(self, execution_result: TableResult, display_row_kind: bool,
display_results: bool) -> None:
display_results: bool, pd_display_options: Optional[Dict[str, Any]] = None) -> None:
if not pd_display_options:
pd_display_options = {}
# active polling
while not self.interrupted:
try:
Expand All @@ -355,7 +364,7 @@ async def __pull_results(self, execution_result: TableResult, display_row_kind:
# if a select query has been executing then `wait` returns as soon as the first
# row is available. To display the results
print("Pulling query results...")
await self.display_execution_result(execution_result, display_row_kind)
await self.display_execution_result(execution_result, display_row_kind, pd_display_options)
return
else:
# if finished then return early even if the user interrupts after this
Expand Down Expand Up @@ -384,7 +393,8 @@ async def __pull_results(self, execution_result: TableResult, display_row_kind:
# usual happy path
print("Execution successful")

async def display_execution_result(self, execution_result: TableResult, display_row_kind: bool) -> pd.DataFrame:
async def display_execution_result(self, execution_result: TableResult, display_row_kind: bool,
pd_display_options: Dict[str, Any]) -> pd.DataFrame:
"""
Displays the execution result and returns a dataframe containing all the results.
Display is done in a stream-like fashion displaying the results as they come.
Expand All @@ -393,6 +403,8 @@ async def display_execution_result(self, execution_result: TableResult, display_
columns = execution_result.get_table_schema().get_field_names()
if display_row_kind:
columns = ["row_kind"] + columns
for key, value in pd_display_options.items():
pd.set_option(key, value)
df = pd.DataFrame(columns=columns)
result_kind = execution_result.get_result_kind()

Expand Down
12 changes: 10 additions & 2 deletions streaming_jupyter_integrations/sql_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,18 @@
'SELECT'
}

DQL_KEYWORDS = {
*QUERY_KEYWORDS,
METADATA_KEYWORDS = {
'DESCRIBE',
'DESC',
'EXPLAIN',
'SHOW'
}

DQL_KEYWORDS = {
*QUERY_KEYWORDS,
*METADATA_KEYWORDS
}

DML_KEYWORDS = {
"INSERT",
"EXECUTE"
Expand Down Expand Up @@ -54,6 +58,10 @@ def is_dql(sql: str) -> bool:
return __first_token_is_keyword(sql, DQL_KEYWORDS)


def is_metadata_query(sql: str) -> bool:
return __first_token_is_keyword(sql, METADATA_KEYWORDS)


def __first_token_is_keyword(sql: str, keywords: Iterable[str]) -> bool:
if not sql or not sql.strip():
return False
Expand Down
Loading