Skip to content

Commit 59181e4

Browse files
authored
fix(ingest): resolve missing numeric types for profiling (#11991)
1 parent ce6474d commit 59181e4

File tree

2 files changed

+37
-3
lines changed

2 files changed

+37
-3
lines changed

metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,11 @@
5757
convert_to_cardinality,
5858
)
5959
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
60-
from datahub.metadata.com.linkedin.pegasus2avro.schema import EditableSchemaMetadata
60+
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
61+
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
62+
EditableSchemaMetadata,
63+
NumberType,
64+
)
6165
from datahub.metadata.schema_classes import (
6266
DatasetFieldProfileClass,
6367
DatasetProfileClass,
@@ -361,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
361365
platform: str
362366
env: str
363367

368+
column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
369+
364370
def _get_columns_to_profile(self) -> List[str]:
365371
if not self.config.any_field_level_metrics_enabled():
366372
return []
@@ -374,6 +380,7 @@ def _get_columns_to_profile(self) -> List[str]:
374380

375381
for col_dict in self.dataset.columns:
376382
col = col_dict["name"]
383+
self.column_types[col] = str(col_dict["type"])
377384
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
378385
if not self.config._allow_deny_patterns.allowed(
379386
f"{self.dataset_name}.{col}"
@@ -430,6 +437,21 @@ def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
430437
self.dataset, column
431438
)
432439

440+
if column_spec.type_ == ProfilerDataType.UNKNOWN:
441+
try:
442+
datahub_field_type = resolve_sql_type(
443+
self.column_types[column], self.dataset.engine.dialect.name.lower()
444+
)
445+
except Exception as e:
446+
logger.debug(
447+
f"Error resolving sql type {self.column_types[column]}: {e}"
448+
)
449+
datahub_field_type = None
450+
if datahub_field_type is None:
451+
return
452+
if isinstance(datahub_field_type, NumberType):
453+
column_spec.type_ = ProfilerDataType.NUMERIC
454+
433455
@_run_with_query_combiner
434456
def _get_column_cardinality(
435457
self, column_spec: _SingleColumnSpec, column: str

metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
276276
return VERTICA_SQL_TYPES_MAP[type_string]
277277

278278

279-
# see https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
280279
SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
281280
"NUMBER": NumberType,
282281
"DECIMAL": NumberType,
@@ -312,6 +311,18 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
312311
"GEOGRAPHY": None,
313312
}
314313

314+
315+
def resolve_snowflake_modified_type(type_string: str) -> Any:
316+
# Match types with precision and scale, e.g., 'DECIMAL(38,0)'
317+
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
318+
if match:
319+
modified_type_base = match.group(1) # Extract the base type
320+
return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
321+
322+
# Fallback for types without precision/scale
323+
return SNOWFLAKE_TYPES_MAP.get(type_string, None)
324+
325+
315326
# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
316327
BIGQUERY_TYPES_MAP: Dict[str, Any] = {
317328
"STRING": StringType,
@@ -380,6 +391,7 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
380391
"row": RecordType,
381392
"map": MapType,
382393
"array": ArrayType,
394+
"json": RecordType,
383395
}
384396

385397
# https://docs.aws.amazon.com/athena/latest/ug/data-types.html
@@ -490,7 +502,7 @@ def resolve_sql_type(
490502
TypeClass = resolve_vertica_modified_type(column_type)
491503
elif platform == "snowflake":
492504
# Snowflake types are uppercase, so we check that.
493-
TypeClass = _merged_mapping.get(column_type.upper())
505+
TypeClass = resolve_snowflake_modified_type(column_type.upper())
494506

495507
if TypeClass:
496508
return TypeClass()

0 commit comments

Comments
 (0)