Skip to content

Commit 77f1a0c

Browse files
authored
fix(ingest/profiling): compute sample row count correctly (#10319)
1 parent 4e2cec8 commit 77f1a0c

File tree

2 files changed

+25
-11
lines changed

2 files changed

+25
-11
lines changed

metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -689,9 +689,28 @@ def generate_dataset_profile( # noqa: C901 (complexity)
689689
logger.debug(f"profiling {self.dataset_name}: flushing stage 1 queries")
690690
self.query_combiner.flush()
691691

692+
assert profile.rowCount is not None
693+
full_row_count = profile.rowCount
694+
692695
if self.config.use_sampling and not self.config.limit:
693696
self.update_dataset_batch_use_sampling(profile)
694697

698+
# Note that this row count may be different from the full_row_count if we are using sampling.
699+
row_count: int = profile.rowCount
700+
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
701+
# Querying exact row count of sample using `_get_dataset_rows`.
702+
# We are not using `self.config.sample_size` directly as the actual row count
703+
# in the sample may be different than configured `sample_size`. For BigQuery,
704+
# we've even seen 160k rows returned for a sample size of 10k.
705+
logger.debug("Recomputing row count for the sample")
706+
707+
# Note that we can't just call `self._get_dataset_rows(profile)` here because
708+
# there's some sort of caching happening that will return the full table row count
709+
# instead of the sample row count.
710+
row_count = self.dataset.get_row_count(str(self.dataset._table))
711+
712+
profile.partitionSpec.partition += f" (sample rows {row_count})"
713+
695714
columns_profiling_queue: List[_SingleColumnSpec] = []
696715
if columns_to_profile:
697716
for column in all_columns:
@@ -708,16 +727,6 @@ def generate_dataset_profile( # noqa: C901 (complexity)
708727
logger.debug(f"profiling {self.dataset_name}: flushing stage 2 queries")
709728
self.query_combiner.flush()
710729

711-
assert profile.rowCount is not None
712-
row_count: int # used for null counts calculation
713-
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
714-
# Querying exact row count of sample using `_get_dataset_rows`.
715-
# We are not using `self.config.sample_size` directly as actual row count
716-
# in sample may be slightly different (more or less) than configured `sample_size`.
717-
self._get_dataset_rows(profile)
718-
719-
row_count = profile.rowCount
720-
721730
for column_spec in columns_profiling_queue:
722731
column = column_spec.column
723732
column_profile = column_spec.column_profile
@@ -825,6 +834,10 @@ def generate_dataset_profile( # noqa: C901 (complexity)
825834

826835
logger.debug(f"profiling {self.dataset_name}: flushing stage 3 queries")
827836
self.query_combiner.flush()
837+
838+
# Reset the row count to the original value.
839+
profile.rowCount = full_row_count
840+
828841
return profile
829842

830843
def init_profile(self):
@@ -1274,6 +1287,7 @@ def create_bigquery_temp_table(
12741287
try:
12751288
cursor: "BigQueryCursor" = cast("BigQueryCursor", raw_connection.cursor())
12761289
try:
1290+
logger.debug(f"Creating temporary table for {table_pretty_name}: {bq_sql}")
12771291
cursor.execute(bq_sql)
12781292
except Exception as e:
12791293
if not instance.config.catch_exceptions:

metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def get_profile_request(
159159
rows_count=table.rows_count,
160160
):
161161
logger.debug(
162-
f"Dataset {dataset_name} was not eliagable for profiling due to last_altered, size in bytes or count of rows limit"
162+
f"Dataset {dataset_name} was not eligible for profiling due to last_altered, size in bytes or count of rows limit"
163163
)
164164
# Profile only table level if dataset is filtered from profiling
165165
# due to size limits alone

0 commit comments

Comments
 (0)