@@ -689,9 +689,28 @@ def generate_dataset_profile( # noqa: C901 (complexity)
689689 logger .debug (f"profiling { self .dataset_name } : flushing stage 1 queries" )
690690 self .query_combiner .flush ()
691691
692+ assert profile .rowCount is not None
693+ full_row_count = profile .rowCount
694+
692695 if self .config .use_sampling and not self .config .limit :
693696 self .update_dataset_batch_use_sampling (profile )
694697
698+ # Note that this row count may be different from the full_row_count if we are using sampling.
699+ row_count : int = profile .rowCount
700+ if profile .partitionSpec and "SAMPLE" in profile .partitionSpec .partition :
701+ # Querying exact row count of sample using `_get_dataset_rows`.
702+ # We are not using `self.config.sample_size` directly as the actual row count
703+ # in the sample may be different than configured `sample_size`. For BigQuery,
704+ # we've even seen 160k rows returned for a sample size of 10k.
705+ logger .debug ("Recomputing row count for the sample" )
706+
707+ # Note that we can't just call `self._get_dataset_rows(profile)` here because
708+ # there's some sort of caching happening that will return the full table row count
709+ # instead of the sample row count.
710+ row_count = self .dataset .get_row_count (str (self .dataset ._table ))
711+
712+ profile .partitionSpec .partition += f" (sample rows { row_count } )"
713+
695714 columns_profiling_queue : List [_SingleColumnSpec ] = []
696715 if columns_to_profile :
697716 for column in all_columns :
@@ -708,16 +727,6 @@ def generate_dataset_profile( # noqa: C901 (complexity)
708727 logger .debug (f"profiling { self .dataset_name } : flushing stage 2 queries" )
709728 self .query_combiner .flush ()
710729
711- assert profile .rowCount is not None
712- row_count : int # used for null counts calculation
713- if profile .partitionSpec and "SAMPLE" in profile .partitionSpec .partition :
714- # Querying exact row count of sample using `_get_dataset_rows`.
715- # We are not using `self.config.sample_size` directly as actual row count
716- # in sample may be slightly different (more or less) than configured `sample_size`.
717- self ._get_dataset_rows (profile )
718-
719- row_count = profile .rowCount
720-
721730 for column_spec in columns_profiling_queue :
722731 column = column_spec .column
723732 column_profile = column_spec .column_profile
@@ -825,6 +834,10 @@ def generate_dataset_profile( # noqa: C901 (complexity)
825834
826835 logger .debug (f"profiling { self .dataset_name } : flushing stage 3 queries" )
827836 self .query_combiner .flush ()
837+
838+ # Reset the row count to the original value.
839+ profile .rowCount = full_row_count
840+
828841 return profile
829842
830843 def init_profile (self ):
@@ -1274,6 +1287,7 @@ def create_bigquery_temp_table(
12741287 try :
12751288 cursor : "BigQueryCursor" = cast ("BigQueryCursor" , raw_connection .cursor ())
12761289 try :
1290+ logger .debug (f"Creating temporary table for { table_pretty_name } : { bq_sql } " )
12771291 cursor .execute (bq_sql )
12781292 except Exception as e :
12791293 if not instance .config .catch_exceptions :
0 commit comments