Add support for database schema def in RedshiftSource

blvp · blvp · commit 91ca9a26ca4e · 2021-08-09T17:40:28.000-07:00
diff --git a/protos/feast/core/DataSource.proto b/protos/feast/core/DataSource.proto
@@ -118,6 +118,9 @@ message DataSource {
     // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective
     // entity columns
     string query = 2;
+
+    // Redshift table schema name
+    string schema = 3;
   }
 
   // Defines configuration for custom third-party data sources.
diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py
@@ -38,6 +38,9 @@ class RedshiftOfflineStoreConfig(FeastConfigBaseModel):
     database: StrictStr
     """ Redshift database name """
 
+    temp_schema_name: StrictStr
+    """ Redshift schema name to offload temporary tables """
+
     s3_staging_location: StrictStr
     """ S3 path for importing & exporting data to Redshift """
 
@@ -237,6 +240,7 @@ def to_arrow(self) -> pa.Table:
                 self._config.offline_store.iam_role,
                 query,
                 self._drop_columns,
+                self._config.offline_store.temp_schema_name,
             )
 
     def to_s3(self) -> str:
@@ -254,13 +258,15 @@ def to_s3(self) -> str:
             )
             return self._s3_path
 
-    def to_redshift(self, table_name: str) -> None:
+    def to_redshift(self, table_name: str, schema: Optional[str] = None) -> None:
         """ Save dataset as a new Redshift table """
         with self._query_generator() as query:
-            query = f'CREATE TABLE "{table_name}" AS ({query});\n'
+            schema_prefix = f'{schema}.' if schema is not None else ''
+            full_table_name = f'{schema_prefix}{table_name}'
+            query = f'CREATE TABLE "{full_table_name}" AS ({query});\n'
             if self._drop_columns is not None:
                 for column in self._drop_columns:
-                    query += f"ALTER TABLE {table_name} DROP COLUMN {column};\n"
+                    query += f"ALTER TABLE {full_table_name} DROP COLUMN {column};\n"
 
             aws_utils.execute_redshift_statement(
                 self._redshift_client,
@@ -291,20 +297,22 @@ def _upload_entity_df_and_get_entity_schema(
             config.offline_store.iam_role,
             table_name,
             entity_df,
+            config.offline_store.temp_schema_name,
         )
         return dict(zip(entity_df.columns, entity_df.dtypes))
     elif isinstance(entity_df, str):
         # If the entity_df is a string (SQL query), create a Redshift table out of it,
         # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it
+        full_table_name = f'{config.offline_store.temp_schema_name}.{table_name}'
         aws_utils.execute_redshift_statement(
             redshift_client,
             config.offline_store.cluster_id,
             config.offline_store.database,
             config.offline_store.user,
-            f"CREATE TABLE {table_name} AS ({entity_df})",
+            f"CREATE TABLE {full_table_name} AS ({entity_df})",
         )
         limited_entity_df = RedshiftRetrievalJob(
-            f"SELECT * FROM {table_name} LIMIT 1", redshift_client, s3_resource, config
+            f"SELECT * FROM {full_table_name} LIMIT 1", redshift_client, s3_resource, config
         ).to_df()
         return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes))
     else:
diff --git a/sdk/python/feast/infra/offline_stores/redshift_source.py b/sdk/python/feast/infra/offline_stores/redshift_source.py
@@ -13,6 +13,7 @@ def __init__(
         self,
         event_timestamp_column: Optional[str] = "",
         table: Optional[str] = None,
+        schema: Optional[str] = None,
         created_timestamp_column: Optional[str] = "",
         field_mapping: Optional[Dict[str, str]] = None,
         date_partition_column: Optional[str] = "",
@@ -25,7 +26,7 @@ def __init__(
             date_partition_column,
         )
 
-        self._redshift_options = RedshiftOptions(table=table, query=query)
+        self._redshift_options = RedshiftOptions(table=table, schema=schema, query=query)
 
     @staticmethod
     def from_proto(data_source: DataSourceProto):
@@ -95,7 +96,8 @@ def validate(self, config: RepoConfig):
     def get_table_query_string(self) -> str:
         """Returns a string that can directly be used to reference this table in SQL"""
         if self.table:
-            return f'"{self.table}"'
+            schema_prefix = f'{self.schema}.' if self.schema is not None else ''
+            return f'"{schema_prefix}{self.table}"'
         else:
             return f"({self.query})"
 
@@ -153,9 +155,19 @@ class RedshiftOptions:
     DataSource Redshift options used to source features from Redshift query
     """
 
-    def __init__(self, table: Optional[str], query: Optional[str]):
+    def __init__(self, table: Optional[str], query: Optional[str], schema: Optional[str]):
+        """Redshift options to encapsulate logic for parsing and working with 2 kinds of source creation
+        table + schema or query
+
+        Args:
+            table (Optional[str]): Redshift table to be looked for in redshift cluster to form datasource
+            query (Optional[str]): Query to run to gather datasource
+            schema (Optional[str]): Schema in redshift cluster to lookup a table. 
+                Has to be provided in case of tables with same name.
+        """
         self._table = table
         self._query = query
+        self._schema = schema
 
     @property
     def query(self):
@@ -185,6 +197,20 @@ def table(self, table_name):
         """
         self._table = table_name
 
+    @property
+    def schema(self):
+        """
+        Returns the schema name of this Redshift table schema
+        """
+        return self._schema
+    
+    @schema.setter
+    def table(self, schema_name):
+        """
+        Sets the schema ref of this Redshift table schema
+        """
+        self._schema = schema_name
+
     @classmethod
     def from_proto(cls, redshift_options_proto: DataSourceProto.RedshiftOptions):
         """
@@ -198,7 +224,9 @@ def from_proto(cls, redshift_options_proto: DataSourceProto.RedshiftOptions):
         """
 
         redshift_options = cls(
-            table=redshift_options_proto.table, query=redshift_options_proto.query,
+            table=redshift_options_proto.table, 
+            query=redshift_options_proto.query,
+            schema=redshift_options_proto.schema
         )
 
         return redshift_options
@@ -212,7 +240,7 @@ def to_proto(self) -> DataSourceProto.RedshiftOptions:
         """
 
         redshift_options_proto = DataSourceProto.RedshiftOptions(
-            table=self.table, query=self.query,
+            table=self.table, query=self.query, schema=self.schema
         )
 
         return redshift_options_proto
diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py
@@ -146,6 +146,7 @@ def upload_df_to_redshift(
     iam_role: str,
     table_name: str,
     df: pd.DataFrame,
+    schema_name: Optional[str] = None,
 ) -> None:
     """Uploads a Pandas DataFrame to Redshift as a new table.
 
@@ -204,9 +205,11 @@ def upload_df_to_redshift(
 
     # Create the table with the desired schema and
     # copy the Parquet file contents to the Redshift table
+    schema_prefix = f'{schema_name}.' if schema_name is not None else ''
+    full_table_name = f'{schema_prefix}{table_name}'
     create_and_copy_query = (
-        f"CREATE TABLE {table_name}({column_query_list}); "
-        + f"COPY {table_name} FROM '{s3_path}' IAM_ROLE '{iam_role}' FORMAT AS PARQUET"
+        f"CREATE TABLE {full_table_name}({column_query_list}); "
+        + f"COPY {full_table_name} FROM '{s3_path}' IAM_ROLE '{iam_role}' FORMAT AS PARQUET"
     )
     execute_redshift_statement(
         redshift_data_client, cluster_id, database, user, create_and_copy_query
@@ -227,6 +230,7 @@ def temporarily_upload_df_to_redshift(
     iam_role: str,
     table_name: str,
     df: pd.DataFrame,
+    schema_name: Optional[str] = None
 ) -> Iterator[None]:
     """Uploads a Pandas DataFrame to Redshift as a new table with cleanup logic.
 
@@ -249,6 +253,7 @@ def temporarily_upload_df_to_redshift(
         iam_role,
         table_name,
         df,
+        schema_name
     )
 
     yield
@@ -325,6 +330,7 @@ def unload_redshift_query_to_pa(
     iam_role: str,
     query: str,
     drop_columns: Optional[List[str]] = None,
+    temp_schema_name: Optional[str] = None,
 ) -> pa.Table:
     """ Unload Redshift Query results to S3 and get the results in PyArrow Table format """
     bucket, key = get_bucket_and_key(s3_path)
@@ -356,6 +362,7 @@ def unload_redshift_query_to_df(
     iam_role: str,
     query: str,
     drop_columns: Optional[List[str]] = None,
+    schema: Optional[List[str]] = None,
 ) -> pd.DataFrame:
     """ Unload Redshift Query results to S3 and get the results in Pandas DataFrame format """
     table = unload_redshift_query_to_pa(
@@ -368,5 +375,6 @@ def unload_redshift_query_to_df(
         iam_role,
         query,
         drop_columns,
+        schema,
     )
     return table.to_pandas()

Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,9 @@ message DataSource {`
`118`	`118`	`// SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective`
`119`	`119`	`// entity columns`
`120`	`120`	`string query = 2;`
	`121`	`+`
	`122`	`+ // Redshift table schema name`
	`123`	`+ string schema = 3;`
`121`	`124`	`}`
`122`	`125`
`123`	`126`	`// Defines configuration for custom third-party data sources.`