Add samples of GET and REMOVE

Jesse Whitehouse · Jesse Whitehouse · commit 469f35fe3906 · 2022-12-22T15:36:40.000-06:00
Signed-off-by: Jesse Whitehouse &lt;jesse.whitehouse@databricks.com&gt;
diff --git a/examples/staging_ingestion.py b/examples/staging_ingestion.py
@@ -1,7 +1,6 @@
 from databricks import sql
 import os
 
-
 """
 Databricks experimentally supports data ingestion of local files via a cloud staging location.
 Ingestion commands will work on DBR >12. And you must include an uploads_base_path kwarg when
@@ -28,21 +27,59 @@
 To run this script: 
 
 1. Set the INGESTION_USER constant to the account email address of the authenticated user
-2. Set the FILEPATH constant to the path of a file that will be uploaded
+2. Set the FILEPATH constant to the path of a file that will be uploaded (this example assumes its a CSV file)
+3. Run this file
 """
 
-INGESTION_USER = "user.name@example.com"
+INGESTION_USER = "some.user@example.com"
 FILEPATH = "example.csv"
 
+# FILEPATH can be relative to the current directory.
+# Resolve it into an absolute path
 _complete_path = os.path.realpath(FILEPATH)
-uploads_base_path = os.path.split(_complete_path)[:-1]
 
+if not os.path.exists(_complete_path):
+
+    # It's easiest to save a file in the same directory as this script. But any path to a file will work.
+    raise Exception(
+        "You need to set FILEPATH in this script to a file that actually exists."
+    )
+
+# Set uploads_base_path equal to the directory that contains FILEPATH
+uploads_base_path = os.path.split(_complete_path)[0]
+
+with sql.connect(
+    server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"),
+    http_path=os.getenv("DATABRICKS_HTTP_PATH"),
+    access_token=os.getenv("DATABRICKS_TOKEN"),
+    uploads_base_path=uploads_base_path,
+) as connection:
+
+    with connection.cursor() as cursor:
+
+        # Ingestion commands are executed like any other SQL.
+        # Here's a sample PUT query. You can remove OVERWRITE at the end to avoid silently overwriting data.
+        query = f"PUT '{_complete_path}' INTO 'stage://tmp/{INGESTION_USER}/pysql_examples/demo.csv' OVERWRITE"
+
+        print(f"Uploading {FILEPATH} to staging location")
+        cursor.execute(query)
+        print("Upload was successful")
+
+        temp_fp = os.path.realpath("temp.csv")
+
+        # Here's a sample GET query. Note that `temp_fp` must also be contained within, or descended from,
+        # the uploads_base_path.
+        query = (
+            f"GET 'stage://tmp/{INGESTION_USER}/pysql_examples/demo.csv' TO '{temp_fp}'"
+        )
+
+        print(f"Fetching from staging location into new file called temp.csv")
+        cursor.execute(query)
+        print("Download was successful")
 
-with sql.connect(server_hostname    = os.getenv("DATABRICKS_SERVER_HOSTNAME"),
-                 http_path          = os.getenv("DATABRICKS_HTTP_PATH"),
-                 access_token       = os.getenv("DATABRICKS_TOKEN"),
-                 uploads_base_path  = uploads_base_path) as connection:
+        # Here's a sample REMOVE query. It cleans up the the demo.csv created in our first query
+        query = f"REMOVE 'stage://tmp/{INGESTION_USER}/pysql_examples/demo.csv'"
 
-  with connection.cursor() as cursor:
-    query = f"PUT '{_complete_path}' INTO 'stage://tmp/{INGESTION_USER}/pysql_examples/demo.csv' OVERWRITE"
-    cursor.execute(query)
+        print("Removing demo.csv from staging location")
+        cursor.execute(query)
+        print("Remove was successful")