Skip to content

Commit 10dd74a

Browse files
refactor(bigquery): update code samples of load table file and uri (#10175)
* refactor(bigquery): update code samples of load table file and uri * refactor(bigquery): add uri for load orc and avro data * refactor(bigquery): fix lint and docs * refactor(bigquery): update copyright to 2020
1 parent 1ebe370 commit 10dd74a

17 files changed

+505
-268
lines changed

bigquery/docs/snippets.py

Lines changed: 0 additions & 263 deletions
Original file line numberDiff line numberDiff line change
@@ -581,269 +581,6 @@ def test_manage_views(client, to_delete):
581581
# [END bigquery_grant_view_access]
582582

583583

584-
def test_load_table_from_file(client, to_delete):
585-
"""Upload table data from a CSV file."""
586-
dataset_id = "load_table_from_file_dataset_{}".format(_millis())
587-
table_id = "load_table_from_file_table_{}".format(_millis())
588-
dataset = bigquery.Dataset(client.dataset(dataset_id))
589-
dataset.location = "US"
590-
client.create_dataset(dataset)
591-
to_delete.append(dataset)
592-
snippets_dir = os.path.abspath(os.path.dirname(__file__))
593-
filename = os.path.join(
594-
snippets_dir, "..", "..", "bigquery", "tests", "data", "people.csv"
595-
)
596-
597-
# [START bigquery_load_from_file]
598-
# from google.cloud import bigquery
599-
# client = bigquery.Client()
600-
# filename = '/path/to/file.csv'
601-
# dataset_id = 'my_dataset'
602-
# table_id = 'my_table'
603-
604-
dataset_ref = client.dataset(dataset_id)
605-
table_ref = dataset_ref.table(table_id)
606-
job_config = bigquery.LoadJobConfig()
607-
job_config.source_format = bigquery.SourceFormat.CSV
608-
job_config.skip_leading_rows = 1
609-
job_config.autodetect = True
610-
611-
with open(filename, "rb") as source_file:
612-
job = client.load_table_from_file(source_file, table_ref, job_config=job_config)
613-
614-
job.result() # Waits for table load to complete.
615-
616-
print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
617-
# [END bigquery_load_from_file]
618-
619-
table = client.get_table(table_ref)
620-
rows = list(client.list_rows(table)) # API request
621-
622-
assert len(rows) == 2
623-
# Order is not preserved, so compare individually
624-
row1 = bigquery.Row(("Wylma Phlyntstone", 29), {"full_name": 0, "age": 1})
625-
assert row1 in rows
626-
row2 = bigquery.Row(("Phred Phlyntstone", 32), {"full_name": 0, "age": 1})
627-
assert row2 in rows
628-
629-
630-
def test_load_table_from_uri_avro(client, to_delete, capsys):
631-
dataset_id = "load_table_from_uri_avro_{}".format(_millis())
632-
dataset = bigquery.Dataset(client.dataset(dataset_id))
633-
client.create_dataset(dataset)
634-
to_delete.append(dataset)
635-
636-
# [START bigquery_load_table_gcs_avro]
637-
# from google.cloud import bigquery
638-
# client = bigquery.Client()
639-
# dataset_id = 'my_dataset'
640-
641-
dataset_ref = client.dataset(dataset_id)
642-
job_config = bigquery.LoadJobConfig()
643-
job_config.source_format = bigquery.SourceFormat.AVRO
644-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.avro"
645-
646-
load_job = client.load_table_from_uri(
647-
uri, dataset_ref.table("us_states"), job_config=job_config
648-
) # API request
649-
print("Starting job {}".format(load_job.job_id))
650-
651-
load_job.result() # Waits for table load to complete.
652-
print("Job finished.")
653-
654-
destination_table = client.get_table(dataset_ref.table("us_states"))
655-
print("Loaded {} rows.".format(destination_table.num_rows))
656-
# [END bigquery_load_table_gcs_avro]
657-
658-
out, _ = capsys.readouterr()
659-
assert "Loaded 50 rows." in out
660-
661-
662-
def test_load_table_from_uri_csv(client, to_delete, capsys):
663-
dataset_id = "load_table_from_uri_csv_{}".format(_millis())
664-
dataset = bigquery.Dataset(client.dataset(dataset_id))
665-
client.create_dataset(dataset)
666-
to_delete.append(dataset)
667-
668-
# [START bigquery_load_table_gcs_csv]
669-
# from google.cloud import bigquery
670-
# client = bigquery.Client()
671-
# dataset_id = 'my_dataset'
672-
673-
dataset_ref = client.dataset(dataset_id)
674-
job_config = bigquery.LoadJobConfig()
675-
job_config.schema = [
676-
bigquery.SchemaField("name", "STRING"),
677-
bigquery.SchemaField("post_abbr", "STRING"),
678-
]
679-
job_config.skip_leading_rows = 1
680-
# The source format defaults to CSV, so the line below is optional.
681-
job_config.source_format = bigquery.SourceFormat.CSV
682-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
683-
684-
load_job = client.load_table_from_uri(
685-
uri, dataset_ref.table("us_states"), job_config=job_config
686-
) # API request
687-
print("Starting job {}".format(load_job.job_id))
688-
689-
load_job.result() # Waits for table load to complete.
690-
print("Job finished.")
691-
692-
destination_table = client.get_table(dataset_ref.table("us_states"))
693-
print("Loaded {} rows.".format(destination_table.num_rows))
694-
# [END bigquery_load_table_gcs_csv]
695-
696-
out, _ = capsys.readouterr()
697-
assert "Loaded 50 rows." in out
698-
699-
700-
def test_load_table_from_uri_json(client, to_delete, capsys):
701-
dataset_id = "load_table_from_uri_json_{}".format(_millis())
702-
dataset = bigquery.Dataset(client.dataset(dataset_id))
703-
dataset.location = "US"
704-
client.create_dataset(dataset)
705-
to_delete.append(dataset)
706-
707-
# [START bigquery_load_table_gcs_json]
708-
# from google.cloud import bigquery
709-
# client = bigquery.Client()
710-
# dataset_id = 'my_dataset'
711-
712-
dataset_ref = client.dataset(dataset_id)
713-
job_config = bigquery.LoadJobConfig()
714-
job_config.schema = [
715-
bigquery.SchemaField("name", "STRING"),
716-
bigquery.SchemaField("post_abbr", "STRING"),
717-
]
718-
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
719-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
720-
721-
load_job = client.load_table_from_uri(
722-
uri,
723-
dataset_ref.table("us_states"),
724-
location="US", # Location must match that of the destination dataset.
725-
job_config=job_config,
726-
) # API request
727-
print("Starting job {}".format(load_job.job_id))
728-
729-
load_job.result() # Waits for table load to complete.
730-
print("Job finished.")
731-
732-
destination_table = client.get_table(dataset_ref.table("us_states"))
733-
print("Loaded {} rows.".format(destination_table.num_rows))
734-
# [END bigquery_load_table_gcs_json]
735-
736-
out, _ = capsys.readouterr()
737-
assert "Loaded 50 rows." in out
738-
739-
740-
def test_load_table_from_uri_cmek(client, to_delete):
741-
dataset_id = "load_table_from_uri_cmek_{}".format(_millis())
742-
dataset = bigquery.Dataset(client.dataset(dataset_id))
743-
dataset.location = "US"
744-
client.create_dataset(dataset)
745-
to_delete.append(dataset)
746-
747-
# [START bigquery_load_table_gcs_json_cmek]
748-
# from google.cloud import bigquery
749-
# client = bigquery.Client()
750-
# dataset_id = 'my_dataset'
751-
752-
dataset_ref = client.dataset(dataset_id)
753-
job_config = bigquery.LoadJobConfig()
754-
job_config.autodetect = True
755-
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
756-
757-
# Set the encryption key to use for the destination.
758-
# TODO: Replace this key with a key you have created in KMS.
759-
kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
760-
"cloud-samples-tests", "us", "test", "test"
761-
)
762-
encryption_config = bigquery.EncryptionConfiguration(kms_key_name=kms_key_name)
763-
job_config.destination_encryption_configuration = encryption_config
764-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
765-
766-
load_job = client.load_table_from_uri(
767-
uri,
768-
dataset_ref.table("us_states"),
769-
location="US", # Location must match that of the destination dataset.
770-
job_config=job_config,
771-
) # API request
772-
773-
assert load_job.job_type == "load"
774-
775-
load_job.result() # Waits for table load to complete.
776-
777-
assert load_job.state == "DONE"
778-
table = client.get_table(dataset_ref.table("us_states"))
779-
assert table.encryption_configuration.kms_key_name == kms_key_name
780-
# [END bigquery_load_table_gcs_json_cmek]
781-
782-
783-
def test_load_table_from_uri_parquet(client, to_delete, capsys):
784-
dataset_id = "load_table_from_uri_parquet_{}".format(_millis())
785-
dataset = bigquery.Dataset(client.dataset(dataset_id))
786-
client.create_dataset(dataset)
787-
to_delete.append(dataset)
788-
789-
# [START bigquery_load_table_gcs_parquet]
790-
# from google.cloud import bigquery
791-
# client = bigquery.Client()
792-
# dataset_id = 'my_dataset'
793-
794-
dataset_ref = client.dataset(dataset_id)
795-
job_config = bigquery.LoadJobConfig()
796-
job_config.source_format = bigquery.SourceFormat.PARQUET
797-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
798-
799-
load_job = client.load_table_from_uri(
800-
uri, dataset_ref.table("us_states"), job_config=job_config
801-
) # API request
802-
print("Starting job {}".format(load_job.job_id))
803-
804-
load_job.result() # Waits for table load to complete.
805-
print("Job finished.")
806-
807-
destination_table = client.get_table(dataset_ref.table("us_states"))
808-
print("Loaded {} rows.".format(destination_table.num_rows))
809-
# [END bigquery_load_table_gcs_parquet]
810-
811-
out, _ = capsys.readouterr()
812-
assert "Loaded 50 rows." in out
813-
814-
815-
def test_load_table_from_uri_orc(client, to_delete, capsys):
816-
dataset_id = "load_table_from_uri_orc_{}".format(_millis())
817-
dataset = bigquery.Dataset(client.dataset(dataset_id))
818-
client.create_dataset(dataset)
819-
to_delete.append(dataset)
820-
821-
# [START bigquery_load_table_gcs_orc]
822-
# from google.cloud import bigquery
823-
# client = bigquery.Client()
824-
# dataset_id = 'my_dataset'
825-
826-
dataset_ref = client.dataset(dataset_id)
827-
job_config = bigquery.LoadJobConfig()
828-
job_config.source_format = bigquery.SourceFormat.ORC
829-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.orc"
830-
831-
load_job = client.load_table_from_uri(
832-
uri, dataset_ref.table("us_states"), job_config=job_config
833-
) # API request
834-
print("Starting job {}".format(load_job.job_id))
835-
836-
load_job.result() # Waits for table load to complete.
837-
print("Job finished.")
838-
839-
destination_table = client.get_table(dataset_ref.table("us_states"))
840-
print("Loaded {} rows.".format(destination_table.num_rows))
841-
# [END bigquery_load_table_gcs_orc]
842-
843-
out, _ = capsys.readouterr()
844-
assert "Loaded 50 rows." in out
845-
846-
847584
def test_load_table_from_uri_autodetect(client, to_delete, capsys):
848585
"""Load table from a GCS URI using various formats and auto-detected schema
849586
Each file format has its own tested load from URI sample. Because most of

bigquery/docs/usage/encryption.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Change the key used to encrypt a table.
2727
Load a file from Cloud Storage, using a customer-managed encryption key from
2828
Cloud KMS for the destination table.
2929

30-
.. literalinclude:: ../snippets.py
30+
.. literalinclude:: ../samples/load_table_uri_cmek.py
3131
:language: python
3232
:dedent: 4
3333
:start-after: [START bigquery_load_table_gcs_json_cmek]

bigquery/docs/usage/tables.rst

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ Create an integer range partitioned table with the
7070
Load table data from a file with the
7171
:func:`~google.cloud.bigquery.client.Client.load_table_from_file` method:
7272

73-
.. literalinclude:: ../snippets.py
73+
.. literalinclude:: ../samples/load_table_file.py
7474
:language: python
7575
:dedent: 4
7676
:start-after: [START bigquery_load_from_file]
@@ -79,7 +79,7 @@ Load table data from a file with the
7979
Load a CSV file from Cloud Storage with the
8080
:func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method:
8181

82-
.. literalinclude:: ../snippets.py
82+
.. literalinclude:: ../samples/load_table_uri_csv.py
8383
:language: python
8484
:dedent: 4
8585
:start-after: [START bigquery_load_table_gcs_csv]
@@ -90,7 +90,7 @@ See also: `Loading CSV data from Cloud Storage
9090

9191
Load a JSON file from Cloud Storage:
9292

93-
.. literalinclude:: ../snippets.py
93+
.. literalinclude:: ../samples/load_table_uri_json.py
9494
:language: python
9595
:dedent: 4
9696
:start-after: [START bigquery_load_table_gcs_json]
@@ -101,7 +101,7 @@ See also: `Loading JSON data from Cloud Storage
101101

102102
Load a Parquet file from Cloud Storage:
103103

104-
.. literalinclude:: ../snippets.py
104+
.. literalinclude:: ../samples/load_table_uri_parquet.py
105105
:language: python
106106
:dedent: 4
107107
:start-after: [START bigquery_load_table_gcs_parquet]
@@ -110,6 +110,28 @@ Load a Parquet file from Cloud Storage:
110110
See also: `Loading Parquet data from Cloud Storage
111111
<https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet>`_.
112112

113+
Load an Avro file from Cloud Storage:
114+
115+
.. literalinclude:: ../samples/load_table_uri_avro.py
116+
:language: python
117+
:dedent: 4
118+
:start-after: [START bigquery_load_table_gcs_avro]
119+
:end-before: [END bigquery_load_table_gcs_avro]
120+
121+
See also: `Loading Avro data from Cloud Storage
122+
<https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro>`_.
123+
124+
Load an ORC file from Cloud Storage:
125+
126+
.. literalinclude:: ../samples/load_table_uri_orc.py
127+
:language: python
128+
:dedent: 4
129+
:start-after: [START bigquery_load_table_gcs_orc]
130+
:end-before: [END bigquery_load_table_gcs_orc]
131+
132+
See also: `Loading ORC data from Cloud Storage
133+
<https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc>`_.
134+
113135
Updating a Table
114136
^^^^^^^^^^^^^^^^
115137

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def load_table_file(file_path, table_id):
17+
18+
# [START bigquery_load_from_file]
19+
from google.cloud import bigquery
20+
21+
# Construct a BigQuery client object.
22+
client = bigquery.Client()
23+
24+
# TODO(developer): Set table_id to the ID of the table to create.
25+
# table_id = "your-project.your_dataset.your_table_name"
26+
27+
job_config = bigquery.LoadJobConfig(
28+
source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
29+
)
30+
31+
with open(file_path, "rb") as source_file:
32+
job = client.load_table_from_file(source_file, table_id, job_config=job_config)
33+
34+
job.result() # Waits for the job to complete.
35+
36+
table = client.get_table(table_id) # Make an API request.
37+
print(
38+
"Loaded {} rows and {} columns to {}".format(
39+
table.num_rows, len(table.schema), table_id
40+
)
41+
)
42+
# [END bigquery_load_from_file]
43+
return table

0 commit comments

Comments
 (0)