Skip to content

Commit 54af80f

Browse files
committed
Improvements discused in PR conversation
1 parent 7bac092 commit 54af80f

File tree

4 files changed

+116
-11
lines changed

4 files changed

+116
-11
lines changed

docs/source/changelog.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
Changelog
22
=========
33

4+
0.1.5 / 2017-04-20
5+
------------------
6+
- When using ```to_gbq``` if ```if_exists``` is set to ```append```, dataframe needs to contain only a subset of the fields in the BigQuery schema. GH#24
7+
48
0.1.4 / 2017-03-17
59
------------------
610

docs/source/writing.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ a ``TableCreationError`` if the destination table already exists.
4040

4141
If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
4242
be written to the table using the defined table schema and column types. The
43-
dataframe must match the destination table in structure and data types.
43+
dataframe must contain fields (matching name and type) currently in the destination.
4444
If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
4545
different schema, a delay of 2 minutes will be forced to ensure that the new schema
4646
has propagated in the Google environment. See

pandas_gbq/gbq.py

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,17 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize):
559559
self._print("\n")
560560

561561
def schema(self, dataset_id, table_id):
562-
"""Retrieve the schema of the table"""
562+
"""Retrieve the schema of the table
563+
564+
Obtain from BigQuery the field names and field types
565+
for the table defined by the parameters
566+
567+
:param str dataset_id: Name of the BigQuery dataset for the table
568+
:param str table_id: Name of the BigQuery table
569+
570+
:return: Fields representing the schema
571+
:rtype: list of dicts
572+
"""
563573

564574
try:
565575
from googleapiclient.errors import HttpError
@@ -581,20 +591,47 @@ def schema(self, dataset_id, table_id):
581591
self.process_http_error(ex)
582592

583593
def verify_schema(self, dataset_id, table_id, schema):
584-
fields_remote = set([json.dumps(field)
585-
for field in self.schema(dataset_id, table_id)])
586-
fields_local = set(json.dumps(field_local)
587-
for field_local in schema['fields'])
594+
"""Indicate whether schemas match exactly
595+
596+
Compare the BigQuery table identified in the parameters with
597+
the schema passed in and indicate whether all fields in the former
598+
are present in the latter. Order is not considered.
599+
600+
:param str dataset_id: Name of the BigQuery dataset for the table
601+
:param str table_id: Name of the BigQuery table
602+
:param list(dict) schema: Schema for comparison. Each item should have
603+
a 'name' and a 'type'
604+
605+
:return: Whether the schemas match
606+
:rtype: bool
607+
"""
608+
609+
fields_remote = sorted(self.schema(dataset_id, table_id),
610+
key=lambda x: x['name'])
611+
fields_local = sorted(schema['fields'], key=lambda x: x['name'])
588612

589613
return fields_remote == fields_local
590614

591615
def schema_is_subset(self, dataset_id, table_id, schema):
592-
fields_remote = set([json.dumps(field)
593-
for field in self.schema(dataset_id, table_id)])
594-
fields_local = set(json.dumps(field_local)
595-
for field_local in schema['fields'])
616+
"""Indicate whether the schema to be uploaded is a subset
617+
618+
Compare the BigQuery table identified in the parameters with
619+
the schema passed in and indicate whether a subset of the fields in
620+
the former are present in the latter. Order is not considered.
621+
622+
:param str dataset_id: Name of the BigQuery dataset for the table
623+
:param str table_id: Name of the BigQuery table
624+
:param list(dict) schema: Schema for comparison. Each item should have
625+
a 'name' and a 'type'
626+
627+
:return: Whether the passed schema is a subset
628+
:rtype: bool
629+
"""
630+
631+
fields_remote = self.schema(dataset_id, table_id)
632+
fields_local = schema['fields']
596633

597-
return fields_remote >= fields_local
634+
return all(field in fields_remote for field in fields_local)
598635

599636
def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
600637
delay = 0

pandas_gbq/tests/test_gbq.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,6 +1083,7 @@ def test_upload_data_if_table_exists_append(self):
10831083
private_key=_get_private_key_path())
10841084

10851085
def test_upload_subset_columns_if_table_exists_append(self):
1086+
# For pull request #24
10861087
test_id = "16"
10871088
test_size = 10
10881089
df = make_mixed_dataframe_v2(test_size)
@@ -1301,6 +1302,7 @@ def test_verify_schema_ignores_field_mode(self):
13011302
'Expected schema to match')
13021303

13031304
def test_retrieve_schema(self):
1305+
# For pull request #24
13041306
test_id = "15"
13051307
test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
13061308
{'name': 'B', 'type': 'FLOAT'},
@@ -1312,6 +1314,68 @@ def test_retrieve_schema(self):
13121314
expected = test_schema['fields']
13131315
assert expected == actual, 'Expected schema used to create table'
13141316

1317+
def test_verify_schema_fails_different_structure(self):
1318+
test_id = "12"
1319+
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
1320+
{'name': 'B', 'type': 'FLOAT'},
1321+
{'name': 'C', 'type': 'STRING'},
1322+
{'name': 'D', 'type': 'TIMESTAMP'}]}
1323+
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
1324+
{'name': 'B2', 'type': 'FLOAT'},
1325+
{'name': 'C', 'type': 'STRING'},
1326+
{'name': 'D', 'type': 'TIMESTAMP'}]}
1327+
1328+
self.table.create(TABLE_ID + test_id, test_schema_1)
1329+
self.assertFalse(self.sut.verify_schema(
1330+
self.dataset_prefix + "1", TABLE_ID + test_id, test_schema_2),
1331+
'Expected different schema')
1332+
1333+
def test_schema_is_subset_passes_if_subset(self):
1334+
# For pull request #24
1335+
test_id = '16'
1336+
1337+
table_name = TABLE_ID + test_id
1338+
dataset = self.dataset_prefix + '1'
1339+
1340+
table_schema = {'fields': [{'name': 'A',
1341+
'type': 'FLOAT'},
1342+
{'name': 'B',
1343+
'type': 'FLOAT'},
1344+
{'name': 'C',
1345+
'type': 'STRING'}]}
1346+
tested_schema = {'fields': [{'name': 'A',
1347+
'type': 'FLOAT'},
1348+
{'name': 'B',
1349+
'type': 'FLOAT'}]}
1350+
1351+
self.table.create(table_name, table_schema)
1352+
1353+
assert self.sut.schema_is_subset(
1354+
dataset, table_name, tested_schema) is True
1355+
1356+
def test_schema_is_subset_fails_if_not_subset(self):
1357+
# For pull request #24
1358+
test_id = '17'
1359+
1360+
table_name = TABLE_ID + test_id
1361+
dataset = self.dataset_prefix + '1'
1362+
1363+
table_schema = {'fields': [{'name': 'A',
1364+
'type': 'FLOAT'},
1365+
{'name': 'B',
1366+
'type': 'FLOAT'},
1367+
{'name': 'C',
1368+
'type': 'STRING'}]}
1369+
tested_schema = {'fields': [{'name': 'A',
1370+
'type': 'FLOAT'},
1371+
{'name': 'C',
1372+
'type': 'FLOAT'}]}
1373+
1374+
self.table.create(table_name, table_schema)
1375+
1376+
assert self.sut.schema_is_subset(
1377+
dataset, table_name, tested_schema) is False
1378+
13151379
def test_list_dataset(self):
13161380
dataset_id = self.dataset_prefix + "1"
13171381
self.assertTrue(dataset_id in self.dataset.datasets(),

0 commit comments

Comments
 (0)