From a62b19039cc48bab34d67d6393db4e6384a81ffd Mon Sep 17 00:00:00 2001 From: John Paton Date: Tue, 12 Mar 2019 17:52:08 +0100 Subject: [PATCH 1/6] ENH: Allow partial table schema in to_gbq --- pandas_gbq/gbq.py | 11 +++++++++-- pandas_gbq/schema.py | 30 ++++++++++++++++++++++++++++++ tests/unit/test_schema.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index b59c3f94..1b803880 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1023,10 +1023,11 @@ def to_gbq( credentials=connector.credentials, ) + default_schema = _generate_bq_schema(dataframe) if not table_schema: - table_schema = _generate_bq_schema(dataframe) + table_schema = default_schema else: - table_schema = dict(fields=table_schema) + table_schema = _update_bq_schema(default_schema, dict(fields=table_schema)) # If table exists, check if_exists parameter if table.exists(table_id): @@ -1091,6 +1092,12 @@ def _generate_bq_schema(df, default_type="STRING"): return schema.generate_bq_schema(df, default_type=default_type) +def _update_bq_schema(schema_old, schema_new): + from pandas_gbq import schema + + return schema.update_schema(schema_old, schema_new) + + class _Table(GbqConnector): def __init__( self, diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py index 3ca03025..e4e18b02 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema.py @@ -31,3 +31,33 @@ def generate_bq_schema(dataframe, default_type="STRING"): ) return {"fields": fields} + + +def update_schema(schema_old, schema_new): + """ + Given an old BigQuery schema, update it with a new one. + + Where a field name is the same, the new will replace the old. Any + new fields not present in the old schema will be added. + + Arguments: + schema_old: the old schema to update + schema_new: the new schema which will overwrite/extend the old + """ + old_fields = schema_old["fields"] + new_fields = schema_new["fields"] + output_fields = old_fields.copy() + + field_indices = {field["name"]: i for i, field in enumerate(output_fields)} + + for field in new_fields: + name = field["name"] + if name in field_indices: + # replace old field with new field of same name + output_fields[field_indices[name]] = field + else: + # add new field + output_fields.append(field) + + return {"fields": output_fields} + diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 74f22f29..e2747580 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -54,3 +54,40 @@ def test_generate_bq_schema(dataframe, expected_schema): schema = pandas_gbq.schema.generate_bq_schema(dataframe) assert schema == expected_schema + +@pytest.mark.parametrize( + "schema_old,schema_new,expected_output", + [ + ( + {"fields": [{"name": "col1", "type": "INTEGER"}]}, + {"fields": [{"name": "col2", "type": "TIMESTAMP"}]}, + {"fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "TIMESTAMP"} + ]}, + ), + ( + {"fields": [{"name": "col1", "type": "INTEGER"}]}, + {"fields": [{"name": "col1", "type": "BOOLEAN"}]}, + {"fields": [{"name": "col1", "type": "BOOLEAN"}]}, + ), + ( + {"fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "INTEGER"} + ]}, + {"fields": [ + {"name": "col2", "type": "BOOLEAN"}, + {"name": "col3", "type": "FLOAT"} + ]}, + {"fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "BOOLEAN"}, + {"name": "col3", "type": "FLOAT"} + ]}, + ) + ] +) +def test_update_schema(schema_old, schema_new, expected_output): + output = pandas_gbq.schema.update_schema(schema_old, schema_new) + assert output == expected_output From 1d98d2bd79c3fa62182f8b0e2449d21e7b1f198e Mon Sep 17 00:00:00 2001 From: John Paton Date: Tue, 12 Mar 2019 18:01:28 +0100 Subject: [PATCH 2/6] CLN: applied black --- pandas_gbq/gbq.py | 4 +++- pandas_gbq/schema.py | 1 - tests/unit/test_schema.py | 47 +++++++++++++++++++++++---------------- 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 1b803880..113bfda3 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1027,7 +1027,9 @@ def to_gbq( if not table_schema: table_schema = default_schema else: - table_schema = _update_bq_schema(default_schema, dict(fields=table_schema)) + table_schema = _update_bq_schema( + default_schema, dict(fields=table_schema) + ) # If table exists, check if_exists parameter if table.exists(table_id): diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py index e4e18b02..e8246d17 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema.py @@ -60,4 +60,3 @@ def update_schema(schema_old, schema_new): output_fields.append(field) return {"fields": output_fields} - diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index e2747580..af3b2043 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -55,16 +55,19 @@ def test_generate_bq_schema(dataframe, expected_schema): schema = pandas_gbq.schema.generate_bq_schema(dataframe) assert schema == expected_schema + @pytest.mark.parametrize( "schema_old,schema_new,expected_output", [ ( {"fields": [{"name": "col1", "type": "INTEGER"}]}, {"fields": [{"name": "col2", "type": "TIMESTAMP"}]}, - {"fields": [ - {"name": "col1", "type": "INTEGER"}, - {"name": "col2", "type": "TIMESTAMP"} - ]}, + { + "fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "TIMESTAMP"}, + ] + }, ), ( {"fields": [{"name": "col1", "type": "INTEGER"}]}, @@ -72,21 +75,27 @@ def test_generate_bq_schema(dataframe, expected_schema): {"fields": [{"name": "col1", "type": "BOOLEAN"}]}, ), ( - {"fields": [ - {"name": "col1", "type": "INTEGER"}, - {"name": "col2", "type": "INTEGER"} - ]}, - {"fields": [ - {"name": "col2", "type": "BOOLEAN"}, - {"name": "col3", "type": "FLOAT"} - ]}, - {"fields": [ - {"name": "col1", "type": "INTEGER"}, - {"name": "col2", "type": "BOOLEAN"}, - {"name": "col3", "type": "FLOAT"} - ]}, - ) - ] + { + "fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "INTEGER"}, + ] + }, + { + "fields": [ + {"name": "col2", "type": "BOOLEAN"}, + {"name": "col3", "type": "FLOAT"}, + ] + }, + { + "fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "BOOLEAN"}, + {"name": "col3", "type": "FLOAT"}, + ] + }, + ), + ], ) def test_update_schema(schema_old, schema_new, expected_output): output = pandas_gbq.schema.update_schema(schema_old, schema_new) From 34de9e5c540c4eb45cb933e75910fe7c07117b23 Mon Sep 17 00:00:00 2001 From: John Paton Date: Tue, 12 Mar 2019 18:06:41 +0100 Subject: [PATCH 3/6] BUG: make update_schema python 2.7 compatible --- pandas_gbq/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py index e8246d17..c59ed68e 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema.py @@ -46,7 +46,7 @@ def update_schema(schema_old, schema_new): """ old_fields = schema_old["fields"] new_fields = schema_new["fields"] - output_fields = old_fields.copy() + output_fields = list(old_fields) field_indices = {field["name"]: i for i, field in enumerate(output_fields)} From ef46a83bfafb6c66ec5b7e43352993a24e566bac Mon Sep 17 00:00:00 2001 From: John Paton Date: Tue, 12 Mar 2019 18:21:59 +0100 Subject: [PATCH 4/6] DOC: update docs to allow for a subset of columns in to_gbq table_schema --- pandas_gbq/gbq.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 113bfda3..2fa31e4f 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -939,9 +939,11 @@ def to_gbq( 'STRING'},...]``. If schema is not provided, it will be generated according to dtypes of DataFrame columns. - If schema is provided, it must contain all DataFrame columns. - pandas_gbq.gbq._generate_bq_schema() may be used to create an initial - schema, though it doesn't preserve column order. + If schema is provided, it may contain all or a subset of DataFrame + columns. If a subset is provided, the rest will be inferred from + the DataFrame dtypes. + pandas_gbq.gbq._generate_bq_schema() may be used to create an + initial schema, though it doesn't preserve column order. See BigQuery API documentation on available names of a field. .. versionadded:: 0.3.1 From 5a797a087bf96ad9d5893498359f14ddf4d239ec Mon Sep 17 00:00:00 2001 From: John Paton Date: Tue, 12 Mar 2019 18:43:10 +0100 Subject: [PATCH 5/6] DOC: what's new --- docs/source/changelog.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 3b43ccd3..c8c34ea7 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -20,6 +20,11 @@ Internal changes - Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()`` function. (:issue:`247`) +Enhancements +~~~~~~~~~~~~ +- Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns, + with the rest being populated using the DataFrame dtypes (:issue:`218`, + contributed by @johnpaton) .. _changelog-0.9.0: @@ -237,4 +242,4 @@ Initial release of transfered code from `pandas `__ -- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 `__, and `pandas-GH#14305 `__ \ No newline at end of file +- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 `__, and `pandas-GH#14305 `__ From 6856d058e0fdb3b74b6d92908ea7b22bf3844ec9 Mon Sep 17 00:00:00 2001 From: John Paton Date: Tue, 12 Mar 2019 18:52:52 +0100 Subject: [PATCH 6/6] DOC: close parens around issue in changelog --- docs/source/changelog.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c8c34ea7..e3c0edd7 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -23,8 +23,8 @@ Internal changes Enhancements ~~~~~~~~~~~~ - Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns, - with the rest being populated using the DataFrame dtypes (:issue:`218`, - contributed by @johnpaton) + with the rest being populated using the DataFrame dtypes (:issue:`218`) + (contributed by @johnpaton) .. _changelog-0.9.0: