Improvements discused in PR conversation

mr-mcox · mr-mcox · commit 54af80fec272 · 2017-04-20T17:48:32.000-05:00
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+0.1.5 / 2017-04-20
+------------------
+- When using ```to_gbq``` if ```if_exists``` is set to ```append```, dataframe needs to contain only a subset of the fields in the BigQuery schema. GH#24
+
 0.1.4 / 2017-03-17
 ------------------
 
diff --git a/docs/source/writing.rst b/docs/source/writing.rst
@@ -40,7 +40,7 @@ a ``TableCreationError`` if the destination table already exists.
 
    If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
    be written to the table using the defined table schema and column types. The
-   dataframe must match the destination table in structure and data types.
+   dataframe must contain fields (matching name and type) currently in the destination.
    If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
    different schema, a delay of 2 minutes will be forced to ensure that the new schema
    has propagated in the Google environment. See
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -559,7 +559,17 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize):
         self._print("\n")
 
     def schema(self, dataset_id, table_id):
-        """Retrieve the schema of the table"""
+        """Retrieve the schema of the table
+
+        Obtain from BigQuery the field names and field types
+        for the table defined by the parameters
+
+        :param str dataset_id: Name of the BigQuery dataset for the table
+        :param str table_id: Name of the BigQuery table
+
+        :return: Fields representing the schema
+        :rtype: list of dicts
+        """
 
         try:
             from googleapiclient.errors import HttpError
@@ -581,20 +591,47 @@ def schema(self, dataset_id, table_id):
             self.process_http_error(ex)
 
     def verify_schema(self, dataset_id, table_id, schema):
-        fields_remote = set([json.dumps(field)
-                             for field in self.schema(dataset_id, table_id)])
-        fields_local = set(json.dumps(field_local)
-                           for field_local in schema['fields'])
+        """Indicate whether schemas match exactly
+
+        Compare the BigQuery table identified in the parameters with
+        the schema passed in and indicate whether all fields in the former
+        are present in the latter. Order is not considered.
+
+        :param str dataset_id: Name of the BigQuery dataset for the table
+        :param str table_id: Name of the BigQuery table
+        :param list(dict) schema: Schema for comparison. Each item should have
+            a 'name' and a 'type'
+
+        :return: Whether the schemas match
+        :rtype: bool
+        """
+
+        fields_remote = sorted(self.schema(dataset_id, table_id),
+                               key=lambda x: x['name'])
+        fields_local = sorted(schema['fields'], key=lambda x: x['name'])
 
         return fields_remote == fields_local
 
     def schema_is_subset(self, dataset_id, table_id, schema):
-        fields_remote = set([json.dumps(field)
-                             for field in self.schema(dataset_id, table_id)])
-        fields_local = set(json.dumps(field_local)
-                           for field_local in schema['fields'])
+        """Indicate whether the schema to be uploaded is a subset
+
+        Compare the BigQuery table identified in the parameters with
+        the schema passed in and indicate whether a subset of the fields in
+        the former are present in the latter. Order is not considered.
+
+        :param str dataset_id: Name of the BigQuery dataset for the table
+        :param str table_id: Name of the BigQuery table
+        :param list(dict) schema: Schema for comparison. Each item should have
+            a 'name' and a 'type'
+
+        :return: Whether the passed schema is a subset
+        :rtype: bool
+        """
+
+        fields_remote = self.schema(dataset_id, table_id)
+        fields_local = schema['fields']
 
-        return fields_remote >= fields_local
+        return all(field in fields_remote for field in fields_local)
 
     def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
         delay = 0
diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py
@@ -1083,6 +1083,7 @@ def test_upload_data_if_table_exists_append(self):
                        private_key=_get_private_key_path())
 
     def test_upload_subset_columns_if_table_exists_append(self):
+        # For pull request #24
         test_id = "16"
         test_size = 10
         df = make_mixed_dataframe_v2(test_size)
@@ -1301,6 +1302,7 @@ def test_verify_schema_ignores_field_mode(self):
             'Expected schema to match')
 
     def test_retrieve_schema(self):
+        # For pull request #24
         test_id = "15"
         test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
                                   {'name': 'B', 'type': 'FLOAT'},
@@ -1312,6 +1314,68 @@ def test_retrieve_schema(self):
         expected = test_schema['fields']
         assert expected == actual, 'Expected schema used to create table'
 
+    def test_verify_schema_fails_different_structure(self):
+        test_id = "12"
+        test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'B', 'type': 'FLOAT'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+        test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'B2', 'type': 'FLOAT'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+
+        self.table.create(TABLE_ID + test_id, test_schema_1)
+        self.assertFalse(self.sut.verify_schema(
+            self.dataset_prefix + "1", TABLE_ID + test_id, test_schema_2),
+            'Expected different schema')
+
+    def test_schema_is_subset_passes_if_subset(self):
+        # For pull request #24
+        test_id = '16'
+
+        table_name = TABLE_ID + test_id
+        dataset = self.dataset_prefix + '1'
+
+        table_schema = {'fields': [{'name': 'A',
+                                    'type': 'FLOAT'},
+                                   {'name': 'B',
+                                    'type': 'FLOAT'},
+                                   {'name': 'C',
+                                    'type': 'STRING'}]}
+        tested_schema = {'fields': [{'name': 'A',
+                                     'type': 'FLOAT'},
+                                    {'name': 'B',
+                                     'type': 'FLOAT'}]}
+
+        self.table.create(table_name, table_schema)
+
+        assert self.sut.schema_is_subset(
+            dataset, table_name, tested_schema) is True
+
+    def test_schema_is_subset_fails_if_not_subset(self):
+        # For pull request #24
+        test_id = '17'
+
+        table_name = TABLE_ID + test_id
+        dataset = self.dataset_prefix + '1'
+
+        table_schema = {'fields': [{'name': 'A',
+                                    'type': 'FLOAT'},
+                                   {'name': 'B',
+                                    'type': 'FLOAT'},
+                                   {'name': 'C',
+                                    'type': 'STRING'}]}
+        tested_schema = {'fields': [{'name': 'A',
+                                     'type': 'FLOAT'},
+                                    {'name': 'C',
+                                     'type': 'FLOAT'}]}
+
+        self.table.create(table_name, table_schema)
+
+        assert self.sut.schema_is_subset(
+            dataset, table_name, tested_schema) is False
+
     def test_list_dataset(self):
         dataset_id = self.dataset_prefix + "1"
         self.assertTrue(dataset_id in self.dataset.datasets(),