Merge pull request #6596 from jacobschaer/GBQ_Unicode_Support

jreback · jreback · commit a576d74a3ab0 · 2014-03-13T22:32:54.000-04:00
Gbq unicode support
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -327,6 +327,7 @@ Enhancements
 - ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp
   and data_label which allow the time stamp and dataset label to be set when creating a
   file. (:issue:`6545`)
+- ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`)
 
 Performance
 ~~~~~~~~~~~
diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
@@ -9,6 +9,7 @@
 from datetime import datetime
 import pkg_resources
 from distutils.version import LooseVersion
+from pandas.compat import u
 
 import pandas as pd
 import numpy as np
@@ -117,9 +118,8 @@ def _parse_entry(field_value, field_type):
         field_value = np.datetime64(timestamp)
     elif field_type == 'BOOLEAN':
         field_value = field_value == 'true'
-    # Note that results are unicode, so this will
-    # fail for non-ASCII characters.. this probably
-    # functions differently in Python 3
+    elif field_type == 'STRING':
+        field_value = field_value
     else:
         field_value = str(field_value)
     return field_value
diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py
@@ -11,6 +11,7 @@
 
 from pandas.core.frame import DataFrame
 from pandas.util.testing import with_connectivity_check
+from pandas.compat import u
 from pandas import NaT
 
 
@@ -193,9 +194,28 @@ def test_type_conversion(self):
                   np.bool(False),
                   np.int('2'),
                   np.float('3.14159'),
-                  'Hello World']
+                  u('Hello World')]
         self.assertEqual(actual_output, sample_output, 'A format conversion failed')
 
+    @with_connectivity_check
+    def test_unicode_string_conversion(self):
+        # Strings from BigQuery Should be converted to UTF-8 properly
+
+        if not os.path.exists(self.bq_token):
+            raise nose.SkipTest('Skipped because authentication information is not available.')
+
+        correct_test_datatype = DataFrame(
+            {'UNICODE_STRING' : [u("\xe9\xfc")]}
+        )
+
+        query = """SELECT '\xc3\xa9\xc3\xbc' as UNICODE_STRING"""
+
+        client = gbq._authenticate()
+        a = gbq.read_gbq(query)
+        tm.assert_frame_equal(a, correct_test_datatype)
+
+
+
     def test_data_small(self):
         # Parsing a fixed page of data should return the proper fixed np.array()
         result_frame = gbq._parse_page(self.test_data_small,