diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index cfee48d62928b..932ed4e1672b7 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -327,6 +327,7 @@ Enhancements - ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp and data_label which allow the time stamp and dataset label to be set when creating a file. (:issue:`6545`) +- ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`) Performance ~~~~~~~~~~~ diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index ebf4f17ffb852..60381a2a628c2 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -9,6 +9,7 @@ from datetime import datetime import pkg_resources from distutils.version import LooseVersion +from pandas.compat import u import pandas as pd import numpy as np @@ -117,9 +118,8 @@ def _parse_entry(field_value, field_type): field_value = np.datetime64(timestamp) elif field_type == 'BOOLEAN': field_value = field_value == 'true' - # Note that results are unicode, so this will - # fail for non-ASCII characters.. this probably - # functions differently in Python 3 + elif field_type == 'STRING': + field_value = field_value else: field_value = str(field_value) return field_value diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index ec051d008b3f3..124658ac80234 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -11,6 +11,7 @@ from pandas.core.frame import DataFrame from pandas.util.testing import with_connectivity_check +from pandas.compat import u from pandas import NaT @@ -193,9 +194,28 @@ def test_type_conversion(self): np.bool(False), np.int('2'), np.float('3.14159'), - 'Hello World'] + u('Hello World')] self.assertEqual(actual_output, sample_output, 'A format conversion failed') + @with_connectivity_check + def test_unicode_string_conversion(self): + # Strings from BigQuery Should be converted to UTF-8 properly + + if not os.path.exists(self.bq_token): + raise nose.SkipTest('Skipped because authentication information is not available.') + + correct_test_datatype = DataFrame( + {'UNICODE_STRING' : [u("\xe9\xfc")]} + ) + + query = """SELECT '\xc3\xa9\xc3\xbc' as UNICODE_STRING""" + + client = gbq._authenticate() + a = gbq.read_gbq(query) + tm.assert_frame_equal(a, correct_test_datatype) + + + def test_data_small(self): # Parsing a fixed page of data should return the proper fixed np.array() result_frame = gbq._parse_page(self.test_data_small,