Skip to content

Commit a576d74

Browse files
committed
Merge pull request #6596 from jacobschaer/GBQ_Unicode_Support
Gbq unicode support
2 parents 403f778 + 8361ca6 commit a576d74

File tree

3 files changed

+25
-4
lines changed

3 files changed

+25
-4
lines changed

doc/source/v0.14.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,7 @@ Enhancements
327327
- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp
328328
and data_label which allow the time stamp and dataset label to be set when creating a
329329
file. (:issue:`6545`)
330+
- ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`)
330331

331332
Performance
332333
~~~~~~~~~~~

pandas/io/gbq.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from datetime import datetime
1010
import pkg_resources
1111
from distutils.version import LooseVersion
12+
from pandas.compat import u
1213

1314
import pandas as pd
1415
import numpy as np
@@ -117,9 +118,8 @@ def _parse_entry(field_value, field_type):
117118
field_value = np.datetime64(timestamp)
118119
elif field_type == 'BOOLEAN':
119120
field_value = field_value == 'true'
120-
# Note that results are unicode, so this will
121-
# fail for non-ASCII characters.. this probably
122-
# functions differently in Python 3
121+
elif field_type == 'STRING':
122+
field_value = field_value
123123
else:
124124
field_value = str(field_value)
125125
return field_value

pandas/io/tests/test_gbq.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas.core.frame import DataFrame
1313
from pandas.util.testing import with_connectivity_check
14+
from pandas.compat import u
1415
from pandas import NaT
1516

1617

@@ -193,9 +194,28 @@ def test_type_conversion(self):
193194
np.bool(False),
194195
np.int('2'),
195196
np.float('3.14159'),
196-
'Hello World']
197+
u('Hello World')]
197198
self.assertEqual(actual_output, sample_output, 'A format conversion failed')
198199

200+
@with_connectivity_check
201+
def test_unicode_string_conversion(self):
202+
# Strings from BigQuery Should be converted to UTF-8 properly
203+
204+
if not os.path.exists(self.bq_token):
205+
raise nose.SkipTest('Skipped because authentication information is not available.')
206+
207+
correct_test_datatype = DataFrame(
208+
{'UNICODE_STRING' : [u("\xe9\xfc")]}
209+
)
210+
211+
query = """SELECT '\xc3\xa9\xc3\xbc' as UNICODE_STRING"""
212+
213+
client = gbq._authenticate()
214+
a = gbq.read_gbq(query)
215+
tm.assert_frame_equal(a, correct_test_datatype)
216+
217+
218+
199219
def test_data_small(self):
200220
# Parsing a fixed page of data should return the proper fixed np.array()
201221
result_frame = gbq._parse_page(self.test_data_small,

0 commit comments

Comments
 (0)