Skip to content

Commit ff3c464

Browse files
necnecjreback
authored andcommitted
ENH: Added configuration parameter to read_gbq
Now more complicated queries could be processed. Author: Dmitry L <lastovichekdv@yandex.ru> Author: necnec <lastovichekdv@yandex.ru> Author: Dmitry <lastovichekdv@condenast.ru> Closes #14742 from necnec/bigquery-udf-resources and squashes the following commits: 3a238a5 [necnec] config->configuration 82f4409 [necnec] Add some documentation & formatting b97a1be [Dmitry L] formatting e2f801f [Dmitry] hotfix 2e02d76 [Dmitry L] Merge remote-tracking branch 'pandas-dev/master' into bigquery-udf-resources ec590af [Dmitry L] Throw exception if more than 1 job type in config 8720b03 [Dmitry L] Delete trailing whitespaces df5dec6 [Dmitry L] configuration->config & formatting 99521aa [Dmitry L] Formatting, documentation, new unit test 0ac26a2 [Dmitry L] added pull request number in whitens 86ed96d [Dmitry L] Merge branch 'master' into bigquery-udf-resources 929ad1a [Dmitry L] formatting: delete whitespace 8a38650 [Dmitry L] Added example configuration & job_configuration refactoring 395c0e9 [Dmitry L] fix formatting c21588a [Dmitry L] Merge remote-tracking branch 'pandas-dev/master' into bigquery-udf-resources 8fe77b2 [necnec] Merge branch 'bigquery-udf-resources' 146f0f3 [necnec] Merge branch 'master' into bigquery-udf-resources ce8ebe4 [necnec] Merge branch 'bigquery-udf-resources' 028c8be [necnec] Solve formating problems c199935 [Dmitry L] Make query configuration more general 0b365da [Dmitry L] delete newlines a952710 [Dmitry L] Move whatsnew BQ Enhancements -> Enhancements b849300 [Dmitry L] Change whatsnew 0.19.2 -> 0.20.0 640be7a [Dmitry L] Change whatnew 0.19.0->0.19.2 834a2ff [necnec] Merge branch 'bigquery-udf-resources' d69ed7f [Dmitry L] check tests 94fa514 [necnec] test formatting ddb4fd1 [necnec] Merge branch 'bigquery-udf-resources' ad35a43 [necnec] fix whatsnew text a96811d [necnec] add unit tests read_gbq: query parameters, cache c66169d [necnec] add read_gbq tests: query parameters and cache 42dc9e6 [necnec] Merge remote-tracking branch 'origin/bigquery-udf-resources' f9fae0c [necnec] Fix formatting 9a16a8c [necnec] Merge branch 'bigquery-udf-resources' dad9288 [necnec] Change parameter to kwargs 55bf05c [Dmitry L] Added udf_resource_uri parameter to read_gbq
1 parent 74e20a0 commit ff3c464

File tree

4 files changed

+144
-12
lines changed

4 files changed

+144
-12
lines changed

doc/source/io.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4652,6 +4652,22 @@ destination DataFrame as well as a preferred column order as follows:
46524652
index_col='index_column_name',
46534653
col_order=['col1', 'col2', 'col3'], projectid)
46544654
4655+
4656+
Starting with 0.20.0, you can specify the query config as parameter to use additional options of your job.
4657+
For more information about query configuration parameters see
4658+
`here <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
4659+
4660+
.. code-block:: python
4661+
4662+
configuration = {
4663+
'query': {
4664+
"useQueryCache": False
4665+
}
4666+
}
4667+
data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table',
4668+
configuration=configuration, projectid)
4669+
4670+
46554671
.. note::
46564672

46574673
You can find your project id in the `Google developers console <https://console.developers.google.com>`__.

doc/source/whatsnew/v0.20.0.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ Other enhancements
100100
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
101101
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
102102

103+
- ``pd.read_gbq`` method now allows query configuration preferences (:issue:`14742`)
104+
103105
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
104106
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
105107
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`

pandas/io/gbq.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ def process_insert_errors(self, insert_errors):
375375

376376
raise StreamingInsertError
377377

378-
def run_query(self, query):
378+
def run_query(self, query, **kwargs):
379379
try:
380380
from googleapiclient.errors import HttpError
381381
except:
@@ -385,16 +385,33 @@ def run_query(self, query):
385385
_check_google_client_version()
386386

387387
job_collection = self.service.jobs()
388-
job_data = {
389-
'configuration': {
390-
'query': {
391-
'query': query,
392-
'useLegacySql': self.dialect == 'legacy'
393-
# 'allowLargeResults', 'createDisposition',
394-
# 'preserveNulls', destinationTable, useQueryCache
395-
}
388+
389+
job_config = {
390+
'query': {
391+
'query': query,
392+
'useLegacySql': self.dialect == 'legacy'
393+
# 'allowLargeResults', 'createDisposition',
394+
# 'preserveNulls', destinationTable, useQueryCache
396395
}
397396
}
397+
config = kwargs.get('configuration')
398+
if config is not None:
399+
if len(config) != 1:
400+
raise ValueError("Only one job type must be specified, but "
401+
"given {}".format(','.join(config.keys())))
402+
if 'query' in config:
403+
if 'query' in config['query'] and query is not None:
404+
raise ValueError("Query statement can't be specified "
405+
"inside config while it is specified "
406+
"as parameter")
407+
408+
job_config['query'].update(config['query'])
409+
else:
410+
raise ValueError("Only 'query' job type is supported")
411+
412+
job_data = {
413+
'configuration': job_config
414+
}
398415

399416
self._start_timer()
400417
try:
@@ -622,8 +639,9 @@ def _parse_entry(field_value, field_type):
622639

623640

624641
def read_gbq(query, project_id=None, index_col=None, col_order=None,
625-
reauth=False, verbose=True, private_key=None, dialect='legacy'):
626-
"""Load data from Google BigQuery.
642+
reauth=False, verbose=True, private_key=None, dialect='legacy',
643+
**kwargs):
644+
r"""Load data from Google BigQuery.
627645
628646
THIS IS AN EXPERIMENTAL LIBRARY
629647
@@ -682,6 +700,17 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
682700
683701
.. versionadded:: 0.19.0
684702
703+
**kwargs : Arbitrary keyword arguments
704+
configuration (dict): query config parameters for job processing.
705+
For example:
706+
707+
configuration = {'query': {'useQueryCache': False}}
708+
709+
For more information see `BigQuery SQL Reference
710+
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`
711+
712+
.. versionadded:: 0.20.0
713+
685714
Returns
686715
-------
687716
df: DataFrame
@@ -698,7 +727,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
698727
connector = GbqConnector(project_id, reauth=reauth, verbose=verbose,
699728
private_key=private_key,
700729
dialect=dialect)
701-
schema, pages = connector.run_query(query)
730+
schema, pages = connector.run_query(query, **kwargs)
702731
dataframe_list = []
703732
while len(pages) > 0:
704733
page = pages.pop()

pandas/io/tests/test_gbq.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,91 @@ def test_invalid_option_for_sql_dialect(self):
711711
gbq.read_gbq(sql_statement, project_id=_get_project_id(),
712712
dialect='standard', private_key=_get_private_key_path())
713713

714+
def test_query_with_parameters(self):
715+
sql_statement = "SELECT @param1 + @param2 as VALID_RESULT"
716+
config = {
717+
'query': {
718+
"useLegacySql": False,
719+
"parameterMode": "named",
720+
"queryParameters": [
721+
{
722+
"name": "param1",
723+
"parameterType": {
724+
"type": "INTEGER"
725+
},
726+
"parameterValue": {
727+
"value": 1
728+
}
729+
},
730+
{
731+
"name": "param2",
732+
"parameterType": {
733+
"type": "INTEGER"
734+
},
735+
"parameterValue": {
736+
"value": 2
737+
}
738+
}
739+
]
740+
}
741+
}
742+
# Test that a query that relies on parameters fails
743+
# when parameters are not supplied via configuration
744+
with tm.assertRaises(ValueError):
745+
gbq.read_gbq(sql_statement, project_id=_get_project_id(),
746+
private_key=_get_private_key_path())
747+
748+
# Test that the query is successful because we have supplied
749+
# the correct query parameters via the 'config' option
750+
df = gbq.read_gbq(sql_statement, project_id=_get_project_id(),
751+
private_key=_get_private_key_path(),
752+
configuration=config)
753+
tm.assert_frame_equal(df, DataFrame({'VALID_RESULT': [3]}))
754+
755+
def test_query_inside_configuration(self):
756+
query_no_use = 'SELECT "PI_WRONG" as VALID_STRING'
757+
query = 'SELECT "PI" as VALID_STRING'
758+
config = {
759+
'query': {
760+
"query": query,
761+
"useQueryCache": False,
762+
}
763+
}
764+
# Test that it can't pass query both
765+
# inside config and as parameter
766+
with tm.assertRaises(ValueError):
767+
gbq.read_gbq(query_no_use, project_id=_get_project_id(),
768+
private_key=_get_private_key_path(),
769+
configuration=config)
770+
771+
df = gbq.read_gbq(None, project_id=_get_project_id(),
772+
private_key=_get_private_key_path(),
773+
configuration=config)
774+
tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
775+
776+
def test_configuration_without_query(self):
777+
sql_statement = 'SELECT 1'
778+
config = {
779+
'copy': {
780+
"sourceTable": {
781+
"projectId": _get_project_id(),
782+
"datasetId": "publicdata:samples",
783+
"tableId": "wikipedia"
784+
},
785+
"destinationTable": {
786+
"projectId": _get_project_id(),
787+
"datasetId": "publicdata:samples",
788+
"tableId": "wikipedia_copied"
789+
},
790+
}
791+
}
792+
# Test that only 'query' configurations are supported
793+
# nor 'copy','load','extract'
794+
with tm.assertRaises(ValueError):
795+
gbq.read_gbq(sql_statement, project_id=_get_project_id(),
796+
private_key=_get_private_key_path(),
797+
configuration=config)
798+
714799

715800
class TestToGBQIntegration(tm.TestCase):
716801
# Changes to BigQuery table schema may take up to 2 minutes as of May 2015

0 commit comments

Comments
 (0)