From 83334db397a9aba411155242d47c48602c10096d Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Wed, 17 May 2017 23:08:40 -0700 Subject: [PATCH 1/6] Convert to python3 --- bigquery/__init__.py | 2 +- bigquery/__init__.py.bak | 21 + bigquery/client.py | 6 +- bigquery/client.py.bak | 1932 ++++++++++++++ bigquery/query_builder.py | 2 +- bigquery/query_builder.py.bak | 397 +++ bigquery/schema_builder.py | 2 +- bigquery/schema_builder.py.bak | 145 + bigquery/tests/test_client.py | 32 +- bigquery/tests/test_client.py.bak | 2902 +++++++++++++++++++++ bigquery/tests/test_schema_builder.py | 2 +- bigquery/tests/test_schema_builder.py.bak | 140 + 12 files changed, 5560 insertions(+), 23 deletions(-) create mode 100644 bigquery/__init__.py.bak create mode 100644 bigquery/client.py.bak create mode 100644 bigquery/query_builder.py.bak create mode 100644 bigquery/schema_builder.py.bak create mode 100644 bigquery/tests/test_client.py.bak create mode 100644 bigquery/tests/test_schema_builder.py.bak diff --git a/bigquery/__init__.py b/bigquery/__init__.py index b393875..beb89bb 100644 --- a/bigquery/__init__.py +++ b/bigquery/__init__.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + from .version import __version__ diff --git a/bigquery/__init__.py.bak b/bigquery/__init__.py.bak new file mode 100644 index 0000000..b393875 --- /dev/null +++ b/bigquery/__init__.py.bak @@ -0,0 +1,21 @@ +from __future__ import absolute_import + +from .version import __version__ + +from .client import get_client +from .client import ( + BIGQUERY_SCOPE, + BIGQUERY_SCOPE_READ_ONLY, + JOB_CREATE_IF_NEEDED, + JOB_CREATE_NEVER, + JOB_SOURCE_FORMAT_NEWLINE_DELIMITED_JSON, + JOB_SOURCE_FORMAT_DATASTORE_BACKUP, + JOB_SOURCE_FORMAT_CSV, + JOB_WRITE_TRUNCATE, + JOB_WRITE_APPEND, + JOB_WRITE_EMPTY, + JOB_ENCODING_UTF_8, + JOB_ENCODING_ISO_8859_1 +) + +from .schema_builder import schema_from_record diff --git a/bigquery/client.py b/bigquery/client.py index 17a3a89..7e5d6f0 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -124,7 +124,7 @@ def get_client(project_id=None, credentials=None, if private_key: try: - if isinstance(private_key, basestring): + if isinstance(private_key, str): private_key = private_key.decode('utf-8') except NameError: # python3 -- private_key is already unicode @@ -1213,7 +1213,7 @@ def wait_for_job(self, job, interval=5, timeout=60): jobId=job_id) job_resource = request.execute() self._raise_executing_exception_if_error(job_resource) - complete = job_resource.get('status').get('state') == u'DONE' + complete = job_resource.get('status').get('state') == 'DONE' elapsed_time = time() - start_time # raise exceptions if timeout @@ -1489,7 +1489,7 @@ def _filter_tables_by_time(self, tables, start_time, end_time): Table names that are inside the time range """ - return [table_name for (table_name, unix_seconds) in tables.items() + return [table_name for (table_name, unix_seconds) in list(tables.items()) if self._in_range(start_time, end_time, unix_seconds)] def _in_range(self, start_time, end_time, time): diff --git a/bigquery/client.py.bak b/bigquery/client.py.bak new file mode 100644 index 0000000..17a3a89 --- /dev/null +++ b/bigquery/client.py.bak @@ -0,0 +1,1932 @@ +import calendar +import json +from logging import getLogger, NullHandler +from collections import defaultdict +from datetime import datetime, timedelta +from hashlib import sha256 +from io import StringIO +from time import sleep, time +from functools import reduce + +import six +from bigquery.errors import (BigQueryTimeoutException, JobExecutingException, + JobInsertException, UnfinishedQueryException) +from googleapiclient.discovery import build, DISCOVERY_URI +from googleapiclient.errors import HttpError +from httplib2 import Http + +BIGQUERY_SCOPE = [ + 'https://www.googleapis.com/auth/bigquery' +] + +BIGQUERY_SCOPE_READ_ONLY = [ + 'https://www.googleapis.com/auth/bigquery.readonly' +] + +CACHE_TIMEOUT = timedelta(seconds=30) + +JOB_CREATE_IF_NEEDED = 'CREATE_IF_NEEDED' +JOB_CREATE_NEVER = 'CREATE_NEVER' +JOB_WRITE_TRUNCATE = 'WRITE_TRUNCATE' +JOB_WRITE_APPEND = 'WRITE_APPEND' +JOB_WRITE_EMPTY = 'WRITE_EMPTY' +JOB_ENCODING_UTF_8 = 'UTF-8' +JOB_ENCODING_ISO_8859_1 = 'ISO-8859-1' +JOB_PRIORITY_INTERACTIVE = 'INTERACTIVE' +JOB_PRIORITY_BATCH = 'BATCH' +JOB_COMPRESSION_NONE = 'NONE' +JOB_COMPRESSION_GZIP = 'GZIP' + +JOB_FORMAT_CSV = 'CSV' +JOB_FORMAT_NEWLINE_DELIMITED_JSON = 'NEWLINE_DELIMITED_JSON' +JOB_SOURCE_FORMAT_DATASTORE_BACKUP = 'DATASTORE_BACKUP' +JOB_SOURCE_FORMAT_NEWLINE_DELIMITED_JSON = JOB_FORMAT_NEWLINE_DELIMITED_JSON +JOB_SOURCE_FORMAT_CSV = JOB_FORMAT_CSV +JOB_DESTINATION_FORMAT_AVRO = 'AVRO' +JOB_DESTINATION_FORMAT_NEWLINE_DELIMITED_JSON = \ + JOB_FORMAT_NEWLINE_DELIMITED_JSON +JOB_DESTINATION_FORMAT_CSV = JOB_FORMAT_CSV + +logger = getLogger(__name__) +logger.addHandler(NullHandler()) + + +def get_client(project_id=None, credentials=None, + service_url=None, service_account=None, + private_key=None, private_key_file=None, + json_key=None, json_key_file=None, + readonly=True, swallow_results=True): + """Return a singleton instance of BigQueryClient. Either + AssertionCredentials or a service account and private key combination need + to be provided in order to authenticate requests to BigQuery. + + Parameters + ---------- + project_id : str, optional + The BigQuery project id, required unless json_key or json_key_file is + provided. + credentials : oauth2client.client.SignedJwtAssertionCredentials, optional + AssertionCredentials instance to authenticate requests to BigQuery + (optional, must provide `service_account` and (`private_key` or + `private_key_file`) or (`json_key` or `json_key_file`) if not included + service_url : str, optional + A URI string template pointing to the location of Google's API + discovery service. Requires two parameters {api} and {apiVersion} that + when filled in produce an absolute URI to the discovery document for + that service. If not set then the default googleapiclient discovery URI + is used. See `credentials` + service_account : str, optional + The Google API service account name. See `credentials` + private_key : str, optional + The private key associated with the service account in PKCS12 or PEM + format. See `credentials` + private_key_file : str, optional + The name of the file containing the private key associated with the + service account in PKCS12 or PEM format. See `credentials` + json_key : dict, optional + The JSON key associated with the service account. See `credentials` + json_key_file : str, optional + The name of the JSON key file associated with the service account. See + `credentials`. + readonly : bool + Bool indicating if BigQuery access is read-only. Has no effect if + credentials are provided. Default True. + swallow_results : bool + If set to False, then return the actual response value instead of + converting to boolean. Default True. + + Returns + ------- + BigQueryClient + An instance of the BigQuery client. + """ + + if not credentials: + assert (service_account and (private_key or private_key_file)) or ( + json_key or json_key_file), \ + 'Must provide AssertionCredentials or service account and P12 key\ + or JSON key' + + if not project_id: + assert json_key or json_key_file, \ + 'Must provide project_id unless json_key or json_key_file is\ + provided' + + if service_url is None: + service_url = DISCOVERY_URI + + scope = BIGQUERY_SCOPE_READ_ONLY if readonly else BIGQUERY_SCOPE + + if private_key_file: + credentials = _credentials().from_p12_keyfile(service_account, + private_key_file, + scopes=scope) + + if private_key: + try: + if isinstance(private_key, basestring): + private_key = private_key.decode('utf-8') + except NameError: + # python3 -- private_key is already unicode + pass + credentials = _credentials().from_p12_keyfile_buffer( + service_account, + StringIO(private_key), + scopes=scope) + + if json_key_file: + with open(json_key_file, 'r') as key_file: + json_key = json.load(key_file) + + if json_key: + credentials = _credentials().from_json_keyfile_dict(json_key, + scopes=scope) + if not project_id: + project_id = json_key['project_id'] + + bq_service = _get_bq_service(credentials=credentials, + service_url=service_url) + + return BigQueryClient(bq_service, project_id, swallow_results) + + +def get_projects(bq_service): + """Given the BigQuery service, return data about all projects.""" + projects_request = bq_service.projects().list().execute() + + projects = [] + for project in projects_request.get('projects', []): + project_data = { + 'id': project['id'], + 'name': project['friendlyName'] + } + projects.append(project_data) + return projects + + +def _get_bq_service(credentials=None, service_url=None): + """Construct an authorized BigQuery service object.""" + + assert credentials, 'Must provide ServiceAccountCredentials' + + http = credentials.authorize(Http()) + service = build('bigquery', 'v2', http=http, + discoveryServiceUrl=service_url) + + return service + + +def _credentials(): + """Import and return SignedJwtAssertionCredentials class""" + from oauth2client.service_account import ServiceAccountCredentials + + return ServiceAccountCredentials + + +class BigQueryClient(object): + + def __init__(self, bq_service, project_id, swallow_results=True): + self.bigquery = bq_service + self.project_id = project_id + self.swallow_results = swallow_results + self.cache = {} + + def _submit_query_job(self, query_data): + """ Submit a query job to BigQuery. + + This is similar to BigQueryClient.query, but gives the user + direct access to the query method on the offical BigQuery + python client. + + For fine-grained control over a query job, see: + https://google-api-client-libraries.appspot.com/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#query + + Parameters + ---------- + query_data + query object as per "configuration.query" in + https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query + + Returns + ------- + tuple + job id and query results if query completed. If dry_run is True, + job id will be None and results will be empty if the query is valid + or a dict containing the response if invalid. + + Raises + ------ + BigQueryTimeoutException + On timeout + """ + + logger.debug('Submitting query job: %s' % query_data) + + job_collection = self.bigquery.jobs() + + try: + query_reply = job_collection.query( + projectId=self.project_id, body=query_data).execute() + except HttpError as e: + if query_data.get("dryRun", False): + return None, json.loads(e.content.decode('utf8')) + raise + + job_id = query_reply['jobReference'].get('jobId') + schema = query_reply.get('schema', {'fields': None})['fields'] + rows = query_reply.get('rows', []) + job_complete = query_reply.get('jobComplete', False) + + # raise exceptions if it's not an async query + # and job is not completed after timeout + if not job_complete and query_data.get("timeoutMs", False): + logger.error('BigQuery job %s timeout' % job_id) + raise BigQueryTimeoutException() + + return job_id, [self._transform_row(row, schema) for row in rows] + + def _insert_job(self, body_object): + """ Submit a job to BigQuery + + Direct proxy to the insert() method of the offical BigQuery + python client. + + Able to submit load, link, query, copy, or extract jobs. + + For more details, see: + https://google-api-client-libraries.appspot.com/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#insert + + Parameters + ---------- + body_object : body object passed to bigquery.jobs().insert() + + Returns + ------- + response of the bigquery.jobs().insert().execute() call + + Raises + ------ + BigQueryTimeoutException on timeout + """ + + logger.debug('Submitting job: %s' % body_object) + + job_collection = self.bigquery.jobs() + + return job_collection.insert( + projectId=self.project_id, + body=body_object + ).execute() + + def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sql=None, external_udf_uris=None): + """Submit a query to BigQuery. + + Parameters + ---------- + query : str + BigQuery query string + max_results : int, optional + The maximum number of rows to return per page of results. + timeout : float, optional + How long to wait for the query to complete, in seconds before + the request times out and returns. + dry_run : bool, optional + If True, the query isn't actually run. A valid query will return an + empty response, while an invalid one will return the same error + message it would if it wasn't a dry run. + use_legacy_sql : bool, optional. Default True. + If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) + external_udf_uris : list, optional + Contains external UDF URIs. If given, URIs must be Google Cloud + Storage and have .js extensions. + + + Returns + ------- + tuple + (job id, query results) if the query completed. If dry_run is True, + job id will be None and results will be empty if the query is valid + or a ``dict`` containing the response if invalid. + + Raises + ------ + BigQueryTimeoutException + on timeout + """ + + logger.debug('Executing query: %s' % query) + + query_data = { + 'query': query, + 'timeoutMs': timeout * 1000, + 'dryRun': dry_run, + 'maxResults': max_results + } + + if use_legacy_sql is not None: + query_data['useLegacySql'] = use_legacy_sql + + if external_udf_uris: + query_data['userDefinedFunctionResources'] = \ + [ {'resourceUri': u} for u in external_udf_uris ] + + return self._submit_query_job(query_data) + + def get_query_schema(self, job_id): + """Retrieve the schema of a query by job id. + + Parameters + ---------- + job_id : str + The job_id that references a BigQuery query + + Returns + ------- + list + A ``list`` of ``dict`` objects that represent the schema. + """ + + query_reply = self.get_query_results(job_id, offset=0, limit=0) + + if not query_reply['jobComplete']: + logger.warning('BigQuery job %s not complete' % job_id) + raise UnfinishedQueryException() + + return query_reply['schema']['fields'] + + def get_table_schema(self, dataset, table): + """Return the table schema. + + Parameters + ---------- + dataset : str + The dataset containing the `table`. + table : str + The table to get the schema for + + Returns + ------- + list + A ``list`` of ``dict`` objects that represent the table schema. If + the table doesn't exist, None is returned. + """ + + try: + result = self.bigquery.tables().get( + projectId=self.project_id, + tableId=table, + datasetId=dataset).execute() + except HttpError as e: + if int(e.resp['status']) == 404: + logger.warn('Table %s.%s does not exist', dataset, table) + return None + raise + + return result['schema']['fields'] + + def check_job(self, job_id): + """Return the state and number of results of a query by job id. + + Parameters + ---------- + job_id : str + The job id of the query to check. + + Returns + ------- + tuple + (``bool``, ``int``) Whether or not the query has completed and the + total number of rows included in the query table if it has + completed (else 0) + """ + + query_reply = self.get_query_results(job_id, offset=0, limit=0) + + return (query_reply.get('jobComplete', False), + int(query_reply.get('totalRows', 0))) + + def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): + """Retrieve a list of rows from a query table by job id. + This method will append results from multiple pages together. If you + want to manually page through results, you can use `get_query_results` + method directly. + + Parameters + ---------- + job_id : str + The job id that references a BigQuery query. + offset : int, optional + The offset of the rows to pull from BigQuery + limit : int, optional + The number of rows to retrieve from a query table. + timeout : float, optional + Timeout in seconds. + + Returns + ------- + list + A ``list`` of ``dict`` objects that represent table rows. + """ + + # Get query results + query_reply = self.get_query_results(job_id, offset=offset, + limit=limit, timeout=timeout) + if not query_reply['jobComplete']: + logger.warning('BigQuery job %s not complete' % job_id) + raise UnfinishedQueryException() + + schema = query_reply["schema"]["fields"] + rows = query_reply.get('rows', []) + page_token = query_reply.get("pageToken") + records = [self._transform_row(row, schema) for row in rows] + + # Append to records if there are multiple pages for query results + while page_token and (not limit or len(records) < limit): + query_reply = self.get_query_results( + job_id, offset=offset, limit=limit, page_token=page_token, + timeout=timeout) + page_token = query_reply.get("pageToken") + rows = query_reply.get('rows', []) + records += [self._transform_row(row, schema) for row in rows] + return records[:limit] if limit else records + + def check_dataset(self, dataset_id): + """Check to see if a dataset exists. + + Parameters + ---------- + dataset_id : str + Dataset unique id + + Returns + ------- + bool + True if dataset at `dataset_id` exists, else Fasle + """ + dataset = self.get_dataset(dataset_id) + return bool(dataset) + + def get_dataset(self, dataset_id): + """Retrieve a dataset if it exists, otherwise return an empty dict. + + Parameters + ---------- + dataset_id : str + Dataset unique id + + Returns + ------- + dict + Contains dataset object if it exists, else empty + """ + try: + dataset = self.bigquery.datasets().get( + projectId=self.project_id, datasetId=dataset_id).execute() + except HttpError: + dataset = {} + + return dataset + + def check_table(self, dataset, table): + """Check to see if a table exists. + + Parameters + ---------- + dataset : str + The dataset to check + table : str + The name of the table + + Returns + ------- + bool + True if table exists, else False + """ + table = self.get_table(dataset, table) + return bool(table) + + def get_table(self, dataset, table): + """ Retrieve a table if it exists, otherwise return an empty dict. + + Parameters + ---------- + dataset : str + The dataset that the table is in + table : str + The name of the table + + Returns + ------- + dict + Containing the table object if it exists, else empty + """ + try: + table = self.bigquery.tables().get( + projectId=self.project_id, datasetId=dataset, + tableId=table).execute() + except HttpError: + table = {} + + return table + + def create_table(self, dataset, table, schema, + expiration_time=None, time_partitioning=False): + """Create a new table in the dataset. + + Parameters + ---------- + dataset : str + The dataset to create the table in + table : str + The name of the table to create + schema : dict + The table schema + expiration_time : float, optional + The expiry time in milliseconds since the epoch. + time_partitioning : bool, optional + Create a time partitioning. + + Returns + ------- + Union[bool, dict] + If the table was successfully created, or response from BigQuery + if swallow_results is set to False + """ + + body = { + 'schema': {'fields': schema}, + 'tableReference': { + 'tableId': table, + 'projectId': self.project_id, + 'datasetId': dataset + } + } + + if expiration_time is not None: + body['expirationTime'] = expiration_time + + if time_partitioning: + body['timePartitioning'] = {'type': 'DAY'} + + try: + table = self.bigquery.tables().insert( + projectId=self.project_id, + datasetId=dataset, + body=body + ).execute() + if self.swallow_results: + return True + else: + return table + + except HttpError as e: + logger.error(('Cannot create table {0}.{1}\n' + 'Http Error: {2}').format(dataset, table, e.content)) + if self.swallow_results: + return False + else: + return {} + + def update_table(self, dataset, table, schema): + """Update an existing table in the dataset. + + Parameters + ---------- + dataset : str + The dataset to update the table in + table : str + The name of the table to update + schema : dict + Table schema + + Returns + ------- + Union[bool, dict] + bool indicating if the table was successfully updated or not, + or response from BigQuery if swallow_results is set to False. + """ + + body = { + 'schema': {'fields': schema}, + 'tableReference': { + 'tableId': table, + 'projectId': self.project_id, + 'datasetId': dataset + } + } + + try: + result = self.bigquery.tables().update( + projectId=self.project_id, + datasetId=dataset, + body=body + ).execute() + if self.swallow_results: + return True + else: + return result + + except HttpError as e: + logger.error(('Cannot update table {0}.{1}\n' + 'Http Error: {2}').format(dataset, table, e.content)) + if self.swallow_results: + return False + else: + return {} + + def patch_table(self, dataset, table, schema): + """Patch an existing table in the dataset. + + Parameters + ---------- + dataset : str + The dataset to patch the table in + table : str + The name of the table to patch + schema : dict + The table schema + + Returns + ------- + Union[bool, dict] + Bool indicating if the table was successfully patched or not, + or response from BigQuery if swallow_results is set to False + """ + + body = { + 'schema': {'fields': schema}, + 'tableReference': { + 'tableId': table, + 'projectId': self.project_id, + 'datasetId': dataset + } + } + + try: + result = self.bigquery.tables().patch( + projectId=self.project_id, + datasetId=dataset, + body=body + ).execute() + if self.swallow_results: + return True + else: + return result + + except HttpError as e: + logger.error(('Cannot patch table {0}.{1}\n' + 'Http Error: {2}').format(dataset, table, e.content)) + if self.swallow_results: + return False + else: + return {} + + def create_view(self, dataset, view, query, use_legacy_sql=None): + """Create a new view in the dataset. + + Parameters + ---------- + dataset : str + The dataset to create the view in + view : str + The name of the view to create + query : dict + A query that BigQuery executes when the view is referenced. + use_legacy_sql : bool, optional + If False, the query will use BigQuery's standard SQL + (https://cloud.google.com/bigquery/sql-reference/) + + Returns + ------- + Union[bool, dict] + bool indicating if the view was successfully created or not, + or response from BigQuery if swallow_results is set to False. + """ + + body = { + 'tableReference': { + 'tableId': view, + 'projectId': self.project_id, + 'datasetId': dataset + }, + 'view': { + 'query': query + } + } + + if use_legacy_sql is not None: + body['view']['useLegacySql'] = use_legacy_sql + + try: + view = self.bigquery.tables().insert( + projectId=self.project_id, + datasetId=dataset, + body=body + ).execute() + if self.swallow_results: + return True + else: + return view + + except HttpError as e: + logger.error(('Cannot create view {0}.{1}\n' + 'Http Error: {2}').format(dataset, view, e.content)) + if self.swallow_results: + return False + else: + return {} + + def delete_table(self, dataset, table): + """Delete a table from the dataset. + + Parameters + ---------- + dataset : str + The dataset to delete the table from. + table : str + The name of the table to delete + + Returns + ------- + Union[bool, dict] + bool indicating if the table was successfully deleted or not, + or response from BigQuery if swallow_results is set for False. + """ + + try: + response = self.bigquery.tables().delete( + projectId=self.project_id, + datasetId=dataset, + tableId=table + ).execute() + if self.swallow_results: + return True + else: + return response + + except HttpError as e: + logger.error(('Cannot delete table {0}.{1}\n' + 'Http Error: {2}').format(dataset, table, e.content)) + if self.swallow_results: + return False + else: + return {} + + def get_tables(self, dataset_id, app_id, start_time, end_time): + """Retrieve a list of tables that are related to the given app id + and are inside the range of start and end times. + + Parameters + ---------- + dataset_id : str + The BigQuery dataset id to consider. + app_id : str + The appspot name + start_time : Union[datetime, int] + The datetime or unix time after which records will be fetched. + end_time : Union[datetime, int] + The datetime or unix time up to which records will be fetched. + + Returns + ------- + list + A ``list`` of table names. + """ + + if isinstance(start_time, datetime): + start_time = calendar.timegm(start_time.utctimetuple()) + + if isinstance(end_time, datetime): + end_time = calendar.timegm(end_time.utctimetuple()) + + every_table = self._get_all_tables(dataset_id) + app_tables = every_table.get(app_id, {}) + + return self._filter_tables_by_time(app_tables, start_time, end_time) + + def import_data_from_uris( + self, + source_uris, + dataset, + table, + schema=None, + job=None, + source_format=None, + create_disposition=None, + write_disposition=None, + encoding=None, + ignore_unknown_values=None, + max_bad_records=None, + allow_jagged_rows=None, + allow_quoted_newlines=None, + field_delimiter=None, + quote=None, + skip_leading_rows=None, + ): + """ + Imports data into a BigQuery table from cloud storage. Optional + arguments that are not specified are determined by BigQuery as + described: + https://developers.google.com/bigquery/docs/reference/v2/jobs + + Parameters + ---------- + source_urls : list + A ``list`` of ``str`` objects representing the urls on cloud + storage of the form: gs://bucket/filename + dataset : str + String id of the dataset + table : str + String id of the table + job : str, optional + Identifies the job (a unique job id is automatically generated if + not provided) + schema : list, optional + Represents the BigQuery schema + source_format : str, optional + One of the JOB_SOURCE_FORMAT_* constants + create_disposition : str, optional + One of the JOB_CREATE_* constants + write_disposition : str, optional + One of the JOB_WRITE_* constants + encoding : str, optional + One of the JOB_ENCODING_* constants + ignore_unknown_values : bool, optional + Whether or not to ignore unknown values + max_bad_records : int, optional + Maximum number of bad records + allow_jagged_rows : bool, optional + For csv only + allow_quoted_newlines : bool, optional + For csv only + field_delimiter : str, optional + For csv only + quote : str, optional + Quote character for csv only + skip_leading_rows : int, optional + For csv only + + Returns + ------- + dict + A BigQuery job response + + Raises + ------ + JobInsertException + on http/auth failures or error in result + """ + source_uris = source_uris if isinstance(source_uris, list) \ + else [source_uris] + + configuration = { + "destinationTable": { + "projectId": self.project_id, + "tableId": table, + "datasetId": dataset + }, + "sourceUris": source_uris, + } + + if max_bad_records: + configuration['maxBadRecords'] = max_bad_records + + if ignore_unknown_values: + configuration['ignoreUnknownValues'] = ignore_unknown_values + + if create_disposition: + configuration['createDisposition'] = create_disposition + + if write_disposition: + configuration['writeDisposition'] = write_disposition + + if encoding: + configuration['encoding'] = encoding + + if schema: + configuration['schema'] = {'fields': schema} + + if source_format: + configuration['sourceFormat'] = source_format + + if not job: + hex = self._generate_hex_for_uris(source_uris) + job = "{dataset}-{table}-{digest}".format( + dataset=dataset, + table=table, + digest=hex + ) + + if source_format == JOB_SOURCE_FORMAT_CSV: + if field_delimiter: + configuration['fieldDelimiter'] = field_delimiter + + if allow_jagged_rows: + configuration['allowJaggedRows'] = allow_jagged_rows + + if allow_quoted_newlines: + configuration['allowQuotedNewlines'] = allow_quoted_newlines + + if quote: + configuration['quote'] = quote + + if skip_leading_rows: + configuration['skipLeadingRows'] = skip_leading_rows + + elif field_delimiter or allow_jagged_rows \ + or allow_quoted_newlines or quote or skip_leading_rows: + all_values = dict(field_delimiter=field_delimiter, + allow_jagged_rows=allow_jagged_rows, + allow_quoted_newlines=allow_quoted_newlines, + skip_leading_rows=skip_leading_rows, + quote=quote) + non_null_values = dict((k, v) for k, v + in list(all_values.items()) + if v) + raise Exception("Parameters field_delimiter, allow_jagged_rows, " + "allow_quoted_newlines, quote and " + "skip_leading_rows are only allowed when " + "source_format=JOB_SOURCE_FORMAT_CSV: %s" + % non_null_values) + + body = { + "configuration": { + 'load': configuration + }, + "jobReference": { + "projectId": self.project_id, + "jobId": job + } + } + + logger.debug("Creating load job %s" % body) + job_resource = self._insert_job(body) + self._raise_insert_exception_if_error(job_resource) + return job_resource + + def export_data_to_uris( + self, + destination_uris, + dataset, + table, + job=None, + compression=None, + destination_format=None, + print_header=None, + field_delimiter=None, + ): + """ + Export data from a BigQuery table to cloud storage. Optional arguments + that are not specified are determined by BigQuery as described: + https://developers.google.com/bigquery/docs/reference/v2/jobs + + Parameters + ---------- + destination_urls : Union[str, list] + ``str`` or ``list`` of ``str`` objects representing the URIs on + cloud storage of the form: gs://bucket/filename + dataset : str + String id of the dataset + table : str + String id of the table + job : str, optional + String identifying the job (a unique jobid is automatically + generated if not provided) + compression : str, optional + One of the JOB_COMPRESSION_* constants + destination_format : str, optional + One of the JOB_DESTination_FORMAT_* constants + print_header : bool, optional + Whether or not to print the header + field_delimiter : str, optional + Character separating fields in delimited file + + Returns + ------- + dict + A BigQuery job resource + + Raises + ------ + JobInsertException + On http/auth failures or error in result + """ + destination_uris = destination_uris \ + if isinstance(destination_uris, list) else [destination_uris] + + configuration = { + "sourceTable": { + "projectId": self.project_id, + "tableId": table, + "datasetId": dataset + }, + "destinationUris": destination_uris, + } + + if compression: + configuration['compression'] = compression + + if destination_format: + configuration['destinationFormat'] = destination_format + + if print_header is not None: + configuration['printHeader'] = print_header + + if field_delimiter: + configuration['fieldDelimiter'] = field_delimiter + + if not job: + hex = self._generate_hex_for_uris(destination_uris) + job = "{dataset}-{table}-{digest}".format( + dataset=dataset, + table=table, + digest=hex + ) + + body = { + "configuration": { + 'extract': configuration + }, + "jobReference": { + "projectId": self.project_id, + "jobId": job + } + } + + logger.info("Creating export job %s" % body) + job_resource = self._insert_job(body) + self._raise_insert_exception_if_error(job_resource) + return job_resource + + def write_to_table( + self, + query, + dataset=None, + table=None, + external_udf_uris=None, + allow_large_results=None, + use_query_cache=None, + priority=None, + create_disposition=None, + write_disposition=None, + use_legacy_sql=None, + maximum_billing_tier=None, + flatten=None + ): + """ + Write query result to table. If dataset or table is not provided, + Bigquery will write the result to temporary table. Optional arguments + that are not specified are determined by BigQuery as described: + https://developers.google.com/bigquery/docs/reference/v2/jobs + + Parameters + ---------- + query : str + BigQuery query string + dataset : str, optional + String id of the dataset + table : str, optional + String id of the table + external_udf_uris : list, optional + Contains external UDF URIs. If given, URIs must be Google Cloud + Storage and have .js extensions. + allow_large_results : bool, optional + Whether or not to allow large results + use_query_cache : bool, optional + Whether or not to use query cache + priority : str, optional + One of the JOB_PRIORITY_* constants + create_disposition : str, optional + One of the JOB_CREATE_* constants + write_disposition : str, optional + One of the JOB_WRITE_* constants + use_legacy_sql: bool, optional + If False, the query will use BigQuery's standard SQL + (https://cloud.google.com/bigquery/sql-reference/) + maximum_billing_tier : integer, optional + Limits the billing tier for this job. Queries that have resource + usage beyond this tier will fail (without incurring a charge). If + unspecified, this will be set to your project default. For more + information, + see https://cloud.google.com/bigquery/pricing#high-compute + flatten : bool, optional + Whether or not to flatten nested and repeated fields + in query results + + Returns + ------- + dict + A BigQuery job resource + + Raises + ------ + JobInsertException + On http/auth failures or error in result + """ + + configuration = { + "query": query, + } + + if dataset and table: + configuration['destinationTable'] = { + "projectId": self.project_id, + "tableId": table, + "datasetId": dataset + } + + if allow_large_results is not None: + configuration['allowLargeResults'] = allow_large_results + + if flatten is not None: + configuration['flattenResults'] = flatten + + if maximum_billing_tier is not None: + configuration['maximumBillingTier'] = maximum_billing_tier + + if use_query_cache is not None: + configuration['useQueryCache'] = use_query_cache + + if use_legacy_sql is not None: + configuration['useLegacySql'] = use_legacy_sql + + if priority: + configuration['priority'] = priority + + if create_disposition: + configuration['createDisposition'] = create_disposition + + if write_disposition: + configuration['writeDisposition'] = write_disposition + + if external_udf_uris: + configuration['userDefinedFunctionResources'] = \ + [ {'resourceUri': u} for u in external_udf_uris ] + + body = { + "configuration": { + 'query': configuration + } + } + + logger.info("Creating write to table job %s" % body) + job_resource = self._insert_job(body) + self._raise_insert_exception_if_error(job_resource) + return job_resource + + def wait_for_job(self, job, interval=5, timeout=60): + """ + Waits until the job indicated by job_resource is done or has failed + + Parameters + ---------- + job : Union[dict, str] + ``dict`` representing a BigQuery job resource, or a ``str`` + representing the BigQuery job id + interval : float, optional + Polling interval in seconds, default = 5 + timeout : float, optional + Timeout in seconds, default = 60 + + Returns + ------- + dict + Final state of the job resouce, as described here: + https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#get + + Raises + ------ + Union[JobExecutingException, BigQueryTimeoutException] + On http/auth failures or timeout + """ + complete = False + job_id = str(job if isinstance(job, + (six.binary_type, six.text_type, int)) + else job['jobReference']['jobId']) + job_resource = None + + start_time = time() + elapsed_time = 0 + while not (complete or elapsed_time > timeout): + sleep(interval) + request = self.bigquery.jobs().get(projectId=self.project_id, + jobId=job_id) + job_resource = request.execute() + self._raise_executing_exception_if_error(job_resource) + complete = job_resource.get('status').get('state') == u'DONE' + elapsed_time = time() - start_time + + # raise exceptions if timeout + if not complete: + logger.error('BigQuery job %s timeout' % job_id) + raise BigQueryTimeoutException() + + return job_resource + + def push_rows(self, dataset, table, rows, insert_id_key=None, + skip_invalid_rows=None, ignore_unknown_values=None, + template_suffix=None): + """Upload rows to BigQuery table. + + Parameters + ---------- + dataset : str + The dataset to upload to + table : str + The name of the table to insert rows into + rows : list + A ``list`` of rows (``dict`` objects) to add to the table + insert_id_key : str, optional + Key for insertId in row. + You can use dot separated key for nested column. + skip_invalid_rows : bool, optional + Insert all valid rows of a request, even if invalid rows exist. + ignore_unknown_values : bool, optional + Accept rows that contain values that do not match the schema. + template_suffix : str, optional + Inserts the rows into an {table}{template_suffix}. + If table {table}{template_suffix} doesn't exist, create from {table}. + + Returns + ------- + Union[bool, dict] + bool indicating if insert succeeded or not, or response + from BigQuery if swallow_results is set for False. + """ + + table_data = self.bigquery.tabledata() + + rows_data = [] + for row in rows: + each_row = {} + each_row["json"] = row + if insert_id_key is not None: + keys = insert_id_key.split('.') + val = reduce(lambda d, key: d.get(key) if d else None, keys, row) + if val is not None: + each_row["insertId"] = val + rows_data.append(each_row) + + data = { + "kind": "bigquery#tableDataInsertAllRequest", + "rows": rows_data + } + + if skip_invalid_rows is not None: + data['skipInvalidRows'] = skip_invalid_rows + + if ignore_unknown_values is not None: + data['ignoreUnknownValues'] = ignore_unknown_values + + if template_suffix is not None: + data['templateSuffix'] = template_suffix + + try: + response = table_data.insertAll( + projectId=self.project_id, + datasetId=dataset, + tableId=table, + body=data + ).execute() + + if response.get('insertErrors'): + logger.error('BigQuery insert errors: %s' % response) + if self.swallow_results: + return False + else: + return response + + if self.swallow_results: + return True + else: + return response + + except HttpError as e: + logger.exception('Problem with BigQuery insertAll') + if self.swallow_results: + return False + else: + return { + 'insertErrors': [{ + 'errors': [{ + 'reason': 'httperror', + 'message': e + }] + }] + } + + def get_all_tables(self, dataset_id): + """Retrieve a list of tables for the dataset. + + Parameters + ---------- + dataset_id : str + The dataset to retrieve table data for. + + Returns + ------- + A ``list`` with all table names + """ + tables_data = self._get_all_tables_for_dataset(dataset_id) + + tables = [] + for table in tables_data.get('tables', []): + table_name = table.get('tableReference', {}).get('tableId') + if table_name: + tables.append(table_name) + return tables + + def _get_all_tables(self, dataset_id, cache=False): + """Retrieve the list of tables for dataset, that respect the formats: + * appid_YYYY_MM + * YYYY_MM_appid + + Parameters + ---------- + dataset_id : str + The dataset to retrieve table names for + cache : bool, optional + To use cached value or not (default False). Timeout value equals + CACHE_TIMEOUT. + + Returns + ------- + dict + A ``dict`` of app ids mapped to their table names + """ + do_fetch = True + if cache and self.cache.get(dataset_id): + time, result = self.cache.get(dataset_id) + if datetime.now() - time < CACHE_TIMEOUT: + do_fetch = False + + if do_fetch: + result = self._get_all_tables_for_dataset(dataset_id) + self.cache[dataset_id] = (datetime.now(), result) + + return self._parse_table_list_response(result) + + def _get_all_tables_for_dataset(self, dataset_id): + """Retrieve a list of all tables for the dataset. + + Parameters + ---------- + dataset_id : str + The dataset to retrieve table names for + + Returns + ------- + dict + A ``dict`` containing tables key with all tables + """ + result = self.bigquery.tables().list( + projectId=self.project_id, + datasetId=dataset_id).execute() + + page_token = result.get('nextPageToken') + while page_token: + res = self.bigquery.tables().list( + projectId=self.project_id, + datasetId=dataset_id, + pageToken=page_token + ).execute() + page_token = res.get('nextPageToken') + result['tables'] += res.get('tables', []) + return result + + def _parse_table_list_response(self, list_response): + """Parse the response received from calling list on tables. + + Parameters + ---------- + list_response + The response found by calling list on a BigQuery table object. + + Returns + ------- + dict + Dates referenced by table names + """ + + tables = defaultdict(dict) + + for table in list_response.get('tables', []): + table_ref = table.get('tableReference') + + if not table_ref: + continue + + table_id = table_ref.get('tableId', '') + + year_month, app_id = self._parse_table_name(table_id) + + if not year_month: + continue + + table_date = datetime.strptime(year_month, '%Y-%m') + unix_seconds = calendar.timegm(table_date.timetuple()) + tables[app_id].update({table_id: unix_seconds}) + + # Turn off defualting + tables.default_factory = None + + return tables + + def _parse_table_name(self, table_id): + """Parse a table name in the form of appid_YYYY_MM or + YYYY_MM_appid and return a tuple consisting of YYYY-MM and the app id. + + Parameters + ---------- + table_id : str + The table id as listed by BigQuery + + Returns + ------- + tuple + (year/month, app id), or (None, None) if the table id cannot be + parsed. + """ + + # Prefix date + attributes = table_id.split('_') + year_month = "-".join(attributes[:2]) + app_id = "-".join(attributes[2:]) + + # Check if date parsed correctly + if year_month.count("-") == 1 and all( + [num.isdigit() for num in year_month.split('-')]): + return year_month, app_id + + # Postfix date + attributes = table_id.split('_') + year_month = "-".join(attributes[-2:]) + app_id = "-".join(attributes[:-2]) + + # Check if date parsed correctly + if year_month.count("-") == 1 and all( + [num.isdigit() for num in year_month.split('-')]): + return year_month, app_id + + return None, None + + def _filter_tables_by_time(self, tables, start_time, end_time): + """Filter a table dictionary and return table names based on the range + of start and end times in unix seconds. + + Parameters + ---------- + tables : dict + Dates referenced by table names + start_time : int + The unix time after which records will be fetched + end_time : int + The unix time up to which records will be fetched + + Returns + ------- + list + Table names that are inside the time range + """ + + return [table_name for (table_name, unix_seconds) in tables.items() + if self._in_range(start_time, end_time, unix_seconds)] + + def _in_range(self, start_time, end_time, time): + """Indicate if the given time falls inside of the given range. + + Parameters + ---------- + start_time : int + The unix time for the start of the range + end_time : int + The unix time for the end of the range + time : int + The unix time to check + + Returns + ------- + bool + True if the time falls within the range, False otherwise. + """ + + ONE_MONTH = 2764800 # 32 days + + return start_time <= time <= end_time or \ + time <= start_time <= time + ONE_MONTH or \ + time <= end_time <= time + ONE_MONTH + + def get_query_results(self, job_id, offset=None, limit=None, + page_token=None, timeout=0): + """Execute the query job indicated by the given job id. This is direct + mapping to bigquery api + https://cloud.google.com/bigquery/docs/reference/v2/jobs/getQueryResults + + Parameters + ---------- + job_id : str + The job id of the query to check + offset : optional + The index the result set should start at. + limit : int, optional + The maximum number of results to retrieve. + page_token : optional + Page token, returned by previous call, to request the next page of + results. + timeout : float, optional + Timeout in seconds + + Returns + ------- + out + The query reply + """ + + job_collection = self.bigquery.jobs() + return job_collection.getQueryResults( + projectId=self.project_id, + jobId=job_id, + startIndex=offset, + maxResults=limit, + pageToken=page_token, + timeoutMs=timeout * 1000).execute() + + def _transform_row(self, row, schema): + """Apply the given schema to the given BigQuery data row. + + Parameters + ---------- + row + A single BigQuery row to transform + schema : list + The BigQuery table schema to apply to the row, specifically + the list of field dicts. + + Returns + ------- + dict + Mapping schema to row + """ + + log = {} + + # Match each schema column with its associated row value + for index, col_dict in enumerate(schema): + col_name = col_dict['name'] + row_value = row['f'][index]['v'] + + if row_value is None: + log[col_name] = None + continue + + # Recurse on nested records + if col_dict['type'] == 'RECORD': + row_value = self._recurse_on_row(col_dict, row_value) + + # Otherwise just cast the value + elif col_dict['type'] == 'INTEGER': + row_value = int(row_value) + + elif col_dict['type'] == 'FLOAT': + row_value = float(row_value) + + elif col_dict['type'] == 'BOOLEAN': + row_value = row_value in ('True', 'true', 'TRUE') + + elif col_dict['type'] == 'TIMESTAMP': + row_value = float(row_value) + + log[col_name] = row_value + + return log + + def _recurse_on_row(self, col_dict, nested_value): + """Apply the schema specified by the given dict to the nested value by + recursing on it. + + Parameters + ---------- + col_dict : dict + The schema to apply to the nested value. + nested_value : A value nested in a BigQuery row. + + Returns + ------- + Union[dict, list] + ``dict`` or ``list`` of ``dict`` objects from applied schema. + """ + + row_value = None + + # Multiple nested records + if col_dict['mode'] == 'REPEATED' and isinstance(nested_value, list): + row_value = [self._transform_row(record['v'], col_dict['fields']) + for record in nested_value] + + # A single nested record + else: + row_value = self._transform_row(nested_value, col_dict['fields']) + + return row_value + + def _generate_hex_for_uris(self, uris): + """Given uris, generate and return hex version of it + + Parameters + ---------- + uris : list + Containing all uris + + Returns + ------- + str + Hexed uris + """ + return sha256((":".join(uris) + str(time())).encode()).hexdigest() + + def _raise_insert_exception_if_error(self, job): + error_http = job.get('error') + if error_http: + raise JobInsertException( + "Error in export job API request: {0}".format(error_http)) + # handle errorResult - API request is successful but error in result + error_result = job.get('status').get('errorResult') + if error_result: + raise JobInsertException( + "Reason:{reason}. Message:{message}".format(**error_result)) + + def _raise_executing_exception_if_error(self, job): + error_http = job.get('error') + if error_http: + raise JobExecutingException( + "Error in export job API request: {0}".format(error_http)) + # handle errorResult - API request is successful but error in result + error_result = job.get('status').get('errorResult') + if error_result: + raise JobExecutingException( + "Reason:{reason}. Message:{message}".format(**error_result)) + + # + # DataSet manipulation methods + # + def create_dataset(self, dataset_id, friendly_name=None, description=None, + access=None, location=None): + """Create a new BigQuery dataset. + + Parameters + ---------- + dataset_id : str + Unique ``str`` identifying the dataset with the project (the + referenceID of the dataset, not the integer id of the dataset) + friendly_name: str, optional + A human readable name + description: str, optional + Longer string providing a description + access : list, optional + Indicating access permissions (see + https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) + location : str, optional + Indicating where dataset should be stored: EU or US (see + https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) + + Returns + ------- + Union[bool, dict] + ``bool`` indicating if dataset was created or not, or response + from BigQuery if swallow_results is set for False + """ + try: + datasets = self.bigquery.datasets() + dataset_data = self.dataset_resource(dataset_id, + friendly_name=friendly_name, + description=description, + access=access, + location=location) + + response = datasets.insert(projectId=self.project_id, + body=dataset_data).execute() + if self.swallow_results: + return True + else: + return response + except HttpError as e: + logger.error( + 'Cannot create dataset {0}, {1}'.format(dataset_id, e)) + if self.swallow_results: + return False + else: + return {} + + def get_datasets(self): + """List all datasets in the project. + + Returns + ------- + list + Dataset resources + """ + try: + datasets = self.bigquery.datasets() + request = datasets.list(projectId=self.project_id) + result = request.execute() + return result.get('datasets', []) + except HttpError as e: + logger.error("Cannot list datasets: {0}".format(e)) + return None + + def delete_dataset(self, dataset_id, delete_contents=False): + """Delete a BigQuery dataset. + + Parameters + ---------- + dataset_id : str + Unique ``str`` identifying the datset with the project (the + referenceId of the dataset) + delete_contents : bool, optional + If True, forces the deletion of the dataset even when the dataset + contains data (Default = False) + + Returns + ------- + Union[bool, dict[ + ool indicating if the delete was successful or not, or response + from BigQuery if swallow_results is set for False + + Raises + ------- + HttpError + 404 when dataset with dataset_id does not exist + """ + try: + datasets = self.bigquery.datasets() + request = datasets.delete(projectId=self.project_id, + datasetId=dataset_id, + deleteContents=delete_contents) + response = request.execute() + if self.swallow_results: + return True + else: + return response + except HttpError as e: + logger.error( + 'Cannot delete dataset {0}: {1}'.format(dataset_id, e)) + if self.swallow_results: + return False + else: + return {} + + def update_dataset(self, dataset_id, friendly_name=None, description=None, + access=None): + """Updates information in an existing dataset. The update method + replaces the entire dataset resource, whereas the patch method only + replaces fields that are provided in the submitted dataset resource. + + Parameters + ---------- + dataset_id : str + Unique ``str`` identifying the dataset with the project (the + referencedId of the dataset) + friendly_name : str, optional + An optional descriptive name for the dataset. + description : str, optional + An optional description of the dataset. + access : list, optional + Indicating access permissions + + Returns + ------- + Union[bool, dict] + ``bool`` indicating if the update was successful or not, or + response from BigQuery if swallow_results is set for False. + """ + try: + datasets = self.bigquery.datasets() + body = self.dataset_resource(dataset_id, friendly_name, + description, access) + request = datasets.update(projectId=self.project_id, + datasetId=dataset_id, + body=body) + response = request.execute() + if self.swallow_results: + return True + else: + return response + except HttpError as e: + logger.error( + 'Cannot update dataset {0}: {1}'.format(dataset_id, e)) + if self.swallow_results: + return False + else: + return {} + + def patch_dataset(self, dataset_id, friendly_name=None, description=None, + access=None): + """Updates information in an existing dataset. The update method + replaces the entire dataset resource, whereas the patch method only + replaces fields that are provided in the submitted dataset resource. + + Parameters + ---------- + dataset_id : str + Unique string idenfitying the dataset with the project (the + referenceId of the dataset) + friendly_name : str, optional + An optional descriptive name for the dataset. + description : str, optional + An optional description of the dataset. + access : list, optional + Indicating access permissions. + + Returns + ------- + Union[bool, dict] + ``bool`` indicating if the patch was successful or not, or response + from BigQuery if swallow_results is set for False. + """ + try: + datasets = self.bigquery.datasets() + body = self.dataset_resource(dataset_id, friendly_name, + description, access) + request = datasets.patch(projectId=self.project_id, + datasetId=dataset_id, body=body) + response = request.execute() + if self.swallow_results: + return True + else: + return response + except HttpError as e: + logger.error('Cannot patch dataset {0}: {1}'.format(dataset_id, e)) + if self.swallow_results: + return False + else: + return {} + + def dataset_resource(self, ref_id, friendly_name=None, description=None, + access=None, location=None): + """See + https://developers.google.com/bigquery/docs/reference/v2/datasets#resource + + Parameters + ---------- + ref_id : str + Dataset id (the reference id, not the integer id) + friendly_name : str, optional + An optional descriptive name for the dataset + description : str, optional + An optional description for the dataset + access : list, optional + Indicating access permissions + location: str, optional, 'EU' or 'US' + An optional geographical location for the dataset(EU or US) + + Returns + ------- + dict + Representing BigQuery dataset resource + """ + data = { + "datasetReference": { + "datasetId": ref_id, + "projectId": self.project_id + } + } + if friendly_name: + data["friendlyName"] = friendly_name + if description: + data["description"] = description + if access: + data["access"] = access + if location: + data["location"] = location + + return data + + @classmethod + def schema_from_record(cls, record): + """Given a dict representing a record instance to be inserted into + BigQuery, calculate the schema. + + Parameters + ---------- + record : dict + representing a record to be inserted into big query, + where all keys are ``str`` objects (representing column names in + the record) and all values are of type ``int``, ``str``, + ``unicode``, ``float``, ``bool``, ``datetime``, or ``dict``. A + ``dict`` value represents a record, and must conform to the same + restrictions as record. + + Returns + ------- + list + BigQuery schema + + Notes + ----- + Results are undefined if a different value type is provided for a + repeated field: E.g. + + >>> { rfield: [ { x: 1}, {x: "a string"} ] } # undefined! + """ + from bigquery.schema_builder import schema_from_record + return schema_from_record(record) diff --git a/bigquery/query_builder.py b/bigquery/query_builder.py index 1054299..b28c1d9 100644 --- a/bigquery/query_builder.py +++ b/bigquery/query_builder.py @@ -81,7 +81,7 @@ def _render_select(selections): return 'SELECT *' rendered_selections = [] - for name, options in selections.items(): + for name, options in list(selections.items()): if not isinstance(options, list): options = [options] diff --git a/bigquery/query_builder.py.bak b/bigquery/query_builder.py.bak new file mode 100644 index 0000000..1054299 --- /dev/null +++ b/bigquery/query_builder.py.bak @@ -0,0 +1,397 @@ +from logging import getLogger, NullHandler + +logger = getLogger(__name__) +logger.addHandler(NullHandler()) + + +def render_query(dataset, tables, select=None, conditions=None, + groupings=None, having=None, order_by=None, limit=None): + """Render a query that will run over the given tables using the specified + parameters. + + Parameters + ---------- + dataset : str + The BigQuery dataset to query data from + tables : Union[dict, list] + The table in `dataset` to query. + select : dict, optional + The keys function as column names and the values function as options to + apply to the select field such as alias and format. For example, + select['start_time'] might have the form + {'alias': 'StartTime', 'format': 'INTEGER-FORMAT_UTC_USEC'}, which + would be represented as 'SEC_TO_TIMESTAMP(INTEGER(start_time)) as + StartTime' in a query. Pass `None` to select all. + conditions : list, optional + a ``list`` of ``dict`` objects to filter results by. Each dict should + have the keys 'field', 'type', and 'comparators'. The first two map to + strings representing the field (e.g. 'foo') and type (e.g. 'FLOAT'). + 'comparators' maps to another ``dict`` containing the keys 'condition', + 'negate', and 'value'. + If 'comparators' = {'condition': '>=', 'negate': False, 'value': 1}, + this example will be rdnered as 'foo >= FLOAT('1')' in the query. + ``list`` of field names to group by + order_by : dict, optional + Keys = {'field', 'direction'}. `dict` should be formatted as + {'field':'TimeStamp, 'direction':'desc'} or similar + limit : int, optional + Limit the amount of data needed to be returned. + + Returns + ------- + str + A rendered query + """ + + if None in (dataset, tables): + return None + + query = "%s %s %s %s %s %s %s" % ( + _render_select(select), + _render_sources(dataset, tables), + _render_conditions(conditions), + _render_groupings(groupings), + _render_having(having), + _render_order(order_by), + _render_limit(limit) + ) + + return query + + +def _render_select(selections): + """Render the selection part of a query. + + Parameters + ---------- + selections : dict + Selections for a table + + Returns + ------- + str + A string for the "select" part of a query + + See Also + -------- + render_query : Further clarification of `selections` dict formatting + """ + + if not selections: + return 'SELECT *' + + rendered_selections = [] + for name, options in selections.items(): + if not isinstance(options, list): + options = [options] + + original_name = name + for options_dict in options: + name = original_name + alias = options_dict.get('alias') + alias = "as %s" % alias if alias else "" + + formatter = options_dict.get('format') + if formatter: + name = _format_select(formatter, name) + + rendered_selections.append("%s %s" % (name, alias)) + + return "SELECT " + ", ".join(rendered_selections) + + +def _format_select(formatter, name): + """Modify the query selector by applying any formatters to it. + + Parameters + ---------- + formatter : str + Hyphen-delimited formatter string where formatters are + applied inside-out, e.g. the formatter string + SEC_TO_MICRO-INTEGER-FORMAT_UTC_USEC applied to the selector + foo would result in FORMAT_UTC_USEC(INTEGER(foo*1000000)). + name: str + The name of the selector to apply formatters to. + + Returns + ------- + str + The formatted selector + """ + + for caster in formatter.split('-'): + if caster == 'SEC_TO_MICRO': + name = "%s*1000000" % name + elif ':' in caster: + caster, args = caster.split(':') + name = "%s(%s,%s)" % (caster, name, args) + else: + name = "%s(%s)" % (caster, name) + + return name + + +def _render_sources(dataset, tables): + """Render the source part of a query. + + Parameters + ---------- + dataset : str + The data set to fetch log data from. + tables : Union[dict, list] + The tables to fetch log data from + + Returns + ------- + str + A string that represents the "from" part of a query. + """ + + if isinstance(tables, dict): + if tables.get('date_range', False): + try: + dataset_table = '.'.join([dataset, tables['table']]) + return "FROM (TABLE_DATE_RANGE([{}], TIMESTAMP('{}'),"\ + " TIMESTAMP('{}'))) ".format(dataset_table, + tables['from_date'], + tables['to_date']) + except KeyError as exp: + logger.warn( + 'Missing parameter %s in selecting sources' % (exp)) + + else: + return "FROM " + ", ".join( + ["[%s.%s]" % (dataset, table) for table in tables]) + + +def _render_conditions(conditions): + """Render the conditions part of a query. + + Parameters + ---------- + conditions : list + A list of dictionay items to filter a table. + + Returns + ------- + str + A string that represents the "where" part of a query + + See Also + -------- + render_query : Further clarification of `conditions` formatting. + """ + + if not conditions: + return "" + + rendered_conditions = [] + + for condition in conditions: + field = condition.get('field') + field_type = condition.get('type') + comparators = condition.get('comparators') + + if None in (field, field_type, comparators) or not comparators: + logger.warn('Invalid condition passed in: %s' % condition) + continue + + rendered_conditions.append( + _render_condition(field, field_type, comparators)) + + if not rendered_conditions: + return "" + + return "WHERE %s" % (" AND ".join(rendered_conditions)) + + +def _render_condition(field, field_type, comparators): + """Render a single query condition. + + Parameters + ---------- + field : str + The field the condition applies to + field_type : str + The data type of the field. + comparators : array_like + An iterable of logic operators to use. + + Returns + ------- + str + a condition string. + """ + + field_type = field_type.upper() + + negated_conditions, normal_conditions = [], [] + + for comparator in comparators: + condition = comparator.get("condition").upper() + negated = "NOT " if comparator.get("negate") else "" + value = comparator.get("value") + + if condition == "IN": + if isinstance(value, (list, tuple, set)): + value = ', '.join( + sorted([_render_condition_value(v, field_type) + for v in value]) + ) + else: + value = _render_condition_value(value, field_type) + value = "(" + value + ")" + elif condition == "BETWEEN": + if isinstance(value, (tuple, list, set)) and len(value) == 2: + value = ' AND '.join( + sorted([_render_condition_value(v, field_type) + for v in value]) + ) + elif isinstance(value, (tuple, list, set)) and len(value) != 2: + logger.warn('Invalid condition passed in: %s' % condition) + + else: + value = _render_condition_value(value, field_type) + + rendered_sub_condition = "%s%s %s %s" % ( + negated, field, condition, value) + + if comparator.get("negate"): + negated_conditions.append(rendered_sub_condition) + else: + normal_conditions.append(rendered_sub_condition) + + rendered_normal = " AND ".join(normal_conditions) + rendered_negated = " AND ".join(negated_conditions) + + if rendered_normal and rendered_negated: + return "((%s) AND (%s))" % (rendered_normal, rendered_negated) + + return "(%s)" % (rendered_normal or rendered_negated) + + +def _render_condition_value(value, field_type): + """Render a query condition value. + + Parameters + ---------- + value : Union[bool, int, float, str, datetime] + The value of the condition + field_type : str + The data type of the field + + Returns + ------- + str + A value string. + """ + + # BigQuery cannot cast strings to booleans, convert to ints + if field_type == "BOOLEAN": + value = 1 if value else 0 + elif field_type in ("STRING", "INTEGER", "FLOAT"): + value = "'%s'" % (value) + elif field_type in ("TIMESTAMP"): + value = "'%s'" % (str(value)) + return "%s(%s)" % (field_type, value) + + +def _render_groupings(fields): + """Render the group by part of a query. + + Parameters + ---------- + fields : list + A list of fields to group by. + + Returns + ------- + str + A string that represents the "group by" part of a query. + """ + + if not fields: + return "" + + return "GROUP BY " + ", ".join(fields) + + +def _render_having(having_conditions): + """Render the having part of a query. + + Parameters + ---------- + having_conditions : list + A ``list`` of ``dict``s to filter the rows + + Returns + ------- + str + A string that represents the "having" part of a query. + + See Also + -------- + render_query : Further clarification of `conditions` formatting. + """ + if not having_conditions: + return "" + + rendered_conditions = [] + + for condition in having_conditions: + field = condition.get('field') + field_type = condition.get('type') + comparators = condition.get('comparators') + + if None in (field, field_type, comparators) or not comparators: + logger.warn('Invalid condition passed in: %s' % condition) + continue + + rendered_conditions.append( + _render_condition(field, field_type, comparators)) + + if not rendered_conditions: + return "" + + return "HAVING %s" % (" AND ".join(rendered_conditions)) + + +def _render_order(order): + """Render the order by part of a query. + + Parameters + ---------- + order : dict + A dictionary with two keys, fields and direction. + Such that the dictionary should be formatted as + {'fields': ['TimeStamp'], 'direction':'desc'}. + + Returns + ------- + str + A string that represents the "order by" part of a query. + """ + + if not order or 'fields' not in order or 'direction' not in order: + return '' + + return "ORDER BY %s %s" % (", ".join(order['fields']), order['direction']) + + +def _render_limit(limit): + """Render the limit part of a query. + + Parameters + ---------- + limit : int, optional + Limit the amount of data needed to be returned. + + Returns + ------- + str + A string that represents the "limit" part of a query. + """ + if not limit: + return '' + + return "LIMIT %s" % limit diff --git a/bigquery/schema_builder.py b/bigquery/schema_builder.py index 65027b8..dafda39 100644 --- a/bigquery/schema_builder.py +++ b/bigquery/schema_builder.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + __author__ = 'Aneil Mallavarapu (http://github.com/aneilbaboo)' from datetime import datetime diff --git a/bigquery/schema_builder.py.bak b/bigquery/schema_builder.py.bak new file mode 100644 index 0000000..65027b8 --- /dev/null +++ b/bigquery/schema_builder.py.bak @@ -0,0 +1,145 @@ +from __future__ import absolute_import +__author__ = 'Aneil Mallavarapu (http://github.com/aneilbaboo)' + +from datetime import datetime + +import six +import dateutil.parser + +from .errors import InvalidTypeException + + +def default_timestamp_parser(s): + try: + if dateutil.parser.parse(s): + return True + else: + return False + except: + return False + + +def schema_from_record(record, timestamp_parser=default_timestamp_parser): + """Generate a BigQuery schema given an example of a record that is to be + inserted into BigQuery. + + Parameters + ---------- + record : dict + Example of a record that is to be inserted into BigQuery + timestamp_parser : function, optional + Unary function taking a ``str`` and returning and ``bool`` that is + True if the string represents a date + + Returns + ------- + Schema: list + """ + return [describe_field(k, v, timestamp_parser=timestamp_parser) + for k, v in list(record.items())] + + +def describe_field(k, v, timestamp_parser=default_timestamp_parser): + """Given a key representing a column name and value representing the value + stored in the column, return a representation of the BigQuery schema + element describing that field. Raise errors if invalid value types are + provided. + + Parameters + ---------- + k : Union[str, unicode] + Key representing the column + v : Union[str, unicode, int, float, datetime, object] + Value mapped to by `k` + + Returns + ------- + object + Describing the field + + Raises + ------ + Exception + If invalid value types are provided. + + Examples + -------- + >>> describe_field("username", "Bob") + {"name": "username", "type": "string", "mode": "nullable"} + >>> describe_field("users", [{"username": "Bob"}]) + {"name": "users", "type": "record", "mode": "repeated", + "fields": [{"name":"username","type":"string","mode":"nullable"}]} + """ + + def bq_schema_field(name, bq_type, mode): + return {"name": name, "type": bq_type, "mode": mode} + + if isinstance(v, list): + if len(v) == 0: + raise Exception( + "Can't describe schema because of empty list {0}:[]".format(k)) + v = v[0] + mode = "repeated" + else: + mode = "nullable" + + bq_type = bigquery_type(v, timestamp_parser=timestamp_parser) + if not bq_type: + raise InvalidTypeException(k, v) + + field = bq_schema_field(k, bq_type, mode) + if bq_type == "record": + try: + field['fields'] = schema_from_record(v, timestamp_parser) + except InvalidTypeException as e: + # recursively construct the key causing the error + raise InvalidTypeException("%s.%s" % (k, e.key), e.value) + + return field + + +def bigquery_type(o, timestamp_parser=default_timestamp_parser): + """Given a value, return the matching BigQuery type of that value. Must be + one of str/unicode/int/float/datetime/record, where record is a dict + containing value which have matching BigQuery types. + + Parameters + ---------- + o : object + A Python object + time_stamp_parser : function, optional + Unary function taking a ``str`` and returning and ``bool`` that is + True if the string represents a date + + Returns + ------- + Union[str, None] + Name of the corresponding BigQuery type for `o`, or None if no type + could be found + + Examples + -------- + >>> bigquery_type("abc") + "string" + >>> bigquery_type(123) + "integer" + """ + + t = type(o) + if t in six.integer_types: + return "integer" + elif (t == six.binary_type and six.PY2) or t == six.text_type: + if timestamp_parser and timestamp_parser(o): + return "timestamp" + else: + return "string" + elif t == float: + return "float" + elif t == bool: + return "boolean" + elif t == dict: + return "record" + elif t == datetime: + return "timestamp" + else: + return None # failed to find a type diff --git a/bigquery/tests/test_client.py b/bigquery/tests/test_client.py index 1315147..bfbe800 100644 --- a/bigquery/tests/test_client.py +++ b/bigquery/tests/test_client.py @@ -655,9 +655,9 @@ def setUp(self): def test_completed_jobs(self): """Ensure we can detect completed jobs""" - return_values = [{'status': {'state': u'RUNNING'}, + return_values = [{'status': {'state': 'RUNNING'}, 'jobReference': {'jobId': "testJob"}}, - {'status': {'state': u'DONE'}, + {'status': {'state': 'DONE'}, 'jobReference': {'jobId': "testJob"}}] def side_effect(*args, **kwargs): @@ -667,7 +667,7 @@ def side_effect(*args, **kwargs): job_resource = self.client.wait_for_job( {'jobReference': {'jobId': "testJob"}, - 'status': {'state': u'RUNNING'}}, + 'status': {'state': 'RUNNING'}}, interval=.01, timeout=.05) @@ -676,7 +676,7 @@ def side_effect(*args, **kwargs): def test_timeout_error(self): """Ensure that timeout raise exceptions""" - incomplete_job = {'status': {'state': u'RUNNING'}, + incomplete_job = {'status': {'state': 'RUNNING'}, 'jobReference': {'jobId': "testJob"}} self.api_mock.jobs().get().execute.return_value = incomplete_job @@ -685,7 +685,7 @@ def test_timeout_error(self): def test_wait_job_http_error(self): """ Test wait job with http error""" - job = {'status': {'state': u'RUNNING'}, + job = {'status': {'state': 'RUNNING'}, 'jobReference': {'jobId': "testJob"}} expected_result = { @@ -709,7 +709,7 @@ def test_wait_job_http_error(self): def test_wait_job_error_result(self): """ Test wait job with error result""" - job = {'status': {'state': u'RUNNING'}, + job = {'status': {'state': 'RUNNING'}, 'jobReference': {'jobId': "testJob"}} expected_result = { @@ -733,9 +733,9 @@ def test_wait_job_error_result(self): def test_accepts_job_id(self): """Ensure it accepts a job Id rather than a full job resource""" - return_values = [{'status': {'state': u'RUNNING'}, + return_values = [{'status': {'state': 'RUNNING'}, 'jobReference': {'jobId': "testJob"}}, - {'status': {'state': u'DONE'}, + {'status': {'state': 'DONE'}, 'jobReference': {'jobId': "testJob"}}] def side_effect(*args, **kwargs): @@ -751,9 +751,9 @@ def side_effect(*args, **kwargs): self.assertIsInstance(job_resource, dict) def test_accepts_integer_job_id(self): - return_values = [{'status': {'state': u'RUNNING'}, + return_values = [{'status': {'state': 'RUNNING'}, 'jobReference': {'jobId': "testJob"}}, - {'status': {'state': u'DONE'}, + {'status': {'state': 'DONE'}, 'jobReference': {'jobId': "testJob"}}] def side_effect(*args, **kwargs): @@ -784,7 +784,7 @@ def setUp(self): def test_csv_job_body_constructed_correctly(self): expected_result = { - 'status': {'state': u'RUNNING'}, + 'status': {'state': 'RUNNING'}, } body = { @@ -843,7 +843,7 @@ def test_csv_job_body_constructed_correctly(self): def test_json_job_body_constructed_correctly(self): expected_result = { - 'status': {'state': u'RUNNING'}, + 'status': {'state': 'RUNNING'}, } body = { @@ -938,7 +938,7 @@ def test_skip_leading_rows_exception_if_not_csv(self): def test_accepts_single_source_uri(self): """Ensure that a source_uri accepts a non-list""" expected_result = { - 'status': {'state': u'RUNNING'}, + 'status': {'state': 'RUNNING'}, } body = { @@ -1033,7 +1033,7 @@ def setUp(self): def test_export(self, mock_generate_hex): """ Ensure that export is working in normal circumstances """ expected_result = { - 'status': {'state': u'RUNNING'}, + 'status': {'state': 'RUNNING'}, } body = { @@ -1134,7 +1134,7 @@ def setUp(self): def test_write(self): """ Ensure that write is working in normal circumstances.""" expected_result = { - 'status': {'state': u'RUNNING'}, + 'status': {'state': 'RUNNING'}, } body = { @@ -1175,7 +1175,7 @@ def test_write(self): def test_write_maxbilltier(self): """ Ensure that write is working when maximumBillingTier is set""" expected_result = { - 'status': {'state': u'RUNNING'}, + 'status': {'state': 'RUNNING'}, } body = { diff --git a/bigquery/tests/test_client.py.bak b/bigquery/tests/test_client.py.bak new file mode 100644 index 0000000..1315147 --- /dev/null +++ b/bigquery/tests/test_client.py.bak @@ -0,0 +1,2902 @@ +import unittest + +import mock +import six +from bigquery import client +from bigquery.errors import ( + JobInsertException, JobExecutingException, + BigQueryTimeoutException +) +from googleapiclient.errors import HttpError +from nose.tools import raises + + +class HttpResponse(object): + def __init__(self, status, reason='There was an error'): + """ + Args: + :param int status: Integer HTTP response status + """ + self.status = status + self.reason = reason + + +class TestGetClient(unittest.TestCase): + def setUp(self): + client._bq_client = None + + self.mock_bq_service = mock.Mock() + self.mock_job_collection = mock.Mock() + + self.mock_bq_service.jobs.return_value = self.mock_job_collection + + self.client = client.BigQueryClient(self.mock_bq_service, 'project') + + def test_no_credentials(self): + """Ensure an Exception is raised when no credentials are provided.""" + + self.assertRaises(AssertionError, client.get_client, 'foo') + + @mock.patch('bigquery.client._credentials') + @mock.patch('bigquery.client.build') + def test_initialize_readonly(self, mock_build, mock_return_cred): + """Ensure that a BigQueryClient is initialized and returned with + read-only permissions. + """ + from bigquery.client import BIGQUERY_SCOPE_READ_ONLY + + mock_cred = mock.Mock() + mock_http = mock.Mock() + mock_service_url = mock.Mock() + mock_cred.from_p12_keyfile_buffer.return_value.authorize.return_value = mock_http + mock_bq = mock.Mock() + mock_build.return_value = mock_bq + key = 'key' + service_account = 'account' + project_id = 'project' + mock_return_cred.return_value = mock_cred + + bq_client = client.get_client( + project_id, service_url=mock_service_url, + service_account=service_account, private_key=key, + readonly=True) + + mock_return_cred.assert_called_once_with() + mock_cred.from_p12_keyfile_buffer.assert_called_once_with( + service_account, mock.ANY, + scopes=BIGQUERY_SCOPE_READ_ONLY) + self.assertTrue( + mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) + mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, + discoveryServiceUrl=mock_service_url) + self.assertEquals(mock_bq, bq_client.bigquery) + self.assertEquals(project_id, bq_client.project_id) + + @mock.patch('bigquery.client._credentials') + @mock.patch('bigquery.client.build') + def test_initialize_read_write(self, mock_build, mock_return_cred): + """Ensure that a BigQueryClient is initialized and returned with + read/write permissions. + """ + from bigquery.client import BIGQUERY_SCOPE + + mock_cred = mock.Mock() + mock_http = mock.Mock() + mock_service_url = mock.Mock() + mock_cred.from_p12_keyfile_buffer.return_value.authorize.return_value = mock_http + mock_bq = mock.Mock() + mock_build.return_value = mock_bq + key = 'key' + service_account = 'account' + project_id = 'project' + mock_return_cred.return_value = mock_cred + + bq_client = client.get_client( + project_id, service_url=mock_service_url, + service_account=service_account, private_key=key, + readonly=False) + + mock_return_cred.assert_called_once_with() + mock_cred.from_p12_keyfile_buffer.assert_called_once_with( + service_account, mock.ANY, scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) + mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, + discoveryServiceUrl=mock_service_url) + self.assertEquals(mock_bq, bq_client.bigquery) + self.assertEquals(project_id, bq_client.project_id) + + @mock.patch('bigquery.client._credentials') + @mock.patch('bigquery.client.build') + def test_initialize_key_file(self, mock_build, mock_return_cred): + """Ensure that a BigQueryClient is initialized and returned with + read/write permissions using a private key file. + """ + from bigquery.client import BIGQUERY_SCOPE + + mock_cred = mock.Mock() + mock_http = mock.Mock() + mock_service_url = mock.Mock() + mock_cred.from_p12_keyfile.return_value.authorize.return_value = mock_http + mock_bq = mock.Mock() + mock_build.return_value = mock_bq + key_file = 'key.pem' + service_account = 'account' + project_id = 'project' + mock_return_cred.return_value = mock_cred + + bq_client = client.get_client( + project_id, service_url=mock_service_url, + service_account=service_account, + private_key_file=key_file, readonly=False) + + mock_return_cred.assert_called_once_with() + mock_cred.from_p12_keyfile.assert_called_once_with(service_account, + key_file, + scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_p12_keyfile.return_value.authorize.called) + mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, + discoveryServiceUrl=mock_service_url) + self.assertEquals(mock_bq, bq_client.bigquery) + self.assertEquals(project_id, bq_client.project_id) + + @mock.patch('bigquery.client._credentials') + @mock.patch('bigquery.client.build') + @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') + def test_initialize_json_key_file(self, mock_open, mock_build, mock_return_cred): + """Ensure that a BigQueryClient is initialized and returned with + read/write permissions using a JSON key file. + """ + from bigquery.client import BIGQUERY_SCOPE + import json + + mock_cred = mock.Mock() + mock_http = mock.Mock() + mock_service_url = mock.Mock() + mock_cred.from_json_keyfile_dict.return_value.authorize.return_value = mock_http + mock_bq = mock.Mock() + mock_build.return_value = mock_bq + json_key_file = 'key.json' + json_key = {'client_email': 'mail', 'private_key': 'pkey'} + mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(json_key) + project_id = 'project' + mock_return_cred.return_value = mock_cred + + bq_client = client.get_client( + project_id, service_url=mock_service_url, + json_key_file=json_key_file, readonly=False) + + mock_return_cred.assert_called_once_with() + mock_cred.from_json_keyfile_dict.assert_called_once_with(json_key, + scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_json_keyfile_dict.return_value.authorize.called) + mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, + discoveryServiceUrl=mock_service_url) + self.assertEquals(mock_bq, bq_client.bigquery) + self.assertEquals(project_id, bq_client.project_id) + + @mock.patch('bigquery.client._credentials') + @mock.patch('bigquery.client.build') + @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') + def test_initialize_json_key_file_without_project_id(self, mock_open, mock_build, + mock_return_cred): + """Ensure that a BigQueryClient is initialized and returned with + read/write permissions using a JSON key file without project_id. + """ + from bigquery.client import BIGQUERY_SCOPE + import json + + mock_cred = mock.Mock() + mock_http = mock.Mock() + mock_service_url = mock.Mock() + mock_cred.from_json_keyfile_dict.return_value.authorize.return_value = mock_http + mock_bq = mock.Mock() + mock_build.return_value = mock_bq + json_key_file = 'key.json' + json_key = {'client_email': 'mail', 'private_key': 'pkey', 'project_id': 'project'} + mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(json_key) + mock_return_cred.return_value = mock_cred + + bq_client = client.get_client( + service_url=mock_service_url, json_key_file=json_key_file, readonly=False) + + mock_open.assert_called_once_with(json_key_file, 'r') + mock_return_cred.assert_called_once_with() + mock_cred.from_json_keyfile_dict.assert_called_once_with(json_key, + scopes=BIGQUERY_SCOPE) + self.assertTrue( + mock_cred.from_json_keyfile_dict.return_value.authorize.called) + mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, + discoveryServiceUrl=mock_service_url) + self.assertEquals(mock_bq, bq_client.bigquery) + self.assertEquals(json_key['project_id'], bq_client.project_id) + + +class TestGetProjectIds(unittest.TestCase): + + def test_get_project_ids(self): + mock_bq_service = mock.Mock() + mock_bq_service.projects().list().execute.return_value = { + 'kind': 'bigquery#projectList', + 'projects': [ + { + 'friendlyName': 'Big Query Test', + 'id': 'big-query-test', + 'kind': 'bigquery#project', + 'numericId': '1435372465', + 'projectReference': {'projectId': 'big-query-test'} + }, + { + 'friendlyName': 'BQ Company project', + 'id': 'bq-project', + 'kind': 'bigquery#project', + 'numericId': '4263574685796', + 'projectReference': {'projectId': 'bq-project'} + } + ], + 'totalItems': 2 + } + + projects = client.get_projects(mock_bq_service) + expected_projects_data = [ + {'id': 'big-query-test', 'name': 'Big Query Test'}, + {'id': 'bq-project', 'name': 'BQ Company project'} + ] + self.assertEqual(projects, expected_projects_data) + + +class TestQuery(unittest.TestCase): + + def setUp(self): + client._bq_client = None + + self.mock_bq_service = mock.Mock() + self.mock_job_collection = mock.Mock() + + self.mock_bq_service.jobs.return_value = self.mock_job_collection + + self.query = 'foo' + self.project_id = 'project' + self.external_udf_uris = ['gs://bucket/external_udf.js'] + self.client = client.BigQueryClient(self.mock_bq_service, + self.project_id) + + def test_query(self): + """Ensure that we retrieve the job id from the query.""" + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'jobComplete': True + } + + self.mock_job_collection.query.return_value = mock_query_job + + job_id, results = self.client.query(self.query, external_udf_uris=self.external_udf_uris) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={ + 'query': self.query, + 'userDefinedFunctionResources': [ {'resourceUri': u} for u in self.external_udf_uris ], + 'timeoutMs': 0, + 'dryRun': False, + 'maxResults': None + } + ) + self.assertEquals(job_id, 'spiderman') + self.assertEquals(results, []) + + + def test_query_max_results_set(self): + """Ensure that we retrieve the job id from the query and the maxResults + parameter is set. + """ + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'jobComplete': True, + } + + self.mock_job_collection.query.return_value = mock_query_job + max_results = 10 + + job_id, results = self.client.query(self.query, + max_results=max_results) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={'query': self.query, 'timeoutMs': 0, + 'maxResults': max_results, 'dryRun': False} + ) + self.assertEquals(job_id, 'spiderman') + self.assertEquals(results, []) + + def test_query_timeout_set(self): + """Ensure that we retrieve the job id from the query and the timeoutMs + parameter is set correctly. + """ + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'jobComplete': True, + } + + self.mock_job_collection.query.return_value = mock_query_job + timeout = 5 + + job_id, results = self.client.query(self.query, timeout=timeout) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={'query': self.query, 'timeoutMs': timeout * 1000, + 'dryRun': False, 'maxResults': None} + ) + self.assertEquals(job_id, 'spiderman') + self.assertEquals(results, []) + + def test_sync_query_timeout(self): + """Ensure that exception is raise on timeout for synchronous query""" + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'jobComplete': False, + } + + self.mock_job_collection.query.return_value = mock_query_job + timeout = 5 + self.assertRaises(BigQueryTimeoutException, self.client.query, + self.query, None, timeout) + + def test_async_query_timeout(self): + """Ensure that exception is not raise on timeout + for asynchronous query""" + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'jobComplete': False, + } + + self.mock_job_collection.query.return_value = mock_query_job + + job_id, results = self.client.query(self.query) + self.assertEquals(job_id, 'spiderman') + self.assertEquals(results, []) + + def test_query_dry_run_valid(self): + """Ensure that None and an empty list is returned from the query when + dry_run is True and the query is valid. + """ + + mock_query_job = mock.Mock() + + mock_query_job.execute.return_value = {'jobReference': {}, + 'jobComplete': True} + + self.mock_job_collection.query.return_value = mock_query_job + + job_id, results = self.client.query(self.query, dry_run=True) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={'query': self.query, 'timeoutMs': 0, 'maxResults': None, + 'dryRun': True} + ) + self.assertIsNone(job_id) + self.assertEqual([], results) + + def test_query_dry_run_invalid(self): + """Ensure that None and a dict is returned from the query when dry_run + is True and the query is invalid. + """ + + mock_query_job = mock.Mock() + + mock_query_job.execute.side_effect = HttpError( + 'crap', '{"message": "Bad query"}'.encode('utf8')) + + self.mock_job_collection.query.return_value = mock_query_job + + job_id, results = self.client.query('%s blah' % self.query, + dry_run=True) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={'query': '%s blah' % self.query, 'timeoutMs': 0, + 'maxResults': None, + 'dryRun': True} + ) + self.assertIsNone(job_id) + self.assertEqual({'message': 'Bad query'}, results) + + def test_query_with_results(self): + """Ensure that we retrieve the job id from the query and results if + they are available. + """ + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'schema': {'fields': [{'name': 'foo', 'type': 'INTEGER'}]}, + 'rows': [{'f': [{'v': 10}]}], + 'jobComplete': True, + } + + self.mock_job_collection.query.return_value = mock_query_job + + job_id, results = self.client.query(self.query) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={'query': self.query, 'timeoutMs': 0, 'dryRun': False, + 'maxResults': None} + ) + self.assertEquals(job_id, 'spiderman') + self.assertEquals(results, [{'foo': 10}]) + + def test_query_with_using_legacy_sql(self): + """Ensure that use_legacy_sql bool gets used""" + + mock_query_job = mock.Mock() + expected_job_id = 'spiderman' + expected_job_ref = {'jobId': expected_job_id} + + mock_query_job.execute.return_value = { + 'jobReference': expected_job_ref, + 'jobComplete': True + } + + self.mock_job_collection.query.return_value = mock_query_job + + job_id, results = self.client.query(self.query, use_legacy_sql=False) + + self.mock_job_collection.query.assert_called_once_with( + projectId=self.project_id, + body={'query': self.query, 'timeoutMs': 0, 'dryRun': False, + 'maxResults': None, 'useLegacySql': False} + ) + self.assertEquals(job_id, 'spiderman') + self.assertEquals(results, []) + + +class TestGetQueryResults(unittest.TestCase): + + def setUp(self): + client._bq_client = None + + self.mock_bq_service = mock.Mock() + self.mock_job_collection = mock.Mock() + + self.mock_bq_service.jobs.return_value = self.mock_job_collection + + self.project_id = 'project' + self.client = client.BigQueryClient(self.mock_bq_service, + self.project_id) + + def test_get_response(self): + """Ensure that the query is executed and the query reply is returned. + """ + + job_id = 'bar' + + mock_query_job = mock.Mock() + mock_query_reply = mock.Mock() + mock_query_job.execute.return_value = mock_query_reply + self.mock_job_collection.getQueryResults.return_value = mock_query_job + + offset = 5 + limit = 10 + page_token = "token" + timeout = 1 + + actual = self.client.get_query_results(job_id, offset, limit, + page_token, timeout) + + self.mock_job_collection.getQueryResults.assert_called_once_with( + projectId=self.project_id, jobId=job_id, startIndex=offset, + maxResults=limit, pageToken=page_token, timeoutMs=1000) + + mock_query_job.execute.assert_called_once_with() + self.assertEquals(actual, mock_query_reply) + + +class TestTransformRow(unittest.TestCase): + + def setUp(self): + client._bq_client = None + + self.mock_bq_service = mock.Mock() + self.mock_job_collection = mock.Mock() + + self.mock_bq_service.jobs.return_value = self.mock_job_collection + + self.project_id = 'project' + self.client = client.BigQueryClient(self.mock_bq_service, + self.project_id) + + def test_transform_row(self): + """Ensure that the row dict is correctly transformed to a log dict.""" + + schema = [{'name': 'foo', 'type': 'INTEGER'}, + {'name': 'bar', 'type': 'FLOAT'}, + {'name': 'baz', 'type': 'STRING'}, + {'name': 'qux', 'type': 'BOOLEAN'}, + {'name': 'timestamp', 'type': 'TIMESTAMP'}] + + row = {'f': [{'v': '42'}, {'v': None}, {'v': 'batman'}, + {'v': 'True'}, {'v': '1.371145650319132E9'}]} + + expected = {'foo': 42, 'bar': None, 'baz': 'batman', 'qux': True, + 'timestamp': 1371145650.319132} + + actual = self.client._transform_row(row, schema) + + self.assertEquals(actual, expected) + + def test_transform_row_with_nested(self): + """Ensure that the row dict with nested records is correctly + transformed to a log dict. + """ + + schema = [{'name': 'foo', 'type': 'INTEGER'}, + {'name': 'bar', 'type': 'FLOAT'}, + {'name': 'baz', 'type': 'STRING'}, + {'name': 'qux', 'type': 'RECORD', 'mode': 'SINGLE', + 'fields': [{'name': 'foobar', 'type': 'INTEGER'}, + {'name': 'bazqux', 'type': 'STRING'}]}] + + row = {'f': [{'v': '42'}, {'v': '36.98'}, {'v': 'batman'}, + {'v': {'f': [{'v': '120'}, {'v': 'robin'}]}}]} + expected = {'foo': 42, 'bar': 36.98, 'baz': 'batman', + 'qux': {'foobar': 120, 'bazqux': 'robin'}} + + actual = self.client._transform_row(row, schema) + + self.assertEquals(actual, expected) + + def test_transform_row_with_nested_repeated(self): + """Ensure that the row dict with nested repeated records is correctly + transformed to a log dict. + """ + + schema = [{'name': 'foo', 'type': 'INTEGER'}, + {'name': 'bar', 'type': 'FLOAT'}, + {'name': 'baz', 'type': 'STRING'}, + {'name': 'qux', 'type': 'RECORD', 'mode': 'REPEATED', + 'fields': [{'name': 'foobar', 'type': 'INTEGER'}, + {'name': 'bazqux', 'type': 'STRING'}]}] + + row = {'f': [{'v': '42'}, {'v': '36.98'}, {'v': 'batman'}, + {'v': [{'v': {'f': [{'v': '120'}, {'v': 'robin'}]}}, + {'v': {'f': [{'v': '300'}, {'v': 'joker'}]}}]}]} + expected = {'foo': 42, 'bar': 36.98, 'baz': 'batman', + 'qux': [{'foobar': 120, 'bazqux': 'robin'}, + {'foobar': 300, 'bazqux': 'joker'}]} + + actual = self.client._transform_row(row, schema) + + self.assertEquals(actual, expected) + + +@mock.patch('bigquery.client.BigQueryClient.get_query_results') +class TestCheckJob(unittest.TestCase): + + def setUp(self): + client._bq_client = None + self.project_id = 'project' + self.client = client.BigQueryClient(mock.Mock(), self.project_id) + + def test_job_incomplete(self, mock_exec): + """Ensure that we return None if the job is not yet complete.""" + + mock_exec.return_value = {'jobComplete': False} + + is_completed, total_rows = self.client.check_job(1) + + self.assertFalse(is_completed) + self.assertEquals(total_rows, 0) + + def test_query_complete(self, mock_exec): + """Ensure that we can handle a normal query result.""" + + mock_exec.return_value = { + 'jobComplete': True, + 'rows': [ + {'f': [{'v': 'bar'}, {'v': 'man'}]}, + {'f': [{'v': 'abc'}, {'v': 'xyz'}]} + ], + 'schema': { + 'fields': [ + {'name': 'foo', 'type': 'STRING'}, + {'name': 'spider', 'type': 'STRING'} + ] + }, + 'totalRows': '2' + } + + is_completed, total_rows = self.client.check_job(1) + + self.assertTrue(is_completed) + self.assertEquals(total_rows, 2) + + +class TestWaitForJob(unittest.TestCase): + + def setUp(self): + client._bq_client = None + self.project_id = 'project' + self.api_mock = mock.Mock() + self.client = client.BigQueryClient(self.api_mock, self.project_id) + + def test_completed_jobs(self): + """Ensure we can detect completed jobs""" + + return_values = [{'status': {'state': u'RUNNING'}, + 'jobReference': {'jobId': "testJob"}}, + {'status': {'state': u'DONE'}, + 'jobReference': {'jobId': "testJob"}}] + + def side_effect(*args, **kwargs): + return return_values.pop(0) + + self.api_mock.jobs().get().execute.side_effect = side_effect + + job_resource = self.client.wait_for_job( + {'jobReference': {'jobId': "testJob"}, + 'status': {'state': u'RUNNING'}}, + interval=.01, + timeout=.05) + + self.assertEqual(self.api_mock.jobs().get().execute.call_count, 2) + self.assertIsInstance(job_resource, dict) + + def test_timeout_error(self): + """Ensure that timeout raise exceptions""" + incomplete_job = {'status': {'state': u'RUNNING'}, + 'jobReference': {'jobId': "testJob"}} + + self.api_mock.jobs().get().execute.return_value = incomplete_job + self.assertRaises(BigQueryTimeoutException, self.client.wait_for_job, + incomplete_job, .1, .25) + + def test_wait_job_http_error(self): + """ Test wait job with http error""" + job = {'status': {'state': u'RUNNING'}, + 'jobReference': {'jobId': "testJob"}} + + expected_result = { + "error": { + "errors": [{ + "domain": "global", + "reason": "required", + "message": "Required parameter is missing" + }], + "code": 400, + "message": "Required parameter is missing" + } + } + + self.api_mock.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobExecutingException, + self.client.wait_for_job, + job, + interval=.01, + timeout=.01) + + def test_wait_job_error_result(self): + """ Test wait job with error result""" + job = {'status': {'state': u'RUNNING'}, + 'jobReference': {'jobId': "testJob"}} + + expected_result = { + "status": { + "state": "DONE", + "errorResult": { + "reason": "invalidQuery", + "location": "query", + "message": "Your Error Message Here " + }, + }, + } + + self.api_mock.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobExecutingException, + self.client.wait_for_job, + job, + interval=.01, + timeout=.01) + + def test_accepts_job_id(self): + """Ensure it accepts a job Id rather than a full job resource""" + + return_values = [{'status': {'state': u'RUNNING'}, + 'jobReference': {'jobId': "testJob"}}, + {'status': {'state': u'DONE'}, + 'jobReference': {'jobId': "testJob"}}] + + def side_effect(*args, **kwargs): + return return_values.pop(0) + + self.api_mock.jobs().get().execute.side_effect = side_effect + + job_resource = self.client.wait_for_job("testJob", + interval=.01, + timeout=5) + + self.assertEqual(self.api_mock.jobs().get().execute.call_count, 2) + self.assertIsInstance(job_resource, dict) + + def test_accepts_integer_job_id(self): + return_values = [{'status': {'state': u'RUNNING'}, + 'jobReference': {'jobId': "testJob"}}, + {'status': {'state': u'DONE'}, + 'jobReference': {'jobId': "testJob"}}] + + def side_effect(*args, **kwargs): + return return_values.pop(0) + + self.api_mock.jobs().get().execute.side_effect = side_effect + + job_resource = self.client.wait_for_job(1234567, + interval=.01, + timeout=600) + + self.assertEqual(self.api_mock.jobs().get().execute.call_count, 2) + self.assertIsInstance(job_resource, dict) + + +class TestImportDataFromURIs(unittest.TestCase): + + def setUp(self): + client._bq_client = None + self.mock_api = mock.Mock() + + self.query = 'foo' + self.project_id = 'project' + self.dataset_id = 'dataset' + self.table_id = 'table' + self.client = client.BigQueryClient(self.mock_api, + self.project_id) + + def test_csv_job_body_constructed_correctly(self): + expected_result = { + 'status': {'state': u'RUNNING'}, + } + + body = { + "jobReference": { + "projectId": self.project_id, + "jobId": "job" + }, + "configuration": { + "load": { + "sourceUris": ["sourceuri"], + "schema": {"fields": ["schema"]}, + "destinationTable": { + "projectId": self.project_id, + "datasetId": self.dataset_id, + "tableId": self.table_id + }, + "createDisposition": "a", + "writeDisposition": "b", + "fieldDelimiter": "c", + "skipLeadingRows": "d", + "encoding": "e", + "quote": "f", + "maxBadRecords": "g", + "allowQuotedNewlines": "h", + "sourceFormat": "CSV", + "allowJaggedRows": "j", + "ignoreUnknownValues": "k" + } + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + result = self.client.import_data_from_uris(["sourceuri"], + self.dataset_id, + self.table_id, + ["schema"], + job="job", + create_disposition="a", + write_disposition="b", + field_delimiter="c", + skip_leading_rows="d", + encoding="e", + quote="f", + max_bad_records="g", + allow_quoted_newlines="h", + source_format="CSV", + allow_jagged_rows="j", + ignore_unknown_values="k") + + self.mock_api.jobs().insert.assert_called_with( + projectId=self.project_id, + body=body + ) + + self.assertEqual(result, expected_result) + + def test_json_job_body_constructed_correctly(self): + expected_result = { + 'status': {'state': u'RUNNING'}, + } + + body = { + "jobReference": { + "projectId": self.project_id, + "jobId": "job" + }, + "configuration": { + "load": { + "sourceUris": ["sourceuri"], + "schema": {"fields": ["schema"]}, + "destinationTable": { + "projectId": self.project_id, + "datasetId": self.dataset_id, + "tableId": self.table_id + }, + "sourceFormat": "JSON" + } + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + result = self.client.import_data_from_uris(["sourceuri"], + self.dataset_id, + self.table_id, + ["schema"], + job="job", + source_format="JSON") + + self.mock_api.jobs().insert.assert_called_with( + projectId=self.project_id, + body=body + ) + + self.assertEqual(result, expected_result) + + @raises(Exception) + def test_field_delimiter_exception_if_not_csv(self): + """Raise exception if csv-only parameter is set inappropriately""" + self.client.import_data_from_uris(["sourceuri"], + self.dataset_id, + self.table_id, + ["schema"], + job="job", + source_format="JSON", + field_delimiter=",") + + @raises(Exception) + def test_allow_jagged_rows_exception_if_not_csv(self): + """Raise exception if csv-only parameter is set inappropriately""" + self.client.import_data_from_uris(["sourceuri"], + self.dataset_id, + self.table_id, + ["schema"], + job="job", + source_format="JSON", + allow_jagged_rows=True) + + @raises(Exception) + def test_allow_quoted_newlines_exception_if_not_csv(self): + """Raise exception if csv-only parameter is set inappropriately""" + self.client.import_data_from_uris(["sourceuri"], + self.dataset_id, + self.table_id, + ["schema"], + job="job", + source_format="JSON", + allow_quoted_newlines=True) + + @raises(Exception) + def test_quote_exception_if_not_csv(self): + """Raise exception if csv-only parameter is set inappropriately""" + self.client.import_data_from_uris(["sourceuri"], + self.dataset_id, + self.table_id, + ["schema"], + job="job", + source_format="JSON", + quote="'") + + @raises(Exception) + def test_skip_leading_rows_exception_if_not_csv(self): + """Raise exception if csv-only parameter is set inappropriately""" + self.client.import_data_from_uris(["sourceuri"], + self.dataset_id, + self.table_id, + ["schema"], + "job", + source_format="JSON", + skip_leading_rows=10) + + def test_accepts_single_source_uri(self): + """Ensure that a source_uri accepts a non-list""" + expected_result = { + 'status': {'state': u'RUNNING'}, + } + + body = { + "jobReference": { + "projectId": self.project_id, + "jobId": "job" + }, + "configuration": { + "load": { + "sourceUris": ["sourceuri"], + "schema": {"fields": ["schema"]}, + "destinationTable": { + "projectId": self.project_id, + "datasetId": self.dataset_id, + "tableId": self.table_id + } + } + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + result = self.client.import_data_from_uris("sourceuri", # not a list! + self.dataset_id, + self.table_id, + schema=["schema"], + job="job") + + self.mock_api.jobs().insert.assert_called_with( + projectId=self.project_id, + body=body + ) + + self.assertEqual(result, expected_result) + + def test_import_http_error(self): + """ Test import with http error""" + expected_result = { + "error": { + "errors": [{ + "domain": "global", + "reason": "required", + "message": "Required parameter is missing" + }], + "code": 400, + "message": "Required parameter is missing" + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobInsertException, + self.client.import_data_from_uris, + ["sourceuri"], + self.dataset_id, + self.table_id) + + def test_import_error_result(self): + """ Test import with error result""" + expected_result = { + "status": { + "state": "DONE", + "errorResult": { + "reason": "invalidQuery", + "location": "query", + "message": "Your Error Message Here " + }, + }, + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobInsertException, + self.client.import_data_from_uris, + ["sourceuri"], + self.dataset_id, + self.table_id) + + +class TestExportDataToURIs(unittest.TestCase): + + def setUp(self): + client._bq_client = None + self.mock_api = mock.Mock() + + self.project_id = 'project' + self.dataset_id = 'dataset' + self.table_id = 'table' + self.destination_format = "CSV" + self.print_header = False + self.client = client.BigQueryClient(self.mock_api, + self.project_id) + + @mock.patch('bigquery.client.BigQueryClient._generate_hex_for_uris') + def test_export(self, mock_generate_hex): + """ Ensure that export is working in normal circumstances """ + expected_result = { + 'status': {'state': u'RUNNING'}, + } + + body = { + "jobReference": { + "projectId": self.project_id, + "jobId": "%s-%s-destinationuri" % (self.dataset_id, + self.table_id) + }, + "configuration": { + "extract": { + "destinationUris": ["destinationuri"], + "sourceTable": { + "projectId": self.project_id, + "datasetId": self.dataset_id, + "tableId": self.table_id + }, + "destinationFormat": self.destination_format, + "printHeader": self.print_header, + } + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + mock_generate_hex.return_value = "destinationuri" + result = self.client.export_data_to_uris( + ["destinationuri"], self.dataset_id, self.table_id, + destination_format=self.destination_format, + print_header=self.print_header + ) + + self.mock_api.jobs().insert.assert_called_with( + projectId=self.project_id, + body=body + ) + + self.assertEqual(result, expected_result) + + def test_export_http_error(self): + """ Test export with http error""" + expected_result = { + "error": { + "errors": [{ + "domain": "global", + "reason": "required", + "message": "Required parameter is missing" + }], + "code": 400, + "message": "Required parameter is missing" + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobInsertException, + self.client.export_data_to_uris, + ["destinationuri"], + self.dataset_id, + self.table_id) + + def test_export_error_result(self): + """ Test export with error result""" + expected_result = { + "status": { + "state": "DONE", + "errorResult": { + "reason": "invalidQuery", + "location": "query", + "message": "Your Error Message Here " + }, + }, + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobInsertException, + self.client.export_data_to_uris, + ["destinationuri"], + self.dataset_id, + self.table_id) + + +class TestWriteToTable(unittest.TestCase): + + def setUp(self): + client._bq_client = None + self.mock_api = mock.Mock() + + self.query = 'foo' + self.project_id = 'project' + self.dataset_id = 'dataset' + self.table_id = 'table' + self.maximum_billing_tier = 1000 + self.external_udf_uris = ['gs://bucket/external_udf.js'] + self.use_query_cache = False + self.priority = "INTERACTIVE" + self.flatten_results = False + self.client = client.BigQueryClient(self.mock_api, + self.project_id) + + def test_write(self): + """ Ensure that write is working in normal circumstances.""" + expected_result = { + 'status': {'state': u'RUNNING'}, + } + + body = { + "configuration": { + "query": { + "destinationTable": { + "projectId": self.project_id, + "datasetId": self.dataset_id, + "tableId": self.table_id + }, + "query": self.query, + "userDefinedFunctionResources": [{ + "resourceUri": self.external_udf_uris[0] + }], + "useQueryCache": self.use_query_cache, + "priority": self.priority, + "flattenResults": self.flatten_results, + } + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + result = self.client.write_to_table(self.query, + self.dataset_id, + self.table_id, + external_udf_uris=self.external_udf_uris, + use_query_cache=False, + flatten=False, + priority=self.priority) + + self.mock_api.jobs().insert.assert_called_with( + projectId=self.project_id, + body=body + ) + + self.assertEqual(result, expected_result) + + def test_write_maxbilltier(self): + """ Ensure that write is working when maximumBillingTier is set""" + expected_result = { + 'status': {'state': u'RUNNING'}, + } + + body = { + "configuration": { + "query": { + "destinationTable": { + "projectId": self.project_id, + "datasetId": self.dataset_id, + "tableId": self.table_id + }, + "query": self.query, + "userDefinedFunctionResources": [{ + "resourceUri": self.external_udf_uris[0] + }], + "useQueryCache": self.use_query_cache, + "priority": self.priority, + "maximumBillingTier": self.maximum_billing_tier + } + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + result = self.client.write_to_table( + self.query, self.dataset_id, self.table_id, priority=self.priority, + external_udf_uris=self.external_udf_uris, use_query_cache=False, + maximum_billing_tier=self.maximum_billing_tier) + + self.mock_api.jobs().insert.assert_called_with( + projectId=self.project_id, + body=body + ) + + self.assertEqual(result, expected_result) + + def test_write_http_error(self): + """ Test write with http error""" + expected_result = { + "error": { + "errors": [{ + "domain": "global", + "reason": "required", + "message": "Required parameter is missing" + }], + "code": 400, + "message": "Required parameter is missing" + } + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobInsertException, self.client.write_to_table, + self.query) + + def test_write_error_result(self): + """ Test write with error result""" + expected_result = { + "status": { + "state": "DONE", + "errorResult": { + "reason": "invalidQuery", + "location": "query", + "message": "Your Error Message Here " + }, + }, + } + + self.mock_api.jobs().insert().execute.return_value = expected_result + self.assertRaises(JobInsertException, self.client.write_to_table, + self.query) + + +class TestFilterTablesByTime(unittest.TestCase): + + def test_empty_tables(self): + """Ensure we can handle filtering an empty dictionary""" + + bq = client.BigQueryClient(None, 'project') + + tables = bq._filter_tables_by_time({}, 1370000000, 0) + + self.assertEqual([], tables) + + def test_multi_inside_range(self): + """Ensure we can correctly filter several application ids""" + + bq = client.BigQueryClient(None, 'project') + + tables = bq._filter_tables_by_time({ + 'Spider-Man': 1370002001, + 'Daenerys Targaryen': 1370001999, + 'Gordon Freeman': 1369999999, + 'William Shatner': 1370001000, + 'Heavy Weapons Guy': 0 + }, 1370002000, 1370000000) + + self.assertEqual( + sorted( + ['Daenerys Targaryen', 'William Shatner', 'Gordon Freeman']), + sorted(tables) + ) + + def test_not_inside_range(self): + """Ensure we can correctly filter several application ids outside the + range we are searching for. + """ + + bq = client.BigQueryClient(None, 'project') + + tables = bq._filter_tables_by_time({ + 'John Snow': 9001, + 'Adam West': 100000000000000, + 'Glados': -1, + 'Potato': 0, + }, 1370002000, 1370000000) + + self.assertEqual([], tables) + + +NEXT_TABLE_LIST_RESPONSE = { + "kind": "bigquery#tableList", + "etag": "\"t_UlB9a9mrx5sjQInRGzeDrLrS0/TsIP_i4gAeLegj84WzkPzBPIkjo\"", + "nextPageToken": "2013_05_appspot_1", + "tables": [ + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_10", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_10" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_11", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_11" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_12", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_12" + } + }, + ], + "totalItems": 3 +} + +FULL_TABLE_LIST_RESPONSE = { + "kind": "bigquery#tableList", + "etag": "\"GSclnjk0zID1ucM3F-xYinOm1oE/cn58Rpu8v8pB4eoJQaiTe11lPQc\"", + "tables": [ + { + "kind": "bigquery#table", + "id": "project:dataset.2013_05_appspot_1", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_05_appspot" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_1", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_1" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_2", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_2" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_3", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_3" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_4", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_4" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.2013_06_appspot_5", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "2013_06_appspot_5" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.appspot_6_2013_06", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "appspot_6_2013_06" + } + }, + { + "kind": "bigquery#table", + "id": "project:dataset.table_not_matching_naming", + "tableReference": { + "projectId": "project", + "datasetId": "dataset", + "tableId": "table_not_matching_naming" + } + }, + { + "kind": "bigquery#table", + "id": "bad table data" + }, + ], + "totalItems": 9 +} + + +@mock.patch('bigquery.client.BigQueryClient.get_query_results') +class TestGetQuerySchema(unittest.TestCase): + + def test_query_complete(self, get_query_mock): + """Ensure that get_query_schema works when a query is complete.""" + from bigquery.client import BigQueryClient + + bq = BigQueryClient(mock.Mock(), 'project') + + get_query_mock.return_value = { + 'jobComplete': True, + 'schema': {'fields': 'This is our schema'} + } + + result_schema = bq.get_query_schema(job_id=123) + + self.assertEquals(result_schema, 'This is our schema') + + def test_query_incomplete(self, get_query_mock): + """Ensure that get_query_schema handles scenarios where the query + is not finished. + """ + from bigquery.client import BigQueryClient + + bq = BigQueryClient(mock.Mock(), 'project') + + get_query_mock.return_value = { + 'jobComplete': False, + 'schema': {'fields': 'This is our schema'} + } + + self.assertRaises(client.UnfinishedQueryException, bq.get_query_schema, + job_id=123) + + +class TestGetTableSchema(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.table = 'table' + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + + def test_table_exists(self): + """Ensure that the table schema is returned if the table exists.""" + + expected = [ + {'type': 'FLOAT', 'name': 'foo', 'mode': 'NULLABLE'}, + {'type': 'INTEGER', 'name': 'bar', 'mode': 'NULLABLE'}, + {'type': 'INTEGER', 'name': 'baz', 'mode': 'NULLABLE'}, + ] + + self.mock_tables.get.return_value.execute.return_value = \ + {'schema': {'fields': expected}} + + self.assertEqual( + expected, self.client.get_table_schema(self.dataset, self.table)) + self.mock_tables.get.assert_called_once_with( + projectId=self.project, tableId=self.table, datasetId=self.dataset) + self.mock_tables.get.return_value.execute.assert_called_once_with() + + def test_table_does_not_exist(self): + """Ensure that None is returned if the table doesn't exist.""" + self.mock_tables.get.return_value.execute.side_effect = \ + HttpError({'status': "404"}, '{}'.encode('utf8')) + + self.assertIsNone( + self.client.get_table_schema(self.dataset, self.table)) + self.mock_tables.get.assert_called_once_with( + projectId=self.project, tableId=self.table, datasetId=self.dataset) + self.mock_tables.get.return_value.execute.assert_called_once_with() + + +@mock.patch('bigquery.client.BigQueryClient.get_query_results') +class TestGetQueryRows(unittest.TestCase): + + def test_query_complete(self, get_query_mock): + """Ensure that get_query_rows works when a query is complete.""" + from bigquery.client import BigQueryClient + + bq = BigQueryClient(mock.Mock(), 'project') + + get_query_mock.return_value = { + 'jobComplete': True, + 'rows': [ + {'f': [{'v': 'bar'}, {'v': 'man'}]}, + {'f': [{'v': 'abc'}, {'v': 'xyz'}]} + ], + 'schema': { + 'fields': [ + {'name': 'foo', 'type': 'STRING'}, + {'name': 'spider', 'type': 'STRING'} + ] + }, + 'totalRows': 2 + } + + result_rows = bq.get_query_rows(job_id=123, offset=0, limit=0) + + expected_rows = [{'foo': 'bar', 'spider': 'man'}, + {'foo': 'abc', 'spider': 'xyz'}] + self.assertEquals(result_rows, expected_rows) + + def test_query_complete_with_page_token(self, get_query_mock): + """Ensure that get_query_rows works with page token.""" + from bigquery.client import BigQueryClient + + page_one_resp = { + "jobComplete": True, + "kind": "bigquery#getQueryResultsResponse", + "pageToken": "TOKEN_TO_PAGE_2", + "schema": { + "fields": [{ + "name": "first_name", + "type": "STRING", + }, { + "name": "last_name", + "type": "STRING", + }] + }, + "rows": [{ + "f": [{ + "v": "foo", + }, { + "v": "bar" + }] + }, { + "f": [{ + "v": "abc", + }, { + "v": "xyz" + }] + }], + "totalRows": "4" + } + + page_two_resp = { + "jobComplete": True, + "kind": "bigquery#getQueryResultsResponse", + "schema": { + "fields": [{ + "name": "first_name", + "type": "STRING", + }, { + "name": "last_name", + "type": "STRING", + }] + }, + "rows": [{ + "f": [{ + "v": "the", + }, { + "v": "beatles" + }] + }, { + "f": [{ + "v": "monty", + }, { + "v": "python" + }] + }], + "totalRows": "4" + } + + bq = BigQueryClient(mock.Mock(), 'project') + get_query_mock.side_effect = [page_one_resp, page_two_resp] + result_rows = bq.get_query_rows(job_id=123, offset=0, limit=0) + + expected_rows = [{'first_name': 'foo', 'last_name': 'bar'}, + {'first_name': 'abc', 'last_name': 'xyz'}, + {'first_name': 'the', 'last_name': 'beatles'}, + {'first_name': 'monty', 'last_name': 'python'}] + self.assertEquals(result_rows, expected_rows) + + def test_query_incomplete(self, get_query_mock): + """Ensure that get_query_rows handles scenarios where the query is not + finished. + """ + from bigquery.client import BigQueryClient + + bq = BigQueryClient(mock.Mock(), 'project') + + get_query_mock.return_value = { + 'jobComplete': False, + 'rows': [ + {'f': [{'v': 'bar'}, {'v': 'man'}]}, + {'f': [{'v': 'abc'}, {'v': 'xyz'}]} + ], + 'schema': { + 'fields': [ + {'name': 'foo', 'type': 'STRING'}, + {'name': 'spider', 'type': 'STRING'} + ] + }, + 'totalRows': 2 + } + + self.assertRaises(client.UnfinishedQueryException, bq.get_query_rows, + job_id=123, offset=0, limit=0) + + +class TestCheckTable(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.table = 'table' + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + + def test_table_does_not_exist(self): + """Ensure that if the table does not exist, False is returned.""" + + self.mock_tables.get.return_value.execute.side_effect = ( + HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) + + actual = self.client.check_table(self.dataset, self.table) + + self.assertFalse(actual) + + self.mock_tables.get.assert_called_once_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table) + + self.mock_tables.get.return_value.execute.assert_called_once_with() + + def test_table_does_exist(self): + """Ensure that if the table does exist, True is returned.""" + + self.mock_tables.get.return_value.execute.side_effect = { + 'status': 'foo'} + + actual = self.client.check_table(self.dataset, self.table) + + self.assertTrue(actual) + + self.mock_tables.get.assert_called_once_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table) + + self.mock_tables.get.return_value.execute.assert_called_once_with() + + +class TestCreateTable(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.table = 'table' + self.schema = [ + {'name': 'foo', 'type': 'STRING', 'mode': 'nullable'}, + {'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'} + ] + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + self.body = { + 'schema': {'fields': self.schema}, + 'tableReference': { + 'tableId': self.table, 'projectId': self.project, + 'datasetId': self.dataset} + } + self.expiration_time = 1437513693000 + self.time_partitioning = True + + def test_table_create_failed(self): + """Ensure that if creating the table fails, False is returned, + or if swallow_results is False an empty dict is returned.""" + + self.mock_tables.insert.return_value.execute.side_effect = ( + HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) + + actual = self.client.create_table(self.dataset, self.table, + self.schema) + + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.create_table(self.dataset, self.table, + self.schema) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.insert.return_value.execute.assert_called_with() + + def test_table_create_success(self): + """Ensure that if creating the table succeeds, True is returned, + or if swallow_results is False the actual response is returned.""" + + self.mock_tables.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.create_table(self.dataset, self.table, + self.schema) + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.create_table(self.dataset, self.table, + self.schema) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.insert.return_value.execute.assert_called_with() + + def test_table_create_body_with_expiration_time(self): + """Ensure that if expiration_time has specified, + it passed to the body.""" + + self.mock_tables.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + self.client.create_table(self.dataset, self.table, + self.schema, self.expiration_time) + + body = self.body.copy() + body.update({ + 'expirationTime': self.expiration_time + }) + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=body) + + self.mock_tables.insert.return_value.execute.assert_called_with() + + def test_table_create_body_with_time_partitioning(self): + """Ensure that if time_partitioning has specified, + it passed to the body.""" + + self.mock_tables.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + self.client.create_table(self.dataset, self.table, + self.schema, + time_partitioning=self.time_partitioning) + + body = self.body.copy() + body.update({ + 'timePartitioning': {'type': 'DAY'} + }) + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=body) + + self.mock_tables.insert.return_value.execute.assert_called_with() + + +class TestUpdateTable(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.table = 'table' + self.schema = [ + {'name': 'foo', 'type': 'STRING', 'mode': 'nullable'}, + {'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'} + ] + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + self.body = { + 'schema': {'fields': self.schema}, + 'tableReference': { + 'tableId': self.table, 'projectId': self.project, + 'datasetId': self.dataset} + } + self.expiration_time = 1437513693000 + + def test_table_update_failed(self): + """Ensure that if updating the table fails, False is returned, + or if swallow_results is False an empty dict is returned.""" + + self.mock_tables.update.return_value.execute.side_effect = ( + HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) + + actual = self.client.update_table(self.dataset, self.table, + self.schema) + + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.update_table(self.dataset, self.table, + self.schema) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_tables.update.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.update.return_value.execute.assert_called_with() + + def test_table_update_success(self): + """Ensure that if updating the table succeeds, True is returned, + or if swallow_results is False the actual response is returned.""" + + self.mock_tables.update.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.update_table(self.dataset, self.table, + self.schema) + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.update_table(self.dataset, self.table, + self.schema) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_tables.update.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.update.return_value.execute.assert_called_with() + + +class TestPatchTable(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.table = 'table' + self.schema = [ + {'name': 'foo', 'type': 'STRING', 'mode': 'nullable'}, + {'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'} + ] + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + self.body = { + 'schema': {'fields': self.schema}, + 'tableReference': { + 'tableId': self.table, 'projectId': self.project, + 'datasetId': self.dataset} + } + self.expiration_time = 1437513693000 + + def test_table_patch_failed(self): + """Ensure that if patching the table fails, False is returned, + or if swallow_results is False an empty dict is returned.""" + + self.mock_tables.patch.return_value.execute.side_effect = ( + HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) + + actual = self.client.patch_table(self.dataset, self.table, + self.schema) + + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.patch_table(self.dataset, self.table, + self.schema) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_tables.patch.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.patch.return_value.execute.assert_called_with() + + def test_table_patch_success(self): + """Ensure that if patching the table succeeds, True is returned, + or if swallow_results is False the actual response is returned.""" + + self.mock_tables.patch.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.patch_table(self.dataset, self.table, + self.schema) + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.patch_table(self.dataset, self.table, + self.schema) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_tables.patch.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.patch.return_value.execute.assert_called_with() + + +class TestCreateView(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.table = 'table' + self.project = 'project' + self.dataset = 'dataset' + self.query = 'SELECT "bar" foo, "foo" bar' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + self.body = { + 'view': {'query': self.query}, + 'tableReference': { + 'tableId': self.table, 'projectId': self.project, + 'datasetId': self.dataset} + } + + def test_view_create_failed(self): + """Ensure that if creating the table fails, False is returned, + or if swallow_results is False an empty dict is returned.""" + + self.mock_tables.insert.return_value.execute.side_effect = ( + HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) + + actual = self.client.create_view(self.dataset, self.table, + self.query) + + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.create_view(self.dataset, self.table, + self.query) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.insert.return_value.execute.assert_called_with() + + def test_view_create_success(self): + """Ensure that if creating the table succeeds, True is returned, + or if swallow_results is False the actual response is returned.""" + + self.mock_tables.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.create_view(self.dataset, self.table, + self.query) + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.create_view(self.dataset, self.table, + self.query) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_tables.insert.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_tables.insert.return_value.execute.assert_called_with() + + +class TestDeleteTable(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_tables = mock.Mock() + self.mock_bq_service.tables.return_value = self.mock_tables + self.table = 'table' + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + + def test_delete_table_fail(self): + """Ensure that if deleting table fails, False is returned, + or the actual response is swallow_results is False.""" + + self.mock_tables.delete.return_value.execute.side_effect = ( + HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) + + actual = self.client.delete_table(self.dataset, self.table) + + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.delete_table(self.dataset, self.table) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_tables.delete.assert_called_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table) + + self.mock_tables.delete.return_value.execute.assert_called_with() + + def test_delete_table_success(self): + """Ensure that if deleting table succeeds, True is returned, + or the actual response if swallow_results is False.""" + + self.mock_tables.delete.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.delete_table(self.dataset, self.table) + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.delete_table(self.dataset, self.table) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_tables.delete.assert_called_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table) + + self.mock_tables.delete.return_value.execute.assert_called_with() + + +class TestParseTableListReponse(unittest.TestCase): + + def test_full_parse(self): + """Ensures we can parse a full list response.""" + + bq = client.BigQueryClient(None, 'project') + + tables = bq._parse_table_list_response(FULL_TABLE_LIST_RESPONSE) + + expected_result = { + 'appspot-3': {'2013_06_appspot_3': 1370044800}, + 'appspot-2': {'2013_06_appspot_2': 1370044800}, + 'appspot-1': {'2013_06_appspot_1': 1370044800}, + 'appspot-6': {'appspot_6_2013_06': 1370044800}, + 'appspot-5': {'2013_06_appspot_5': 1370044800}, + 'appspot-4': {'2013_06_appspot_4': 1370044800}, + 'appspot': {'2013_05_appspot': 1367366400} + } + + self.assertEquals(expected_result, tables) + + def test_empty_parse(self): + """Ensures we can parse an empty dictionary.""" + + bq = client.BigQueryClient(None, 'project') + + tables = bq._parse_table_list_response({}) + + self.assertEquals(tables, {}) + + def test_error(self): + """Ensures we can handle parsing a response error.""" + + error_response = { + "error": { + "errors": [ + { + "domain": "global", + "reason": "required", + "message": "Login Required", + "locationType": "header", + "location": "Authorization" + } + ], + "code": 401, + "message": "Login Required" + } + } + bq = client.BigQueryClient(None, 'project') + + tables = bq._parse_table_list_response(error_response) + + self.assertEquals(tables, {}) + + def test_incorrect_table_formats(self): + """Ensures we can parse incorrectly formatted table ids.""" + + list_response = { + "tables": [ + { + "tableReference": { + "tableId": "somethingwrong" + } + }, + { + "tableReference": { + "tableId": "john-snow" + } + }, + { + "tableReference": { + "tableId": "'------'," + } + }, + { + "tableReference": { + "tableId": "" + } + }, + { + "tableReference": { + "tableId": "adam_west" + } + } + ], + } + bq = client.BigQueryClient(None, 'project') + + tables = bq._parse_table_list_response(list_response) + + self.assertEquals(tables, {}) + + +class TestPushRows(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_table_data = mock.Mock() + self.mock_bq_service.tabledata.return_value = self.mock_table_data + self.table = 'table' + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + self.rows = [ + {'one': 'uno', 'two': 'dos'}, {'one': 'ein', 'two': 'zwei'}, + {'two': 'kiwi'}] + self.data = { + "kind": "bigquery#tableDataInsertAllRequest", + "rows": [{'insertId': "uno", 'json': {'one': 'uno', 'two': 'dos'}}, + {'insertId': "ein", 'json': + {'one': 'ein', 'two': 'zwei'}}, + {'json': {'two': 'kiwi'}}] + } + + def test_push_failed(self): + """Ensure that if insertAll does not raise an exception, but returns + insertion errors, False is returned. + """ + + self.mock_table_data.insertAll.return_value.execute.return_value = { + 'insertErrors': 'foo'} + + actual = self.client.push_rows(self.dataset, self.table, self.rows, + 'one') + + self.assertFalse(actual) + + self.mock_bq_service.tabledata.assert_called_once_with() + + self.mock_table_data.insertAll.assert_called_once_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table, + body=self.data) + + execute_calls = [mock.call()] + self.mock_table_data.insertAll.return_value.execute.assert_has_calls( + execute_calls) + + def test_push_failed_swallow_results_false(self): + """ + Ensure that if insertAll returns insertion errors and swallow_results + is false that you get an empty dictionary. + """ + self.mock_table_data.insertAll.return_value.execute.return_value = { + 'insertErrors': 'foo'} + self.client.swallow_results = False + + actual = self.client.push_rows(self.dataset, self.table, self.rows, + 'one') + + self.client.swallow_results = True # Reset for other tests + + self.assertEqual( + actual, + self.mock_table_data.insertAll.return_value.execute.return_value) + + def test_push_exception(self): + """Ensure that if insertAll raises an exception, False is returned.""" + + e = HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) + self.mock_table_data.insertAll.return_value.execute.side_effect = e + + actual = self.client.push_rows(self.dataset, self.table, self.rows, + 'one') + + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.push_rows(self.dataset, self.table, self.rows, + 'one') + + self.assertEqual(actual, { + 'insertErrors': [{ + 'errors': [{ + 'reason': 'httperror', + 'message': e + }] + }]}) + + self.client.swallow_results = True + + self.mock_bq_service.tabledata.assert_called_with() + + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table, + body=self.data) + + execute_calls = [mock.call()] + self.mock_table_data.insertAll.return_value.execute.assert_has_calls( + execute_calls) + + def test_push_success(self): + """Ensure that if insertAll does not raise an exception, but returns + insertion errors, False is returned. + """ + + self.mock_table_data.insertAll.return_value.execute.return_value = { + 'status': 'foo'} + + actual = self.client.push_rows(self.dataset, self.table, self.rows, + 'one') + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.push_rows(self.dataset, self.table, self.rows, + 'one') + + self.assertEqual(actual, {'status': 'foo'}) + + self.client.swallow_results = True + + self.mock_bq_service.tabledata.assert_called_with() + + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, datasetId=self.dataset, tableId=self.table, + body=self.data) + + execute_calls = [mock.call()] + self.mock_table_data.insertAll.return_value.execute.assert_has_calls( + execute_calls) + + def test_request_data_with_options(self): + """Ensure that insertAll body has optional property only when + the optional parameter of push_rows passed. + """ + expected_body = self.data.copy() + + self.client.push_rows( + self.dataset, self.table, self.rows, + insert_id_key='one') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + self.client.push_rows( + self.dataset, self.table, self.rows, + insert_id_key='one', + ignore_unknown_values=False, + skip_invalid_rows=False) + expected_body['ignoreUnknownValues'] = False + expected_body['skipInvalidRows'] = False + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + self.client.push_rows( + self.dataset, self.table, self.rows, + insert_id_key='one', + ignore_unknown_values=True, + skip_invalid_rows=True, + template_suffix='20160428' + ) + expected_body['ignoreUnknownValues'] = True + expected_body['skipInvalidRows'] = True + expected_body['templateSuffix'] = '20160428' + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + def test_insert_id_key_with_nested_column(self): + """Ensure that dot separated insert_id_key properly extracted with nested column value.""" + rows = [ + {'nested': {'col': 'nested_col1'}, 'val': 1}, + {'nested': {'col': 'nested_col2'}, 'val': 2}, + ] + expected_body = self.data.copy() + expected_body['rows'] = [ + {'insertId': 'nested_col1', 'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, + {'insertId': 'nested_col2', 'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, + ] + + self.client.push_rows(self.dataset, self.table, rows, + insert_id_key='nested.col') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + expected_body = self.data.copy() + expected_body['rows'] = [ + {'insertId': 1, 'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, + {'insertId': 2, 'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, + ] + self.client.push_rows(self.dataset, self.table, rows, + insert_id_key='val') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + expected_body = self.data.copy() + expected_body['rows'] = [ + {'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, + {'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, + ] + self.client.push_rows(self.dataset, self.table, rows, + insert_id_key='no_such.column') + self.mock_table_data.insertAll.assert_called_with( + projectId=self.project, + datasetId=self.dataset, + tableId=self.table, + body=expected_body) + + +class TestGetAllTables(unittest.TestCase): + + def test_get_all_tables(self): + """Ensure get_all_tables fetches table names from BigQuery.""" + + mock_execute = mock.Mock() + mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE + + mock_tables = mock.Mock() + mock_tables.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.tables.return_value = mock_tables + + bq = client.BigQueryClient(mock_bq_service, 'project') + + expected_result = [ + '2013_05_appspot', '2013_06_appspot_1', '2013_06_appspot_2', + '2013_06_appspot_3', '2013_06_appspot_4', '2013_06_appspot_5', + 'appspot_6_2013_06', 'table_not_matching_naming' + ] + + tables = bq.get_all_tables('dataset') + self.assertEquals(expected_result, tables) + + def test_get_tables(self): + """Ensure _get_all_tables fetches table names from BigQuery.""" + + mock_execute = mock.Mock() + mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE + + mock_tables = mock.Mock() + mock_tables.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.tables.return_value = mock_tables + + bq = client.BigQueryClient(mock_bq_service, 'project') + + expected_result = { + 'appspot-3': {'2013_06_appspot_3': 1370044800}, + 'appspot-2': {'2013_06_appspot_2': 1370044800}, + 'appspot-1': {'2013_06_appspot_1': 1370044800}, + 'appspot-6': {'appspot_6_2013_06': 1370044800}, + 'appspot-5': {'2013_06_appspot_5': 1370044800}, + 'appspot-4': {'2013_06_appspot_4': 1370044800}, + 'appspot': {'2013_05_appspot': 1367366400} + } + + tables = bq._get_all_tables('dataset', cache=False) + self.assertEquals(expected_result, tables) + + def test_get_all_tables_with_page_token(self): + """Ensure get_all_tables fetches all tables names from BigQuery""" + + mock_execute = mock.Mock() + mock_execute.execute.side_effect = [NEXT_TABLE_LIST_RESPONSE, + FULL_TABLE_LIST_RESPONSE] + + mock_tables = mock.Mock() + mock_tables.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.tables.return_value = mock_tables + + bq = client.BigQueryClient(mock_bq_service, 'project') + + expected_result = { + 'appspot-3': {'2013_06_appspot_3': 1370044800}, + 'appspot-2': {'2013_06_appspot_2': 1370044800}, + 'appspot-1': {'2013_06_appspot_1': 1370044800}, + 'appspot-6': {'appspot_6_2013_06': 1370044800}, + 'appspot-5': {'2013_06_appspot_5': 1370044800}, + 'appspot-4': {'2013_06_appspot_4': 1370044800}, + 'appspot': {'2013_05_appspot': 1367366400}, + 'appspot-10': {'2013_06_appspot_10': 1370044800}, + 'appspot-12': {'2013_06_appspot_12': 1370044800}, + 'appspot-11': {'2013_06_appspot_11': 1370044800}, + } + tables = bq._get_all_tables('dataset', cache=False) + self.assertEquals(expected_result, tables) + + def test_get_all_tables_with_cache(self): + """Ensure get_all_tables uses cache when fetching""" + mock_execute = mock.Mock() + mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE + + mock_tables = mock.Mock() + mock_tables.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.tables.return_value = mock_tables + + bq = client.BigQueryClient(mock_bq_service, 'project') + + expected_result = { + 'appspot-3': {'2013_06_appspot_3': 1370044800}, + 'appspot-2': {'2013_06_appspot_2': 1370044800}, + 'appspot-1': {'2013_06_appspot_1': 1370044800}, + 'appspot-6': {'appspot_6_2013_06': 1370044800}, + 'appspot-5': {'2013_06_appspot_5': 1370044800}, + 'appspot-4': {'2013_06_appspot_4': 1370044800}, + 'appspot': {'2013_05_appspot': 1367366400} + } + + tables = bq._get_all_tables('dataset', cache=True) + self.assertEquals(expected_result, tables) + + mock_execute.execute.side_effect = [NEXT_TABLE_LIST_RESPONSE, + FULL_TABLE_LIST_RESPONSE] + tables = bq._get_all_tables('dataset', cache=True) + self.assertEquals(expected_result, tables) + + expected_result = { + 'appspot-3': {'2013_06_appspot_3': 1370044800}, + 'appspot-2': {'2013_06_appspot_2': 1370044800}, + 'appspot-1': {'2013_06_appspot_1': 1370044800}, + 'appspot-6': {'appspot_6_2013_06': 1370044800}, + 'appspot-5': {'2013_06_appspot_5': 1370044800}, + 'appspot-4': {'2013_06_appspot_4': 1370044800}, + 'appspot': {'2013_05_appspot': 1367366400}, + 'appspot-10': {'2013_06_appspot_10': 1370044800}, + 'appspot-12': {'2013_06_appspot_12': 1370044800}, + 'appspot-11': {'2013_06_appspot_11': 1370044800}, + } + tables = bq._get_all_tables('dataset', cache=False) + self.assertEquals(expected_result, tables) + + +class TestGetTables(unittest.TestCase): + + def test_get_tables(self): + """Ensure tables falling in the time window are returned.""" + + mock_execute = mock.Mock() + mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE + + mock_tables = mock.Mock() + mock_tables.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.tables.return_value = mock_tables + + bq = client.BigQueryClient(mock_bq_service, 'project') + + tables = bq.get_tables('dataset', 'appspot-1', 0, 10000000000) + six.assertCountEqual(self, tables, ['2013_06_appspot_1']) + + def test_get_tables_from_datetimes(self): + """Ensure tables falling in the time window, specified with datetimes, + are returned. + """ + from datetime import datetime + + mock_execute = mock.Mock() + mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE + + mock_tables = mock.Mock() + mock_tables.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.tables.return_value = mock_tables + + bq = client.BigQueryClient(mock_bq_service, 'project') + + start = datetime(2013, 5, 10) + end = datetime(2013, 7, 10) + + tables = bq.get_tables('dataset', 'appspot-1', start, end) + six.assertCountEqual(self, tables, ['2013_06_appspot_1']) + + +# +# Dataset tests +# +class TestCreateDataset(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_datasets = mock.Mock() + self.mock_bq_service.datasets.return_value = self.mock_datasets + self.dataset = 'dataset' + self.project = 'project' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + self.friendly_name = "friendly name" + self.description = "description" + self.access = [{'userByEmail': "bob@gmail.com"}] + self.body = { + 'datasetReference': { + 'datasetId': self.dataset, + 'projectId': self.project}, + 'friendlyName': self.friendly_name, + 'description': self.description, + 'access': self.access + } + + def test_dataset_create_failed(self): + """Ensure that if creating the table fails, False is returned.""" + + self.mock_datasets.insert.return_value.execute.side_effect = \ + HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) + + actual = self.client.create_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.create_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_datasets.insert.assert_called_with( + projectId=self.project, body=self.body) + + self.mock_datasets.insert.return_value.execute. \ + assert_called_with() + + def test_dataset_create_success(self): + """Ensure that if creating the table fails, False is returned.""" + + self.mock_datasets.insert.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.create_dataset(self.dataset, + self.friendly_name, + self.description, + self.access) + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.create_dataset(self.dataset, + self.friendly_name, + self.description, + self.access) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_datasets.insert.assert_called_with( + projectId=self.project, body=self.body) + + self.mock_datasets.insert.return_value.execute. \ + assert_called_with() + + +class TestDeleteDataset(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_datasets = mock.Mock() + self.mock_bq_service.datasets.return_value = self.mock_datasets + self.project = 'project' + self.dataset = 'dataset' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + + def test_delete_datasets_fail(self): + """Ensure that if deleting table fails, False is returned.""" + + self.mock_datasets.delete.return_value.execute.side_effect = \ + HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) + + actual = self.client.delete_dataset(self.dataset) + + self.assertFalse(actual) + + self.mock_datasets.delete.assert_called_once_with( + projectId=self.project, datasetId=self.dataset, + deleteContents=False) + + self.client.swallow_results = False + + actual = self.client.delete_dataset(self.dataset) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_datasets.delete.return_value.execute. \ + assert_called_with() + + def test_delete_datasets_success(self): + """Ensure that if deleting table succeeds, True is returned.""" + + self.mock_datasets.delete.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.delete_dataset(self.dataset) + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.delete_dataset(self.dataset) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_datasets.delete.assert_called_with( + projectId=self.project, datasetId=self.dataset, + deleteContents=False) + + self.mock_datasets.delete.return_value.execute. \ + assert_called_with() + + def test_delete_datasets_delete_contents_success(self): + """Ensure that if deleting table succeeds, True is returned.""" + + self.mock_datasets.delete.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.delete_dataset(self.dataset, True) + + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.delete_dataset(self.dataset, True) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_datasets.delete.assert_called_with( + projectId=self.project, datasetId=self.dataset, + deleteContents=True) + + self.mock_datasets.delete.return_value.execute. \ + assert_called_with() + + +FULL_DATASET_LIST_RESPONSE = { + "kind": "bigquery#dataseteList", + "etag": "\"GSclnjk0zID1ucM3F-xYinOm1oE/cn58Rpu8v8pB4eoJQaiTe11lPQc\"", + "datasets": [ + { + "kind": "bigquery#dataset", + "id": "project:dataset1", + "datasetReference": { + "projectId": "project", + "datasetId": "dataset1" + } + }, + { + "kind": "bigquery#dataset", + "id": "project:dataset2", + "datasetReference": { + "projectId": "project", + "datasetId": "dataset2", + } + }, + { + "kind": "bigquery#dataset", + "id": "project:dataset3", + "datasetReference": { + "projectId": "project", + "datasetId": "dataset3" + } + }, + { + "kind": "bigquery#dataset", + "id": "project:dataset4", + "datasetReference": { + "projectId": "project", + "datasetId": "dataset4" + } + }, + { + "kind": "bigquery#dataset", + "id": "project:dataset5", + "datasetReference": { + "projectId": "project", + "datasetId": "dataset5" + } + }, + { + "kind": "bigquery#dataset", + "id": "project:dataset6", + "datasetReference": { + "projectId": "project", + "datasetId": "dataset6" + } + }, + { + "kind": "bigquery#dataset", + "id": "project:dataset7", + "datasetReference": { + "projectId": "project", + "datasetId": "dataset7" + } + }, + { + "kind": "bigquery#dataset", + "id": "bad dataset data" + } + ], + "totalItems": 8 +} + + +class TestGetDatasets(unittest.TestCase): + + def test_get_datasets(self): + """Ensure datasets are returned.""" + + mock_execute = mock.Mock() + mock_execute.execute.return_value = FULL_DATASET_LIST_RESPONSE + + mock_datasets = mock.Mock() + mock_datasets.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.datasets.return_value = mock_datasets + + bq = client.BigQueryClient(mock_bq_service, 'project') + + datasets = bq.get_datasets() + six.assertCountEqual(self, datasets, + FULL_DATASET_LIST_RESPONSE['datasets']) + + def test_get_datasets_returns_no_list(self): + """Ensure we handle the no datasets case""" + mock_execute = mock.Mock() + mock_execute.execute.return_value = { + "kind": "bigquery#dataseteList", + "etag": "\"GSclnjk0zID1ucM3F-xYinOm1oE/cn58Rpu8v8pB4eoJQaiTe11lP\"" + } + + mock_datasets = mock.Mock() + mock_datasets.list.return_value = mock_execute + + mock_bq_service = mock.Mock() + mock_bq_service.datasets.return_value = mock_datasets + + bq = client.BigQueryClient(mock_bq_service, 'project') + + datasets = bq.get_datasets() + six.assertCountEqual(self, datasets, []) + + +class TestUpdateDataset(unittest.TestCase): + + def setUp(self): + self.mock_bq_service = mock.Mock() + self.mock_datasets = mock.Mock() + self.mock_bq_service.datasets.return_value = self.mock_datasets + self.dataset = 'dataset' + self.project = 'project' + self.client = client.BigQueryClient(self.mock_bq_service, self.project) + self.friendly_name = "friendly name" + self.description = "description" + self.access = [{'userByEmail': "bob@gmail.com"}] + self.body = { + 'datasetReference': { + 'datasetId': self.dataset, + 'projectId': self.project}, + 'friendlyName': self.friendly_name, + 'description': self.description, + 'access': self.access + } + + def test_dataset_update_failed(self): + """Ensure that if creating the table fails, False is returned.""" + + self.mock_datasets.update.return_value.execute.side_effect = \ + HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) + + actual = self.client.update_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) + self.assertFalse(actual) + + self.client.swallow_results = False + + actual = self.client.update_dataset(self.dataset, + friendly_name=self.friendly_name, + description=self.description, + access=self.access) + + self.assertEqual(actual, {}) + + self.client.swallow_results = True + + self.mock_datasets.update.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_datasets.update.return_value.execute. \ + assert_called_with() + + def test_dataset_update_success(self): + """Ensure that if creating the table fails, False is returned.""" + + self.mock_datasets.update.return_value.execute.side_effect = [{ + 'status': 'foo'}, {'status': 'bar'}] + + actual = self.client.update_dataset(self.dataset, + self.friendly_name, + self.description, + self.access) + self.assertTrue(actual) + + self.client.swallow_results = False + + actual = self.client.update_dataset(self.dataset, + self.friendly_name, + self.description, + self.access) + + self.assertEqual(actual, {'status': 'bar'}) + + self.client.swallow_results = True + + self.mock_datasets.update.assert_called_with( + projectId=self.project, datasetId=self.dataset, body=self.body) + + self.mock_datasets.update.return_value.execute. \ + assert_called_with() diff --git a/bigquery/tests/test_schema_builder.py b/bigquery/tests/test_schema_builder.py index 060162b..80023d4 100644 --- a/bigquery/tests/test_schema_builder.py +++ b/bigquery/tests/test_schema_builder.py @@ -15,7 +15,7 @@ def test_str_is_string(self): six.assertCountEqual(self, bigquery_type("Bob"), 'string') def test_unicode_is_string(self): - six.assertCountEqual(self, bigquery_type(u"Here is a happy face \u263A"), + six.assertCountEqual(self, bigquery_type("Here is a happy face \u263A"), 'string') def test_int_is_integer(self): diff --git a/bigquery/tests/test_schema_builder.py.bak b/bigquery/tests/test_schema_builder.py.bak new file mode 100644 index 0000000..060162b --- /dev/null +++ b/bigquery/tests/test_schema_builder.py.bak @@ -0,0 +1,140 @@ +from six.moves.builtins import object +from datetime import datetime +import unittest + +import six +from bigquery.schema_builder import schema_from_record +from bigquery.schema_builder import describe_field +from bigquery.schema_builder import bigquery_type +from bigquery.schema_builder import InvalidTypeException + + +class TestBigQueryTypes(unittest.TestCase): + + def test_str_is_string(self): + six.assertCountEqual(self, bigquery_type("Bob"), 'string') + + def test_unicode_is_string(self): + six.assertCountEqual(self, bigquery_type(u"Here is a happy face \u263A"), + 'string') + + def test_int_is_integer(self): + six.assertCountEqual(self, bigquery_type(123), 'integer') + + def test_datetime_is_timestamp(self): + six.assertCountEqual(self, bigquery_type(datetime.now()), 'timestamp') + + def test_isoformat_timestring(self): + six.assertCountEqual(self, bigquery_type(datetime.now().isoformat()), + 'timestamp') + + def test_timestring_feb_20_1973(self): + six.assertCountEqual(self, bigquery_type("February 20th 1973"), + 'timestamp') + + def test_timestring_thu_1_july_2004_22_30_00(self): + six.assertCountEqual(self, bigquery_type("Thu, 1 July 2004 22:30:00"), + 'timestamp') + + def test_today_is_not_timestring(self): + six.assertCountEqual(self, bigquery_type("today"), 'string') + + def test_timestring_next_thursday(self): + six.assertCountEqual(self, bigquery_type("February 20th 1973"), 'timestamp') + + def test_timestring_arbitrary_fn_success(self): + six.assertCountEqual( + self, bigquery_type("whatever", timestamp_parser=lambda x: True), + 'timestamp') + + def test_timestring_arbitrary_fn_fail(self): + six.assertCountEqual( + self, bigquery_type("February 20th 1973", + timestamp_parser=lambda x: False), + 'string') + + def test_class_instance_is_invalid_type(self): + class SomeClass(object): + pass + + self.assertIsNone(bigquery_type(SomeClass())) + + def test_list_is_invalid_type(self): + self.assertIsNone(bigquery_type([1, 2, 3])) + + def test_dict_is_record(self): + six.assertCountEqual(self, bigquery_type({"a": 1}), 'record') + + +class TestFieldDescription(unittest.TestCase): + + def test_simple_string_field(self): + six.assertCountEqual(self, describe_field("user", "Bob"), + {"name": "user", "type": "string", "mode": + "nullable"}) + + +class TestSchemaGenerator(unittest.TestCase): + + def test_simple_record(self): + record = {"username": "Bob", "id": 123} + schema = [{"name": "username", "type": "string", "mode": "nullable"}, + {"name": "id", "type": "integer", "mode": "nullable"}] + + six.assertCountEqual(self, schema_from_record(record), schema) + + def test_hierarchical_record(self): + record = {"user": {"username": "Bob", "id": 123}} + schema = [{"name": "user", "type": "record", "mode": "nullable", + "fields": [{"name": "username", "type": "string", "mode": + "nullable"}, {"name": "id", "type": "integer", + "mode": "nullable"}]}] + generated_schema = schema_from_record(record) + schema_fields = schema[0].pop('fields') + generated_fields = generated_schema[0].pop('fields') + six.assertCountEqual(self, schema_fields, generated_fields) + six.assertCountEqual(self, generated_schema, schema) + + def test_hierarchical_record_with_timestamps(self): + record = {"global": "2001-01-01", "user": {"local": "2001-01-01"}} + + schema_with_ts = [ + {"name": "global", "type": "timestamp", "mode": "nullable"}, + {"name": "user", "type": "record", "mode": "nullable", + "fields": [{ + "name": "local", + "type": "timestamp", + "mode": "nullable"}]}] + + schema_without_ts = [ + {"name": "global", "type": "string", "mode": "nullable"}, + {"name": "user", "type": "record", "mode": "nullable", + "fields": [{ + "name": "local", + "type": "string", + "mode": "nullable"}]}] + + six.assertCountEqual(self, schema_from_record(record), schema_with_ts) + + six.assertCountEqual( + self, schema_from_record(record, timestamp_parser=lambda x: False), + schema_without_ts) + + def test_repeated_field(self): + record = {"ids": [1, 2, 3, 4, 5]} + schema = [{"name": "ids", "type": "integer", "mode": "repeated"}] + + six.assertCountEqual(self, schema_from_record(record), schema) + + def test_nested_invalid_type_reported_correctly(self): + key = "wrong answer" + value = "wrong answer" + + try: + schema_from_record({"a": {"b": [{"c": None}]}}) + except InvalidTypeException as e: + key = e.key + value = e.value + + self.assertEqual(key, "a.b.c") + self.assertEqual(value, None) From f54113842f199650d733d791c2d278e255e90ae8 Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Thu, 18 May 2017 10:18:37 -0700 Subject: [PATCH 2/6] Remove .bak --- bigquery/__init__.py.bak | 21 - bigquery/client.py.bak | 1932 -------------- bigquery/query_builder.py.bak | 397 --- bigquery/schema_builder.py.bak | 145 - bigquery/tests/test_client.py.bak | 2902 --------------------- bigquery/tests/test_schema_builder.py.bak | 140 - 6 files changed, 5537 deletions(-) delete mode 100644 bigquery/__init__.py.bak delete mode 100644 bigquery/client.py.bak delete mode 100644 bigquery/query_builder.py.bak delete mode 100644 bigquery/schema_builder.py.bak delete mode 100644 bigquery/tests/test_client.py.bak delete mode 100644 bigquery/tests/test_schema_builder.py.bak diff --git a/bigquery/__init__.py.bak b/bigquery/__init__.py.bak deleted file mode 100644 index b393875..0000000 --- a/bigquery/__init__.py.bak +++ /dev/null @@ -1,21 +0,0 @@ -from __future__ import absolute_import - -from .version import __version__ - -from .client import get_client -from .client import ( - BIGQUERY_SCOPE, - BIGQUERY_SCOPE_READ_ONLY, - JOB_CREATE_IF_NEEDED, - JOB_CREATE_NEVER, - JOB_SOURCE_FORMAT_NEWLINE_DELIMITED_JSON, - JOB_SOURCE_FORMAT_DATASTORE_BACKUP, - JOB_SOURCE_FORMAT_CSV, - JOB_WRITE_TRUNCATE, - JOB_WRITE_APPEND, - JOB_WRITE_EMPTY, - JOB_ENCODING_UTF_8, - JOB_ENCODING_ISO_8859_1 -) - -from .schema_builder import schema_from_record diff --git a/bigquery/client.py.bak b/bigquery/client.py.bak deleted file mode 100644 index 17a3a89..0000000 --- a/bigquery/client.py.bak +++ /dev/null @@ -1,1932 +0,0 @@ -import calendar -import json -from logging import getLogger, NullHandler -from collections import defaultdict -from datetime import datetime, timedelta -from hashlib import sha256 -from io import StringIO -from time import sleep, time -from functools import reduce - -import six -from bigquery.errors import (BigQueryTimeoutException, JobExecutingException, - JobInsertException, UnfinishedQueryException) -from googleapiclient.discovery import build, DISCOVERY_URI -from googleapiclient.errors import HttpError -from httplib2 import Http - -BIGQUERY_SCOPE = [ - 'https://www.googleapis.com/auth/bigquery' -] - -BIGQUERY_SCOPE_READ_ONLY = [ - 'https://www.googleapis.com/auth/bigquery.readonly' -] - -CACHE_TIMEOUT = timedelta(seconds=30) - -JOB_CREATE_IF_NEEDED = 'CREATE_IF_NEEDED' -JOB_CREATE_NEVER = 'CREATE_NEVER' -JOB_WRITE_TRUNCATE = 'WRITE_TRUNCATE' -JOB_WRITE_APPEND = 'WRITE_APPEND' -JOB_WRITE_EMPTY = 'WRITE_EMPTY' -JOB_ENCODING_UTF_8 = 'UTF-8' -JOB_ENCODING_ISO_8859_1 = 'ISO-8859-1' -JOB_PRIORITY_INTERACTIVE = 'INTERACTIVE' -JOB_PRIORITY_BATCH = 'BATCH' -JOB_COMPRESSION_NONE = 'NONE' -JOB_COMPRESSION_GZIP = 'GZIP' - -JOB_FORMAT_CSV = 'CSV' -JOB_FORMAT_NEWLINE_DELIMITED_JSON = 'NEWLINE_DELIMITED_JSON' -JOB_SOURCE_FORMAT_DATASTORE_BACKUP = 'DATASTORE_BACKUP' -JOB_SOURCE_FORMAT_NEWLINE_DELIMITED_JSON = JOB_FORMAT_NEWLINE_DELIMITED_JSON -JOB_SOURCE_FORMAT_CSV = JOB_FORMAT_CSV -JOB_DESTINATION_FORMAT_AVRO = 'AVRO' -JOB_DESTINATION_FORMAT_NEWLINE_DELIMITED_JSON = \ - JOB_FORMAT_NEWLINE_DELIMITED_JSON -JOB_DESTINATION_FORMAT_CSV = JOB_FORMAT_CSV - -logger = getLogger(__name__) -logger.addHandler(NullHandler()) - - -def get_client(project_id=None, credentials=None, - service_url=None, service_account=None, - private_key=None, private_key_file=None, - json_key=None, json_key_file=None, - readonly=True, swallow_results=True): - """Return a singleton instance of BigQueryClient. Either - AssertionCredentials or a service account and private key combination need - to be provided in order to authenticate requests to BigQuery. - - Parameters - ---------- - project_id : str, optional - The BigQuery project id, required unless json_key or json_key_file is - provided. - credentials : oauth2client.client.SignedJwtAssertionCredentials, optional - AssertionCredentials instance to authenticate requests to BigQuery - (optional, must provide `service_account` and (`private_key` or - `private_key_file`) or (`json_key` or `json_key_file`) if not included - service_url : str, optional - A URI string template pointing to the location of Google's API - discovery service. Requires two parameters {api} and {apiVersion} that - when filled in produce an absolute URI to the discovery document for - that service. If not set then the default googleapiclient discovery URI - is used. See `credentials` - service_account : str, optional - The Google API service account name. See `credentials` - private_key : str, optional - The private key associated with the service account in PKCS12 or PEM - format. See `credentials` - private_key_file : str, optional - The name of the file containing the private key associated with the - service account in PKCS12 or PEM format. See `credentials` - json_key : dict, optional - The JSON key associated with the service account. See `credentials` - json_key_file : str, optional - The name of the JSON key file associated with the service account. See - `credentials`. - readonly : bool - Bool indicating if BigQuery access is read-only. Has no effect if - credentials are provided. Default True. - swallow_results : bool - If set to False, then return the actual response value instead of - converting to boolean. Default True. - - Returns - ------- - BigQueryClient - An instance of the BigQuery client. - """ - - if not credentials: - assert (service_account and (private_key or private_key_file)) or ( - json_key or json_key_file), \ - 'Must provide AssertionCredentials or service account and P12 key\ - or JSON key' - - if not project_id: - assert json_key or json_key_file, \ - 'Must provide project_id unless json_key or json_key_file is\ - provided' - - if service_url is None: - service_url = DISCOVERY_URI - - scope = BIGQUERY_SCOPE_READ_ONLY if readonly else BIGQUERY_SCOPE - - if private_key_file: - credentials = _credentials().from_p12_keyfile(service_account, - private_key_file, - scopes=scope) - - if private_key: - try: - if isinstance(private_key, basestring): - private_key = private_key.decode('utf-8') - except NameError: - # python3 -- private_key is already unicode - pass - credentials = _credentials().from_p12_keyfile_buffer( - service_account, - StringIO(private_key), - scopes=scope) - - if json_key_file: - with open(json_key_file, 'r') as key_file: - json_key = json.load(key_file) - - if json_key: - credentials = _credentials().from_json_keyfile_dict(json_key, - scopes=scope) - if not project_id: - project_id = json_key['project_id'] - - bq_service = _get_bq_service(credentials=credentials, - service_url=service_url) - - return BigQueryClient(bq_service, project_id, swallow_results) - - -def get_projects(bq_service): - """Given the BigQuery service, return data about all projects.""" - projects_request = bq_service.projects().list().execute() - - projects = [] - for project in projects_request.get('projects', []): - project_data = { - 'id': project['id'], - 'name': project['friendlyName'] - } - projects.append(project_data) - return projects - - -def _get_bq_service(credentials=None, service_url=None): - """Construct an authorized BigQuery service object.""" - - assert credentials, 'Must provide ServiceAccountCredentials' - - http = credentials.authorize(Http()) - service = build('bigquery', 'v2', http=http, - discoveryServiceUrl=service_url) - - return service - - -def _credentials(): - """Import and return SignedJwtAssertionCredentials class""" - from oauth2client.service_account import ServiceAccountCredentials - - return ServiceAccountCredentials - - -class BigQueryClient(object): - - def __init__(self, bq_service, project_id, swallow_results=True): - self.bigquery = bq_service - self.project_id = project_id - self.swallow_results = swallow_results - self.cache = {} - - def _submit_query_job(self, query_data): - """ Submit a query job to BigQuery. - - This is similar to BigQueryClient.query, but gives the user - direct access to the query method on the offical BigQuery - python client. - - For fine-grained control over a query job, see: - https://google-api-client-libraries.appspot.com/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#query - - Parameters - ---------- - query_data - query object as per "configuration.query" in - https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query - - Returns - ------- - tuple - job id and query results if query completed. If dry_run is True, - job id will be None and results will be empty if the query is valid - or a dict containing the response if invalid. - - Raises - ------ - BigQueryTimeoutException - On timeout - """ - - logger.debug('Submitting query job: %s' % query_data) - - job_collection = self.bigquery.jobs() - - try: - query_reply = job_collection.query( - projectId=self.project_id, body=query_data).execute() - except HttpError as e: - if query_data.get("dryRun", False): - return None, json.loads(e.content.decode('utf8')) - raise - - job_id = query_reply['jobReference'].get('jobId') - schema = query_reply.get('schema', {'fields': None})['fields'] - rows = query_reply.get('rows', []) - job_complete = query_reply.get('jobComplete', False) - - # raise exceptions if it's not an async query - # and job is not completed after timeout - if not job_complete and query_data.get("timeoutMs", False): - logger.error('BigQuery job %s timeout' % job_id) - raise BigQueryTimeoutException() - - return job_id, [self._transform_row(row, schema) for row in rows] - - def _insert_job(self, body_object): - """ Submit a job to BigQuery - - Direct proxy to the insert() method of the offical BigQuery - python client. - - Able to submit load, link, query, copy, or extract jobs. - - For more details, see: - https://google-api-client-libraries.appspot.com/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#insert - - Parameters - ---------- - body_object : body object passed to bigquery.jobs().insert() - - Returns - ------- - response of the bigquery.jobs().insert().execute() call - - Raises - ------ - BigQueryTimeoutException on timeout - """ - - logger.debug('Submitting job: %s' % body_object) - - job_collection = self.bigquery.jobs() - - return job_collection.insert( - projectId=self.project_id, - body=body_object - ).execute() - - def query(self, query, max_results=None, timeout=0, dry_run=False, use_legacy_sql=None, external_udf_uris=None): - """Submit a query to BigQuery. - - Parameters - ---------- - query : str - BigQuery query string - max_results : int, optional - The maximum number of rows to return per page of results. - timeout : float, optional - How long to wait for the query to complete, in seconds before - the request times out and returns. - dry_run : bool, optional - If True, the query isn't actually run. A valid query will return an - empty response, while an invalid one will return the same error - message it would if it wasn't a dry run. - use_legacy_sql : bool, optional. Default True. - If False, the query will use BigQuery's standard SQL (https://cloud.google.com/bigquery/sql-reference/) - external_udf_uris : list, optional - Contains external UDF URIs. If given, URIs must be Google Cloud - Storage and have .js extensions. - - - Returns - ------- - tuple - (job id, query results) if the query completed. If dry_run is True, - job id will be None and results will be empty if the query is valid - or a ``dict`` containing the response if invalid. - - Raises - ------ - BigQueryTimeoutException - on timeout - """ - - logger.debug('Executing query: %s' % query) - - query_data = { - 'query': query, - 'timeoutMs': timeout * 1000, - 'dryRun': dry_run, - 'maxResults': max_results - } - - if use_legacy_sql is not None: - query_data['useLegacySql'] = use_legacy_sql - - if external_udf_uris: - query_data['userDefinedFunctionResources'] = \ - [ {'resourceUri': u} for u in external_udf_uris ] - - return self._submit_query_job(query_data) - - def get_query_schema(self, job_id): - """Retrieve the schema of a query by job id. - - Parameters - ---------- - job_id : str - The job_id that references a BigQuery query - - Returns - ------- - list - A ``list`` of ``dict`` objects that represent the schema. - """ - - query_reply = self.get_query_results(job_id, offset=0, limit=0) - - if not query_reply['jobComplete']: - logger.warning('BigQuery job %s not complete' % job_id) - raise UnfinishedQueryException() - - return query_reply['schema']['fields'] - - def get_table_schema(self, dataset, table): - """Return the table schema. - - Parameters - ---------- - dataset : str - The dataset containing the `table`. - table : str - The table to get the schema for - - Returns - ------- - list - A ``list`` of ``dict`` objects that represent the table schema. If - the table doesn't exist, None is returned. - """ - - try: - result = self.bigquery.tables().get( - projectId=self.project_id, - tableId=table, - datasetId=dataset).execute() - except HttpError as e: - if int(e.resp['status']) == 404: - logger.warn('Table %s.%s does not exist', dataset, table) - return None - raise - - return result['schema']['fields'] - - def check_job(self, job_id): - """Return the state and number of results of a query by job id. - - Parameters - ---------- - job_id : str - The job id of the query to check. - - Returns - ------- - tuple - (``bool``, ``int``) Whether or not the query has completed and the - total number of rows included in the query table if it has - completed (else 0) - """ - - query_reply = self.get_query_results(job_id, offset=0, limit=0) - - return (query_reply.get('jobComplete', False), - int(query_reply.get('totalRows', 0))) - - def get_query_rows(self, job_id, offset=None, limit=None, timeout=0): - """Retrieve a list of rows from a query table by job id. - This method will append results from multiple pages together. If you - want to manually page through results, you can use `get_query_results` - method directly. - - Parameters - ---------- - job_id : str - The job id that references a BigQuery query. - offset : int, optional - The offset of the rows to pull from BigQuery - limit : int, optional - The number of rows to retrieve from a query table. - timeout : float, optional - Timeout in seconds. - - Returns - ------- - list - A ``list`` of ``dict`` objects that represent table rows. - """ - - # Get query results - query_reply = self.get_query_results(job_id, offset=offset, - limit=limit, timeout=timeout) - if not query_reply['jobComplete']: - logger.warning('BigQuery job %s not complete' % job_id) - raise UnfinishedQueryException() - - schema = query_reply["schema"]["fields"] - rows = query_reply.get('rows', []) - page_token = query_reply.get("pageToken") - records = [self._transform_row(row, schema) for row in rows] - - # Append to records if there are multiple pages for query results - while page_token and (not limit or len(records) < limit): - query_reply = self.get_query_results( - job_id, offset=offset, limit=limit, page_token=page_token, - timeout=timeout) - page_token = query_reply.get("pageToken") - rows = query_reply.get('rows', []) - records += [self._transform_row(row, schema) for row in rows] - return records[:limit] if limit else records - - def check_dataset(self, dataset_id): - """Check to see if a dataset exists. - - Parameters - ---------- - dataset_id : str - Dataset unique id - - Returns - ------- - bool - True if dataset at `dataset_id` exists, else Fasle - """ - dataset = self.get_dataset(dataset_id) - return bool(dataset) - - def get_dataset(self, dataset_id): - """Retrieve a dataset if it exists, otherwise return an empty dict. - - Parameters - ---------- - dataset_id : str - Dataset unique id - - Returns - ------- - dict - Contains dataset object if it exists, else empty - """ - try: - dataset = self.bigquery.datasets().get( - projectId=self.project_id, datasetId=dataset_id).execute() - except HttpError: - dataset = {} - - return dataset - - def check_table(self, dataset, table): - """Check to see if a table exists. - - Parameters - ---------- - dataset : str - The dataset to check - table : str - The name of the table - - Returns - ------- - bool - True if table exists, else False - """ - table = self.get_table(dataset, table) - return bool(table) - - def get_table(self, dataset, table): - """ Retrieve a table if it exists, otherwise return an empty dict. - - Parameters - ---------- - dataset : str - The dataset that the table is in - table : str - The name of the table - - Returns - ------- - dict - Containing the table object if it exists, else empty - """ - try: - table = self.bigquery.tables().get( - projectId=self.project_id, datasetId=dataset, - tableId=table).execute() - except HttpError: - table = {} - - return table - - def create_table(self, dataset, table, schema, - expiration_time=None, time_partitioning=False): - """Create a new table in the dataset. - - Parameters - ---------- - dataset : str - The dataset to create the table in - table : str - The name of the table to create - schema : dict - The table schema - expiration_time : float, optional - The expiry time in milliseconds since the epoch. - time_partitioning : bool, optional - Create a time partitioning. - - Returns - ------- - Union[bool, dict] - If the table was successfully created, or response from BigQuery - if swallow_results is set to False - """ - - body = { - 'schema': {'fields': schema}, - 'tableReference': { - 'tableId': table, - 'projectId': self.project_id, - 'datasetId': dataset - } - } - - if expiration_time is not None: - body['expirationTime'] = expiration_time - - if time_partitioning: - body['timePartitioning'] = {'type': 'DAY'} - - try: - table = self.bigquery.tables().insert( - projectId=self.project_id, - datasetId=dataset, - body=body - ).execute() - if self.swallow_results: - return True - else: - return table - - except HttpError as e: - logger.error(('Cannot create table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) - if self.swallow_results: - return False - else: - return {} - - def update_table(self, dataset, table, schema): - """Update an existing table in the dataset. - - Parameters - ---------- - dataset : str - The dataset to update the table in - table : str - The name of the table to update - schema : dict - Table schema - - Returns - ------- - Union[bool, dict] - bool indicating if the table was successfully updated or not, - or response from BigQuery if swallow_results is set to False. - """ - - body = { - 'schema': {'fields': schema}, - 'tableReference': { - 'tableId': table, - 'projectId': self.project_id, - 'datasetId': dataset - } - } - - try: - result = self.bigquery.tables().update( - projectId=self.project_id, - datasetId=dataset, - body=body - ).execute() - if self.swallow_results: - return True - else: - return result - - except HttpError as e: - logger.error(('Cannot update table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) - if self.swallow_results: - return False - else: - return {} - - def patch_table(self, dataset, table, schema): - """Patch an existing table in the dataset. - - Parameters - ---------- - dataset : str - The dataset to patch the table in - table : str - The name of the table to patch - schema : dict - The table schema - - Returns - ------- - Union[bool, dict] - Bool indicating if the table was successfully patched or not, - or response from BigQuery if swallow_results is set to False - """ - - body = { - 'schema': {'fields': schema}, - 'tableReference': { - 'tableId': table, - 'projectId': self.project_id, - 'datasetId': dataset - } - } - - try: - result = self.bigquery.tables().patch( - projectId=self.project_id, - datasetId=dataset, - body=body - ).execute() - if self.swallow_results: - return True - else: - return result - - except HttpError as e: - logger.error(('Cannot patch table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) - if self.swallow_results: - return False - else: - return {} - - def create_view(self, dataset, view, query, use_legacy_sql=None): - """Create a new view in the dataset. - - Parameters - ---------- - dataset : str - The dataset to create the view in - view : str - The name of the view to create - query : dict - A query that BigQuery executes when the view is referenced. - use_legacy_sql : bool, optional - If False, the query will use BigQuery's standard SQL - (https://cloud.google.com/bigquery/sql-reference/) - - Returns - ------- - Union[bool, dict] - bool indicating if the view was successfully created or not, - or response from BigQuery if swallow_results is set to False. - """ - - body = { - 'tableReference': { - 'tableId': view, - 'projectId': self.project_id, - 'datasetId': dataset - }, - 'view': { - 'query': query - } - } - - if use_legacy_sql is not None: - body['view']['useLegacySql'] = use_legacy_sql - - try: - view = self.bigquery.tables().insert( - projectId=self.project_id, - datasetId=dataset, - body=body - ).execute() - if self.swallow_results: - return True - else: - return view - - except HttpError as e: - logger.error(('Cannot create view {0}.{1}\n' - 'Http Error: {2}').format(dataset, view, e.content)) - if self.swallow_results: - return False - else: - return {} - - def delete_table(self, dataset, table): - """Delete a table from the dataset. - - Parameters - ---------- - dataset : str - The dataset to delete the table from. - table : str - The name of the table to delete - - Returns - ------- - Union[bool, dict] - bool indicating if the table was successfully deleted or not, - or response from BigQuery if swallow_results is set for False. - """ - - try: - response = self.bigquery.tables().delete( - projectId=self.project_id, - datasetId=dataset, - tableId=table - ).execute() - if self.swallow_results: - return True - else: - return response - - except HttpError as e: - logger.error(('Cannot delete table {0}.{1}\n' - 'Http Error: {2}').format(dataset, table, e.content)) - if self.swallow_results: - return False - else: - return {} - - def get_tables(self, dataset_id, app_id, start_time, end_time): - """Retrieve a list of tables that are related to the given app id - and are inside the range of start and end times. - - Parameters - ---------- - dataset_id : str - The BigQuery dataset id to consider. - app_id : str - The appspot name - start_time : Union[datetime, int] - The datetime or unix time after which records will be fetched. - end_time : Union[datetime, int] - The datetime or unix time up to which records will be fetched. - - Returns - ------- - list - A ``list`` of table names. - """ - - if isinstance(start_time, datetime): - start_time = calendar.timegm(start_time.utctimetuple()) - - if isinstance(end_time, datetime): - end_time = calendar.timegm(end_time.utctimetuple()) - - every_table = self._get_all_tables(dataset_id) - app_tables = every_table.get(app_id, {}) - - return self._filter_tables_by_time(app_tables, start_time, end_time) - - def import_data_from_uris( - self, - source_uris, - dataset, - table, - schema=None, - job=None, - source_format=None, - create_disposition=None, - write_disposition=None, - encoding=None, - ignore_unknown_values=None, - max_bad_records=None, - allow_jagged_rows=None, - allow_quoted_newlines=None, - field_delimiter=None, - quote=None, - skip_leading_rows=None, - ): - """ - Imports data into a BigQuery table from cloud storage. Optional - arguments that are not specified are determined by BigQuery as - described: - https://developers.google.com/bigquery/docs/reference/v2/jobs - - Parameters - ---------- - source_urls : list - A ``list`` of ``str`` objects representing the urls on cloud - storage of the form: gs://bucket/filename - dataset : str - String id of the dataset - table : str - String id of the table - job : str, optional - Identifies the job (a unique job id is automatically generated if - not provided) - schema : list, optional - Represents the BigQuery schema - source_format : str, optional - One of the JOB_SOURCE_FORMAT_* constants - create_disposition : str, optional - One of the JOB_CREATE_* constants - write_disposition : str, optional - One of the JOB_WRITE_* constants - encoding : str, optional - One of the JOB_ENCODING_* constants - ignore_unknown_values : bool, optional - Whether or not to ignore unknown values - max_bad_records : int, optional - Maximum number of bad records - allow_jagged_rows : bool, optional - For csv only - allow_quoted_newlines : bool, optional - For csv only - field_delimiter : str, optional - For csv only - quote : str, optional - Quote character for csv only - skip_leading_rows : int, optional - For csv only - - Returns - ------- - dict - A BigQuery job response - - Raises - ------ - JobInsertException - on http/auth failures or error in result - """ - source_uris = source_uris if isinstance(source_uris, list) \ - else [source_uris] - - configuration = { - "destinationTable": { - "projectId": self.project_id, - "tableId": table, - "datasetId": dataset - }, - "sourceUris": source_uris, - } - - if max_bad_records: - configuration['maxBadRecords'] = max_bad_records - - if ignore_unknown_values: - configuration['ignoreUnknownValues'] = ignore_unknown_values - - if create_disposition: - configuration['createDisposition'] = create_disposition - - if write_disposition: - configuration['writeDisposition'] = write_disposition - - if encoding: - configuration['encoding'] = encoding - - if schema: - configuration['schema'] = {'fields': schema} - - if source_format: - configuration['sourceFormat'] = source_format - - if not job: - hex = self._generate_hex_for_uris(source_uris) - job = "{dataset}-{table}-{digest}".format( - dataset=dataset, - table=table, - digest=hex - ) - - if source_format == JOB_SOURCE_FORMAT_CSV: - if field_delimiter: - configuration['fieldDelimiter'] = field_delimiter - - if allow_jagged_rows: - configuration['allowJaggedRows'] = allow_jagged_rows - - if allow_quoted_newlines: - configuration['allowQuotedNewlines'] = allow_quoted_newlines - - if quote: - configuration['quote'] = quote - - if skip_leading_rows: - configuration['skipLeadingRows'] = skip_leading_rows - - elif field_delimiter or allow_jagged_rows \ - or allow_quoted_newlines or quote or skip_leading_rows: - all_values = dict(field_delimiter=field_delimiter, - allow_jagged_rows=allow_jagged_rows, - allow_quoted_newlines=allow_quoted_newlines, - skip_leading_rows=skip_leading_rows, - quote=quote) - non_null_values = dict((k, v) for k, v - in list(all_values.items()) - if v) - raise Exception("Parameters field_delimiter, allow_jagged_rows, " - "allow_quoted_newlines, quote and " - "skip_leading_rows are only allowed when " - "source_format=JOB_SOURCE_FORMAT_CSV: %s" - % non_null_values) - - body = { - "configuration": { - 'load': configuration - }, - "jobReference": { - "projectId": self.project_id, - "jobId": job - } - } - - logger.debug("Creating load job %s" % body) - job_resource = self._insert_job(body) - self._raise_insert_exception_if_error(job_resource) - return job_resource - - def export_data_to_uris( - self, - destination_uris, - dataset, - table, - job=None, - compression=None, - destination_format=None, - print_header=None, - field_delimiter=None, - ): - """ - Export data from a BigQuery table to cloud storage. Optional arguments - that are not specified are determined by BigQuery as described: - https://developers.google.com/bigquery/docs/reference/v2/jobs - - Parameters - ---------- - destination_urls : Union[str, list] - ``str`` or ``list`` of ``str`` objects representing the URIs on - cloud storage of the form: gs://bucket/filename - dataset : str - String id of the dataset - table : str - String id of the table - job : str, optional - String identifying the job (a unique jobid is automatically - generated if not provided) - compression : str, optional - One of the JOB_COMPRESSION_* constants - destination_format : str, optional - One of the JOB_DESTination_FORMAT_* constants - print_header : bool, optional - Whether or not to print the header - field_delimiter : str, optional - Character separating fields in delimited file - - Returns - ------- - dict - A BigQuery job resource - - Raises - ------ - JobInsertException - On http/auth failures or error in result - """ - destination_uris = destination_uris \ - if isinstance(destination_uris, list) else [destination_uris] - - configuration = { - "sourceTable": { - "projectId": self.project_id, - "tableId": table, - "datasetId": dataset - }, - "destinationUris": destination_uris, - } - - if compression: - configuration['compression'] = compression - - if destination_format: - configuration['destinationFormat'] = destination_format - - if print_header is not None: - configuration['printHeader'] = print_header - - if field_delimiter: - configuration['fieldDelimiter'] = field_delimiter - - if not job: - hex = self._generate_hex_for_uris(destination_uris) - job = "{dataset}-{table}-{digest}".format( - dataset=dataset, - table=table, - digest=hex - ) - - body = { - "configuration": { - 'extract': configuration - }, - "jobReference": { - "projectId": self.project_id, - "jobId": job - } - } - - logger.info("Creating export job %s" % body) - job_resource = self._insert_job(body) - self._raise_insert_exception_if_error(job_resource) - return job_resource - - def write_to_table( - self, - query, - dataset=None, - table=None, - external_udf_uris=None, - allow_large_results=None, - use_query_cache=None, - priority=None, - create_disposition=None, - write_disposition=None, - use_legacy_sql=None, - maximum_billing_tier=None, - flatten=None - ): - """ - Write query result to table. If dataset or table is not provided, - Bigquery will write the result to temporary table. Optional arguments - that are not specified are determined by BigQuery as described: - https://developers.google.com/bigquery/docs/reference/v2/jobs - - Parameters - ---------- - query : str - BigQuery query string - dataset : str, optional - String id of the dataset - table : str, optional - String id of the table - external_udf_uris : list, optional - Contains external UDF URIs. If given, URIs must be Google Cloud - Storage and have .js extensions. - allow_large_results : bool, optional - Whether or not to allow large results - use_query_cache : bool, optional - Whether or not to use query cache - priority : str, optional - One of the JOB_PRIORITY_* constants - create_disposition : str, optional - One of the JOB_CREATE_* constants - write_disposition : str, optional - One of the JOB_WRITE_* constants - use_legacy_sql: bool, optional - If False, the query will use BigQuery's standard SQL - (https://cloud.google.com/bigquery/sql-reference/) - maximum_billing_tier : integer, optional - Limits the billing tier for this job. Queries that have resource - usage beyond this tier will fail (without incurring a charge). If - unspecified, this will be set to your project default. For more - information, - see https://cloud.google.com/bigquery/pricing#high-compute - flatten : bool, optional - Whether or not to flatten nested and repeated fields - in query results - - Returns - ------- - dict - A BigQuery job resource - - Raises - ------ - JobInsertException - On http/auth failures or error in result - """ - - configuration = { - "query": query, - } - - if dataset and table: - configuration['destinationTable'] = { - "projectId": self.project_id, - "tableId": table, - "datasetId": dataset - } - - if allow_large_results is not None: - configuration['allowLargeResults'] = allow_large_results - - if flatten is not None: - configuration['flattenResults'] = flatten - - if maximum_billing_tier is not None: - configuration['maximumBillingTier'] = maximum_billing_tier - - if use_query_cache is not None: - configuration['useQueryCache'] = use_query_cache - - if use_legacy_sql is not None: - configuration['useLegacySql'] = use_legacy_sql - - if priority: - configuration['priority'] = priority - - if create_disposition: - configuration['createDisposition'] = create_disposition - - if write_disposition: - configuration['writeDisposition'] = write_disposition - - if external_udf_uris: - configuration['userDefinedFunctionResources'] = \ - [ {'resourceUri': u} for u in external_udf_uris ] - - body = { - "configuration": { - 'query': configuration - } - } - - logger.info("Creating write to table job %s" % body) - job_resource = self._insert_job(body) - self._raise_insert_exception_if_error(job_resource) - return job_resource - - def wait_for_job(self, job, interval=5, timeout=60): - """ - Waits until the job indicated by job_resource is done or has failed - - Parameters - ---------- - job : Union[dict, str] - ``dict`` representing a BigQuery job resource, or a ``str`` - representing the BigQuery job id - interval : float, optional - Polling interval in seconds, default = 5 - timeout : float, optional - Timeout in seconds, default = 60 - - Returns - ------- - dict - Final state of the job resouce, as described here: - https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#get - - Raises - ------ - Union[JobExecutingException, BigQueryTimeoutException] - On http/auth failures or timeout - """ - complete = False - job_id = str(job if isinstance(job, - (six.binary_type, six.text_type, int)) - else job['jobReference']['jobId']) - job_resource = None - - start_time = time() - elapsed_time = 0 - while not (complete or elapsed_time > timeout): - sleep(interval) - request = self.bigquery.jobs().get(projectId=self.project_id, - jobId=job_id) - job_resource = request.execute() - self._raise_executing_exception_if_error(job_resource) - complete = job_resource.get('status').get('state') == u'DONE' - elapsed_time = time() - start_time - - # raise exceptions if timeout - if not complete: - logger.error('BigQuery job %s timeout' % job_id) - raise BigQueryTimeoutException() - - return job_resource - - def push_rows(self, dataset, table, rows, insert_id_key=None, - skip_invalid_rows=None, ignore_unknown_values=None, - template_suffix=None): - """Upload rows to BigQuery table. - - Parameters - ---------- - dataset : str - The dataset to upload to - table : str - The name of the table to insert rows into - rows : list - A ``list`` of rows (``dict`` objects) to add to the table - insert_id_key : str, optional - Key for insertId in row. - You can use dot separated key for nested column. - skip_invalid_rows : bool, optional - Insert all valid rows of a request, even if invalid rows exist. - ignore_unknown_values : bool, optional - Accept rows that contain values that do not match the schema. - template_suffix : str, optional - Inserts the rows into an {table}{template_suffix}. - If table {table}{template_suffix} doesn't exist, create from {table}. - - Returns - ------- - Union[bool, dict] - bool indicating if insert succeeded or not, or response - from BigQuery if swallow_results is set for False. - """ - - table_data = self.bigquery.tabledata() - - rows_data = [] - for row in rows: - each_row = {} - each_row["json"] = row - if insert_id_key is not None: - keys = insert_id_key.split('.') - val = reduce(lambda d, key: d.get(key) if d else None, keys, row) - if val is not None: - each_row["insertId"] = val - rows_data.append(each_row) - - data = { - "kind": "bigquery#tableDataInsertAllRequest", - "rows": rows_data - } - - if skip_invalid_rows is not None: - data['skipInvalidRows'] = skip_invalid_rows - - if ignore_unknown_values is not None: - data['ignoreUnknownValues'] = ignore_unknown_values - - if template_suffix is not None: - data['templateSuffix'] = template_suffix - - try: - response = table_data.insertAll( - projectId=self.project_id, - datasetId=dataset, - tableId=table, - body=data - ).execute() - - if response.get('insertErrors'): - logger.error('BigQuery insert errors: %s' % response) - if self.swallow_results: - return False - else: - return response - - if self.swallow_results: - return True - else: - return response - - except HttpError as e: - logger.exception('Problem with BigQuery insertAll') - if self.swallow_results: - return False - else: - return { - 'insertErrors': [{ - 'errors': [{ - 'reason': 'httperror', - 'message': e - }] - }] - } - - def get_all_tables(self, dataset_id): - """Retrieve a list of tables for the dataset. - - Parameters - ---------- - dataset_id : str - The dataset to retrieve table data for. - - Returns - ------- - A ``list`` with all table names - """ - tables_data = self._get_all_tables_for_dataset(dataset_id) - - tables = [] - for table in tables_data.get('tables', []): - table_name = table.get('tableReference', {}).get('tableId') - if table_name: - tables.append(table_name) - return tables - - def _get_all_tables(self, dataset_id, cache=False): - """Retrieve the list of tables for dataset, that respect the formats: - * appid_YYYY_MM - * YYYY_MM_appid - - Parameters - ---------- - dataset_id : str - The dataset to retrieve table names for - cache : bool, optional - To use cached value or not (default False). Timeout value equals - CACHE_TIMEOUT. - - Returns - ------- - dict - A ``dict`` of app ids mapped to their table names - """ - do_fetch = True - if cache and self.cache.get(dataset_id): - time, result = self.cache.get(dataset_id) - if datetime.now() - time < CACHE_TIMEOUT: - do_fetch = False - - if do_fetch: - result = self._get_all_tables_for_dataset(dataset_id) - self.cache[dataset_id] = (datetime.now(), result) - - return self._parse_table_list_response(result) - - def _get_all_tables_for_dataset(self, dataset_id): - """Retrieve a list of all tables for the dataset. - - Parameters - ---------- - dataset_id : str - The dataset to retrieve table names for - - Returns - ------- - dict - A ``dict`` containing tables key with all tables - """ - result = self.bigquery.tables().list( - projectId=self.project_id, - datasetId=dataset_id).execute() - - page_token = result.get('nextPageToken') - while page_token: - res = self.bigquery.tables().list( - projectId=self.project_id, - datasetId=dataset_id, - pageToken=page_token - ).execute() - page_token = res.get('nextPageToken') - result['tables'] += res.get('tables', []) - return result - - def _parse_table_list_response(self, list_response): - """Parse the response received from calling list on tables. - - Parameters - ---------- - list_response - The response found by calling list on a BigQuery table object. - - Returns - ------- - dict - Dates referenced by table names - """ - - tables = defaultdict(dict) - - for table in list_response.get('tables', []): - table_ref = table.get('tableReference') - - if not table_ref: - continue - - table_id = table_ref.get('tableId', '') - - year_month, app_id = self._parse_table_name(table_id) - - if not year_month: - continue - - table_date = datetime.strptime(year_month, '%Y-%m') - unix_seconds = calendar.timegm(table_date.timetuple()) - tables[app_id].update({table_id: unix_seconds}) - - # Turn off defualting - tables.default_factory = None - - return tables - - def _parse_table_name(self, table_id): - """Parse a table name in the form of appid_YYYY_MM or - YYYY_MM_appid and return a tuple consisting of YYYY-MM and the app id. - - Parameters - ---------- - table_id : str - The table id as listed by BigQuery - - Returns - ------- - tuple - (year/month, app id), or (None, None) if the table id cannot be - parsed. - """ - - # Prefix date - attributes = table_id.split('_') - year_month = "-".join(attributes[:2]) - app_id = "-".join(attributes[2:]) - - # Check if date parsed correctly - if year_month.count("-") == 1 and all( - [num.isdigit() for num in year_month.split('-')]): - return year_month, app_id - - # Postfix date - attributes = table_id.split('_') - year_month = "-".join(attributes[-2:]) - app_id = "-".join(attributes[:-2]) - - # Check if date parsed correctly - if year_month.count("-") == 1 and all( - [num.isdigit() for num in year_month.split('-')]): - return year_month, app_id - - return None, None - - def _filter_tables_by_time(self, tables, start_time, end_time): - """Filter a table dictionary and return table names based on the range - of start and end times in unix seconds. - - Parameters - ---------- - tables : dict - Dates referenced by table names - start_time : int - The unix time after which records will be fetched - end_time : int - The unix time up to which records will be fetched - - Returns - ------- - list - Table names that are inside the time range - """ - - return [table_name for (table_name, unix_seconds) in tables.items() - if self._in_range(start_time, end_time, unix_seconds)] - - def _in_range(self, start_time, end_time, time): - """Indicate if the given time falls inside of the given range. - - Parameters - ---------- - start_time : int - The unix time for the start of the range - end_time : int - The unix time for the end of the range - time : int - The unix time to check - - Returns - ------- - bool - True if the time falls within the range, False otherwise. - """ - - ONE_MONTH = 2764800 # 32 days - - return start_time <= time <= end_time or \ - time <= start_time <= time + ONE_MONTH or \ - time <= end_time <= time + ONE_MONTH - - def get_query_results(self, job_id, offset=None, limit=None, - page_token=None, timeout=0): - """Execute the query job indicated by the given job id. This is direct - mapping to bigquery api - https://cloud.google.com/bigquery/docs/reference/v2/jobs/getQueryResults - - Parameters - ---------- - job_id : str - The job id of the query to check - offset : optional - The index the result set should start at. - limit : int, optional - The maximum number of results to retrieve. - page_token : optional - Page token, returned by previous call, to request the next page of - results. - timeout : float, optional - Timeout in seconds - - Returns - ------- - out - The query reply - """ - - job_collection = self.bigquery.jobs() - return job_collection.getQueryResults( - projectId=self.project_id, - jobId=job_id, - startIndex=offset, - maxResults=limit, - pageToken=page_token, - timeoutMs=timeout * 1000).execute() - - def _transform_row(self, row, schema): - """Apply the given schema to the given BigQuery data row. - - Parameters - ---------- - row - A single BigQuery row to transform - schema : list - The BigQuery table schema to apply to the row, specifically - the list of field dicts. - - Returns - ------- - dict - Mapping schema to row - """ - - log = {} - - # Match each schema column with its associated row value - for index, col_dict in enumerate(schema): - col_name = col_dict['name'] - row_value = row['f'][index]['v'] - - if row_value is None: - log[col_name] = None - continue - - # Recurse on nested records - if col_dict['type'] == 'RECORD': - row_value = self._recurse_on_row(col_dict, row_value) - - # Otherwise just cast the value - elif col_dict['type'] == 'INTEGER': - row_value = int(row_value) - - elif col_dict['type'] == 'FLOAT': - row_value = float(row_value) - - elif col_dict['type'] == 'BOOLEAN': - row_value = row_value in ('True', 'true', 'TRUE') - - elif col_dict['type'] == 'TIMESTAMP': - row_value = float(row_value) - - log[col_name] = row_value - - return log - - def _recurse_on_row(self, col_dict, nested_value): - """Apply the schema specified by the given dict to the nested value by - recursing on it. - - Parameters - ---------- - col_dict : dict - The schema to apply to the nested value. - nested_value : A value nested in a BigQuery row. - - Returns - ------- - Union[dict, list] - ``dict`` or ``list`` of ``dict`` objects from applied schema. - """ - - row_value = None - - # Multiple nested records - if col_dict['mode'] == 'REPEATED' and isinstance(nested_value, list): - row_value = [self._transform_row(record['v'], col_dict['fields']) - for record in nested_value] - - # A single nested record - else: - row_value = self._transform_row(nested_value, col_dict['fields']) - - return row_value - - def _generate_hex_for_uris(self, uris): - """Given uris, generate and return hex version of it - - Parameters - ---------- - uris : list - Containing all uris - - Returns - ------- - str - Hexed uris - """ - return sha256((":".join(uris) + str(time())).encode()).hexdigest() - - def _raise_insert_exception_if_error(self, job): - error_http = job.get('error') - if error_http: - raise JobInsertException( - "Error in export job API request: {0}".format(error_http)) - # handle errorResult - API request is successful but error in result - error_result = job.get('status').get('errorResult') - if error_result: - raise JobInsertException( - "Reason:{reason}. Message:{message}".format(**error_result)) - - def _raise_executing_exception_if_error(self, job): - error_http = job.get('error') - if error_http: - raise JobExecutingException( - "Error in export job API request: {0}".format(error_http)) - # handle errorResult - API request is successful but error in result - error_result = job.get('status').get('errorResult') - if error_result: - raise JobExecutingException( - "Reason:{reason}. Message:{message}".format(**error_result)) - - # - # DataSet manipulation methods - # - def create_dataset(self, dataset_id, friendly_name=None, description=None, - access=None, location=None): - """Create a new BigQuery dataset. - - Parameters - ---------- - dataset_id : str - Unique ``str`` identifying the dataset with the project (the - referenceID of the dataset, not the integer id of the dataset) - friendly_name: str, optional - A human readable name - description: str, optional - Longer string providing a description - access : list, optional - Indicating access permissions (see - https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) - location : str, optional - Indicating where dataset should be stored: EU or US (see - https://developers.google.com/bigquery/docs/reference/v2/datasets#resource) - - Returns - ------- - Union[bool, dict] - ``bool`` indicating if dataset was created or not, or response - from BigQuery if swallow_results is set for False - """ - try: - datasets = self.bigquery.datasets() - dataset_data = self.dataset_resource(dataset_id, - friendly_name=friendly_name, - description=description, - access=access, - location=location) - - response = datasets.insert(projectId=self.project_id, - body=dataset_data).execute() - if self.swallow_results: - return True - else: - return response - except HttpError as e: - logger.error( - 'Cannot create dataset {0}, {1}'.format(dataset_id, e)) - if self.swallow_results: - return False - else: - return {} - - def get_datasets(self): - """List all datasets in the project. - - Returns - ------- - list - Dataset resources - """ - try: - datasets = self.bigquery.datasets() - request = datasets.list(projectId=self.project_id) - result = request.execute() - return result.get('datasets', []) - except HttpError as e: - logger.error("Cannot list datasets: {0}".format(e)) - return None - - def delete_dataset(self, dataset_id, delete_contents=False): - """Delete a BigQuery dataset. - - Parameters - ---------- - dataset_id : str - Unique ``str`` identifying the datset with the project (the - referenceId of the dataset) - delete_contents : bool, optional - If True, forces the deletion of the dataset even when the dataset - contains data (Default = False) - - Returns - ------- - Union[bool, dict[ - ool indicating if the delete was successful or not, or response - from BigQuery if swallow_results is set for False - - Raises - ------- - HttpError - 404 when dataset with dataset_id does not exist - """ - try: - datasets = self.bigquery.datasets() - request = datasets.delete(projectId=self.project_id, - datasetId=dataset_id, - deleteContents=delete_contents) - response = request.execute() - if self.swallow_results: - return True - else: - return response - except HttpError as e: - logger.error( - 'Cannot delete dataset {0}: {1}'.format(dataset_id, e)) - if self.swallow_results: - return False - else: - return {} - - def update_dataset(self, dataset_id, friendly_name=None, description=None, - access=None): - """Updates information in an existing dataset. The update method - replaces the entire dataset resource, whereas the patch method only - replaces fields that are provided in the submitted dataset resource. - - Parameters - ---------- - dataset_id : str - Unique ``str`` identifying the dataset with the project (the - referencedId of the dataset) - friendly_name : str, optional - An optional descriptive name for the dataset. - description : str, optional - An optional description of the dataset. - access : list, optional - Indicating access permissions - - Returns - ------- - Union[bool, dict] - ``bool`` indicating if the update was successful or not, or - response from BigQuery if swallow_results is set for False. - """ - try: - datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, friendly_name, - description, access) - request = datasets.update(projectId=self.project_id, - datasetId=dataset_id, - body=body) - response = request.execute() - if self.swallow_results: - return True - else: - return response - except HttpError as e: - logger.error( - 'Cannot update dataset {0}: {1}'.format(dataset_id, e)) - if self.swallow_results: - return False - else: - return {} - - def patch_dataset(self, dataset_id, friendly_name=None, description=None, - access=None): - """Updates information in an existing dataset. The update method - replaces the entire dataset resource, whereas the patch method only - replaces fields that are provided in the submitted dataset resource. - - Parameters - ---------- - dataset_id : str - Unique string idenfitying the dataset with the project (the - referenceId of the dataset) - friendly_name : str, optional - An optional descriptive name for the dataset. - description : str, optional - An optional description of the dataset. - access : list, optional - Indicating access permissions. - - Returns - ------- - Union[bool, dict] - ``bool`` indicating if the patch was successful or not, or response - from BigQuery if swallow_results is set for False. - """ - try: - datasets = self.bigquery.datasets() - body = self.dataset_resource(dataset_id, friendly_name, - description, access) - request = datasets.patch(projectId=self.project_id, - datasetId=dataset_id, body=body) - response = request.execute() - if self.swallow_results: - return True - else: - return response - except HttpError as e: - logger.error('Cannot patch dataset {0}: {1}'.format(dataset_id, e)) - if self.swallow_results: - return False - else: - return {} - - def dataset_resource(self, ref_id, friendly_name=None, description=None, - access=None, location=None): - """See - https://developers.google.com/bigquery/docs/reference/v2/datasets#resource - - Parameters - ---------- - ref_id : str - Dataset id (the reference id, not the integer id) - friendly_name : str, optional - An optional descriptive name for the dataset - description : str, optional - An optional description for the dataset - access : list, optional - Indicating access permissions - location: str, optional, 'EU' or 'US' - An optional geographical location for the dataset(EU or US) - - Returns - ------- - dict - Representing BigQuery dataset resource - """ - data = { - "datasetReference": { - "datasetId": ref_id, - "projectId": self.project_id - } - } - if friendly_name: - data["friendlyName"] = friendly_name - if description: - data["description"] = description - if access: - data["access"] = access - if location: - data["location"] = location - - return data - - @classmethod - def schema_from_record(cls, record): - """Given a dict representing a record instance to be inserted into - BigQuery, calculate the schema. - - Parameters - ---------- - record : dict - representing a record to be inserted into big query, - where all keys are ``str`` objects (representing column names in - the record) and all values are of type ``int``, ``str``, - ``unicode``, ``float``, ``bool``, ``datetime``, or ``dict``. A - ``dict`` value represents a record, and must conform to the same - restrictions as record. - - Returns - ------- - list - BigQuery schema - - Notes - ----- - Results are undefined if a different value type is provided for a - repeated field: E.g. - - >>> { rfield: [ { x: 1}, {x: "a string"} ] } # undefined! - """ - from bigquery.schema_builder import schema_from_record - return schema_from_record(record) diff --git a/bigquery/query_builder.py.bak b/bigquery/query_builder.py.bak deleted file mode 100644 index 1054299..0000000 --- a/bigquery/query_builder.py.bak +++ /dev/null @@ -1,397 +0,0 @@ -from logging import getLogger, NullHandler - -logger = getLogger(__name__) -logger.addHandler(NullHandler()) - - -def render_query(dataset, tables, select=None, conditions=None, - groupings=None, having=None, order_by=None, limit=None): - """Render a query that will run over the given tables using the specified - parameters. - - Parameters - ---------- - dataset : str - The BigQuery dataset to query data from - tables : Union[dict, list] - The table in `dataset` to query. - select : dict, optional - The keys function as column names and the values function as options to - apply to the select field such as alias and format. For example, - select['start_time'] might have the form - {'alias': 'StartTime', 'format': 'INTEGER-FORMAT_UTC_USEC'}, which - would be represented as 'SEC_TO_TIMESTAMP(INTEGER(start_time)) as - StartTime' in a query. Pass `None` to select all. - conditions : list, optional - a ``list`` of ``dict`` objects to filter results by. Each dict should - have the keys 'field', 'type', and 'comparators'. The first two map to - strings representing the field (e.g. 'foo') and type (e.g. 'FLOAT'). - 'comparators' maps to another ``dict`` containing the keys 'condition', - 'negate', and 'value'. - If 'comparators' = {'condition': '>=', 'negate': False, 'value': 1}, - this example will be rdnered as 'foo >= FLOAT('1')' in the query. - ``list`` of field names to group by - order_by : dict, optional - Keys = {'field', 'direction'}. `dict` should be formatted as - {'field':'TimeStamp, 'direction':'desc'} or similar - limit : int, optional - Limit the amount of data needed to be returned. - - Returns - ------- - str - A rendered query - """ - - if None in (dataset, tables): - return None - - query = "%s %s %s %s %s %s %s" % ( - _render_select(select), - _render_sources(dataset, tables), - _render_conditions(conditions), - _render_groupings(groupings), - _render_having(having), - _render_order(order_by), - _render_limit(limit) - ) - - return query - - -def _render_select(selections): - """Render the selection part of a query. - - Parameters - ---------- - selections : dict - Selections for a table - - Returns - ------- - str - A string for the "select" part of a query - - See Also - -------- - render_query : Further clarification of `selections` dict formatting - """ - - if not selections: - return 'SELECT *' - - rendered_selections = [] - for name, options in selections.items(): - if not isinstance(options, list): - options = [options] - - original_name = name - for options_dict in options: - name = original_name - alias = options_dict.get('alias') - alias = "as %s" % alias if alias else "" - - formatter = options_dict.get('format') - if formatter: - name = _format_select(formatter, name) - - rendered_selections.append("%s %s" % (name, alias)) - - return "SELECT " + ", ".join(rendered_selections) - - -def _format_select(formatter, name): - """Modify the query selector by applying any formatters to it. - - Parameters - ---------- - formatter : str - Hyphen-delimited formatter string where formatters are - applied inside-out, e.g. the formatter string - SEC_TO_MICRO-INTEGER-FORMAT_UTC_USEC applied to the selector - foo would result in FORMAT_UTC_USEC(INTEGER(foo*1000000)). - name: str - The name of the selector to apply formatters to. - - Returns - ------- - str - The formatted selector - """ - - for caster in formatter.split('-'): - if caster == 'SEC_TO_MICRO': - name = "%s*1000000" % name - elif ':' in caster: - caster, args = caster.split(':') - name = "%s(%s,%s)" % (caster, name, args) - else: - name = "%s(%s)" % (caster, name) - - return name - - -def _render_sources(dataset, tables): - """Render the source part of a query. - - Parameters - ---------- - dataset : str - The data set to fetch log data from. - tables : Union[dict, list] - The tables to fetch log data from - - Returns - ------- - str - A string that represents the "from" part of a query. - """ - - if isinstance(tables, dict): - if tables.get('date_range', False): - try: - dataset_table = '.'.join([dataset, tables['table']]) - return "FROM (TABLE_DATE_RANGE([{}], TIMESTAMP('{}'),"\ - " TIMESTAMP('{}'))) ".format(dataset_table, - tables['from_date'], - tables['to_date']) - except KeyError as exp: - logger.warn( - 'Missing parameter %s in selecting sources' % (exp)) - - else: - return "FROM " + ", ".join( - ["[%s.%s]" % (dataset, table) for table in tables]) - - -def _render_conditions(conditions): - """Render the conditions part of a query. - - Parameters - ---------- - conditions : list - A list of dictionay items to filter a table. - - Returns - ------- - str - A string that represents the "where" part of a query - - See Also - -------- - render_query : Further clarification of `conditions` formatting. - """ - - if not conditions: - return "" - - rendered_conditions = [] - - for condition in conditions: - field = condition.get('field') - field_type = condition.get('type') - comparators = condition.get('comparators') - - if None in (field, field_type, comparators) or not comparators: - logger.warn('Invalid condition passed in: %s' % condition) - continue - - rendered_conditions.append( - _render_condition(field, field_type, comparators)) - - if not rendered_conditions: - return "" - - return "WHERE %s" % (" AND ".join(rendered_conditions)) - - -def _render_condition(field, field_type, comparators): - """Render a single query condition. - - Parameters - ---------- - field : str - The field the condition applies to - field_type : str - The data type of the field. - comparators : array_like - An iterable of logic operators to use. - - Returns - ------- - str - a condition string. - """ - - field_type = field_type.upper() - - negated_conditions, normal_conditions = [], [] - - for comparator in comparators: - condition = comparator.get("condition").upper() - negated = "NOT " if comparator.get("negate") else "" - value = comparator.get("value") - - if condition == "IN": - if isinstance(value, (list, tuple, set)): - value = ', '.join( - sorted([_render_condition_value(v, field_type) - for v in value]) - ) - else: - value = _render_condition_value(value, field_type) - value = "(" + value + ")" - elif condition == "BETWEEN": - if isinstance(value, (tuple, list, set)) and len(value) == 2: - value = ' AND '.join( - sorted([_render_condition_value(v, field_type) - for v in value]) - ) - elif isinstance(value, (tuple, list, set)) and len(value) != 2: - logger.warn('Invalid condition passed in: %s' % condition) - - else: - value = _render_condition_value(value, field_type) - - rendered_sub_condition = "%s%s %s %s" % ( - negated, field, condition, value) - - if comparator.get("negate"): - negated_conditions.append(rendered_sub_condition) - else: - normal_conditions.append(rendered_sub_condition) - - rendered_normal = " AND ".join(normal_conditions) - rendered_negated = " AND ".join(negated_conditions) - - if rendered_normal and rendered_negated: - return "((%s) AND (%s))" % (rendered_normal, rendered_negated) - - return "(%s)" % (rendered_normal or rendered_negated) - - -def _render_condition_value(value, field_type): - """Render a query condition value. - - Parameters - ---------- - value : Union[bool, int, float, str, datetime] - The value of the condition - field_type : str - The data type of the field - - Returns - ------- - str - A value string. - """ - - # BigQuery cannot cast strings to booleans, convert to ints - if field_type == "BOOLEAN": - value = 1 if value else 0 - elif field_type in ("STRING", "INTEGER", "FLOAT"): - value = "'%s'" % (value) - elif field_type in ("TIMESTAMP"): - value = "'%s'" % (str(value)) - return "%s(%s)" % (field_type, value) - - -def _render_groupings(fields): - """Render the group by part of a query. - - Parameters - ---------- - fields : list - A list of fields to group by. - - Returns - ------- - str - A string that represents the "group by" part of a query. - """ - - if not fields: - return "" - - return "GROUP BY " + ", ".join(fields) - - -def _render_having(having_conditions): - """Render the having part of a query. - - Parameters - ---------- - having_conditions : list - A ``list`` of ``dict``s to filter the rows - - Returns - ------- - str - A string that represents the "having" part of a query. - - See Also - -------- - render_query : Further clarification of `conditions` formatting. - """ - if not having_conditions: - return "" - - rendered_conditions = [] - - for condition in having_conditions: - field = condition.get('field') - field_type = condition.get('type') - comparators = condition.get('comparators') - - if None in (field, field_type, comparators) or not comparators: - logger.warn('Invalid condition passed in: %s' % condition) - continue - - rendered_conditions.append( - _render_condition(field, field_type, comparators)) - - if not rendered_conditions: - return "" - - return "HAVING %s" % (" AND ".join(rendered_conditions)) - - -def _render_order(order): - """Render the order by part of a query. - - Parameters - ---------- - order : dict - A dictionary with two keys, fields and direction. - Such that the dictionary should be formatted as - {'fields': ['TimeStamp'], 'direction':'desc'}. - - Returns - ------- - str - A string that represents the "order by" part of a query. - """ - - if not order or 'fields' not in order or 'direction' not in order: - return '' - - return "ORDER BY %s %s" % (", ".join(order['fields']), order['direction']) - - -def _render_limit(limit): - """Render the limit part of a query. - - Parameters - ---------- - limit : int, optional - Limit the amount of data needed to be returned. - - Returns - ------- - str - A string that represents the "limit" part of a query. - """ - if not limit: - return '' - - return "LIMIT %s" % limit diff --git a/bigquery/schema_builder.py.bak b/bigquery/schema_builder.py.bak deleted file mode 100644 index 65027b8..0000000 --- a/bigquery/schema_builder.py.bak +++ /dev/null @@ -1,145 +0,0 @@ -from __future__ import absolute_import -__author__ = 'Aneil Mallavarapu (http://github.com/aneilbaboo)' - -from datetime import datetime - -import six -import dateutil.parser - -from .errors import InvalidTypeException - - -def default_timestamp_parser(s): - try: - if dateutil.parser.parse(s): - return True - else: - return False - except: - return False - - -def schema_from_record(record, timestamp_parser=default_timestamp_parser): - """Generate a BigQuery schema given an example of a record that is to be - inserted into BigQuery. - - Parameters - ---------- - record : dict - Example of a record that is to be inserted into BigQuery - timestamp_parser : function, optional - Unary function taking a ``str`` and returning and ``bool`` that is - True if the string represents a date - - Returns - ------- - Schema: list - """ - return [describe_field(k, v, timestamp_parser=timestamp_parser) - for k, v in list(record.items())] - - -def describe_field(k, v, timestamp_parser=default_timestamp_parser): - """Given a key representing a column name and value representing the value - stored in the column, return a representation of the BigQuery schema - element describing that field. Raise errors if invalid value types are - provided. - - Parameters - ---------- - k : Union[str, unicode] - Key representing the column - v : Union[str, unicode, int, float, datetime, object] - Value mapped to by `k` - - Returns - ------- - object - Describing the field - - Raises - ------ - Exception - If invalid value types are provided. - - Examples - -------- - >>> describe_field("username", "Bob") - {"name": "username", "type": "string", "mode": "nullable"} - >>> describe_field("users", [{"username": "Bob"}]) - {"name": "users", "type": "record", "mode": "repeated", - "fields": [{"name":"username","type":"string","mode":"nullable"}]} - """ - - def bq_schema_field(name, bq_type, mode): - return {"name": name, "type": bq_type, "mode": mode} - - if isinstance(v, list): - if len(v) == 0: - raise Exception( - "Can't describe schema because of empty list {0}:[]".format(k)) - v = v[0] - mode = "repeated" - else: - mode = "nullable" - - bq_type = bigquery_type(v, timestamp_parser=timestamp_parser) - if not bq_type: - raise InvalidTypeException(k, v) - - field = bq_schema_field(k, bq_type, mode) - if bq_type == "record": - try: - field['fields'] = schema_from_record(v, timestamp_parser) - except InvalidTypeException as e: - # recursively construct the key causing the error - raise InvalidTypeException("%s.%s" % (k, e.key), e.value) - - return field - - -def bigquery_type(o, timestamp_parser=default_timestamp_parser): - """Given a value, return the matching BigQuery type of that value. Must be - one of str/unicode/int/float/datetime/record, where record is a dict - containing value which have matching BigQuery types. - - Parameters - ---------- - o : object - A Python object - time_stamp_parser : function, optional - Unary function taking a ``str`` and returning and ``bool`` that is - True if the string represents a date - - Returns - ------- - Union[str, None] - Name of the corresponding BigQuery type for `o`, or None if no type - could be found - - Examples - -------- - >>> bigquery_type("abc") - "string" - >>> bigquery_type(123) - "integer" - """ - - t = type(o) - if t in six.integer_types: - return "integer" - elif (t == six.binary_type and six.PY2) or t == six.text_type: - if timestamp_parser and timestamp_parser(o): - return "timestamp" - else: - return "string" - elif t == float: - return "float" - elif t == bool: - return "boolean" - elif t == dict: - return "record" - elif t == datetime: - return "timestamp" - else: - return None # failed to find a type diff --git a/bigquery/tests/test_client.py.bak b/bigquery/tests/test_client.py.bak deleted file mode 100644 index 1315147..0000000 --- a/bigquery/tests/test_client.py.bak +++ /dev/null @@ -1,2902 +0,0 @@ -import unittest - -import mock -import six -from bigquery import client -from bigquery.errors import ( - JobInsertException, JobExecutingException, - BigQueryTimeoutException -) -from googleapiclient.errors import HttpError -from nose.tools import raises - - -class HttpResponse(object): - def __init__(self, status, reason='There was an error'): - """ - Args: - :param int status: Integer HTTP response status - """ - self.status = status - self.reason = reason - - -class TestGetClient(unittest.TestCase): - def setUp(self): - client._bq_client = None - - self.mock_bq_service = mock.Mock() - self.mock_job_collection = mock.Mock() - - self.mock_bq_service.jobs.return_value = self.mock_job_collection - - self.client = client.BigQueryClient(self.mock_bq_service, 'project') - - def test_no_credentials(self): - """Ensure an Exception is raised when no credentials are provided.""" - - self.assertRaises(AssertionError, client.get_client, 'foo') - - @mock.patch('bigquery.client._credentials') - @mock.patch('bigquery.client.build') - def test_initialize_readonly(self, mock_build, mock_return_cred): - """Ensure that a BigQueryClient is initialized and returned with - read-only permissions. - """ - from bigquery.client import BIGQUERY_SCOPE_READ_ONLY - - mock_cred = mock.Mock() - mock_http = mock.Mock() - mock_service_url = mock.Mock() - mock_cred.from_p12_keyfile_buffer.return_value.authorize.return_value = mock_http - mock_bq = mock.Mock() - mock_build.return_value = mock_bq - key = 'key' - service_account = 'account' - project_id = 'project' - mock_return_cred.return_value = mock_cred - - bq_client = client.get_client( - project_id, service_url=mock_service_url, - service_account=service_account, private_key=key, - readonly=True) - - mock_return_cred.assert_called_once_with() - mock_cred.from_p12_keyfile_buffer.assert_called_once_with( - service_account, mock.ANY, - scopes=BIGQUERY_SCOPE_READ_ONLY) - self.assertTrue( - mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) - self.assertEquals(mock_bq, bq_client.bigquery) - self.assertEquals(project_id, bq_client.project_id) - - @mock.patch('bigquery.client._credentials') - @mock.patch('bigquery.client.build') - def test_initialize_read_write(self, mock_build, mock_return_cred): - """Ensure that a BigQueryClient is initialized and returned with - read/write permissions. - """ - from bigquery.client import BIGQUERY_SCOPE - - mock_cred = mock.Mock() - mock_http = mock.Mock() - mock_service_url = mock.Mock() - mock_cred.from_p12_keyfile_buffer.return_value.authorize.return_value = mock_http - mock_bq = mock.Mock() - mock_build.return_value = mock_bq - key = 'key' - service_account = 'account' - project_id = 'project' - mock_return_cred.return_value = mock_cred - - bq_client = client.get_client( - project_id, service_url=mock_service_url, - service_account=service_account, private_key=key, - readonly=False) - - mock_return_cred.assert_called_once_with() - mock_cred.from_p12_keyfile_buffer.assert_called_once_with( - service_account, mock.ANY, scopes=BIGQUERY_SCOPE) - self.assertTrue( - mock_cred.from_p12_keyfile_buffer.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) - self.assertEquals(mock_bq, bq_client.bigquery) - self.assertEquals(project_id, bq_client.project_id) - - @mock.patch('bigquery.client._credentials') - @mock.patch('bigquery.client.build') - def test_initialize_key_file(self, mock_build, mock_return_cred): - """Ensure that a BigQueryClient is initialized and returned with - read/write permissions using a private key file. - """ - from bigquery.client import BIGQUERY_SCOPE - - mock_cred = mock.Mock() - mock_http = mock.Mock() - mock_service_url = mock.Mock() - mock_cred.from_p12_keyfile.return_value.authorize.return_value = mock_http - mock_bq = mock.Mock() - mock_build.return_value = mock_bq - key_file = 'key.pem' - service_account = 'account' - project_id = 'project' - mock_return_cred.return_value = mock_cred - - bq_client = client.get_client( - project_id, service_url=mock_service_url, - service_account=service_account, - private_key_file=key_file, readonly=False) - - mock_return_cred.assert_called_once_with() - mock_cred.from_p12_keyfile.assert_called_once_with(service_account, - key_file, - scopes=BIGQUERY_SCOPE) - self.assertTrue( - mock_cred.from_p12_keyfile.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) - self.assertEquals(mock_bq, bq_client.bigquery) - self.assertEquals(project_id, bq_client.project_id) - - @mock.patch('bigquery.client._credentials') - @mock.patch('bigquery.client.build') - @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') - def test_initialize_json_key_file(self, mock_open, mock_build, mock_return_cred): - """Ensure that a BigQueryClient is initialized and returned with - read/write permissions using a JSON key file. - """ - from bigquery.client import BIGQUERY_SCOPE - import json - - mock_cred = mock.Mock() - mock_http = mock.Mock() - mock_service_url = mock.Mock() - mock_cred.from_json_keyfile_dict.return_value.authorize.return_value = mock_http - mock_bq = mock.Mock() - mock_build.return_value = mock_bq - json_key_file = 'key.json' - json_key = {'client_email': 'mail', 'private_key': 'pkey'} - mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(json_key) - project_id = 'project' - mock_return_cred.return_value = mock_cred - - bq_client = client.get_client( - project_id, service_url=mock_service_url, - json_key_file=json_key_file, readonly=False) - - mock_return_cred.assert_called_once_with() - mock_cred.from_json_keyfile_dict.assert_called_once_with(json_key, - scopes=BIGQUERY_SCOPE) - self.assertTrue( - mock_cred.from_json_keyfile_dict.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) - self.assertEquals(mock_bq, bq_client.bigquery) - self.assertEquals(project_id, bq_client.project_id) - - @mock.patch('bigquery.client._credentials') - @mock.patch('bigquery.client.build') - @mock.patch('__builtin__.open' if six.PY2 else 'builtins.open') - def test_initialize_json_key_file_without_project_id(self, mock_open, mock_build, - mock_return_cred): - """Ensure that a BigQueryClient is initialized and returned with - read/write permissions using a JSON key file without project_id. - """ - from bigquery.client import BIGQUERY_SCOPE - import json - - mock_cred = mock.Mock() - mock_http = mock.Mock() - mock_service_url = mock.Mock() - mock_cred.from_json_keyfile_dict.return_value.authorize.return_value = mock_http - mock_bq = mock.Mock() - mock_build.return_value = mock_bq - json_key_file = 'key.json' - json_key = {'client_email': 'mail', 'private_key': 'pkey', 'project_id': 'project'} - mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(json_key) - mock_return_cred.return_value = mock_cred - - bq_client = client.get_client( - service_url=mock_service_url, json_key_file=json_key_file, readonly=False) - - mock_open.assert_called_once_with(json_key_file, 'r') - mock_return_cred.assert_called_once_with() - mock_cred.from_json_keyfile_dict.assert_called_once_with(json_key, - scopes=BIGQUERY_SCOPE) - self.assertTrue( - mock_cred.from_json_keyfile_dict.return_value.authorize.called) - mock_build.assert_called_once_with('bigquery', 'v2', http=mock_http, - discoveryServiceUrl=mock_service_url) - self.assertEquals(mock_bq, bq_client.bigquery) - self.assertEquals(json_key['project_id'], bq_client.project_id) - - -class TestGetProjectIds(unittest.TestCase): - - def test_get_project_ids(self): - mock_bq_service = mock.Mock() - mock_bq_service.projects().list().execute.return_value = { - 'kind': 'bigquery#projectList', - 'projects': [ - { - 'friendlyName': 'Big Query Test', - 'id': 'big-query-test', - 'kind': 'bigquery#project', - 'numericId': '1435372465', - 'projectReference': {'projectId': 'big-query-test'} - }, - { - 'friendlyName': 'BQ Company project', - 'id': 'bq-project', - 'kind': 'bigquery#project', - 'numericId': '4263574685796', - 'projectReference': {'projectId': 'bq-project'} - } - ], - 'totalItems': 2 - } - - projects = client.get_projects(mock_bq_service) - expected_projects_data = [ - {'id': 'big-query-test', 'name': 'Big Query Test'}, - {'id': 'bq-project', 'name': 'BQ Company project'} - ] - self.assertEqual(projects, expected_projects_data) - - -class TestQuery(unittest.TestCase): - - def setUp(self): - client._bq_client = None - - self.mock_bq_service = mock.Mock() - self.mock_job_collection = mock.Mock() - - self.mock_bq_service.jobs.return_value = self.mock_job_collection - - self.query = 'foo' - self.project_id = 'project' - self.external_udf_uris = ['gs://bucket/external_udf.js'] - self.client = client.BigQueryClient(self.mock_bq_service, - self.project_id) - - def test_query(self): - """Ensure that we retrieve the job id from the query.""" - - mock_query_job = mock.Mock() - expected_job_id = 'spiderman' - expected_job_ref = {'jobId': expected_job_id} - - mock_query_job.execute.return_value = { - 'jobReference': expected_job_ref, - 'jobComplete': True - } - - self.mock_job_collection.query.return_value = mock_query_job - - job_id, results = self.client.query(self.query, external_udf_uris=self.external_udf_uris) - - self.mock_job_collection.query.assert_called_once_with( - projectId=self.project_id, - body={ - 'query': self.query, - 'userDefinedFunctionResources': [ {'resourceUri': u} for u in self.external_udf_uris ], - 'timeoutMs': 0, - 'dryRun': False, - 'maxResults': None - } - ) - self.assertEquals(job_id, 'spiderman') - self.assertEquals(results, []) - - - def test_query_max_results_set(self): - """Ensure that we retrieve the job id from the query and the maxResults - parameter is set. - """ - - mock_query_job = mock.Mock() - expected_job_id = 'spiderman' - expected_job_ref = {'jobId': expected_job_id} - - mock_query_job.execute.return_value = { - 'jobReference': expected_job_ref, - 'jobComplete': True, - } - - self.mock_job_collection.query.return_value = mock_query_job - max_results = 10 - - job_id, results = self.client.query(self.query, - max_results=max_results) - - self.mock_job_collection.query.assert_called_once_with( - projectId=self.project_id, - body={'query': self.query, 'timeoutMs': 0, - 'maxResults': max_results, 'dryRun': False} - ) - self.assertEquals(job_id, 'spiderman') - self.assertEquals(results, []) - - def test_query_timeout_set(self): - """Ensure that we retrieve the job id from the query and the timeoutMs - parameter is set correctly. - """ - - mock_query_job = mock.Mock() - expected_job_id = 'spiderman' - expected_job_ref = {'jobId': expected_job_id} - - mock_query_job.execute.return_value = { - 'jobReference': expected_job_ref, - 'jobComplete': True, - } - - self.mock_job_collection.query.return_value = mock_query_job - timeout = 5 - - job_id, results = self.client.query(self.query, timeout=timeout) - - self.mock_job_collection.query.assert_called_once_with( - projectId=self.project_id, - body={'query': self.query, 'timeoutMs': timeout * 1000, - 'dryRun': False, 'maxResults': None} - ) - self.assertEquals(job_id, 'spiderman') - self.assertEquals(results, []) - - def test_sync_query_timeout(self): - """Ensure that exception is raise on timeout for synchronous query""" - - mock_query_job = mock.Mock() - expected_job_id = 'spiderman' - expected_job_ref = {'jobId': expected_job_id} - - mock_query_job.execute.return_value = { - 'jobReference': expected_job_ref, - 'jobComplete': False, - } - - self.mock_job_collection.query.return_value = mock_query_job - timeout = 5 - self.assertRaises(BigQueryTimeoutException, self.client.query, - self.query, None, timeout) - - def test_async_query_timeout(self): - """Ensure that exception is not raise on timeout - for asynchronous query""" - - mock_query_job = mock.Mock() - expected_job_id = 'spiderman' - expected_job_ref = {'jobId': expected_job_id} - - mock_query_job.execute.return_value = { - 'jobReference': expected_job_ref, - 'jobComplete': False, - } - - self.mock_job_collection.query.return_value = mock_query_job - - job_id, results = self.client.query(self.query) - self.assertEquals(job_id, 'spiderman') - self.assertEquals(results, []) - - def test_query_dry_run_valid(self): - """Ensure that None and an empty list is returned from the query when - dry_run is True and the query is valid. - """ - - mock_query_job = mock.Mock() - - mock_query_job.execute.return_value = {'jobReference': {}, - 'jobComplete': True} - - self.mock_job_collection.query.return_value = mock_query_job - - job_id, results = self.client.query(self.query, dry_run=True) - - self.mock_job_collection.query.assert_called_once_with( - projectId=self.project_id, - body={'query': self.query, 'timeoutMs': 0, 'maxResults': None, - 'dryRun': True} - ) - self.assertIsNone(job_id) - self.assertEqual([], results) - - def test_query_dry_run_invalid(self): - """Ensure that None and a dict is returned from the query when dry_run - is True and the query is invalid. - """ - - mock_query_job = mock.Mock() - - mock_query_job.execute.side_effect = HttpError( - 'crap', '{"message": "Bad query"}'.encode('utf8')) - - self.mock_job_collection.query.return_value = mock_query_job - - job_id, results = self.client.query('%s blah' % self.query, - dry_run=True) - - self.mock_job_collection.query.assert_called_once_with( - projectId=self.project_id, - body={'query': '%s blah' % self.query, 'timeoutMs': 0, - 'maxResults': None, - 'dryRun': True} - ) - self.assertIsNone(job_id) - self.assertEqual({'message': 'Bad query'}, results) - - def test_query_with_results(self): - """Ensure that we retrieve the job id from the query and results if - they are available. - """ - - mock_query_job = mock.Mock() - expected_job_id = 'spiderman' - expected_job_ref = {'jobId': expected_job_id} - - mock_query_job.execute.return_value = { - 'jobReference': expected_job_ref, - 'schema': {'fields': [{'name': 'foo', 'type': 'INTEGER'}]}, - 'rows': [{'f': [{'v': 10}]}], - 'jobComplete': True, - } - - self.mock_job_collection.query.return_value = mock_query_job - - job_id, results = self.client.query(self.query) - - self.mock_job_collection.query.assert_called_once_with( - projectId=self.project_id, - body={'query': self.query, 'timeoutMs': 0, 'dryRun': False, - 'maxResults': None} - ) - self.assertEquals(job_id, 'spiderman') - self.assertEquals(results, [{'foo': 10}]) - - def test_query_with_using_legacy_sql(self): - """Ensure that use_legacy_sql bool gets used""" - - mock_query_job = mock.Mock() - expected_job_id = 'spiderman' - expected_job_ref = {'jobId': expected_job_id} - - mock_query_job.execute.return_value = { - 'jobReference': expected_job_ref, - 'jobComplete': True - } - - self.mock_job_collection.query.return_value = mock_query_job - - job_id, results = self.client.query(self.query, use_legacy_sql=False) - - self.mock_job_collection.query.assert_called_once_with( - projectId=self.project_id, - body={'query': self.query, 'timeoutMs': 0, 'dryRun': False, - 'maxResults': None, 'useLegacySql': False} - ) - self.assertEquals(job_id, 'spiderman') - self.assertEquals(results, []) - - -class TestGetQueryResults(unittest.TestCase): - - def setUp(self): - client._bq_client = None - - self.mock_bq_service = mock.Mock() - self.mock_job_collection = mock.Mock() - - self.mock_bq_service.jobs.return_value = self.mock_job_collection - - self.project_id = 'project' - self.client = client.BigQueryClient(self.mock_bq_service, - self.project_id) - - def test_get_response(self): - """Ensure that the query is executed and the query reply is returned. - """ - - job_id = 'bar' - - mock_query_job = mock.Mock() - mock_query_reply = mock.Mock() - mock_query_job.execute.return_value = mock_query_reply - self.mock_job_collection.getQueryResults.return_value = mock_query_job - - offset = 5 - limit = 10 - page_token = "token" - timeout = 1 - - actual = self.client.get_query_results(job_id, offset, limit, - page_token, timeout) - - self.mock_job_collection.getQueryResults.assert_called_once_with( - projectId=self.project_id, jobId=job_id, startIndex=offset, - maxResults=limit, pageToken=page_token, timeoutMs=1000) - - mock_query_job.execute.assert_called_once_with() - self.assertEquals(actual, mock_query_reply) - - -class TestTransformRow(unittest.TestCase): - - def setUp(self): - client._bq_client = None - - self.mock_bq_service = mock.Mock() - self.mock_job_collection = mock.Mock() - - self.mock_bq_service.jobs.return_value = self.mock_job_collection - - self.project_id = 'project' - self.client = client.BigQueryClient(self.mock_bq_service, - self.project_id) - - def test_transform_row(self): - """Ensure that the row dict is correctly transformed to a log dict.""" - - schema = [{'name': 'foo', 'type': 'INTEGER'}, - {'name': 'bar', 'type': 'FLOAT'}, - {'name': 'baz', 'type': 'STRING'}, - {'name': 'qux', 'type': 'BOOLEAN'}, - {'name': 'timestamp', 'type': 'TIMESTAMP'}] - - row = {'f': [{'v': '42'}, {'v': None}, {'v': 'batman'}, - {'v': 'True'}, {'v': '1.371145650319132E9'}]} - - expected = {'foo': 42, 'bar': None, 'baz': 'batman', 'qux': True, - 'timestamp': 1371145650.319132} - - actual = self.client._transform_row(row, schema) - - self.assertEquals(actual, expected) - - def test_transform_row_with_nested(self): - """Ensure that the row dict with nested records is correctly - transformed to a log dict. - """ - - schema = [{'name': 'foo', 'type': 'INTEGER'}, - {'name': 'bar', 'type': 'FLOAT'}, - {'name': 'baz', 'type': 'STRING'}, - {'name': 'qux', 'type': 'RECORD', 'mode': 'SINGLE', - 'fields': [{'name': 'foobar', 'type': 'INTEGER'}, - {'name': 'bazqux', 'type': 'STRING'}]}] - - row = {'f': [{'v': '42'}, {'v': '36.98'}, {'v': 'batman'}, - {'v': {'f': [{'v': '120'}, {'v': 'robin'}]}}]} - expected = {'foo': 42, 'bar': 36.98, 'baz': 'batman', - 'qux': {'foobar': 120, 'bazqux': 'robin'}} - - actual = self.client._transform_row(row, schema) - - self.assertEquals(actual, expected) - - def test_transform_row_with_nested_repeated(self): - """Ensure that the row dict with nested repeated records is correctly - transformed to a log dict. - """ - - schema = [{'name': 'foo', 'type': 'INTEGER'}, - {'name': 'bar', 'type': 'FLOAT'}, - {'name': 'baz', 'type': 'STRING'}, - {'name': 'qux', 'type': 'RECORD', 'mode': 'REPEATED', - 'fields': [{'name': 'foobar', 'type': 'INTEGER'}, - {'name': 'bazqux', 'type': 'STRING'}]}] - - row = {'f': [{'v': '42'}, {'v': '36.98'}, {'v': 'batman'}, - {'v': [{'v': {'f': [{'v': '120'}, {'v': 'robin'}]}}, - {'v': {'f': [{'v': '300'}, {'v': 'joker'}]}}]}]} - expected = {'foo': 42, 'bar': 36.98, 'baz': 'batman', - 'qux': [{'foobar': 120, 'bazqux': 'robin'}, - {'foobar': 300, 'bazqux': 'joker'}]} - - actual = self.client._transform_row(row, schema) - - self.assertEquals(actual, expected) - - -@mock.patch('bigquery.client.BigQueryClient.get_query_results') -class TestCheckJob(unittest.TestCase): - - def setUp(self): - client._bq_client = None - self.project_id = 'project' - self.client = client.BigQueryClient(mock.Mock(), self.project_id) - - def test_job_incomplete(self, mock_exec): - """Ensure that we return None if the job is not yet complete.""" - - mock_exec.return_value = {'jobComplete': False} - - is_completed, total_rows = self.client.check_job(1) - - self.assertFalse(is_completed) - self.assertEquals(total_rows, 0) - - def test_query_complete(self, mock_exec): - """Ensure that we can handle a normal query result.""" - - mock_exec.return_value = { - 'jobComplete': True, - 'rows': [ - {'f': [{'v': 'bar'}, {'v': 'man'}]}, - {'f': [{'v': 'abc'}, {'v': 'xyz'}]} - ], - 'schema': { - 'fields': [ - {'name': 'foo', 'type': 'STRING'}, - {'name': 'spider', 'type': 'STRING'} - ] - }, - 'totalRows': '2' - } - - is_completed, total_rows = self.client.check_job(1) - - self.assertTrue(is_completed) - self.assertEquals(total_rows, 2) - - -class TestWaitForJob(unittest.TestCase): - - def setUp(self): - client._bq_client = None - self.project_id = 'project' - self.api_mock = mock.Mock() - self.client = client.BigQueryClient(self.api_mock, self.project_id) - - def test_completed_jobs(self): - """Ensure we can detect completed jobs""" - - return_values = [{'status': {'state': u'RUNNING'}, - 'jobReference': {'jobId': "testJob"}}, - {'status': {'state': u'DONE'}, - 'jobReference': {'jobId': "testJob"}}] - - def side_effect(*args, **kwargs): - return return_values.pop(0) - - self.api_mock.jobs().get().execute.side_effect = side_effect - - job_resource = self.client.wait_for_job( - {'jobReference': {'jobId': "testJob"}, - 'status': {'state': u'RUNNING'}}, - interval=.01, - timeout=.05) - - self.assertEqual(self.api_mock.jobs().get().execute.call_count, 2) - self.assertIsInstance(job_resource, dict) - - def test_timeout_error(self): - """Ensure that timeout raise exceptions""" - incomplete_job = {'status': {'state': u'RUNNING'}, - 'jobReference': {'jobId': "testJob"}} - - self.api_mock.jobs().get().execute.return_value = incomplete_job - self.assertRaises(BigQueryTimeoutException, self.client.wait_for_job, - incomplete_job, .1, .25) - - def test_wait_job_http_error(self): - """ Test wait job with http error""" - job = {'status': {'state': u'RUNNING'}, - 'jobReference': {'jobId': "testJob"}} - - expected_result = { - "error": { - "errors": [{ - "domain": "global", - "reason": "required", - "message": "Required parameter is missing" - }], - "code": 400, - "message": "Required parameter is missing" - } - } - - self.api_mock.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobExecutingException, - self.client.wait_for_job, - job, - interval=.01, - timeout=.01) - - def test_wait_job_error_result(self): - """ Test wait job with error result""" - job = {'status': {'state': u'RUNNING'}, - 'jobReference': {'jobId': "testJob"}} - - expected_result = { - "status": { - "state": "DONE", - "errorResult": { - "reason": "invalidQuery", - "location": "query", - "message": "Your Error Message Here " - }, - }, - } - - self.api_mock.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobExecutingException, - self.client.wait_for_job, - job, - interval=.01, - timeout=.01) - - def test_accepts_job_id(self): - """Ensure it accepts a job Id rather than a full job resource""" - - return_values = [{'status': {'state': u'RUNNING'}, - 'jobReference': {'jobId': "testJob"}}, - {'status': {'state': u'DONE'}, - 'jobReference': {'jobId': "testJob"}}] - - def side_effect(*args, **kwargs): - return return_values.pop(0) - - self.api_mock.jobs().get().execute.side_effect = side_effect - - job_resource = self.client.wait_for_job("testJob", - interval=.01, - timeout=5) - - self.assertEqual(self.api_mock.jobs().get().execute.call_count, 2) - self.assertIsInstance(job_resource, dict) - - def test_accepts_integer_job_id(self): - return_values = [{'status': {'state': u'RUNNING'}, - 'jobReference': {'jobId': "testJob"}}, - {'status': {'state': u'DONE'}, - 'jobReference': {'jobId': "testJob"}}] - - def side_effect(*args, **kwargs): - return return_values.pop(0) - - self.api_mock.jobs().get().execute.side_effect = side_effect - - job_resource = self.client.wait_for_job(1234567, - interval=.01, - timeout=600) - - self.assertEqual(self.api_mock.jobs().get().execute.call_count, 2) - self.assertIsInstance(job_resource, dict) - - -class TestImportDataFromURIs(unittest.TestCase): - - def setUp(self): - client._bq_client = None - self.mock_api = mock.Mock() - - self.query = 'foo' - self.project_id = 'project' - self.dataset_id = 'dataset' - self.table_id = 'table' - self.client = client.BigQueryClient(self.mock_api, - self.project_id) - - def test_csv_job_body_constructed_correctly(self): - expected_result = { - 'status': {'state': u'RUNNING'}, - } - - body = { - "jobReference": { - "projectId": self.project_id, - "jobId": "job" - }, - "configuration": { - "load": { - "sourceUris": ["sourceuri"], - "schema": {"fields": ["schema"]}, - "destinationTable": { - "projectId": self.project_id, - "datasetId": self.dataset_id, - "tableId": self.table_id - }, - "createDisposition": "a", - "writeDisposition": "b", - "fieldDelimiter": "c", - "skipLeadingRows": "d", - "encoding": "e", - "quote": "f", - "maxBadRecords": "g", - "allowQuotedNewlines": "h", - "sourceFormat": "CSV", - "allowJaggedRows": "j", - "ignoreUnknownValues": "k" - } - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - result = self.client.import_data_from_uris(["sourceuri"], - self.dataset_id, - self.table_id, - ["schema"], - job="job", - create_disposition="a", - write_disposition="b", - field_delimiter="c", - skip_leading_rows="d", - encoding="e", - quote="f", - max_bad_records="g", - allow_quoted_newlines="h", - source_format="CSV", - allow_jagged_rows="j", - ignore_unknown_values="k") - - self.mock_api.jobs().insert.assert_called_with( - projectId=self.project_id, - body=body - ) - - self.assertEqual(result, expected_result) - - def test_json_job_body_constructed_correctly(self): - expected_result = { - 'status': {'state': u'RUNNING'}, - } - - body = { - "jobReference": { - "projectId": self.project_id, - "jobId": "job" - }, - "configuration": { - "load": { - "sourceUris": ["sourceuri"], - "schema": {"fields": ["schema"]}, - "destinationTable": { - "projectId": self.project_id, - "datasetId": self.dataset_id, - "tableId": self.table_id - }, - "sourceFormat": "JSON" - } - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - result = self.client.import_data_from_uris(["sourceuri"], - self.dataset_id, - self.table_id, - ["schema"], - job="job", - source_format="JSON") - - self.mock_api.jobs().insert.assert_called_with( - projectId=self.project_id, - body=body - ) - - self.assertEqual(result, expected_result) - - @raises(Exception) - def test_field_delimiter_exception_if_not_csv(self): - """Raise exception if csv-only parameter is set inappropriately""" - self.client.import_data_from_uris(["sourceuri"], - self.dataset_id, - self.table_id, - ["schema"], - job="job", - source_format="JSON", - field_delimiter=",") - - @raises(Exception) - def test_allow_jagged_rows_exception_if_not_csv(self): - """Raise exception if csv-only parameter is set inappropriately""" - self.client.import_data_from_uris(["sourceuri"], - self.dataset_id, - self.table_id, - ["schema"], - job="job", - source_format="JSON", - allow_jagged_rows=True) - - @raises(Exception) - def test_allow_quoted_newlines_exception_if_not_csv(self): - """Raise exception if csv-only parameter is set inappropriately""" - self.client.import_data_from_uris(["sourceuri"], - self.dataset_id, - self.table_id, - ["schema"], - job="job", - source_format="JSON", - allow_quoted_newlines=True) - - @raises(Exception) - def test_quote_exception_if_not_csv(self): - """Raise exception if csv-only parameter is set inappropriately""" - self.client.import_data_from_uris(["sourceuri"], - self.dataset_id, - self.table_id, - ["schema"], - job="job", - source_format="JSON", - quote="'") - - @raises(Exception) - def test_skip_leading_rows_exception_if_not_csv(self): - """Raise exception if csv-only parameter is set inappropriately""" - self.client.import_data_from_uris(["sourceuri"], - self.dataset_id, - self.table_id, - ["schema"], - "job", - source_format="JSON", - skip_leading_rows=10) - - def test_accepts_single_source_uri(self): - """Ensure that a source_uri accepts a non-list""" - expected_result = { - 'status': {'state': u'RUNNING'}, - } - - body = { - "jobReference": { - "projectId": self.project_id, - "jobId": "job" - }, - "configuration": { - "load": { - "sourceUris": ["sourceuri"], - "schema": {"fields": ["schema"]}, - "destinationTable": { - "projectId": self.project_id, - "datasetId": self.dataset_id, - "tableId": self.table_id - } - } - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - result = self.client.import_data_from_uris("sourceuri", # not a list! - self.dataset_id, - self.table_id, - schema=["schema"], - job="job") - - self.mock_api.jobs().insert.assert_called_with( - projectId=self.project_id, - body=body - ) - - self.assertEqual(result, expected_result) - - def test_import_http_error(self): - """ Test import with http error""" - expected_result = { - "error": { - "errors": [{ - "domain": "global", - "reason": "required", - "message": "Required parameter is missing" - }], - "code": 400, - "message": "Required parameter is missing" - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobInsertException, - self.client.import_data_from_uris, - ["sourceuri"], - self.dataset_id, - self.table_id) - - def test_import_error_result(self): - """ Test import with error result""" - expected_result = { - "status": { - "state": "DONE", - "errorResult": { - "reason": "invalidQuery", - "location": "query", - "message": "Your Error Message Here " - }, - }, - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobInsertException, - self.client.import_data_from_uris, - ["sourceuri"], - self.dataset_id, - self.table_id) - - -class TestExportDataToURIs(unittest.TestCase): - - def setUp(self): - client._bq_client = None - self.mock_api = mock.Mock() - - self.project_id = 'project' - self.dataset_id = 'dataset' - self.table_id = 'table' - self.destination_format = "CSV" - self.print_header = False - self.client = client.BigQueryClient(self.mock_api, - self.project_id) - - @mock.patch('bigquery.client.BigQueryClient._generate_hex_for_uris') - def test_export(self, mock_generate_hex): - """ Ensure that export is working in normal circumstances """ - expected_result = { - 'status': {'state': u'RUNNING'}, - } - - body = { - "jobReference": { - "projectId": self.project_id, - "jobId": "%s-%s-destinationuri" % (self.dataset_id, - self.table_id) - }, - "configuration": { - "extract": { - "destinationUris": ["destinationuri"], - "sourceTable": { - "projectId": self.project_id, - "datasetId": self.dataset_id, - "tableId": self.table_id - }, - "destinationFormat": self.destination_format, - "printHeader": self.print_header, - } - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - mock_generate_hex.return_value = "destinationuri" - result = self.client.export_data_to_uris( - ["destinationuri"], self.dataset_id, self.table_id, - destination_format=self.destination_format, - print_header=self.print_header - ) - - self.mock_api.jobs().insert.assert_called_with( - projectId=self.project_id, - body=body - ) - - self.assertEqual(result, expected_result) - - def test_export_http_error(self): - """ Test export with http error""" - expected_result = { - "error": { - "errors": [{ - "domain": "global", - "reason": "required", - "message": "Required parameter is missing" - }], - "code": 400, - "message": "Required parameter is missing" - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobInsertException, - self.client.export_data_to_uris, - ["destinationuri"], - self.dataset_id, - self.table_id) - - def test_export_error_result(self): - """ Test export with error result""" - expected_result = { - "status": { - "state": "DONE", - "errorResult": { - "reason": "invalidQuery", - "location": "query", - "message": "Your Error Message Here " - }, - }, - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobInsertException, - self.client.export_data_to_uris, - ["destinationuri"], - self.dataset_id, - self.table_id) - - -class TestWriteToTable(unittest.TestCase): - - def setUp(self): - client._bq_client = None - self.mock_api = mock.Mock() - - self.query = 'foo' - self.project_id = 'project' - self.dataset_id = 'dataset' - self.table_id = 'table' - self.maximum_billing_tier = 1000 - self.external_udf_uris = ['gs://bucket/external_udf.js'] - self.use_query_cache = False - self.priority = "INTERACTIVE" - self.flatten_results = False - self.client = client.BigQueryClient(self.mock_api, - self.project_id) - - def test_write(self): - """ Ensure that write is working in normal circumstances.""" - expected_result = { - 'status': {'state': u'RUNNING'}, - } - - body = { - "configuration": { - "query": { - "destinationTable": { - "projectId": self.project_id, - "datasetId": self.dataset_id, - "tableId": self.table_id - }, - "query": self.query, - "userDefinedFunctionResources": [{ - "resourceUri": self.external_udf_uris[0] - }], - "useQueryCache": self.use_query_cache, - "priority": self.priority, - "flattenResults": self.flatten_results, - } - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - result = self.client.write_to_table(self.query, - self.dataset_id, - self.table_id, - external_udf_uris=self.external_udf_uris, - use_query_cache=False, - flatten=False, - priority=self.priority) - - self.mock_api.jobs().insert.assert_called_with( - projectId=self.project_id, - body=body - ) - - self.assertEqual(result, expected_result) - - def test_write_maxbilltier(self): - """ Ensure that write is working when maximumBillingTier is set""" - expected_result = { - 'status': {'state': u'RUNNING'}, - } - - body = { - "configuration": { - "query": { - "destinationTable": { - "projectId": self.project_id, - "datasetId": self.dataset_id, - "tableId": self.table_id - }, - "query": self.query, - "userDefinedFunctionResources": [{ - "resourceUri": self.external_udf_uris[0] - }], - "useQueryCache": self.use_query_cache, - "priority": self.priority, - "maximumBillingTier": self.maximum_billing_tier - } - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - result = self.client.write_to_table( - self.query, self.dataset_id, self.table_id, priority=self.priority, - external_udf_uris=self.external_udf_uris, use_query_cache=False, - maximum_billing_tier=self.maximum_billing_tier) - - self.mock_api.jobs().insert.assert_called_with( - projectId=self.project_id, - body=body - ) - - self.assertEqual(result, expected_result) - - def test_write_http_error(self): - """ Test write with http error""" - expected_result = { - "error": { - "errors": [{ - "domain": "global", - "reason": "required", - "message": "Required parameter is missing" - }], - "code": 400, - "message": "Required parameter is missing" - } - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobInsertException, self.client.write_to_table, - self.query) - - def test_write_error_result(self): - """ Test write with error result""" - expected_result = { - "status": { - "state": "DONE", - "errorResult": { - "reason": "invalidQuery", - "location": "query", - "message": "Your Error Message Here " - }, - }, - } - - self.mock_api.jobs().insert().execute.return_value = expected_result - self.assertRaises(JobInsertException, self.client.write_to_table, - self.query) - - -class TestFilterTablesByTime(unittest.TestCase): - - def test_empty_tables(self): - """Ensure we can handle filtering an empty dictionary""" - - bq = client.BigQueryClient(None, 'project') - - tables = bq._filter_tables_by_time({}, 1370000000, 0) - - self.assertEqual([], tables) - - def test_multi_inside_range(self): - """Ensure we can correctly filter several application ids""" - - bq = client.BigQueryClient(None, 'project') - - tables = bq._filter_tables_by_time({ - 'Spider-Man': 1370002001, - 'Daenerys Targaryen': 1370001999, - 'Gordon Freeman': 1369999999, - 'William Shatner': 1370001000, - 'Heavy Weapons Guy': 0 - }, 1370002000, 1370000000) - - self.assertEqual( - sorted( - ['Daenerys Targaryen', 'William Shatner', 'Gordon Freeman']), - sorted(tables) - ) - - def test_not_inside_range(self): - """Ensure we can correctly filter several application ids outside the - range we are searching for. - """ - - bq = client.BigQueryClient(None, 'project') - - tables = bq._filter_tables_by_time({ - 'John Snow': 9001, - 'Adam West': 100000000000000, - 'Glados': -1, - 'Potato': 0, - }, 1370002000, 1370000000) - - self.assertEqual([], tables) - - -NEXT_TABLE_LIST_RESPONSE = { - "kind": "bigquery#tableList", - "etag": "\"t_UlB9a9mrx5sjQInRGzeDrLrS0/TsIP_i4gAeLegj84WzkPzBPIkjo\"", - "nextPageToken": "2013_05_appspot_1", - "tables": [ - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_10", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_10" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_11", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_11" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_12", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_12" - } - }, - ], - "totalItems": 3 -} - -FULL_TABLE_LIST_RESPONSE = { - "kind": "bigquery#tableList", - "etag": "\"GSclnjk0zID1ucM3F-xYinOm1oE/cn58Rpu8v8pB4eoJQaiTe11lPQc\"", - "tables": [ - { - "kind": "bigquery#table", - "id": "project:dataset.2013_05_appspot_1", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_05_appspot" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_1", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_1" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_2", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_2" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_3", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_3" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_4", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_4" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.2013_06_appspot_5", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "2013_06_appspot_5" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.appspot_6_2013_06", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "appspot_6_2013_06" - } - }, - { - "kind": "bigquery#table", - "id": "project:dataset.table_not_matching_naming", - "tableReference": { - "projectId": "project", - "datasetId": "dataset", - "tableId": "table_not_matching_naming" - } - }, - { - "kind": "bigquery#table", - "id": "bad table data" - }, - ], - "totalItems": 9 -} - - -@mock.patch('bigquery.client.BigQueryClient.get_query_results') -class TestGetQuerySchema(unittest.TestCase): - - def test_query_complete(self, get_query_mock): - """Ensure that get_query_schema works when a query is complete.""" - from bigquery.client import BigQueryClient - - bq = BigQueryClient(mock.Mock(), 'project') - - get_query_mock.return_value = { - 'jobComplete': True, - 'schema': {'fields': 'This is our schema'} - } - - result_schema = bq.get_query_schema(job_id=123) - - self.assertEquals(result_schema, 'This is our schema') - - def test_query_incomplete(self, get_query_mock): - """Ensure that get_query_schema handles scenarios where the query - is not finished. - """ - from bigquery.client import BigQueryClient - - bq = BigQueryClient(mock.Mock(), 'project') - - get_query_mock.return_value = { - 'jobComplete': False, - 'schema': {'fields': 'This is our schema'} - } - - self.assertRaises(client.UnfinishedQueryException, bq.get_query_schema, - job_id=123) - - -class TestGetTableSchema(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_tables = mock.Mock() - self.mock_bq_service.tables.return_value = self.mock_tables - self.table = 'table' - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - - def test_table_exists(self): - """Ensure that the table schema is returned if the table exists.""" - - expected = [ - {'type': 'FLOAT', 'name': 'foo', 'mode': 'NULLABLE'}, - {'type': 'INTEGER', 'name': 'bar', 'mode': 'NULLABLE'}, - {'type': 'INTEGER', 'name': 'baz', 'mode': 'NULLABLE'}, - ] - - self.mock_tables.get.return_value.execute.return_value = \ - {'schema': {'fields': expected}} - - self.assertEqual( - expected, self.client.get_table_schema(self.dataset, self.table)) - self.mock_tables.get.assert_called_once_with( - projectId=self.project, tableId=self.table, datasetId=self.dataset) - self.mock_tables.get.return_value.execute.assert_called_once_with() - - def test_table_does_not_exist(self): - """Ensure that None is returned if the table doesn't exist.""" - self.mock_tables.get.return_value.execute.side_effect = \ - HttpError({'status': "404"}, '{}'.encode('utf8')) - - self.assertIsNone( - self.client.get_table_schema(self.dataset, self.table)) - self.mock_tables.get.assert_called_once_with( - projectId=self.project, tableId=self.table, datasetId=self.dataset) - self.mock_tables.get.return_value.execute.assert_called_once_with() - - -@mock.patch('bigquery.client.BigQueryClient.get_query_results') -class TestGetQueryRows(unittest.TestCase): - - def test_query_complete(self, get_query_mock): - """Ensure that get_query_rows works when a query is complete.""" - from bigquery.client import BigQueryClient - - bq = BigQueryClient(mock.Mock(), 'project') - - get_query_mock.return_value = { - 'jobComplete': True, - 'rows': [ - {'f': [{'v': 'bar'}, {'v': 'man'}]}, - {'f': [{'v': 'abc'}, {'v': 'xyz'}]} - ], - 'schema': { - 'fields': [ - {'name': 'foo', 'type': 'STRING'}, - {'name': 'spider', 'type': 'STRING'} - ] - }, - 'totalRows': 2 - } - - result_rows = bq.get_query_rows(job_id=123, offset=0, limit=0) - - expected_rows = [{'foo': 'bar', 'spider': 'man'}, - {'foo': 'abc', 'spider': 'xyz'}] - self.assertEquals(result_rows, expected_rows) - - def test_query_complete_with_page_token(self, get_query_mock): - """Ensure that get_query_rows works with page token.""" - from bigquery.client import BigQueryClient - - page_one_resp = { - "jobComplete": True, - "kind": "bigquery#getQueryResultsResponse", - "pageToken": "TOKEN_TO_PAGE_2", - "schema": { - "fields": [{ - "name": "first_name", - "type": "STRING", - }, { - "name": "last_name", - "type": "STRING", - }] - }, - "rows": [{ - "f": [{ - "v": "foo", - }, { - "v": "bar" - }] - }, { - "f": [{ - "v": "abc", - }, { - "v": "xyz" - }] - }], - "totalRows": "4" - } - - page_two_resp = { - "jobComplete": True, - "kind": "bigquery#getQueryResultsResponse", - "schema": { - "fields": [{ - "name": "first_name", - "type": "STRING", - }, { - "name": "last_name", - "type": "STRING", - }] - }, - "rows": [{ - "f": [{ - "v": "the", - }, { - "v": "beatles" - }] - }, { - "f": [{ - "v": "monty", - }, { - "v": "python" - }] - }], - "totalRows": "4" - } - - bq = BigQueryClient(mock.Mock(), 'project') - get_query_mock.side_effect = [page_one_resp, page_two_resp] - result_rows = bq.get_query_rows(job_id=123, offset=0, limit=0) - - expected_rows = [{'first_name': 'foo', 'last_name': 'bar'}, - {'first_name': 'abc', 'last_name': 'xyz'}, - {'first_name': 'the', 'last_name': 'beatles'}, - {'first_name': 'monty', 'last_name': 'python'}] - self.assertEquals(result_rows, expected_rows) - - def test_query_incomplete(self, get_query_mock): - """Ensure that get_query_rows handles scenarios where the query is not - finished. - """ - from bigquery.client import BigQueryClient - - bq = BigQueryClient(mock.Mock(), 'project') - - get_query_mock.return_value = { - 'jobComplete': False, - 'rows': [ - {'f': [{'v': 'bar'}, {'v': 'man'}]}, - {'f': [{'v': 'abc'}, {'v': 'xyz'}]} - ], - 'schema': { - 'fields': [ - {'name': 'foo', 'type': 'STRING'}, - {'name': 'spider', 'type': 'STRING'} - ] - }, - 'totalRows': 2 - } - - self.assertRaises(client.UnfinishedQueryException, bq.get_query_rows, - job_id=123, offset=0, limit=0) - - -class TestCheckTable(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_tables = mock.Mock() - self.mock_bq_service.tables.return_value = self.mock_tables - self.table = 'table' - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - - def test_table_does_not_exist(self): - """Ensure that if the table does not exist, False is returned.""" - - self.mock_tables.get.return_value.execute.side_effect = ( - HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) - - actual = self.client.check_table(self.dataset, self.table) - - self.assertFalse(actual) - - self.mock_tables.get.assert_called_once_with( - projectId=self.project, datasetId=self.dataset, tableId=self.table) - - self.mock_tables.get.return_value.execute.assert_called_once_with() - - def test_table_does_exist(self): - """Ensure that if the table does exist, True is returned.""" - - self.mock_tables.get.return_value.execute.side_effect = { - 'status': 'foo'} - - actual = self.client.check_table(self.dataset, self.table) - - self.assertTrue(actual) - - self.mock_tables.get.assert_called_once_with( - projectId=self.project, datasetId=self.dataset, tableId=self.table) - - self.mock_tables.get.return_value.execute.assert_called_once_with() - - -class TestCreateTable(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_tables = mock.Mock() - self.mock_bq_service.tables.return_value = self.mock_tables - self.table = 'table' - self.schema = [ - {'name': 'foo', 'type': 'STRING', 'mode': 'nullable'}, - {'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'} - ] - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - self.body = { - 'schema': {'fields': self.schema}, - 'tableReference': { - 'tableId': self.table, 'projectId': self.project, - 'datasetId': self.dataset} - } - self.expiration_time = 1437513693000 - self.time_partitioning = True - - def test_table_create_failed(self): - """Ensure that if creating the table fails, False is returned, - or if swallow_results is False an empty dict is returned.""" - - self.mock_tables.insert.return_value.execute.side_effect = ( - HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) - - actual = self.client.create_table(self.dataset, self.table, - self.schema) - - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.create_table(self.dataset, self.table, - self.schema) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_tables.insert.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.insert.return_value.execute.assert_called_with() - - def test_table_create_success(self): - """Ensure that if creating the table succeeds, True is returned, - or if swallow_results is False the actual response is returned.""" - - self.mock_tables.insert.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.create_table(self.dataset, self.table, - self.schema) - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.create_table(self.dataset, self.table, - self.schema) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_tables.insert.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.insert.return_value.execute.assert_called_with() - - def test_table_create_body_with_expiration_time(self): - """Ensure that if expiration_time has specified, - it passed to the body.""" - - self.mock_tables.insert.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - self.client.create_table(self.dataset, self.table, - self.schema, self.expiration_time) - - body = self.body.copy() - body.update({ - 'expirationTime': self.expiration_time - }) - - self.mock_tables.insert.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=body) - - self.mock_tables.insert.return_value.execute.assert_called_with() - - def test_table_create_body_with_time_partitioning(self): - """Ensure that if time_partitioning has specified, - it passed to the body.""" - - self.mock_tables.insert.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - self.client.create_table(self.dataset, self.table, - self.schema, - time_partitioning=self.time_partitioning) - - body = self.body.copy() - body.update({ - 'timePartitioning': {'type': 'DAY'} - }) - - self.mock_tables.insert.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=body) - - self.mock_tables.insert.return_value.execute.assert_called_with() - - -class TestUpdateTable(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_tables = mock.Mock() - self.mock_bq_service.tables.return_value = self.mock_tables - self.table = 'table' - self.schema = [ - {'name': 'foo', 'type': 'STRING', 'mode': 'nullable'}, - {'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'} - ] - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - self.body = { - 'schema': {'fields': self.schema}, - 'tableReference': { - 'tableId': self.table, 'projectId': self.project, - 'datasetId': self.dataset} - } - self.expiration_time = 1437513693000 - - def test_table_update_failed(self): - """Ensure that if updating the table fails, False is returned, - or if swallow_results is False an empty dict is returned.""" - - self.mock_tables.update.return_value.execute.side_effect = ( - HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) - - actual = self.client.update_table(self.dataset, self.table, - self.schema) - - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.update_table(self.dataset, self.table, - self.schema) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_tables.update.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.update.return_value.execute.assert_called_with() - - def test_table_update_success(self): - """Ensure that if updating the table succeeds, True is returned, - or if swallow_results is False the actual response is returned.""" - - self.mock_tables.update.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.update_table(self.dataset, self.table, - self.schema) - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.update_table(self.dataset, self.table, - self.schema) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_tables.update.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.update.return_value.execute.assert_called_with() - - -class TestPatchTable(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_tables = mock.Mock() - self.mock_bq_service.tables.return_value = self.mock_tables - self.table = 'table' - self.schema = [ - {'name': 'foo', 'type': 'STRING', 'mode': 'nullable'}, - {'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'} - ] - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - self.body = { - 'schema': {'fields': self.schema}, - 'tableReference': { - 'tableId': self.table, 'projectId': self.project, - 'datasetId': self.dataset} - } - self.expiration_time = 1437513693000 - - def test_table_patch_failed(self): - """Ensure that if patching the table fails, False is returned, - or if swallow_results is False an empty dict is returned.""" - - self.mock_tables.patch.return_value.execute.side_effect = ( - HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) - - actual = self.client.patch_table(self.dataset, self.table, - self.schema) - - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.patch_table(self.dataset, self.table, - self.schema) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_tables.patch.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.patch.return_value.execute.assert_called_with() - - def test_table_patch_success(self): - """Ensure that if patching the table succeeds, True is returned, - or if swallow_results is False the actual response is returned.""" - - self.mock_tables.patch.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.patch_table(self.dataset, self.table, - self.schema) - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.patch_table(self.dataset, self.table, - self.schema) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_tables.patch.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.patch.return_value.execute.assert_called_with() - - -class TestCreateView(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_tables = mock.Mock() - self.mock_bq_service.tables.return_value = self.mock_tables - self.table = 'table' - self.project = 'project' - self.dataset = 'dataset' - self.query = 'SELECT "bar" foo, "foo" bar' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - self.body = { - 'view': {'query': self.query}, - 'tableReference': { - 'tableId': self.table, 'projectId': self.project, - 'datasetId': self.dataset} - } - - def test_view_create_failed(self): - """Ensure that if creating the table fails, False is returned, - or if swallow_results is False an empty dict is returned.""" - - self.mock_tables.insert.return_value.execute.side_effect = ( - HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) - - actual = self.client.create_view(self.dataset, self.table, - self.query) - - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.create_view(self.dataset, self.table, - self.query) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_tables.insert.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.insert.return_value.execute.assert_called_with() - - def test_view_create_success(self): - """Ensure that if creating the table succeeds, True is returned, - or if swallow_results is False the actual response is returned.""" - - self.mock_tables.insert.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.create_view(self.dataset, self.table, - self.query) - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.create_view(self.dataset, self.table, - self.query) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_tables.insert.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_tables.insert.return_value.execute.assert_called_with() - - -class TestDeleteTable(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_tables = mock.Mock() - self.mock_bq_service.tables.return_value = self.mock_tables - self.table = 'table' - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - - def test_delete_table_fail(self): - """Ensure that if deleting table fails, False is returned, - or the actual response is swallow_results is False.""" - - self.mock_tables.delete.return_value.execute.side_effect = ( - HttpError(HttpResponse(404), 'There was an error'.encode('utf8'))) - - actual = self.client.delete_table(self.dataset, self.table) - - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.delete_table(self.dataset, self.table) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_tables.delete.assert_called_with( - projectId=self.project, datasetId=self.dataset, tableId=self.table) - - self.mock_tables.delete.return_value.execute.assert_called_with() - - def test_delete_table_success(self): - """Ensure that if deleting table succeeds, True is returned, - or the actual response if swallow_results is False.""" - - self.mock_tables.delete.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.delete_table(self.dataset, self.table) - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.delete_table(self.dataset, self.table) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_tables.delete.assert_called_with( - projectId=self.project, datasetId=self.dataset, tableId=self.table) - - self.mock_tables.delete.return_value.execute.assert_called_with() - - -class TestParseTableListReponse(unittest.TestCase): - - def test_full_parse(self): - """Ensures we can parse a full list response.""" - - bq = client.BigQueryClient(None, 'project') - - tables = bq._parse_table_list_response(FULL_TABLE_LIST_RESPONSE) - - expected_result = { - 'appspot-3': {'2013_06_appspot_3': 1370044800}, - 'appspot-2': {'2013_06_appspot_2': 1370044800}, - 'appspot-1': {'2013_06_appspot_1': 1370044800}, - 'appspot-6': {'appspot_6_2013_06': 1370044800}, - 'appspot-5': {'2013_06_appspot_5': 1370044800}, - 'appspot-4': {'2013_06_appspot_4': 1370044800}, - 'appspot': {'2013_05_appspot': 1367366400} - } - - self.assertEquals(expected_result, tables) - - def test_empty_parse(self): - """Ensures we can parse an empty dictionary.""" - - bq = client.BigQueryClient(None, 'project') - - tables = bq._parse_table_list_response({}) - - self.assertEquals(tables, {}) - - def test_error(self): - """Ensures we can handle parsing a response error.""" - - error_response = { - "error": { - "errors": [ - { - "domain": "global", - "reason": "required", - "message": "Login Required", - "locationType": "header", - "location": "Authorization" - } - ], - "code": 401, - "message": "Login Required" - } - } - bq = client.BigQueryClient(None, 'project') - - tables = bq._parse_table_list_response(error_response) - - self.assertEquals(tables, {}) - - def test_incorrect_table_formats(self): - """Ensures we can parse incorrectly formatted table ids.""" - - list_response = { - "tables": [ - { - "tableReference": { - "tableId": "somethingwrong" - } - }, - { - "tableReference": { - "tableId": "john-snow" - } - }, - { - "tableReference": { - "tableId": "'------'," - } - }, - { - "tableReference": { - "tableId": "" - } - }, - { - "tableReference": { - "tableId": "adam_west" - } - } - ], - } - bq = client.BigQueryClient(None, 'project') - - tables = bq._parse_table_list_response(list_response) - - self.assertEquals(tables, {}) - - -class TestPushRows(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_table_data = mock.Mock() - self.mock_bq_service.tabledata.return_value = self.mock_table_data - self.table = 'table' - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - self.rows = [ - {'one': 'uno', 'two': 'dos'}, {'one': 'ein', 'two': 'zwei'}, - {'two': 'kiwi'}] - self.data = { - "kind": "bigquery#tableDataInsertAllRequest", - "rows": [{'insertId': "uno", 'json': {'one': 'uno', 'two': 'dos'}}, - {'insertId': "ein", 'json': - {'one': 'ein', 'two': 'zwei'}}, - {'json': {'two': 'kiwi'}}] - } - - def test_push_failed(self): - """Ensure that if insertAll does not raise an exception, but returns - insertion errors, False is returned. - """ - - self.mock_table_data.insertAll.return_value.execute.return_value = { - 'insertErrors': 'foo'} - - actual = self.client.push_rows(self.dataset, self.table, self.rows, - 'one') - - self.assertFalse(actual) - - self.mock_bq_service.tabledata.assert_called_once_with() - - self.mock_table_data.insertAll.assert_called_once_with( - projectId=self.project, datasetId=self.dataset, tableId=self.table, - body=self.data) - - execute_calls = [mock.call()] - self.mock_table_data.insertAll.return_value.execute.assert_has_calls( - execute_calls) - - def test_push_failed_swallow_results_false(self): - """ - Ensure that if insertAll returns insertion errors and swallow_results - is false that you get an empty dictionary. - """ - self.mock_table_data.insertAll.return_value.execute.return_value = { - 'insertErrors': 'foo'} - self.client.swallow_results = False - - actual = self.client.push_rows(self.dataset, self.table, self.rows, - 'one') - - self.client.swallow_results = True # Reset for other tests - - self.assertEqual( - actual, - self.mock_table_data.insertAll.return_value.execute.return_value) - - def test_push_exception(self): - """Ensure that if insertAll raises an exception, False is returned.""" - - e = HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) - self.mock_table_data.insertAll.return_value.execute.side_effect = e - - actual = self.client.push_rows(self.dataset, self.table, self.rows, - 'one') - - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.push_rows(self.dataset, self.table, self.rows, - 'one') - - self.assertEqual(actual, { - 'insertErrors': [{ - 'errors': [{ - 'reason': 'httperror', - 'message': e - }] - }]}) - - self.client.swallow_results = True - - self.mock_bq_service.tabledata.assert_called_with() - - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, datasetId=self.dataset, tableId=self.table, - body=self.data) - - execute_calls = [mock.call()] - self.mock_table_data.insertAll.return_value.execute.assert_has_calls( - execute_calls) - - def test_push_success(self): - """Ensure that if insertAll does not raise an exception, but returns - insertion errors, False is returned. - """ - - self.mock_table_data.insertAll.return_value.execute.return_value = { - 'status': 'foo'} - - actual = self.client.push_rows(self.dataset, self.table, self.rows, - 'one') - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.push_rows(self.dataset, self.table, self.rows, - 'one') - - self.assertEqual(actual, {'status': 'foo'}) - - self.client.swallow_results = True - - self.mock_bq_service.tabledata.assert_called_with() - - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, datasetId=self.dataset, tableId=self.table, - body=self.data) - - execute_calls = [mock.call()] - self.mock_table_data.insertAll.return_value.execute.assert_has_calls( - execute_calls) - - def test_request_data_with_options(self): - """Ensure that insertAll body has optional property only when - the optional parameter of push_rows passed. - """ - expected_body = self.data.copy() - - self.client.push_rows( - self.dataset, self.table, self.rows, - insert_id_key='one') - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, - datasetId=self.dataset, - tableId=self.table, - body=expected_body) - - self.client.push_rows( - self.dataset, self.table, self.rows, - insert_id_key='one', - ignore_unknown_values=False, - skip_invalid_rows=False) - expected_body['ignoreUnknownValues'] = False - expected_body['skipInvalidRows'] = False - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, - datasetId=self.dataset, - tableId=self.table, - body=expected_body) - - self.client.push_rows( - self.dataset, self.table, self.rows, - insert_id_key='one', - ignore_unknown_values=True, - skip_invalid_rows=True, - template_suffix='20160428' - ) - expected_body['ignoreUnknownValues'] = True - expected_body['skipInvalidRows'] = True - expected_body['templateSuffix'] = '20160428' - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, - datasetId=self.dataset, - tableId=self.table, - body=expected_body) - - def test_insert_id_key_with_nested_column(self): - """Ensure that dot separated insert_id_key properly extracted with nested column value.""" - rows = [ - {'nested': {'col': 'nested_col1'}, 'val': 1}, - {'nested': {'col': 'nested_col2'}, 'val': 2}, - ] - expected_body = self.data.copy() - expected_body['rows'] = [ - {'insertId': 'nested_col1', 'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, - {'insertId': 'nested_col2', 'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, - ] - - self.client.push_rows(self.dataset, self.table, rows, - insert_id_key='nested.col') - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, - datasetId=self.dataset, - tableId=self.table, - body=expected_body) - - expected_body = self.data.copy() - expected_body['rows'] = [ - {'insertId': 1, 'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, - {'insertId': 2, 'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, - ] - self.client.push_rows(self.dataset, self.table, rows, - insert_id_key='val') - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, - datasetId=self.dataset, - tableId=self.table, - body=expected_body) - - expected_body = self.data.copy() - expected_body['rows'] = [ - {'json': {'nested': {'col': 'nested_col1'}, 'val': 1}}, - {'json': {'nested': {'col': 'nested_col2'}, 'val': 2}}, - ] - self.client.push_rows(self.dataset, self.table, rows, - insert_id_key='no_such.column') - self.mock_table_data.insertAll.assert_called_with( - projectId=self.project, - datasetId=self.dataset, - tableId=self.table, - body=expected_body) - - -class TestGetAllTables(unittest.TestCase): - - def test_get_all_tables(self): - """Ensure get_all_tables fetches table names from BigQuery.""" - - mock_execute = mock.Mock() - mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE - - mock_tables = mock.Mock() - mock_tables.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.tables.return_value = mock_tables - - bq = client.BigQueryClient(mock_bq_service, 'project') - - expected_result = [ - '2013_05_appspot', '2013_06_appspot_1', '2013_06_appspot_2', - '2013_06_appspot_3', '2013_06_appspot_4', '2013_06_appspot_5', - 'appspot_6_2013_06', 'table_not_matching_naming' - ] - - tables = bq.get_all_tables('dataset') - self.assertEquals(expected_result, tables) - - def test_get_tables(self): - """Ensure _get_all_tables fetches table names from BigQuery.""" - - mock_execute = mock.Mock() - mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE - - mock_tables = mock.Mock() - mock_tables.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.tables.return_value = mock_tables - - bq = client.BigQueryClient(mock_bq_service, 'project') - - expected_result = { - 'appspot-3': {'2013_06_appspot_3': 1370044800}, - 'appspot-2': {'2013_06_appspot_2': 1370044800}, - 'appspot-1': {'2013_06_appspot_1': 1370044800}, - 'appspot-6': {'appspot_6_2013_06': 1370044800}, - 'appspot-5': {'2013_06_appspot_5': 1370044800}, - 'appspot-4': {'2013_06_appspot_4': 1370044800}, - 'appspot': {'2013_05_appspot': 1367366400} - } - - tables = bq._get_all_tables('dataset', cache=False) - self.assertEquals(expected_result, tables) - - def test_get_all_tables_with_page_token(self): - """Ensure get_all_tables fetches all tables names from BigQuery""" - - mock_execute = mock.Mock() - mock_execute.execute.side_effect = [NEXT_TABLE_LIST_RESPONSE, - FULL_TABLE_LIST_RESPONSE] - - mock_tables = mock.Mock() - mock_tables.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.tables.return_value = mock_tables - - bq = client.BigQueryClient(mock_bq_service, 'project') - - expected_result = { - 'appspot-3': {'2013_06_appspot_3': 1370044800}, - 'appspot-2': {'2013_06_appspot_2': 1370044800}, - 'appspot-1': {'2013_06_appspot_1': 1370044800}, - 'appspot-6': {'appspot_6_2013_06': 1370044800}, - 'appspot-5': {'2013_06_appspot_5': 1370044800}, - 'appspot-4': {'2013_06_appspot_4': 1370044800}, - 'appspot': {'2013_05_appspot': 1367366400}, - 'appspot-10': {'2013_06_appspot_10': 1370044800}, - 'appspot-12': {'2013_06_appspot_12': 1370044800}, - 'appspot-11': {'2013_06_appspot_11': 1370044800}, - } - tables = bq._get_all_tables('dataset', cache=False) - self.assertEquals(expected_result, tables) - - def test_get_all_tables_with_cache(self): - """Ensure get_all_tables uses cache when fetching""" - mock_execute = mock.Mock() - mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE - - mock_tables = mock.Mock() - mock_tables.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.tables.return_value = mock_tables - - bq = client.BigQueryClient(mock_bq_service, 'project') - - expected_result = { - 'appspot-3': {'2013_06_appspot_3': 1370044800}, - 'appspot-2': {'2013_06_appspot_2': 1370044800}, - 'appspot-1': {'2013_06_appspot_1': 1370044800}, - 'appspot-6': {'appspot_6_2013_06': 1370044800}, - 'appspot-5': {'2013_06_appspot_5': 1370044800}, - 'appspot-4': {'2013_06_appspot_4': 1370044800}, - 'appspot': {'2013_05_appspot': 1367366400} - } - - tables = bq._get_all_tables('dataset', cache=True) - self.assertEquals(expected_result, tables) - - mock_execute.execute.side_effect = [NEXT_TABLE_LIST_RESPONSE, - FULL_TABLE_LIST_RESPONSE] - tables = bq._get_all_tables('dataset', cache=True) - self.assertEquals(expected_result, tables) - - expected_result = { - 'appspot-3': {'2013_06_appspot_3': 1370044800}, - 'appspot-2': {'2013_06_appspot_2': 1370044800}, - 'appspot-1': {'2013_06_appspot_1': 1370044800}, - 'appspot-6': {'appspot_6_2013_06': 1370044800}, - 'appspot-5': {'2013_06_appspot_5': 1370044800}, - 'appspot-4': {'2013_06_appspot_4': 1370044800}, - 'appspot': {'2013_05_appspot': 1367366400}, - 'appspot-10': {'2013_06_appspot_10': 1370044800}, - 'appspot-12': {'2013_06_appspot_12': 1370044800}, - 'appspot-11': {'2013_06_appspot_11': 1370044800}, - } - tables = bq._get_all_tables('dataset', cache=False) - self.assertEquals(expected_result, tables) - - -class TestGetTables(unittest.TestCase): - - def test_get_tables(self): - """Ensure tables falling in the time window are returned.""" - - mock_execute = mock.Mock() - mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE - - mock_tables = mock.Mock() - mock_tables.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.tables.return_value = mock_tables - - bq = client.BigQueryClient(mock_bq_service, 'project') - - tables = bq.get_tables('dataset', 'appspot-1', 0, 10000000000) - six.assertCountEqual(self, tables, ['2013_06_appspot_1']) - - def test_get_tables_from_datetimes(self): - """Ensure tables falling in the time window, specified with datetimes, - are returned. - """ - from datetime import datetime - - mock_execute = mock.Mock() - mock_execute.execute.return_value = FULL_TABLE_LIST_RESPONSE - - mock_tables = mock.Mock() - mock_tables.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.tables.return_value = mock_tables - - bq = client.BigQueryClient(mock_bq_service, 'project') - - start = datetime(2013, 5, 10) - end = datetime(2013, 7, 10) - - tables = bq.get_tables('dataset', 'appspot-1', start, end) - six.assertCountEqual(self, tables, ['2013_06_appspot_1']) - - -# -# Dataset tests -# -class TestCreateDataset(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_datasets = mock.Mock() - self.mock_bq_service.datasets.return_value = self.mock_datasets - self.dataset = 'dataset' - self.project = 'project' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - self.friendly_name = "friendly name" - self.description = "description" - self.access = [{'userByEmail': "bob@gmail.com"}] - self.body = { - 'datasetReference': { - 'datasetId': self.dataset, - 'projectId': self.project}, - 'friendlyName': self.friendly_name, - 'description': self.description, - 'access': self.access - } - - def test_dataset_create_failed(self): - """Ensure that if creating the table fails, False is returned.""" - - self.mock_datasets.insert.return_value.execute.side_effect = \ - HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) - - actual = self.client.create_dataset(self.dataset, - friendly_name=self.friendly_name, - description=self.description, - access=self.access) - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.create_dataset(self.dataset, - friendly_name=self.friendly_name, - description=self.description, - access=self.access) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_datasets.insert.assert_called_with( - projectId=self.project, body=self.body) - - self.mock_datasets.insert.return_value.execute. \ - assert_called_with() - - def test_dataset_create_success(self): - """Ensure that if creating the table fails, False is returned.""" - - self.mock_datasets.insert.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.create_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.create_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_datasets.insert.assert_called_with( - projectId=self.project, body=self.body) - - self.mock_datasets.insert.return_value.execute. \ - assert_called_with() - - -class TestDeleteDataset(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_datasets = mock.Mock() - self.mock_bq_service.datasets.return_value = self.mock_datasets - self.project = 'project' - self.dataset = 'dataset' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - - def test_delete_datasets_fail(self): - """Ensure that if deleting table fails, False is returned.""" - - self.mock_datasets.delete.return_value.execute.side_effect = \ - HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) - - actual = self.client.delete_dataset(self.dataset) - - self.assertFalse(actual) - - self.mock_datasets.delete.assert_called_once_with( - projectId=self.project, datasetId=self.dataset, - deleteContents=False) - - self.client.swallow_results = False - - actual = self.client.delete_dataset(self.dataset) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_datasets.delete.return_value.execute. \ - assert_called_with() - - def test_delete_datasets_success(self): - """Ensure that if deleting table succeeds, True is returned.""" - - self.mock_datasets.delete.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.delete_dataset(self.dataset) - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.delete_dataset(self.dataset) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_datasets.delete.assert_called_with( - projectId=self.project, datasetId=self.dataset, - deleteContents=False) - - self.mock_datasets.delete.return_value.execute. \ - assert_called_with() - - def test_delete_datasets_delete_contents_success(self): - """Ensure that if deleting table succeeds, True is returned.""" - - self.mock_datasets.delete.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.delete_dataset(self.dataset, True) - - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.delete_dataset(self.dataset, True) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_datasets.delete.assert_called_with( - projectId=self.project, datasetId=self.dataset, - deleteContents=True) - - self.mock_datasets.delete.return_value.execute. \ - assert_called_with() - - -FULL_DATASET_LIST_RESPONSE = { - "kind": "bigquery#dataseteList", - "etag": "\"GSclnjk0zID1ucM3F-xYinOm1oE/cn58Rpu8v8pB4eoJQaiTe11lPQc\"", - "datasets": [ - { - "kind": "bigquery#dataset", - "id": "project:dataset1", - "datasetReference": { - "projectId": "project", - "datasetId": "dataset1" - } - }, - { - "kind": "bigquery#dataset", - "id": "project:dataset2", - "datasetReference": { - "projectId": "project", - "datasetId": "dataset2", - } - }, - { - "kind": "bigquery#dataset", - "id": "project:dataset3", - "datasetReference": { - "projectId": "project", - "datasetId": "dataset3" - } - }, - { - "kind": "bigquery#dataset", - "id": "project:dataset4", - "datasetReference": { - "projectId": "project", - "datasetId": "dataset4" - } - }, - { - "kind": "bigquery#dataset", - "id": "project:dataset5", - "datasetReference": { - "projectId": "project", - "datasetId": "dataset5" - } - }, - { - "kind": "bigquery#dataset", - "id": "project:dataset6", - "datasetReference": { - "projectId": "project", - "datasetId": "dataset6" - } - }, - { - "kind": "bigquery#dataset", - "id": "project:dataset7", - "datasetReference": { - "projectId": "project", - "datasetId": "dataset7" - } - }, - { - "kind": "bigquery#dataset", - "id": "bad dataset data" - } - ], - "totalItems": 8 -} - - -class TestGetDatasets(unittest.TestCase): - - def test_get_datasets(self): - """Ensure datasets are returned.""" - - mock_execute = mock.Mock() - mock_execute.execute.return_value = FULL_DATASET_LIST_RESPONSE - - mock_datasets = mock.Mock() - mock_datasets.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.datasets.return_value = mock_datasets - - bq = client.BigQueryClient(mock_bq_service, 'project') - - datasets = bq.get_datasets() - six.assertCountEqual(self, datasets, - FULL_DATASET_LIST_RESPONSE['datasets']) - - def test_get_datasets_returns_no_list(self): - """Ensure we handle the no datasets case""" - mock_execute = mock.Mock() - mock_execute.execute.return_value = { - "kind": "bigquery#dataseteList", - "etag": "\"GSclnjk0zID1ucM3F-xYinOm1oE/cn58Rpu8v8pB4eoJQaiTe11lP\"" - } - - mock_datasets = mock.Mock() - mock_datasets.list.return_value = mock_execute - - mock_bq_service = mock.Mock() - mock_bq_service.datasets.return_value = mock_datasets - - bq = client.BigQueryClient(mock_bq_service, 'project') - - datasets = bq.get_datasets() - six.assertCountEqual(self, datasets, []) - - -class TestUpdateDataset(unittest.TestCase): - - def setUp(self): - self.mock_bq_service = mock.Mock() - self.mock_datasets = mock.Mock() - self.mock_bq_service.datasets.return_value = self.mock_datasets - self.dataset = 'dataset' - self.project = 'project' - self.client = client.BigQueryClient(self.mock_bq_service, self.project) - self.friendly_name = "friendly name" - self.description = "description" - self.access = [{'userByEmail': "bob@gmail.com"}] - self.body = { - 'datasetReference': { - 'datasetId': self.dataset, - 'projectId': self.project}, - 'friendlyName': self.friendly_name, - 'description': self.description, - 'access': self.access - } - - def test_dataset_update_failed(self): - """Ensure that if creating the table fails, False is returned.""" - - self.mock_datasets.update.return_value.execute.side_effect = \ - HttpError(HttpResponse(404), 'There was an error'.encode('utf8')) - - actual = self.client.update_dataset(self.dataset, - friendly_name=self.friendly_name, - description=self.description, - access=self.access) - self.assertFalse(actual) - - self.client.swallow_results = False - - actual = self.client.update_dataset(self.dataset, - friendly_name=self.friendly_name, - description=self.description, - access=self.access) - - self.assertEqual(actual, {}) - - self.client.swallow_results = True - - self.mock_datasets.update.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_datasets.update.return_value.execute. \ - assert_called_with() - - def test_dataset_update_success(self): - """Ensure that if creating the table fails, False is returned.""" - - self.mock_datasets.update.return_value.execute.side_effect = [{ - 'status': 'foo'}, {'status': 'bar'}] - - actual = self.client.update_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) - self.assertTrue(actual) - - self.client.swallow_results = False - - actual = self.client.update_dataset(self.dataset, - self.friendly_name, - self.description, - self.access) - - self.assertEqual(actual, {'status': 'bar'}) - - self.client.swallow_results = True - - self.mock_datasets.update.assert_called_with( - projectId=self.project, datasetId=self.dataset, body=self.body) - - self.mock_datasets.update.return_value.execute. \ - assert_called_with() diff --git a/bigquery/tests/test_schema_builder.py.bak b/bigquery/tests/test_schema_builder.py.bak deleted file mode 100644 index 060162b..0000000 --- a/bigquery/tests/test_schema_builder.py.bak +++ /dev/null @@ -1,140 +0,0 @@ -from six.moves.builtins import object -from datetime import datetime -import unittest - -import six -from bigquery.schema_builder import schema_from_record -from bigquery.schema_builder import describe_field -from bigquery.schema_builder import bigquery_type -from bigquery.schema_builder import InvalidTypeException - - -class TestBigQueryTypes(unittest.TestCase): - - def test_str_is_string(self): - six.assertCountEqual(self, bigquery_type("Bob"), 'string') - - def test_unicode_is_string(self): - six.assertCountEqual(self, bigquery_type(u"Here is a happy face \u263A"), - 'string') - - def test_int_is_integer(self): - six.assertCountEqual(self, bigquery_type(123), 'integer') - - def test_datetime_is_timestamp(self): - six.assertCountEqual(self, bigquery_type(datetime.now()), 'timestamp') - - def test_isoformat_timestring(self): - six.assertCountEqual(self, bigquery_type(datetime.now().isoformat()), - 'timestamp') - - def test_timestring_feb_20_1973(self): - six.assertCountEqual(self, bigquery_type("February 20th 1973"), - 'timestamp') - - def test_timestring_thu_1_july_2004_22_30_00(self): - six.assertCountEqual(self, bigquery_type("Thu, 1 July 2004 22:30:00"), - 'timestamp') - - def test_today_is_not_timestring(self): - six.assertCountEqual(self, bigquery_type("today"), 'string') - - def test_timestring_next_thursday(self): - six.assertCountEqual(self, bigquery_type("February 20th 1973"), 'timestamp') - - def test_timestring_arbitrary_fn_success(self): - six.assertCountEqual( - self, bigquery_type("whatever", timestamp_parser=lambda x: True), - 'timestamp') - - def test_timestring_arbitrary_fn_fail(self): - six.assertCountEqual( - self, bigquery_type("February 20th 1973", - timestamp_parser=lambda x: False), - 'string') - - def test_class_instance_is_invalid_type(self): - class SomeClass(object): - pass - - self.assertIsNone(bigquery_type(SomeClass())) - - def test_list_is_invalid_type(self): - self.assertIsNone(bigquery_type([1, 2, 3])) - - def test_dict_is_record(self): - six.assertCountEqual(self, bigquery_type({"a": 1}), 'record') - - -class TestFieldDescription(unittest.TestCase): - - def test_simple_string_field(self): - six.assertCountEqual(self, describe_field("user", "Bob"), - {"name": "user", "type": "string", "mode": - "nullable"}) - - -class TestSchemaGenerator(unittest.TestCase): - - def test_simple_record(self): - record = {"username": "Bob", "id": 123} - schema = [{"name": "username", "type": "string", "mode": "nullable"}, - {"name": "id", "type": "integer", "mode": "nullable"}] - - six.assertCountEqual(self, schema_from_record(record), schema) - - def test_hierarchical_record(self): - record = {"user": {"username": "Bob", "id": 123}} - schema = [{"name": "user", "type": "record", "mode": "nullable", - "fields": [{"name": "username", "type": "string", "mode": - "nullable"}, {"name": "id", "type": "integer", - "mode": "nullable"}]}] - generated_schema = schema_from_record(record) - schema_fields = schema[0].pop('fields') - generated_fields = generated_schema[0].pop('fields') - six.assertCountEqual(self, schema_fields, generated_fields) - six.assertCountEqual(self, generated_schema, schema) - - def test_hierarchical_record_with_timestamps(self): - record = {"global": "2001-01-01", "user": {"local": "2001-01-01"}} - - schema_with_ts = [ - {"name": "global", "type": "timestamp", "mode": "nullable"}, - {"name": "user", "type": "record", "mode": "nullable", - "fields": [{ - "name": "local", - "type": "timestamp", - "mode": "nullable"}]}] - - schema_without_ts = [ - {"name": "global", "type": "string", "mode": "nullable"}, - {"name": "user", "type": "record", "mode": "nullable", - "fields": [{ - "name": "local", - "type": "string", - "mode": "nullable"}]}] - - six.assertCountEqual(self, schema_from_record(record), schema_with_ts) - - six.assertCountEqual( - self, schema_from_record(record, timestamp_parser=lambda x: False), - schema_without_ts) - - def test_repeated_field(self): - record = {"ids": [1, 2, 3, 4, 5]} - schema = [{"name": "ids", "type": "integer", "mode": "repeated"}] - - six.assertCountEqual(self, schema_from_record(record), schema) - - def test_nested_invalid_type_reported_correctly(self): - key = "wrong answer" - value = "wrong answer" - - try: - schema_from_record({"a": {"b": [{"c": None}]}}) - except InvalidTypeException as e: - key = e.key - value = e.value - - self.assertEqual(key, "a.b.c") - self.assertEqual(value, None) From 09dc78a7b348f1867faaa0fbee41645d63988484 Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Thu, 18 May 2017 10:21:30 -0700 Subject: [PATCH 3/6] stability update --- bigquery/__init__.py | 2 +- bigquery/schema_builder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigquery/__init__.py b/bigquery/__init__.py index beb89bb..b393875 100644 --- a/bigquery/__init__.py +++ b/bigquery/__init__.py @@ -1,4 +1,4 @@ - +from __future__ import absolute_import from .version import __version__ diff --git a/bigquery/schema_builder.py b/bigquery/schema_builder.py index dafda39..65027b8 100644 --- a/bigquery/schema_builder.py +++ b/bigquery/schema_builder.py @@ -1,4 +1,4 @@ - +from __future__ import absolute_import __author__ = 'Aneil Mallavarapu (http://github.com/aneilbaboo)' from datetime import datetime From b8b398b08f66526ff1afcc7d8b50989f3589048e Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Thu, 18 May 2017 10:24:05 -0700 Subject: [PATCH 4/6] Hopefully this shall pass python2 and python3 --- bigquery/client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigquery/client.py b/bigquery/client.py index 7e5d6f0..bd4a700 100644 --- a/bigquery/client.py +++ b/bigquery/client.py @@ -15,6 +15,9 @@ from googleapiclient.errors import HttpError from httplib2 import Http +if sys.version_info >= (3, 0): + basestring = str + BIGQUERY_SCOPE = [ 'https://www.googleapis.com/auth/bigquery' ] @@ -124,7 +127,7 @@ def get_client(project_id=None, credentials=None, if private_key: try: - if isinstance(private_key, str): + if isinstance(private_key, basestring): private_key = private_key.decode('utf-8') except NameError: # python3 -- private_key is already unicode From e5a3f0ad0508d88ff81356d767c78148f417b2df Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Thu, 18 May 2017 10:26:42 -0700 Subject: [PATCH 5/6] Update travis-ci --- .travis.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9f422c6..2895a3f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,10 @@ language: python + +python: + - "2.7" + - "3.6" + - "3.7-dev" # 3.7 development branch + install: - python setup.py develop - pip install tox @@ -7,7 +13,5 @@ notifications: email: false env: - TOXENV=py27 - - TOXENV=py33 - - TOXENV=py34 - TOXENV=nightly - TOXENV=pypy From 58751d22cde2fbee499977241993217ee62396e4 Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Thu, 18 May 2017 10:29:43 -0700 Subject: [PATCH 6/6] Turn off tox --- .travis.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2895a3f..8d17484 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,11 +7,5 @@ python: install: - python setup.py develop - - pip install tox -script: tox -e $TOXENV notifications: email: false -env: - - TOXENV=py27 - - TOXENV=nightly - - TOXENV=pypy