DOC: Update "writing" docs with information about schema inference. (#259)

tswast · web-flow · commit 21f49dad682b · 2019-03-15T13:38:56.000-07:00
* DOC: Update "writing" docs with information about schema inference.

This commit started as a clean-up change to remove the unnecessary
pandas_gbq.gbq._update_bq_schema method, but I then also updated the
docs for to_gbq to be more clear about how the table_schema argument is
to be used. I added a section to the writing.rst how-to guide about
the table_schema parameter as well.

Some of the "notes" in writing.rst were better as their own subsections.
I moved the note on not to use BigQuery as a transactional database to
the landing page.

I link to the BigQuery sandox docs in the warning about creating
a BigQuery account because you can follow those instructions to use
BigQuery without entering credit card information.

* Blacken
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -16,9 +16,12 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+import datetime
 import os
 import sys
 
+import pandas_gbq
+
 # sys.path.insert(0, os.path.abspath('.'))
 
 # -- General configuration ------------------------------------------------
@@ -62,17 +65,19 @@
 
 # General information about the project.
 project = u"pandas-gbq"
-copyright = u"2017, PyData Development Team"
+copyright = u"2017-{}, PyData Development Team".format(
+    datetime.datetime.now().year
+)
 author = u"PyData Development Team"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = u"0.1.0"
+version = pandas_gbq.__version__
 # The full version, including alpha/beta/rc tags.
-release = u"0.1.0"
+release = version
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -8,16 +8,24 @@ Welcome to pandas-gbq's documentation!
 
 The :mod:`pandas_gbq` module provides a wrapper for Google's BigQuery
 analytics web service to simplify retrieving results from BigQuery tables
-using SQL-like queries. Result sets are parsed into a pandas
-DataFrame with a shape and data types derived from the source table.
-Additionally, DataFrames can be inserted into new BigQuery tables or appended
-to existing tables.
+using SQL-like queries. Result sets are parsed into a :class:`pandas.DataFrame`
+with a shape and data types derived from the source table. Additionally,
+DataFrames can be inserted into new BigQuery tables or appended to existing
+tables.
 
 .. warning::
 
-   To use this module, you will need a valid BigQuery account. Refer to the
-   `BigQuery Documentation <https://cloud.google.com/bigquery/what-is-bigquery>`__
-   for details on the service itself.
+   To use this module, you will need a valid BigQuery account. Use the
+   `BigQuery sandbox <https://cloud.google.com/bigquery/docs/sandbox>`__ to
+   try the service for free.
+
+While BigQuery uses standard SQL syntax, it has some important differences
+from traditional databases both in functionality, API limitations (size and
+quantity of queries or uploads), and how Google charges for use of the
+service. BiqQuery is best for analyzing large sets of data quickly. It is not
+a direct replacement for a transactional database. Refer to the `BigQuery
+Documentation <https://cloud.google.com/bigquery/what-is-bigquery>`__ for
+details on the service itself.
 
 Contents:
 
@@ -29,7 +37,6 @@ Contents:
    howto/authentication.rst
    reading.rst
    writing.rst
-   tables.rst
    api.rst
    contributing.rst
    changelog.rst
diff --git a/docs/source/tables.rst b/docs/source/tables.rst
diff --git a/docs/source/writing.rst b/docs/source/writing.rst
@@ -3,8 +3,8 @@
 Writing DataFrames
 ==================
 
-Assume we want to write a DataFrame ``df`` into a BigQuery table using
-:func:`~pandas_gbq.to_gbq`.
+Assume we want to write a :class:`~pandas.DataFrame` named ``df`` into a
+BigQuery table using :func:`~pandas_gbq.to_gbq`.
 
 .. ipython:: python
 
@@ -21,40 +21,62 @@ Assume we want to write a DataFrame ``df`` into a BigQuery table using
 
 .. code-block:: python
 
-   to_gbq(df, 'my_dataset.my_table', projectid)
+   import pandas_gbq
+   pandas_gbq.to_gbq(df, 'my_dataset.my_table', project_id=projectid)
 
-.. note::
+The destination table and destination dataset will automatically be created
+if they do not already exist.
 
-   The destination table and destination dataset will automatically be created if they do not already exist.
 
-The ``if_exists`` argument can be used to dictate whether to ``'fail'``, ``'replace'``
-or ``'append'`` if the destination table already exists. The default value is ``'fail'``.
+Writing to an Existing Table
+----------------------------
+
+Use the ``if_exists`` argument to dictate whether to ``'fail'``,
+``'replace'`` or ``'append'`` if the destination table already exists. The
+default value is ``'fail'``.
 
 For example, assume that ``if_exists`` is set to ``'fail'``. The following snippet will raise
 a ``TableCreationError`` if the destination table already exists.
 
 .. code-block:: python
 
-   to_gbq(df, 'my_dataset.my_table', projectid, if_exists='fail')
+   import pandas_gbq
+   pandas_gbq.to_gbq(
+       df, 'my_dataset.my_table', project_id=projectid, if_exists='fail',
+   )
+
+If the ``if_exists`` argument is set to ``'append'``, the destination
+dataframe will be written to the table using the defined table schema and
+column types. The dataframe must contain fields (matching name and type)
+currently in the destination table.
+
+
+.. _writing-schema:
+
+Inferring the Table Schema
+--------------------------
 
-.. note::
+The :func:`~pandas_gbq.to_gbq` method infers the BigQuery table schema based
+on the dtypes of the uploaded :class:`~pandas.DataFrame`.
 
-   If the ``if_exists`` argument is set to ``'append'``, the destination
-   dataframe will be written to the table using the defined table schema and
-   column types. The dataframe must contain fields (matching name and type)
-   currently in the destination table.
+========================= ==================
+dtype                     BigQuery Data Type
+========================= ==================
+i (integer)               INTEGER
+b (boolean)               BOOLEAN
+f (float)                 FLOAT
+O (object)                STRING
+S (zero-terminated bytes) STRING
+U (Unicode string)        STRING
+M (datetime)              TIMESTAMP
+========================= ==================
 
-.. note::
+If the data type inference does not suit your needs, supply a BigQuery schema
+as the ``table_schema`` parameter of :func:`~pandas_gbq.to_gbq`.
 
-   If an error occurs while streaming data to BigQuery, see
-   `Troubleshooting BigQuery Errors <https://cloud.google.com/bigquery/troubleshooting-errors>`__.
 
-.. note::
+Troubleshooting Errors
+----------------------
 
-   While BigQuery uses SQL-like syntax, it has some important differences
-   from traditional databases both in functionality, API limitations (size
-   and quantity of queries or uploads), and how Google charges for use of the
-   service. You should refer to `Google BigQuery documentation
-   <https://cloud.google.com/bigquery/docs>`__ often as the service is always
-   evolving. BiqQuery is best for analyzing large sets of data quickly, but
-   it is not a direct replacement for a transactional database.
+If an error occurs while writing data to BigQuery, see
+`Troubleshooting BigQuery Errors <https://cloud.google.com/bigquery/troubleshooting-errors>`__.
diff --git a/noxfile.py b/noxfile.py
@@ -5,6 +5,7 @@
 
 import os
 import os.path
+import shutil
 
 import nox
 
@@ -51,6 +52,28 @@ def cover(session, python=latest_python):
     session.run("coverage", "erase")
 
 
+@nox.session(python=latest_python)
+def docs(session):
+    """Build the docs."""
+
+    session.install("-r", os.path.join("docs", "requirements-docs.txt"))
+    session.install("-e", ".")
+
+    shutil.rmtree(os.path.join("docs", "source", "_build"), ignore_errors=True)
+    session.run(
+        "sphinx-build",
+        "-W",  # warnings as errors
+        "-T",  # show full traceback on exception
+        "-N",  # no colors
+        "-b",
+        "html",
+        "-d",
+        os.path.join("docs", "source", "_build", "doctrees", ""),
+        os.path.join("docs", "source", ""),
+        os.path.join("docs", "source", "_build", "html", ""),
+    )
+
+
 @nox.session(python=supported_pythons)
 def system(session):
     session.install("pytest", "pytest-cov")
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -937,14 +937,18 @@ def to_gbq(
         List of BigQuery table fields to which according DataFrame
         columns conform to, e.g. ``[{'name': 'col1', 'type':
         'STRING'},...]``.
-        If schema is not provided, it will be
-        generated according to dtypes of DataFrame columns.
-        If schema is provided, it may contain all or a subset of DataFrame
-        columns. If a subset is provided, the rest will be inferred from
-        the DataFrame dtypes.
-        pandas_gbq.gbq._generate_bq_schema() may be used to create an
-        initial schema, though it doesn't preserve column order.
-        See BigQuery API documentation on available names of a field.
+
+        - If ``table_schema`` is provided, it may contain all or a subset of
+          DataFrame columns. If a subset is provided, the rest will be
+          inferred from the DataFrame dtypes.
+        - If ``table_schema`` is **not** provided, it will be
+          generated according to dtypes of DataFrame columns. See
+          `Inferring the Table Schema
+          <https://pandas-gbq.readthedocs.io/en/latest/writing.html#writing-schema>`__.
+          for a description of the schema inference.
+
+        See `BigQuery API documentation on valid column names
+        <https://cloud.google.com/bigquery/docs/schemas#column_names`>__.
 
         .. versionadded:: 0.3.1
     location : str, optional
@@ -985,6 +989,7 @@ def to_gbq(
     """
 
     _test_google_api_imports()
+    from pandas_gbq import schema
 
     if verbose is not None and SHOW_VERBOSE_DEPRECATION:
         warnings.warn(
@@ -1029,7 +1034,7 @@ def to_gbq(
     if not table_schema:
         table_schema = default_schema
     else:
-        table_schema = _update_bq_schema(
+        table_schema = schema.update_schema(
             default_schema, dict(fields=table_schema)
         )
 
@@ -1091,15 +1096,16 @@ def generate_bq_schema(df, default_type="STRING"):
 
 
 def _generate_bq_schema(df, default_type="STRING"):
-    from pandas_gbq import schema
+    """DEPRECATED: Given a dataframe, generate a Google BigQuery schema.
 
-    return schema.generate_bq_schema(df, default_type=default_type)
-
-
-def _update_bq_schema(schema_old, schema_new):
+    This is a private method, but was used in external code to work around
+    issues in the default schema generation. Now that individual columns can
+    be overridden: https://github.com/pydata/pandas-gbq/issues/218, this
+    method can be removed after there is time to migrate away from this
+    method. """
     from pandas_gbq import schema
 
-    return schema.update_schema(schema_old, schema_new)
+    return schema.generate_bq_schema(df, default_type=default_type)
 
 
 class _Table(GbqConnector):
diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py
@@ -11,6 +11,8 @@ def generate_bq_schema(dataframe, default_type="STRING"):
         does not exist in the schema.
     """
 
+    # If you update this mapping, also update the table at
+    # `docs/source/writing.rst`.
     type_mapping = {
         "i": "INTEGER",
         "b": "BOOLEAN",