From 089b80f757b66112c4254ee9b1172f8ca3b3e565 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Sep 2019 08:51:31 +0200 Subject: [PATCH] Add 5_add_columns.ipynb from master --- notebooks/5_add_columns.ipynb | 727 ++++++++++++++++++++++++++++++++++ 1 file changed, 727 insertions(+) create mode 100644 notebooks/5_add_columns.ipynb diff --git a/notebooks/5_add_columns.ipynb b/notebooks/5_add_columns.ipynb new file mode 100644 index 0000000..f2b6881 --- /dev/null +++ b/notebooks/5_add_columns.ipynb @@ -0,0 +1,727 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Objectives\n", + "\n", + "- Add a new column to a DataFrame with a chosen name\n", + "- Understand that operations work element-wise (no loops required)\n", + "- Rename an existing column\n", + "\n", + "Content to cover\n", + "\n", + "- df[“...”] = f(other column)\n", + "- df[“...”] = f(other columns), eg. df[“...”] + df[“...”]\n", + "- df.rename\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this tutorial, air quality data about $NO_2$ is used, made available by [openaq](https://openaq.org) and using the [py-openaq](http://dhhagan.github.io/py-openaq/index.html) package. The `air_quality_no2.csv` data set provides $NO_2$ values for the measurement stations _FR04014_, _BETR801_ and _London Westminster_ in respectively Paris, Antwerp and London." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
station_antwerpstation_parisstation_london
datetime
2019-04-09 03:00:0022.524.467.0
2019-04-09 04:00:0053.527.467.0
2019-04-09 05:00:0054.534.241.0
2019-04-09 06:00:0034.548.541.0
2019-04-09 07:00:0046.559.541.0
\n", + "
" + ], + "text/plain": [ + " station_antwerp station_paris station_london\n", + "datetime \n", + "2019-04-09 03:00:00 22.5 24.4 67.0\n", + "2019-04-09 04:00:00 53.5 27.4 67.0\n", + "2019-04-09 05:00:00 54.5 34.2 41.0\n", + "2019-04-09 06:00:00 34.5 48.5 41.0\n", + "2019-04-09 07:00:00 46.5 59.5 41.0" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_quality = pd.read_csv(\"../data/air_quality_no2.csv\", \n", + " index_col=0, parse_dates=True)\n", + "air_quality.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add new columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/05_newcolumn_1.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I want to express the $NO_2$ concentration of the station in London in mg/m$^3$ (conversion factor 1.882)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(_Assume temperature of 25 degrees Celsius and pressure of 1013 hPa. The molecular weight of $NO_2$ is 46.01 g/mol, resulting in 1 ppm $NO_2$ being equivalent to 1.882 mg/m$^3$_)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
station_antwerpstation_parisstation_londonlondon_mg_per_cubic
datetime
2019-04-09 03:00:0022.524.467.0126.094
2019-04-09 04:00:0053.527.467.0126.094
2019-04-09 05:00:0054.534.241.077.162
2019-04-09 06:00:0034.548.541.077.162
2019-04-09 07:00:0046.559.541.077.162
\n", + "
" + ], + "text/plain": [ + " station_antwerp station_paris station_london \\\n", + "datetime \n", + "2019-04-09 03:00:00 22.5 24.4 67.0 \n", + "2019-04-09 04:00:00 53.5 27.4 67.0 \n", + "2019-04-09 05:00:00 54.5 34.2 41.0 \n", + "2019-04-09 06:00:00 34.5 48.5 41.0 \n", + "2019-04-09 07:00:00 46.5 59.5 41.0 \n", + "\n", + " london_mg_per_cubic \n", + "datetime \n", + "2019-04-09 03:00:00 126.094 \n", + "2019-04-09 04:00:00 126.094 \n", + "2019-04-09 05:00:00 77.162 \n", + "2019-04-09 06:00:00 77.162 \n", + "2019-04-09 07:00:00 77.162 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_quality[\"london_mg_per_cubic\"] = air_quality[\"station_london\"] * 1.882\n", + "air_quality.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To create a new column, use the `[]` brackets with the new column name at the left side of the assignment. When familiar to Python dictionaries, the syntax is similar to adding new key/value combinations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " \n", + "__Note__: The calculation of the values is done __element_wise__. This means all values in the given column are multiplied by the value 1.882 at once. You do not need to use a loop to iterate each of the rows!\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/05_newcolumn_2.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I want to check the ratio of the values in Paris versus Antwerp and save the result in a new column" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
station_antwerpstation_parisstation_londonlondon_mg_per_cubicratio_paris_antwerp
datetime
2019-04-09 03:00:0022.524.467.0126.0941.084444
2019-04-09 04:00:0053.527.467.0126.0940.512150
2019-04-09 05:00:0054.534.241.077.1620.627523
2019-04-09 06:00:0034.548.541.077.1621.405797
2019-04-09 07:00:0046.559.541.077.1621.279570
\n", + "
" + ], + "text/plain": [ + " station_antwerp station_paris station_london \\\n", + "datetime \n", + "2019-04-09 03:00:00 22.5 24.4 67.0 \n", + "2019-04-09 04:00:00 53.5 27.4 67.0 \n", + "2019-04-09 05:00:00 54.5 34.2 41.0 \n", + "2019-04-09 06:00:00 34.5 48.5 41.0 \n", + "2019-04-09 07:00:00 46.5 59.5 41.0 \n", + "\n", + " london_mg_per_cubic ratio_paris_antwerp \n", + "datetime \n", + "2019-04-09 03:00:00 126.094 1.084444 \n", + "2019-04-09 04:00:00 126.094 0.512150 \n", + "2019-04-09 05:00:00 77.162 0.627523 \n", + "2019-04-09 06:00:00 77.162 1.405797 \n", + "2019-04-09 07:00:00 77.162 1.279570 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_quality[\"ratio_paris_antwerp\"] = air_quality[\"station_paris\"] / air_quality[\"station_antwerp\"]\n", + "air_quality.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The calculation is again element-wise, so the `/` is applied _for the values in each row_. Also other mathematical operators (+, -, *, /) or logical operators (<, >, =,...) work element wise. The latter was already used in the [subset data tutorial](./3_subset_data.ipynb) to filter rows of a table using a conditional expression." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I want to rename the data columns to the corresponding station identifiers used by openAQ" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "air_quality_renamed = air_quality.rename(columns = {\"station_antwerp\": \"BETR801\", \n", + " \"station_paris\": \"FR04014\",\n", + " \"station_london\": \"London Westminster\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BETR801FR04014London Westminsterlondon_mg_per_cubicratio_paris_antwerp
datetime
2019-04-09 03:00:0022.524.467.0126.0941.084444
2019-04-09 04:00:0053.527.467.0126.0940.512150
2019-04-09 05:00:0054.534.241.077.1620.627523
2019-04-09 06:00:0034.548.541.077.1621.405797
2019-04-09 07:00:0046.559.541.077.1621.279570
\n", + "
" + ], + "text/plain": [ + " BETR801 FR04014 London Westminster \\\n", + "datetime \n", + "2019-04-09 03:00:00 22.5 24.4 67.0 \n", + "2019-04-09 04:00:00 53.5 27.4 67.0 \n", + "2019-04-09 05:00:00 54.5 34.2 41.0 \n", + "2019-04-09 06:00:00 34.5 48.5 41.0 \n", + "2019-04-09 07:00:00 46.5 59.5 41.0 \n", + "\n", + " london_mg_per_cubic ratio_paris_antwerp \n", + "datetime \n", + "2019-04-09 03:00:00 126.094 1.084444 \n", + "2019-04-09 04:00:00 126.094 0.512150 \n", + "2019-04-09 05:00:00 77.162 0.627523 \n", + "2019-04-09 06:00:00 77.162 1.405797 \n", + "2019-04-09 07:00:00 77.162 1.279570 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_quality_renamed.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `rename` function can be used for both row labels as column names. Provide a dictionary with the keys the current names and the values the new names to update the corresponding names." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The mapping should not be restricted to fixed names only, but can be a mapping function as well. For example, converting the column names to lower case letters can be done using a function as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
betr801fr04014london westminsterlondon_mg_per_cubicratio_paris_antwerp
datetime
2019-04-09 03:00:0022.524.467.0126.0941.084444
2019-04-09 04:00:0053.527.467.0126.0940.512150
2019-04-09 05:00:0054.534.241.077.1620.627523
2019-04-09 06:00:0034.548.541.077.1621.405797
2019-04-09 07:00:0046.559.541.077.1621.279570
\n", + "
" + ], + "text/plain": [ + " betr801 fr04014 london westminster \\\n", + "datetime \n", + "2019-04-09 03:00:00 22.5 24.4 67.0 \n", + "2019-04-09 04:00:00 53.5 27.4 67.0 \n", + "2019-04-09 05:00:00 54.5 34.2 41.0 \n", + "2019-04-09 06:00:00 34.5 48.5 41.0 \n", + "2019-04-09 07:00:00 46.5 59.5 41.0 \n", + "\n", + " london_mg_per_cubic ratio_paris_antwerp \n", + "datetime \n", + "2019-04-09 03:00:00 126.094 1.084444 \n", + "2019-04-09 04:00:00 126.094 0.512150 \n", + "2019-04-09 05:00:00 77.162 0.627523 \n", + "2019-04-09 06:00:00 77.162 1.405797 \n", + "2019-04-09 07:00:00 77.162 1.279570 " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_quality_renamed = air_quality_renamed.rename(columns = str.lower)\n", + "air_quality_renamed.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ Further details about column or row label renaming is provided in :ref:`basics.rename`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## REMEMBER\n", + "\n", + "- Create a new column by assigning the output to the DataFrame with a new column name in between the `[]`.\n", + "- Operations are element-wise, no need to loop over rows.\n", + "- Use `rename` with a diwtionary or function to rename row labels or column names." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ Further details about column addition and deletion is provided in :ref:`TODO` ([label](https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#column-selection-addition-deletion) to add in sphinx)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}