From 71b74383bc7b035969cc61e7921c365f2f57bf9b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Sep 2019 08:47:33 +0200 Subject: [PATCH 1/2] Add 3_subset_data.ipynb from master --- notebooks/3_subset_data.ipynb | 1358 +++++++++++++++++++++++++++++++++ 1 file changed, 1358 insertions(+) create mode 100644 notebooks/3_subset_data.ipynb diff --git a/notebooks/3_subset_data.ipynb b/notebooks/3_subset_data.ipynb new file mode 100644 index 0000000..da9eb46 --- /dev/null +++ b/notebooks/3_subset_data.ipynb @@ -0,0 +1,1358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Objectives\n", + "\n", + "- Extract and manipulate data using column headings\n", + "- Query / select a subset of data using boolean indexing\n", + "- Understand the difference between loc and iloc\n", + "- Drop rows with Nan values in a given column\n", + "\n", + "Content to cover\n", + "\n", + "- df[\"COLUMN_NAME\"] and df[[\"COLUMN_NAME_1\", \"COLUMN_NAME_2\"]]\n", + "- assign new value to selection\n", + "- df[df[\"NAME] < 18] conditional setup \n", + "- df[df[“Name”].isin([...])] conditional function\n", + "- loc/iloc\n", + "- df[“column”].dropna() or df.dropna(“column”)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic = pd.read_csv(\"../data/titanic.csv\")\n", + "titanic.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select the data you need" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select specific columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/03_subset_columns.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " > I'm interested in the age of the titanic passengers" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 22.0\n", + "1 38.0\n", + "2 26.0\n", + "3 35.0\n", + "4 35.0\n", + "Name: Age, dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ages = titanic[\"Age\"]\n", + "ages.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To select a single column, use square brackets `[]` with the column name of the column of interest." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The returned data type is a Pandas Series, as a single column is selected." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.series.Series" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(titanic[\"Age\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " > I 'm interested in the age and sex of the titanic passengers" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSex
022.0male
138.0female
226.0female
335.0female
435.0male
\n", + "
" + ], + "text/plain": [ + " Age Sex\n", + "0 22.0 male\n", + "1 38.0 female\n", + "2 26.0 female\n", + "3 35.0 female\n", + "4 35.0 male" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "age_sex = titanic[[\"Age\", \"Sex\"]]\n", + "age_sex.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To select multiple columns, use a list of column names within the selection brackets `[]`. Note, the inner square brackets define the list of column names, the outer brackets are to select data from a Pandas DataFrame as seen in the previous example." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The returned data type is a Pandas DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(titanic[[\"Age\", \"Sex\"]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ For basic information on indexing, see :ref:`indexing.basics`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filter rows of a table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/03_subset_rows.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I 'm interested in the passengers older than 18 years" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adults = titanic[titanic[\"Age\"] > 18]\n", + "adults.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To select rows based on a conditional expression, use a conditional statement inside the selection brackets `[]`. The condition inside the selection brackets `titanic[\"Age\"] > 18` checks for which rows the `Age` column has a value larger than 18. Each row for which the condition is `True`, is selected." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I 'm interested in the titanic passengers from cabin class 2 and 3" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
7803Palsson, Master. Gosta Leonardmale2.03134990921.0750NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Name Sex \\\n", + "0 1 0 3 Braund, Mr. Owen Harris male \n", + "2 3 1 3 Heikkinen, Miss. Laina female \n", + "4 5 0 3 Allen, Mr. William Henry male \n", + "5 6 0 3 Moran, Mr. James male \n", + "7 8 0 3 Palsson, Master. Gosta Leonard male \n", + "\n", + " Age SibSp Parch Ticket Fare Cabin Embarked \n", + "0 22.0 1 0 A/5 21171 7.2500 NaN S \n", + "2 26.0 0 0 STON/O2. 3101282 7.9250 NaN S \n", + "4 35.0 0 0 373450 8.0500 NaN S \n", + "5 NaN 0 0 330877 8.4583 NaN Q \n", + "7 2.0 3 1 349909 21.0750 NaN S " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_23 = titanic[titanic[\"Pclass\"].isin([2, 3])]\n", + "class_23.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to the conditional expression, the `isin` conditional function returns a `True` for each row the values are in the provided list. To filter the rows based on such a function, use the conditional function inside the selection brackets `[]`. In this case, the condition inside the selection brackets `titanic[\"Pclass\"].isin([2, 3])` checks for which rows the `Pclass` column is either 2 or 3." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above is equivalent to filtering by rows for which the class is either 2 or 3 and combiniing the two statements with an `|` (or) operator:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
7803Palsson, Master. Gosta Leonardmale2.03134990921.0750NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Name Sex \\\n", + "0 1 0 3 Braund, Mr. Owen Harris male \n", + "2 3 1 3 Heikkinen, Miss. Laina female \n", + "4 5 0 3 Allen, Mr. William Henry male \n", + "5 6 0 3 Moran, Mr. James male \n", + "7 8 0 3 Palsson, Master. Gosta Leonard male \n", + "\n", + " Age SibSp Parch Ticket Fare Cabin Embarked \n", + "0 22.0 1 0 A/5 21171 7.2500 NaN S \n", + "2 26.0 0 0 STON/O2. 3101282 7.9250 NaN S \n", + "4 35.0 0 0 373450 8.0500 NaN S \n", + "5 NaN 0 0 330877 8.4583 NaN Q \n", + "7 2.0 3 1 349909 21.0750 NaN S " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_23 = titanic[(titanic[\"Pclass\"] == 2) | (titanic[\"Pclass\"] == 3)]\n", + "class_23.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ Conditional (boolean) indexing, see :ref:`indexing.boolean`. Specific information on `isin`, see :ref:`indexing.basics.indexing_isin`. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I want to work with passenger data for which the age is known" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "age_nonull = titanic[titanic[\"Age\"].notnull()]\n", + "age_nonull.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `notnull` conditional function returns a `True` for each row the values are not an `Null` value. As such, this can be combined with the selection brackets `[]` to filter the data table." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ For more dedicated functions on missing values, see :ref:`missing-data`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select specific rows and/or columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/03_subset_columns_rows.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I 'm interested in the Names of the passengers older than 18 years" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Braund, Mr. Owen Harris\n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n", + "2 Heikkinen, Miss. Laina\n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n", + "4 Allen, Mr. William Henry\n", + "Name: Name, dtype: object" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adult_names = titanic.loc[titanic[\"Age\"] > 18, \"Name\"]\n", + "adult_names.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When using the column names, row labels or a condition expression, use the `loc` operator in front of the selection brackets `[]`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I 'm interested in rows 10 till 25 and columns 3 to 5" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PclassNameSex
92Nasser, Mrs. Nicholas (Adele Achem)female
103Sandstrom, Miss. Marguerite Rutfemale
111Bonnell, Miss. Elizabethfemale
123Saundercock, Mr. William Henrymale
133Andersson, Mr. Anders Johanmale
143Vestrom, Miss. Hulda Amanda Adolfinafemale
152Hewlett, Mrs. (Mary D Kingcome)female
163Rice, Master. Eugenemale
172Williams, Mr. Charles Eugenemale
183Vander Planke, Mrs. Julius (Emelia Maria Vande...female
193Masselmani, Mrs. Fatimafemale
202Fynney, Mr. Joseph Jmale
212Beesley, Mr. Lawrencemale
223McGowan, Miss. Anna \"Annie\"female
231Sloper, Mr. William Thompsonmale
243Palsson, Miss. Torborg Danirafemale
\n", + "
" + ], + "text/plain": [ + " Pclass Name Sex\n", + "9 2 Nasser, Mrs. Nicholas (Adele Achem) female\n", + "10 3 Sandstrom, Miss. Marguerite Rut female\n", + "11 1 Bonnell, Miss. Elizabeth female\n", + "12 3 Saundercock, Mr. William Henry male\n", + "13 3 Andersson, Mr. Anders Johan male\n", + "14 3 Vestrom, Miss. Hulda Amanda Adolfina female\n", + "15 2 Hewlett, Mrs. (Mary D Kingcome) female\n", + "16 3 Rice, Master. Eugene male\n", + "17 2 Williams, Mr. Charles Eugene male\n", + "18 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female\n", + "19 3 Masselmani, Mrs. Fatima female\n", + "20 2 Fynney, Mr. Joseph J male\n", + "21 2 Beesley, Mr. Lawrence male\n", + "22 3 McGowan, Miss. Anna \"Annie\" female\n", + "23 1 Sloper, Mr. William Thompson male\n", + "24 3 Palsson, Miss. Torborg Danira female" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.iloc[9:25, 2:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When specifically interested in certain rows and/or columns based on their position in the table, use the `iloc` operator in front of the selection brackets `[]`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ For more detailed description on selecting subsets of a data table, see :ref:`indexing.choice`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## REMEMBER\n", + "\n", + "- When selecting subsets of data, square brackets `[]` are used.\n", + "- Inside these brackets, you can use a single column name, multiple columns within a list, conditional expressions or conditional statements\n", + "- Select specific rows and/or columns using `loc` when using the row and column names\n", + "- Select specific rows and/or columns using `iloc` when using the positions in the table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ Further details about indexing is provided in :ref:`indexing`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1681efb59a5f872f08981ae799ed4a843a00a1a5 Mon Sep 17 00:00:00 2001 From: stijnvanhoey Date: Mon, 7 Oct 2019 18:47:41 +0200 Subject: [PATCH 2/2] update from master --- notebooks/3_subset_data.ipynb | 679 +++++++++++++++++++++++++--------- 1 file changed, 507 insertions(+), 172 deletions(-) diff --git a/notebooks/3_subset_data.ipynb b/notebooks/3_subset_data.ipynb index da9eb46..aaa1880 100644 --- a/notebooks/3_subset_data.ipynb +++ b/notebooks/3_subset_data.ipynb @@ -1,41 +1,43 @@ { "cells": [ { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 41, "metadata": {}, + "outputs": [], "source": [ - "Objectives\n", - "\n", - "- Extract and manipulate data using column headings\n", - "- Query / select a subset of data using boolean indexing\n", - "- Understand the difference between loc and iloc\n", - "- Drop rows with Nan values in a given column\n", - "\n", - "Content to cover\n", - "\n", - "- df[\"COLUMN_NAME\"] and df[[\"COLUMN_NAME_1\", \"COLUMN_NAME_2\"]]\n", - "- assign new value to selection\n", - "- df[df[\"NAME] < 18] conditional setup \n", - "- df[df[“Name”].isin([...])] conditional function\n", - "- loc/iloc\n", - "- df[“column”].dropna() or df.dropna(“column”)\n" + "import pandas as pd" ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" + "
\n", + " \n", + "This tutorial uses the titanic data set, stored as CSV. The data consists of the following data columns:\n", + "\n", + "- PassengerId: Id of every passenger.\n", + "- Survived: This feature have value 0 and 1. 0 for not survived and 1 for survived.\n", + "- Pclass: There are 3 classes: Class 1, Class 2 and Class 3.\n", + "- Name: Name of passenger.\n", + "- Sex: Gender of passenger.\n", + "- Age: Age of passenger.\n", + "- SibSp: Indication that passenger have siblings and spouse.\n", + "- Parch: Whether a passenger is alone or have family.\n", + "- Ticket: Ticket number of passenger.\n", + "- Fare: Indicating the fare.\n", + "- Cabin: The cabin of passenger.\n", + "- Embarked: The embarked category.\n", + "\n", + "Reading in a data set is explained in the [tutorial on read/write operations](./2_read_write.ipynb).\n", + "\n", + "
" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -75,7 +77,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 1\n", " 0\n", " 3\n", @@ -90,7 +92,7 @@ " S\n", " \n", " \n", - " 1\n", + " 1\n", " 2\n", " 1\n", " 1\n", @@ -105,7 +107,7 @@ " C\n", " \n", " \n", - " 2\n", + " 2\n", " 3\n", " 1\n", " 3\n", @@ -120,7 +122,7 @@ " S\n", " \n", " \n", - " 3\n", + " 3\n", " 4\n", " 1\n", " 1\n", @@ -135,7 +137,7 @@ " S\n", " \n", " \n", - " 4\n", + " 4\n", " 5\n", " 0\n", " 3\n", @@ -176,7 +178,7 @@ "4 0 373450 8.0500 NaN S " ] }, - "execution_count": 3, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -190,14 +192,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Select the data you need" + "# How do I select a subset of data in a `DataFrame`? " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Select specific columns" + "### How do I select specific columns from a `DataFrame`?" ] }, { @@ -211,12 +213,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " > I'm interested in the age of the titanic passengers" + " > I'm interested in the age of the titanic passengers." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -230,7 +232,7 @@ "Name: Age, dtype: float64" ] }, - "execution_count": 20, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -251,12 +253,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The returned data type is a Pandas Series, as a single column is selected." + "Each column in a `DataFrame` is a `Series`. As a single column is selected, the returned object is a pandas `Series`. We can verify this by checking the type of the output:" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 65, "metadata": {}, "outputs": [ { @@ -265,7 +267,7 @@ "pandas.core.series.Series" ] }, - "execution_count": 5, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -278,12 +280,46 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " > I 'm interested in the age and sex of the titanic passengers" + "And have a look at the `shape` of the output:" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(891,)" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic[\"Age\"].shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`shape` is an attribute (remember [previous tutorial](./2_read_write.ipynb), no parantheses for attributes) of a pandas `Series` and `DataFrame` containing the number of rows and columns: _(nrows, ncolumns)_. A pandas Series is 1-dimensional and only the number of rows is returned." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " > I'm interested in the age and sex of the titanic passengers." + ] + }, + { + "cell_type": "code", + "execution_count": 66, "metadata": {}, "outputs": [ { @@ -313,27 +349,27 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 22.0\n", " male\n", " \n", " \n", - " 1\n", + " 1\n", " 38.0\n", " female\n", " \n", " \n", - " 2\n", + " 2\n", " 26.0\n", " female\n", " \n", " \n", - " 3\n", + " 3\n", " 35.0\n", " female\n", " \n", " \n", - " 4\n", + " 4\n", " 35.0\n", " male\n", " \n", @@ -350,7 +386,7 @@ "4 35.0 male" ] }, - "execution_count": 23, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -364,7 +400,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To select multiple columns, use a list of column names within the selection brackets `[]`. Note, the inner square brackets define the list of column names, the outer brackets are to select data from a Pandas DataFrame as seen in the previous example." + "To select multiple columns, use a list of column names within the selection brackets `[]`. \n", + "\n", + "
\n", + " \n", + "__Note:__ The inner square brackets define a :ref:`Python list ` with column names, whereas the outer brackets are used to select the data from a pandas `DataFrame` as seen in the previous example.\n", + "\n", + "
" ] }, { @@ -376,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 67, "metadata": {}, "outputs": [ { @@ -385,7 +427,7 @@ "pandas.core.frame.DataFrame" ] }, - "execution_count": 24, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -394,6 +436,33 @@ "type(titanic[[\"Age\", \"Sex\"]])" ] }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(891, 2)" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic[[\"Age\", \"Sex\"]].shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The selection returned a `DataFrame` with 891 rows and 2 columns. A `DataFrame` is 2-dimensional with both a row and column dimension." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -405,7 +474,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Filter rows of a table" + "### How do I filter specific rows from a `DataFrame`?" ] }, { @@ -419,12 +488,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> I 'm interested in the passengers older than 18 years" + "> I'm interested in the passengers older than 35 years." ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 73, "metadata": {}, "outputs": [ { @@ -464,22 +533,7 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", - " 0\n", - " 3\n", - " Braund, Mr. Owen Harris\n", - " male\n", - " 22.0\n", " 1\n", - " 0\n", - " A/5 21171\n", - " 7.2500\n", - " NaN\n", - " S\n", - " \n", - " \n", - " 1\n", " 2\n", " 1\n", " 1\n", @@ -494,47 +548,62 @@ " C\n", " \n", " \n", - " 2\n", - " 3\n", + " 6\n", + " 7\n", + " 0\n", " 1\n", - " 3\n", - " Heikkinen, Miss. Laina\n", - " female\n", - " 26.0\n", + " McCarthy, Mr. Timothy J\n", + " male\n", + " 54.0\n", " 0\n", " 0\n", - " STON/O2. 3101282\n", - " 7.9250\n", - " NaN\n", + " 17463\n", + " 51.8625\n", + " E46\n", " S\n", " \n", " \n", - " 3\n", - " 4\n", + " 11\n", + " 12\n", " 1\n", " 1\n", - " Futrelle, Mrs. Jacques Heath (Lily May Peel)\n", + " Bonnell, Miss. Elizabeth\n", " female\n", - " 35.0\n", - " 1\n", + " 58.0\n", " 0\n", - " 113803\n", - " 53.1000\n", - " C123\n", + " 0\n", + " 113783\n", + " 26.5500\n", + " C103\n", " S\n", " \n", " \n", - " 4\n", - " 5\n", + " 13\n", + " 14\n", " 0\n", " 3\n", - " Allen, Mr. William Henry\n", + " Andersson, Mr. Anders Johan\n", " male\n", - " 35.0\n", + " 39.0\n", + " 1\n", + " 5\n", + " 347082\n", + " 31.2750\n", + " NaN\n", + " S\n", + " \n", + " \n", + " 15\n", + " 16\n", + " 1\n", + " 2\n", + " Hewlett, Mrs. (Mary D Kingcome)\n", + " female\n", + " 55.0\n", " 0\n", " 0\n", - " 373450\n", - " 8.0500\n", + " 248706\n", + " 16.0000\n", " NaN\n", " S\n", " \n", @@ -543,55 +612,120 @@ "" ], "text/plain": [ - " PassengerId Survived Pclass \\\n", - "0 1 0 3 \n", - "1 2 1 1 \n", - "2 3 1 3 \n", - "3 4 1 1 \n", - "4 5 0 3 \n", + " PassengerId Survived Pclass \\\n", + "1 2 1 1 \n", + "6 7 0 1 \n", + "11 12 1 1 \n", + "13 14 0 3 \n", + "15 16 1 2 \n", "\n", - " Name Sex Age SibSp \\\n", - "0 Braund, Mr. Owen Harris male 22.0 1 \n", - "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", - "2 Heikkinen, Miss. Laina female 26.0 0 \n", - "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", - "4 Allen, Mr. William Henry male 35.0 0 \n", + " Name Sex Age SibSp \\\n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "6 McCarthy, Mr. Timothy J male 54.0 0 \n", + "11 Bonnell, Miss. Elizabeth female 58.0 0 \n", + "13 Andersson, Mr. Anders Johan male 39.0 1 \n", + "15 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 \n", "\n", - " Parch Ticket Fare Cabin Embarked \n", - "0 0 A/5 21171 7.2500 NaN S \n", - "1 0 PC 17599 71.2833 C85 C \n", - "2 0 STON/O2. 3101282 7.9250 NaN S \n", - "3 0 113803 53.1000 C123 S \n", - "4 0 373450 8.0500 NaN S " + " Parch Ticket Fare Cabin Embarked \n", + "1 0 PC 17599 71.2833 C85 C \n", + "6 0 17463 51.8625 E46 S \n", + "11 0 113783 26.5500 C103 S \n", + "13 5 347082 31.2750 NaN S \n", + "15 0 248706 16.0000 NaN S " + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "above_35 = titanic[titanic[\"Age\"] > 35]\n", + "above_35.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To select rows based on a conditional expression, use a condition inside the selection brackets `[]`. The condition inside the selection brackets `titanic[\"Age\"] > 35` checks for which rows the `Age` column has a value larger than 35:" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 True\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "886 False\n", + "887 False\n", + "888 False\n", + "889 False\n", + "890 False\n", + "Name: Age, Length: 891, dtype: bool" ] }, - "execution_count": 25, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "adults = titanic[titanic[\"Age\"] > 18]\n", - "adults.head()" + "titanic[\"Age\"] > 35" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output of the conditional expression (`>`, but also `==`, `!=`, `<`, `<=`,... would work) is actually a pandas `Series` of boolean values (either `True` or `False`) with the same number of rows as the original `DataFrame`. Such a `Series` of boolean values can be used to filter the `DataFrame` by putting it in between the selection brackets `[]`. Only rows for which the value is `True` will be selected." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To select rows based on a conditional expression, use a conditional statement inside the selection brackets `[]`. The condition inside the selection brackets `titanic[\"Age\"] > 18` checks for which rows the `Age` column has a value larger than 18. Each row for which the condition is `True`, is selected." + "We now from before that the original titanic `DataFrame` consists of 891 rows. Let's have a look at the amount of rows which satisfy the condition by checking the `shape` attribute of the resulting `DataFrame` above_35:" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(217, 12)" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "above_35.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "> I 'm interested in the titanic passengers from cabin class 2 and 3" + "> I'm interested in the titanic passengers from cabin class 2 and 3." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 76, "metadata": {}, "outputs": [ { @@ -631,7 +765,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 1\n", " 0\n", " 3\n", @@ -646,7 +780,7 @@ " S\n", " \n", " \n", - " 2\n", + " 2\n", " 3\n", " 1\n", " 3\n", @@ -661,7 +795,7 @@ " S\n", " \n", " \n", - " 4\n", + " 4\n", " 5\n", " 0\n", " 3\n", @@ -676,7 +810,7 @@ " S\n", " \n", " \n", - " 5\n", + " 5\n", " 6\n", " 0\n", " 3\n", @@ -691,7 +825,7 @@ " Q\n", " \n", " \n", - " 7\n", + " 7\n", " 8\n", " 0\n", " 3\n", @@ -725,7 +859,7 @@ "7 2.0 3 1 349909 21.0750 NaN S " ] }, - "execution_count": 26, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } @@ -746,12 +880,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The above is equivalent to filtering by rows for which the class is either 2 or 3 and combiniing the two statements with an `|` (or) operator:" + "The above is equivalent to filtering by rows for which the class is either 2 or 3 and combining the two statements with an `|` (or) operator:" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 58, "metadata": {}, "outputs": [ { @@ -791,7 +925,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 1\n", " 0\n", " 3\n", @@ -806,7 +940,7 @@ " S\n", " \n", " \n", - " 2\n", + " 2\n", " 3\n", " 1\n", " 3\n", @@ -821,7 +955,7 @@ " S\n", " \n", " \n", - " 4\n", + " 4\n", " 5\n", " 0\n", " 3\n", @@ -836,7 +970,7 @@ " S\n", " \n", " \n", - " 5\n", + " 5\n", " 6\n", " 0\n", " 3\n", @@ -851,7 +985,7 @@ " Q\n", " \n", " \n", - " 7\n", + " 7\n", " 8\n", " 0\n", " 3\n", @@ -885,7 +1019,7 @@ "7 2.0 3 1 349909 21.0750 NaN S " ] }, - "execution_count": 27, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } @@ -895,6 +1029,17 @@ "class_23.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " \n", + "__Note:__ When combining multiple conditional statements, each condition must be surrounded by parentheses `()`. Moreover, you can not use `or`/`and` but need to use the `or` operator `|` and the `and` operator `&`.\n", + "\n", + "
" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -906,12 +1051,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> I want to work with passenger data for which the age is known" + "> I want to work with passenger data for which the age is known." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 59, "metadata": { "scrolled": true }, @@ -953,7 +1098,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 1\n", " 0\n", " 3\n", @@ -968,7 +1113,7 @@ " S\n", " \n", " \n", - " 1\n", + " 1\n", " 2\n", " 1\n", " 1\n", @@ -983,7 +1128,7 @@ " C\n", " \n", " \n", - " 2\n", + " 2\n", " 3\n", " 1\n", " 3\n", @@ -998,7 +1143,7 @@ " S\n", " \n", " \n", - " 3\n", + " 3\n", " 4\n", " 1\n", " 1\n", @@ -1013,7 +1158,7 @@ " S\n", " \n", " \n", - " 4\n", + " 4\n", " 5\n", " 0\n", " 3\n", @@ -1054,21 +1199,48 @@ "4 0 373450 8.0500 NaN S " ] }, - "execution_count": 28, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "age_nonull = titanic[titanic[\"Age\"].notnull()]\n", - "age_nonull.head()" + "age_no_na = titanic[titanic[\"Age\"].notna()]\n", + "age_no_na.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `notna` conditional function returns a `True` for each row the values are not an `Null` value. As such, this can be combined with the selection brackets `[]` to filter the data table." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The `notnull` conditional function returns a `True` for each row the values are not an `Null` value. As such, this can be combined with the selection brackets `[]` to filter the data table." + "You might wonder what actually changed, as the first 5 lines are still the same values. One way to verify is to check if the shape has changed:" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(714, 12)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "age_no_na.shape" ] }, { @@ -1082,7 +1254,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Select specific rows and/or columns" + "### How do I select specific rows and columns from a `DataFrame`? " ] }, { @@ -1096,32 +1268,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> I 'm interested in the Names of the passengers older than 18 years" + "> I'm interested in the names of the passengers older than 35 years." ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 Braund, Mr. Owen Harris\n", - "1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n", - "2 Heikkinen, Miss. Laina\n", - "3 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n", - "4 Allen, Mr. William Henry\n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n", + "6 McCarthy, Mr. Timothy J\n", + "11 Bonnell, Miss. Elizabeth\n", + "13 Andersson, Mr. Anders Johan\n", + "15 Hewlett, Mrs. (Mary D Kingcome) \n", "Name: Name, dtype: object" ] }, - "execution_count": 34, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "adult_names = titanic.loc[titanic[\"Age\"] > 18, \"Name\"]\n", + "adult_names = titanic.loc[titanic[\"Age\"] > 35, \"Name\"]\n", "adult_names.head()" ] }, @@ -1129,19 +1301,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When using the column names, row labels or a condition expression, use the `loc` operator in front of the selection brackets `[]`." + "In this case, a subset of both rows and columns is made in one go and just using selection brackets `[]` is not sufficient anymore. The `loc`/`iloc` operators are required in front of the selection brackets `[]`. When using `loc`/`iloc`, the part before the comma is the rows you want, and the part after the comma is the columns you want to select.\n", + "\n", + "When using the column names, row labels or a condition expression, use the `loc` operator in front of the selection brackets `[]`. For both the part before and after the comma, you can use a single label, a list of labels, a slice of labels, a conditional expression or a colon. using a colon specificies you want to select all rows or columns." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "> I 'm interested in rows 10 till 25 and columns 3 to 5" + "> I'm interested in rows 10 till 25 and columns 3 to 5." ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -1172,97 +1346,97 @@ " \n", " \n", " \n", - " 9\n", + " 9\n", " 2\n", " Nasser, Mrs. Nicholas (Adele Achem)\n", " female\n", " \n", " \n", - " 10\n", + " 10\n", " 3\n", " Sandstrom, Miss. Marguerite Rut\n", " female\n", " \n", " \n", - " 11\n", + " 11\n", " 1\n", " Bonnell, Miss. Elizabeth\n", " female\n", " \n", " \n", - " 12\n", + " 12\n", " 3\n", " Saundercock, Mr. William Henry\n", " male\n", " \n", " \n", - " 13\n", + " 13\n", " 3\n", " Andersson, Mr. Anders Johan\n", " male\n", " \n", " \n", - " 14\n", + " 14\n", " 3\n", " Vestrom, Miss. Hulda Amanda Adolfina\n", " female\n", " \n", " \n", - " 15\n", + " 15\n", " 2\n", " Hewlett, Mrs. (Mary D Kingcome)\n", " female\n", " \n", " \n", - " 16\n", + " 16\n", " 3\n", " Rice, Master. Eugene\n", " male\n", " \n", " \n", - " 17\n", + " 17\n", " 2\n", " Williams, Mr. Charles Eugene\n", " male\n", " \n", " \n", - " 18\n", + " 18\n", " 3\n", " Vander Planke, Mrs. Julius (Emelia Maria Vande...\n", " female\n", " \n", " \n", - " 19\n", + " 19\n", " 3\n", " Masselmani, Mrs. Fatima\n", " female\n", " \n", " \n", - " 20\n", + " 20\n", " 2\n", " Fynney, Mr. Joseph J\n", " male\n", " \n", " \n", - " 21\n", + " 21\n", " 2\n", " Beesley, Mr. Lawrence\n", " male\n", " \n", " \n", - " 22\n", + " 22\n", " 3\n", " McGowan, Miss. Anna \"Annie\"\n", " female\n", " \n", " \n", - " 23\n", + " 23\n", " 1\n", " Sloper, Mr. William Thompson\n", " male\n", " \n", " \n", - " 24\n", + " 24\n", " 3\n", " Palsson, Miss. Torborg Danira\n", " female\n", @@ -1291,7 +1465,7 @@ "24 3 Palsson, Miss. Torborg Danira female" ] }, - "execution_count": 35, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -1304,7 +1478,167 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When specifically interested in certain rows and/or columns based on their position in the table, use the `iloc` operator in front of the selection brackets `[]`." + "Again, a subset of both rows and columns is made in one go and just using selection brackets `[]` is not sufficient anymore. When specifically interested in certain rows and/or columns based on their position in the table, use the `iloc` operator in front of the selection brackets `[]`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When selecting specific rows and/or columns with `loc` or `iloc`, new values can be assigned to the selected data. For example, to assign the name `anonymous` to the first 3 elements of the third column:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103anonymousmale22.010A/5 211717.2500NaNS
1211anonymousfemale38.010PC 1759971.2833C85C
2313anonymousfemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp Parch \\\n", + "0 anonymous male 22.0 1 0 \n", + "1 anonymous female 38.0 1 0 \n", + "2 anonymous female 26.0 0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 \n", + "4 Allen, Mr. William Henry male 35.0 0 0 \n", + "\n", + " Ticket Fare Cabin Embarked \n", + "0 A/5 21171 7.2500 NaN S \n", + "1 PC 17599 71.2833 C85 C \n", + "2 STON/O2. 3101282 7.9250 NaN S \n", + "3 113803 53.1000 C123 S \n", + "4 373450 8.0500 NaN S " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.iloc[0:3, 3] = \"anonymous\"\n", + "titanic.head()" ] }, { @@ -1321,9 +1655,10 @@ "## REMEMBER\n", "\n", "- When selecting subsets of data, square brackets `[]` are used.\n", - "- Inside these brackets, you can use a single column name, multiple columns within a list, conditional expressions or conditional statements\n", + "- Inside these brackets, you can use a single column/row label, a list of column/row labels, a slice of labels, a conditional expression or a colon.\n", "- Select specific rows and/or columns using `loc` when using the row and column names\n", - "- Select specific rows and/or columns using `iloc` when using the positions in the table" + "- Select specific rows and/or columns using `iloc` when using the positions in the table\n", + "- You can assign new values to a selection based on `loc`/`iloc`." ] }, {