From d0ddf97ab5b9aca2d5070a857dc6043e78f26f80 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Sep 2019 08:46:50 +0200 Subject: [PATCH 1/2] Add 2_read_write.ipynb from master --- notebooks/2_read_write.ipynb | 684 +++++++++++++++++++++++++++++++++++ 1 file changed, 684 insertions(+) create mode 100644 notebooks/2_read_write.ipynb diff --git a/notebooks/2_read_write.ipynb b/notebooks/2_read_write.ipynb new file mode 100644 index 0000000..b1f4a7d --- /dev/null +++ b/notebooks/2_read_write.ipynb @@ -0,0 +1,684 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Objectives\n", + "\n", + "- Use `read_csv` to read tabular text data into Python\n", + "- Know how to find and use other read_/to_ methods of Pandas\n", + "- Verify the data types of the data after reading data\n", + "\n", + "Content to cover\n", + "\n", + "- read_csv\n", + "- to_excel/read_excel example\n", + "- head()/tail()\n", + "- dtypes\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read and write different file formats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](../schemas/02_io_readwrite.svg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I want to start analyzing the titanic passenger data, available as a CSV file" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "titanic = pd.read_csv(\"../data/titanic.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To read CSV data, the reading functionalality provide the `read_csv` function to read the data into a Pandas DataFrame. Other read methods are available, each of them with the prefix `read_*`. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure to have a first check on the data. Checking the first and or last 5 records is a good first step, provided by the `head`, respectively `tail`, method:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
88688702Montvila, Rev. Juozasmale27.00021153613.00NaNS
88788811Graham, Miss. Margaret Edithfemale19.00011205330.00B42S
88888903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.45NaNS
88989011Behr, Mr. Karl Howellmale26.00011136930.00C148C
89089103Dooley, Mr. Patrickmale32.0003703767.75NaNQ
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Name \\\n", + "886 887 0 2 Montvila, Rev. Juozas \n", + "887 888 1 1 Graham, Miss. Margaret Edith \n", + "888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n", + "889 890 1 1 Behr, Mr. Karl Howell \n", + "890 891 0 3 Dooley, Mr. Patrick \n", + "\n", + " Sex Age SibSp Parch Ticket Fare Cabin Embarked \n", + "886 male 27.0 0 0 211536 13.00 NaN S \n", + "887 female 19.0 0 0 112053 30.00 B42 S \n", + "888 female NaN 1 2 W./C. 6607 23.45 NaN S \n", + "889 male 26.0 0 0 111369 30.00 C148 C \n", + "890 male 32.0 0 0 370376 7.75 NaN Q " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A check on how Pandas interpreted each of the column data types can be done by requesting the Pandas `dtypes` attribute:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PassengerId int64\n", + "Survived int64\n", + "Pclass int64\n", + "Name object\n", + "Sex object\n", + "Age float64\n", + "SibSp int64\n", + "Parch int64\n", + "Ticket object\n", + "Fare float64\n", + "Cabin object\n", + "Embarked object\n", + "dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> My colleague requested the titanic data as a spreadsheet" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "titanic.to_excel('titanic.xlsx', sheet_name='passengers', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Whereas `read_*` functions are used to read data to Pandas, the `to_*` functions are used to store data. The `to_excel` function stores the data as an excel file. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The equivalent read function `read_excel` would reload the data to a DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "titanic = pd.read_excel('titanic.xlsx', 'passengers', \n", + " index_col=None, na_values=['NA'])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PassengerId int64\n", + "Survived int64\n", + "Pclass int64\n", + "Name object\n", + "Sex object\n", + "Age float64\n", + "SibSp int64\n", + "Parch int64\n", + "Ticket object\n", + "Fare float64\n", + "Cabin object\n", + "Embarked object\n", + "dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I want to know what other read and write methods are available in Pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## REMEMBER\n", + "\n", + "- Getting data in to Pandas is supported by multiple `read_*` functions.\n", + "- Exporting data out of Pandas is provided by multiple `to_*`functions.\n", + "- The `head`/`tail` methods and `dtypes` attribute are convenient for a first check." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__To user guide:__ For a complete overview of the input and output possibilites from and to Pandas, see :ref:`io`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 27ae45b9bb08f352c5e62a908b762c3e4b4efc71 Mon Sep 17 00:00:00 2001 From: stijnvanhoey Date: Mon, 7 Oct 2019 18:46:56 +0200 Subject: [PATCH 2/2] update from master --- notebooks/2_read_write.ipynb | 534 +++++++++++++++++++++++++---------- 1 file changed, 381 insertions(+), 153 deletions(-) diff --git a/notebooks/2_read_write.ipynb b/notebooks/2_read_write.ipynb index b1f4a7d..e21c689 100644 --- a/notebooks/2_read_write.ipynb +++ b/notebooks/2_read_write.ipynb @@ -1,40 +1,43 @@ { "cells": [ { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 1, "metadata": {}, + "outputs": [], "source": [ - "Objectives\n", - "\n", - "- Use `read_csv` to read tabular text data into Python\n", - "- Know how to find and use other read_/to_ methods of Pandas\n", - "- Verify the data types of the data after reading data\n", - "\n", - "Content to cover\n", - "\n", - "- read_csv\n", - "- to_excel/read_excel example\n", - "- head()/tail()\n", - "- dtypes\n" + "import pandas as pd" ] }, { - "cell_type": "code", - "execution_count": 4, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" + "
\n", + " \n", + "This tutorial uses the titanic data set, stored as CSV. The data consists of the following data columns:\n", + "\n", + "- PassengerId: Id of every passenger.\n", + "- Survived: This feature have value 0 and 1. 0 for not survived and 1 for survived.\n", + "- Pclass: There are 3 classes: Class 1, Class 2 and Class 3.\n", + "- Name: Name of passenger.\n", + "- Sex: Gender of passenger.\n", + "- Age: Age of passenger.\n", + "- SibSp: Indication that passenger have siblings and spouse.\n", + "- Parch: Whether a passenger is alone or have family.\n", + "- Ticket: Ticket number of passenger.\n", + "- Fare: Indicating the fare.\n", + "- Cabin: The cabin of passenger.\n", + "- Embarked: The embarked category.\n", + "\n", + "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Read and write different file formats" + "# How do I read and write tabular data? " ] }, { @@ -48,12 +51,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> I want to start analyzing the titanic passenger data, available as a CSV file" + "> I want to start analyzing the titanic passenger data, available as a CSV file." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -64,19 +67,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To read CSV data, the reading functionalality provide the `read_csv` function to read the data into a Pandas DataFrame. Other read methods are available, each of them with the prefix `read_*`. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure to have a first check on the data. Checking the first and or last 5 records is a good first step, provided by the `head`, respectively `tail`, method:" + "Pandas provides the `read_csv` function to read data stored as a csv file into a pandas `DataFrame`. Pandas supports many different file formats or data sources out of the box (csv, excel, sql, json, parquet, ...), each of them with the prefix `read_*`. \n", + "\n", + "Make sure to always have a first check on the data after reading in the data. When displaying a `DataFrame`, the first and last 5 rows will be shown by default:" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -116,7 +114,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 1\n", " 0\n", " 3\n", @@ -131,7 +129,7 @@ " S\n", " \n", " \n", - " 1\n", + " 1\n", " 2\n", " 1\n", " 1\n", @@ -146,7 +144,7 @@ " C\n", " \n", " \n", - " 2\n", + " 2\n", " 3\n", " 1\n", " 3\n", @@ -161,7 +159,7 @@ " S\n", " \n", " \n", - " 3\n", + " 3\n", " 4\n", " 1\n", " 1\n", @@ -176,7 +174,7 @@ " S\n", " \n", " \n", - " 4\n", + " 4\n", " 5\n", " 0\n", " 3\n", @@ -190,45 +188,163 @@ " NaN\n", " S\n", " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 886\n", + " 887\n", + " 0\n", + " 2\n", + " Montvila, Rev. Juozas\n", + " male\n", + " 27.0\n", + " 0\n", + " 0\n", + " 211536\n", + " 13.0000\n", + " NaN\n", + " S\n", + " \n", + " \n", + " 887\n", + " 888\n", + " 1\n", + " 1\n", + " Graham, Miss. Margaret Edith\n", + " female\n", + " 19.0\n", + " 0\n", + " 0\n", + " 112053\n", + " 30.0000\n", + " B42\n", + " S\n", + " \n", + " \n", + " 888\n", + " 889\n", + " 0\n", + " 3\n", + " Johnston, Miss. Catherine Helen \"Carrie\"\n", + " female\n", + " NaN\n", + " 1\n", + " 2\n", + " W./C. 6607\n", + " 23.4500\n", + " NaN\n", + " S\n", + " \n", + " \n", + " 889\n", + " 890\n", + " 1\n", + " 1\n", + " Behr, Mr. Karl Howell\n", + " male\n", + " 26.0\n", + " 0\n", + " 0\n", + " 111369\n", + " 30.0000\n", + " C148\n", + " C\n", + " \n", + " \n", + " 890\n", + " 891\n", + " 0\n", + " 3\n", + " Dooley, Mr. Patrick\n", + " male\n", + " 32.0\n", + " 0\n", + " 0\n", + " 370376\n", + " 7.7500\n", + " NaN\n", + " Q\n", + " \n", " \n", "\n", + "

891 rows × 12 columns

\n", "" ], "text/plain": [ - " PassengerId Survived Pclass \\\n", - "0 1 0 3 \n", - "1 2 1 1 \n", - "2 3 1 3 \n", - "3 4 1 1 \n", - "4 5 0 3 \n", + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + ".. ... ... ... \n", + "886 887 0 2 \n", + "887 888 1 1 \n", + "888 889 0 3 \n", + "889 890 1 1 \n", + "890 891 0 3 \n", "\n", - " Name Sex Age SibSp \\\n", - "0 Braund, Mr. Owen Harris male 22.0 1 \n", - "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", - "2 Heikkinen, Miss. Laina female 26.0 0 \n", - "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", - "4 Allen, Mr. William Henry male 35.0 0 \n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + ".. ... ... ... ... \n", + "886 Montvila, Rev. Juozas male 27.0 0 \n", + "887 Graham, Miss. Margaret Edith female 19.0 0 \n", + "888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n", + "889 Behr, Mr. Karl Howell male 26.0 0 \n", + "890 Dooley, Mr. Patrick male 32.0 0 \n", "\n", - " Parch Ticket Fare Cabin Embarked \n", - "0 0 A/5 21171 7.2500 NaN S \n", - "1 0 PC 17599 71.2833 C85 C \n", - "2 0 STON/O2. 3101282 7.9250 NaN S \n", - "3 0 113803 53.1000 C123 S \n", - "4 0 373450 8.0500 NaN S " + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S \n", + ".. ... ... ... ... ... \n", + "886 0 211536 13.0000 NaN S \n", + "887 0 112053 30.0000 B42 S \n", + "888 2 W./C. 6607 23.4500 NaN S \n", + "889 0 111369 30.0000 C148 C \n", + "890 0 370376 7.7500 NaN Q \n", + "\n", + "[891 rows x 12 columns]" ] }, - "execution_count": 12, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "titanic.head()" + "titanic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I want to see the first 8 rows of a pandas DataFrame." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -268,107 +384,186 @@ " \n", " \n", " \n", - " 886\n", - " 887\n", " 0\n", - " 2\n", - " Montvila, Rev. Juozas\n", - " male\n", - " 27.0\n", + " 1\n", " 0\n", + " 3\n", + " Braund, Mr. Owen Harris\n", + " male\n", + " 22.0\n", + " 1\n", " 0\n", - " 211536\n", - " 13.00\n", + " A/5 21171\n", + " 7.2500\n", " NaN\n", " S\n", " \n", " \n", - " 887\n", - " 888\n", " 1\n", + " 2\n", " 1\n", - " Graham, Miss. Margaret Edith\n", + " 1\n", + " Cumings, Mrs. John Bradley (Florence Briggs Th...\n", " female\n", - " 19.0\n", - " 0\n", + " 38.0\n", + " 1\n", " 0\n", - " 112053\n", - " 30.00\n", - " B42\n", - " S\n", + " PC 17599\n", + " 71.2833\n", + " C85\n", + " C\n", " \n", " \n", - " 888\n", - " 889\n", - " 0\n", + " 2\n", " 3\n", - " Johnston, Miss. Catherine Helen \"Carrie\"\n", - " female\n", - " NaN\n", " 1\n", - " 2\n", - " W./C. 6607\n", - " 23.45\n", + " 3\n", + " Heikkinen, Miss. Laina\n", + " female\n", + " 26.0\n", + " 0\n", + " 0\n", + " STON/O2. 3101282\n", + " 7.9250\n", " NaN\n", " S\n", " \n", " \n", - " 889\n", - " 890\n", + " 3\n", + " 4\n", " 1\n", " 1\n", - " Behr, Mr. Karl Howell\n", + " Futrelle, Mrs. Jacques Heath (Lily May Peel)\n", + " female\n", + " 35.0\n", + " 1\n", + " 0\n", + " 113803\n", + " 53.1000\n", + " C123\n", + " S\n", + " \n", + " \n", + " 4\n", + " 5\n", + " 0\n", + " 3\n", + " Allen, Mr. William Henry\n", " male\n", - " 26.0\n", + " 35.0\n", " 0\n", " 0\n", - " 111369\n", - " 30.00\n", - " C148\n", - " C\n", + " 373450\n", + " 8.0500\n", + " NaN\n", + " S\n", " \n", " \n", - " 890\n", - " 891\n", + " 5\n", + " 6\n", " 0\n", " 3\n", - " Dooley, Mr. Patrick\n", + " Moran, Mr. James\n", " male\n", - " 32.0\n", + " NaN\n", " 0\n", " 0\n", - " 370376\n", - " 7.75\n", + " 330877\n", + " 8.4583\n", " NaN\n", " Q\n", " \n", + " \n", + " 6\n", + " 7\n", + " 0\n", + " 1\n", + " McCarthy, Mr. Timothy J\n", + " male\n", + " 54.0\n", + " 0\n", + " 0\n", + " 17463\n", + " 51.8625\n", + " E46\n", + " S\n", + " \n", + " \n", + " 7\n", + " 8\n", + " 0\n", + " 3\n", + " Palsson, Master. Gosta Leonard\n", + " male\n", + " 2.0\n", + " 3\n", + " 1\n", + " 349909\n", + " 21.0750\n", + " NaN\n", + " S\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " PassengerId Survived Pclass Name \\\n", - "886 887 0 2 Montvila, Rev. Juozas \n", - "887 888 1 1 Graham, Miss. Margaret Edith \n", - "888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n", - "889 890 1 1 Behr, Mr. Karl Howell \n", - "890 891 0 3 Dooley, Mr. Patrick \n", + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "5 6 0 3 \n", + "6 7 0 1 \n", + "7 8 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "5 Moran, Mr. James male NaN 0 \n", + "6 McCarthy, Mr. Timothy J male 54.0 0 \n", + "7 Palsson, Master. Gosta Leonard male 2.0 3 \n", "\n", - " Sex Age SibSp Parch Ticket Fare Cabin Embarked \n", - "886 male 27.0 0 0 211536 13.00 NaN S \n", - "887 female 19.0 0 0 112053 30.00 B42 S \n", - "888 female NaN 1 2 W./C. 6607 23.45 NaN S \n", - "889 male 26.0 0 0 111369 30.00 C148 C \n", - "890 male 32.0 0 0 370376 7.75 NaN Q " + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S \n", + "5 0 330877 8.4583 NaN Q \n", + "6 0 17463 51.8625 E46 S \n", + "7 1 349909 21.0750 NaN S " ] }, - "execution_count": 13, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "titanic.tail()" + "titanic.head(8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To see the first N rows of a `DataFrame`, use the `head` method with the required number of rows (in this case 8) as argument. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " \n", + "__Note__: Interested in the last N rows instead? Pandas also provides a `tail` method. For example, `titanic.tail(10)` will return the last 10 rows of the DataFrame.\n", + "\n", + "
" ] }, { @@ -380,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -401,7 +596,7 @@ "dtype: object" ] }, - "execution_count": 14, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -414,12 +609,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> My colleague requested the titanic data as a spreadsheet" + "For each of the columns, the used data type is enlisted. The data types in this `DataFrame` are integers (`int64`), floats (`float63`) and strings (`object`)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " \n", + "__Note__: When asking for the `dtypes`, no brackets are used! `dtypes` is an attribute of a `DataFrame` and `Series`. Attributes of `DataFrame` or `Series` do not need brackets. Attributes represent a characteristic of a `DataFrame`/`Series`, whereas a method (which requires brackets) _do_ something with the `DataFrame`/`Series` as introduced in the [first tutorial](./1_table_oriented.ipynb).\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> My colleague requested the titanic data as a spreadsheet." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -430,7 +643,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Whereas `read_*` functions are used to read data to Pandas, the `to_*` functions are used to store data. The `to_excel` function stores the data as an excel file. " + "Whereas `read_*` fucntions are used to read data to Pandas, the `to_*` methods are used to store data. The `to_excel` method stores the data as an excel file. In the example here, the `sheet_name` is named _passengers_ instead of the default _Sheet1_. By setting `index=False` the row index labels are not saved in the spreadsheet." ] }, { @@ -442,17 +655,16 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "titanic = pd.read_excel('titanic.xlsx', 'passengers', \n", - " index_col=None, na_values=['NA'])" + "titanic = pd.read_excel('titanic.xlsx', sheet_name='passengers')" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -492,7 +704,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 1\n", " 0\n", " 3\n", @@ -507,7 +719,7 @@ " S\n", " \n", " \n", - " 1\n", + " 1\n", " 2\n", " 1\n", " 1\n", @@ -522,7 +734,7 @@ " C\n", " \n", " \n", - " 2\n", + " 2\n", " 3\n", " 1\n", " 3\n", @@ -537,7 +749,7 @@ " S\n", " \n", " \n", - " 3\n", + " 3\n", " 4\n", " 1\n", " 1\n", @@ -552,7 +764,7 @@ " S\n", " \n", " \n", - " 4\n", + " 4\n", " 5\n", " 0\n", " 3\n", @@ -593,7 +805,7 @@ "4 0 373450 8.0500 NaN S " ] }, - "execution_count": 17, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -602,43 +814,59 @@ "titanic.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> I'm interested in a technical summary of a `DataFrame`" + ] + }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "PassengerId int64\n", - "Survived int64\n", - "Pclass int64\n", - "Name object\n", - "Sex object\n", - "Age float64\n", - "SibSp int64\n", - "Parch int64\n", - "Ticket object\n", - "Fare float64\n", - "Cabin object\n", - "Embarked object\n", - "dtype: object" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 891 entries, 0 to 890\n", + "Data columns (total 12 columns):\n", + "PassengerId 891 non-null int64\n", + "Survived 891 non-null int64\n", + "Pclass 891 non-null int64\n", + "Name 891 non-null object\n", + "Sex 891 non-null object\n", + "Age 714 non-null float64\n", + "SibSp 891 non-null int64\n", + "Parch 891 non-null int64\n", + "Ticket 891 non-null object\n", + "Fare 891 non-null float64\n", + "Cabin 204 non-null object\n", + "Embarked 889 non-null object\n", + "dtypes: float64(2), int64(5), object(5)\n", + "memory usage: 83.7+ KB\n" + ] } ], "source": [ - "titanic.dtypes" + "titanic.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "> I want to know what other read and write methods are available in Pandas" + "The command provides a lot of technical information about the `DataFrame`, so let's explain the output in more detail:\n", + "\n", + "- It is indeed a `DataFrame`.\n", + "- There are 891 entries, i.e. 891 rows.\n", + "- Each row has a row label (aka the `index`) with values ranging from 0 to 890.\n", + "- The table has 12 columns. Most columns have a value for each of the rows (all 891 values are `non-null`). Some columns do have missing values and less than 891 `non-null` values. \n", + "- The columns `Name`, `Sex`, `Cabin` and `Embarked` consists of textual data (strings, aka `object`). The other columns are numerical data with some of them whole numbers (aka `integer`) and others are real numbers (aka `float`).\n", + "- The kind of data (characters, integers,...) in the different columns are summarized by listing the `dtypes`.\n", + "- The approximate amount of RAM used to hold the DataFrame is provided as well." ] }, { @@ -647,9 +875,9 @@ "source": [ "## REMEMBER\n", "\n", - "- Getting data in to Pandas is supported by multiple `read_*` functions.\n", - "- Exporting data out of Pandas is provided by multiple `to_*`functions.\n", - "- The `head`/`tail` methods and `dtypes` attribute are convenient for a first check." + "- Getting data in to Pandas from many different file formats or data sources is supported by `read_*` functions.\n", + "- Exporting data out of Pandas is provided by different `to_*`methods.\n", + "- The `head`/`tail`/`info` methods and the `dtypes` attribute are convenient for a first check." ] }, {