Skip to content

Commit c3e4d28

Browse files
author
craigsdennis
committed
Review pass
1 parent 99db252 commit c3e4d28

File tree

1 file changed

+193
-26
lines changed

1 file changed

+193
-26
lines changed

s2n10-grouping.ipynb

Lines changed: 193 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,12 @@
11
{
22
"cells": [
3-
{
4-
"cell_type": "markdown",
5-
"metadata": {},
6-
"source": [
7-
"[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/treehouse-projects/python-introducing-pandas/master?filepath=s2n7-grouping.ipynb)"
8-
]
9-
},
103
{
114
"cell_type": "markdown",
125
"metadata": {},
136
"source": [
147
"# Grouping\n",
158
"\n",
16-
"When dealing with data a common need arises where you need to look at an aggregate view of a `DataFrame` by a certain value. This is where grouping comes in.\n",
9+
"A common need is bound to arise where you will need to look at an aggregate view of a `DataFrame` by a certain value. This is where grouping comes in.\n",
1710
"\n",
1811
"CashBox has asked that we produce a list of the top 10 users who have been on the receiving side of transactions the most. They would like to see the user's first and last name, their email, and the total number of transactions where the user was the receiver.\n",
1912
"\n",
@@ -46,6 +39,7 @@
4639
"\n",
4740
"import pandas as pd\n",
4841
"\n",
42+
"pd.options.display.max_rows = 10\n",
4943
"users = pd.read_csv(os.path.join('data', 'users.csv'), index_col=0)\n",
5044
"transactions = pd.read_csv(os.path.join('data', 'transactions.csv'), index_col=0)\n",
5145
"# Sanity check\n",
@@ -108,6 +102,7 @@
108102
],
109103
"source": [
110104
"grouped_by_receiver = transactions.groupby('receiver')\n",
105+
"\n",
111106
"# Let's see what type of object we got back\n",
112107
"type(grouped_by_receiver)"
113108
]
@@ -123,7 +118,7 @@
123118
},
124119
{
125120
"cell_type": "code",
126-
"execution_count": 4,
121+
"execution_count": 11,
127122
"metadata": {},
128123
"outputs": [
129124
{
@@ -135,17 +130,23 @@
135130
"adam.saunders 2\n",
136131
"adrian 3\n",
137132
"adrian.blair 7\n",
138-
"dtype: int64"
133+
" ..\n",
134+
"wilson 2\n",
135+
"wking 2\n",
136+
"wright3590 4\n",
137+
"young 2\n",
138+
"zachary.neal 4\n",
139+
"Length: 410, dtype: int64"
139140
]
140141
},
141-
"execution_count": 4,
142+
"execution_count": 11,
142143
"metadata": {},
143144
"output_type": "execute_result"
144145
}
145146
],
146147
"source": [
147148
"# Returns a Series of total number of rows\n",
148-
"grouped_by_receiver.size().head()"
149+
"grouped_by_receiver.size()"
149150
]
150151
},
151152
{
@@ -157,7 +158,7 @@
157158
},
158159
{
159160
"cell_type": "code",
160-
"execution_count": 5,
161+
"execution_count": 12,
161162
"metadata": {},
162163
"outputs": [
163164
{
@@ -223,8 +224,45 @@
223224
" <td>7</td>\n",
224225
" <td>7</td>\n",
225226
" </tr>\n",
227+
" <tr>\n",
228+
" <th>...</th>\n",
229+
" <td>...</td>\n",
230+
" <td>...</td>\n",
231+
" <td>...</td>\n",
232+
" </tr>\n",
233+
" <tr>\n",
234+
" <th>wilson</th>\n",
235+
" <td>2</td>\n",
236+
" <td>2</td>\n",
237+
" <td>2</td>\n",
238+
" </tr>\n",
239+
" <tr>\n",
240+
" <th>wking</th>\n",
241+
" <td>2</td>\n",
242+
" <td>2</td>\n",
243+
" <td>2</td>\n",
244+
" </tr>\n",
245+
" <tr>\n",
246+
" <th>wright3590</th>\n",
247+
" <td>4</td>\n",
248+
" <td>4</td>\n",
249+
" <td>4</td>\n",
250+
" </tr>\n",
251+
" <tr>\n",
252+
" <th>young</th>\n",
253+
" <td>2</td>\n",
254+
" <td>2</td>\n",
255+
" <td>2</td>\n",
256+
" </tr>\n",
257+
" <tr>\n",
258+
" <th>zachary.neal</th>\n",
259+
" <td>4</td>\n",
260+
" <td>4</td>\n",
261+
" <td>4</td>\n",
262+
" </tr>\n",
226263
" </tbody>\n",
227264
"</table>\n",
265+
"<p>410 rows × 3 columns</p>\n",
228266
"</div>"
229267
],
230268
"text/plain": [
@@ -234,16 +272,24 @@
234272
"acook 1 1 1\n",
235273
"adam.saunders 2 2 2\n",
236274
"adrian 3 3 3\n",
237-
"adrian.blair 7 7 7"
275+
"adrian.blair 7 7 7\n",
276+
"... ... ... ...\n",
277+
"wilson 2 2 2\n",
278+
"wking 2 2 2\n",
279+
"wright3590 4 4 4\n",
280+
"young 2 2 2\n",
281+
"zachary.neal 4 4 4\n",
282+
"\n",
283+
"[410 rows x 3 columns]"
238284
]
239285
},
240-
"execution_count": 5,
286+
"execution_count": 12,
241287
"metadata": {},
242288
"output_type": "execute_result"
243289
}
244290
],
245291
"source": [
246-
"grouped_by_receiver.count().head()"
292+
"grouped_by_receiver.count()"
247293
]
248294
},
249295
{
@@ -307,8 +353,33 @@
307353
" <th>adrian.blair</th>\n",
308354
" <td>462.88</td>\n",
309355
" </tr>\n",
356+
" <tr>\n",
357+
" <th>...</th>\n",
358+
" <td>...</td>\n",
359+
" </tr>\n",
360+
" <tr>\n",
361+
" <th>wilson</th>\n",
362+
" <td>44.39</td>\n",
363+
" </tr>\n",
364+
" <tr>\n",
365+
" <th>wking</th>\n",
366+
" <td>74.07</td>\n",
367+
" </tr>\n",
368+
" <tr>\n",
369+
" <th>wright3590</th>\n",
370+
" <td>195.45</td>\n",
371+
" </tr>\n",
372+
" <tr>\n",
373+
" <th>young</th>\n",
374+
" <td>83.57</td>\n",
375+
" </tr>\n",
376+
" <tr>\n",
377+
" <th>zachary.neal</th>\n",
378+
" <td>186.01</td>\n",
379+
" </tr>\n",
310380
" </tbody>\n",
311381
"</table>\n",
382+
"<p>410 rows × 1 columns</p>\n",
312383
"</div>"
313384
],
314385
"text/plain": [
@@ -318,7 +389,15 @@
318389
"acook 94.65\n",
319390
"adam.saunders 101.15\n",
320391
"adrian 124.36\n",
321-
"adrian.blair 462.88"
392+
"adrian.blair 462.88\n",
393+
"... ...\n",
394+
"wilson 44.39\n",
395+
"wking 74.07\n",
396+
"wright3590 195.45\n",
397+
"young 83.57\n",
398+
"zachary.neal 186.01\n",
399+
"\n",
400+
"[410 rows x 1 columns]"
322401
]
323402
},
324403
"execution_count": 6,
@@ -327,7 +406,7 @@
327406
}
328407
],
329408
"source": [
330-
"grouped_by_receiver.sum().head()"
409+
"grouped_by_receiver.sum()"
331410
]
332411
},
333412
{
@@ -356,6 +435,7 @@
356435
"source": [
357436
"# Create a new column in users called transaction count, and set the values to the size of the matching group\n",
358437
"users['transaction_count'] = grouped_by_receiver.size()\n",
438+
"\n",
359439
"# Not every user has made a transaction, let's see what kind of missing data we are dealing with\n",
360440
"len(users[users.transaction_count.isna()])"
361441
]
@@ -459,31 +539,118 @@
459539
" <td>25.85</td>\n",
460540
" <td>7.0</td>\n",
461541
" </tr>\n",
542+
" <tr>\n",
543+
" <th>...</th>\n",
544+
" <td>...</td>\n",
545+
" <td>...</td>\n",
546+
" <td>...</td>\n",
547+
" <td>...</td>\n",
548+
" <td>...</td>\n",
549+
" <td>...</td>\n",
550+
" <td>...</td>\n",
551+
" <td>...</td>\n",
552+
" </tr>\n",
553+
" <tr>\n",
554+
" <th>wilson</th>\n",
555+
" <td>Robert</td>\n",
556+
" <td>Wilson</td>\n",
557+
" <td>robert@yahoo.com</td>\n",
558+
" <td>False</td>\n",
559+
" <td>2018-05-16</td>\n",
560+
" <td>5</td>\n",
561+
" <td>59.75</td>\n",
562+
" <td>2.0</td>\n",
563+
" </tr>\n",
564+
" <tr>\n",
565+
" <th>wking</th>\n",
566+
" <td>Wanda</td>\n",
567+
" <td>King</td>\n",
568+
" <td>wanda.king@holt.com</td>\n",
569+
" <td>True</td>\n",
570+
" <td>2018-06-01</td>\n",
571+
" <td>2</td>\n",
572+
" <td>67.08</td>\n",
573+
" <td>2.0</td>\n",
574+
" </tr>\n",
575+
" <tr>\n",
576+
" <th>wright3590</th>\n",
577+
" <td>Jacqueline</td>\n",
578+
" <td>Wright</td>\n",
579+
" <td>jacqueline.wright@gonzalez.com</td>\n",
580+
" <td>True</td>\n",
581+
" <td>2018-02-08</td>\n",
582+
" <td>6</td>\n",
583+
" <td>18.48</td>\n",
584+
" <td>4.0</td>\n",
585+
" </tr>\n",
586+
" <tr>\n",
587+
" <th>young</th>\n",
588+
" <td>Jessica</td>\n",
589+
" <td>Young</td>\n",
590+
" <td>jessica4028@yahoo.com</td>\n",
591+
" <td>True</td>\n",
592+
" <td>2018-07-17</td>\n",
593+
" <td>4</td>\n",
594+
" <td>75.39</td>\n",
595+
" <td>2.0</td>\n",
596+
" </tr>\n",
597+
" <tr>\n",
598+
" <th>zachary.neal</th>\n",
599+
" <td>Zachary</td>\n",
600+
" <td>Neal</td>\n",
601+
" <td>zneal@gmail.com</td>\n",
602+
" <td>True</td>\n",
603+
" <td>2018-07-26</td>\n",
604+
" <td>1</td>\n",
605+
" <td>39.90</td>\n",
606+
" <td>4.0</td>\n",
607+
" </tr>\n",
462608
" </tbody>\n",
463609
"</table>\n",
610+
"<p>475 rows × 8 columns</p>\n",
464611
"</div>"
465612
],
466613
"text/plain": [
467-
" first_name last_name email \\\n",
468-
"aaron Aaron Davis aaron6348@gmail.com \n",
469-
"acook Anthony Cook cook@gmail.com \n",
470-
"adam.saunders Adam Saunders adam@gmail.com \n",
471-
"adrian Adrian Yang adrian.yang@teamtreehouse.com \n",
472-
"adrian.blair Adrian Blair adrian9335@gmail.com \n",
614+
" first_name last_name email \\\n",
615+
"aaron Aaron Davis aaron6348@gmail.com \n",
616+
"acook Anthony Cook cook@gmail.com \n",
617+
"adam.saunders Adam Saunders adam@gmail.com \n",
618+
"adrian Adrian Yang adrian.yang@teamtreehouse.com \n",
619+
"adrian.blair Adrian Blair adrian9335@gmail.com \n",
620+
"... ... ... ... \n",
621+
"wilson Robert Wilson robert@yahoo.com \n",
622+
"wking Wanda King wanda.king@holt.com \n",
623+
"wright3590 Jacqueline Wright jacqueline.wright@gonzalez.com \n",
624+
"young Jessica Young jessica4028@yahoo.com \n",
625+
"zachary.neal Zachary Neal zneal@gmail.com \n",
473626
"\n",
474627
" email_verified signup_date referral_count balance \\\n",
475628
"aaron True 2018-08-31 6 18.14 \n",
476629
"acook True 2018-05-12 2 55.45 \n",
477630
"adam.saunders False 2018-05-29 3 72.12 \n",
478631
"adrian True 2018-04-28 3 30.01 \n",
479632
"adrian.blair True 2018-06-16 7 25.85 \n",
633+
"... ... ... ... ... \n",
634+
"wilson False 2018-05-16 5 59.75 \n",
635+
"wking True 2018-06-01 2 67.08 \n",
636+
"wright3590 True 2018-02-08 6 18.48 \n",
637+
"young True 2018-07-17 4 75.39 \n",
638+
"zachary.neal True 2018-07-26 1 39.90 \n",
480639
"\n",
481640
" transaction_count \n",
482641
"aaron 6.0 \n",
483642
"acook 1.0 \n",
484643
"adam.saunders 2.0 \n",
485644
"adrian 3.0 \n",
486-
"adrian.blair 7.0 "
645+
"adrian.blair 7.0 \n",
646+
"... ... \n",
647+
"wilson 2.0 \n",
648+
"wking 2.0 \n",
649+
"wright3590 4.0 \n",
650+
"young 2.0 \n",
651+
"zachary.neal 4.0 \n",
652+
"\n",
653+
"[475 rows x 8 columns]"
487654
]
488655
},
489656
"execution_count": 8,
@@ -494,7 +661,7 @@
494661
"source": [
495662
"# Set all missing data to 0, since in reality, there have been 0 received transactions for this user\n",
496663
"users.transaction_count.fillna(0, inplace=True)\n",
497-
"users.head()"
664+
"users"
498665
]
499666
},
500667
{

0 commit comments

Comments
 (0)