From 6ac0e21f0ac24b7904f8e93821f347cfd610b2e5 Mon Sep 17 00:00:00 2001 From: Priyatharsan Rajasekar Date: Fri, 18 Jan 2019 16:29:19 -0500 Subject: [PATCH] fixed typo; removed outdated APIs; changed styling wrt plotly brand colors --- ...15-06-30-principal_component_analysis.html | 541 +++++++++--------- .../principal_component_analysis.ipynb | 361 ++++++------ 2 files changed, 479 insertions(+), 423 deletions(-) diff --git a/_posts/ipython-notebooks/2015-06-30-principal_component_analysis.html b/_posts/ipython-notebooks/2015-06-30-principal_component_analysis.html index 0bc5594ffa3e..969aa44549f8 100755 --- a/_posts/ipython-notebooks/2015-06-30-principal_component_analysis.html +++ b/_posts/ipython-notebooks/2015-06-30-principal_component_analysis.html @@ -1,17 +1,17 @@ --- permalink: ipython-notebooks/principal-component-analysis/ description: A step by step tutorial to Principal Component Analysis, a simple yet powerful transformation technique. -title: Principal Component Analysis in 3 Simple Steps +name: Principal Component Analysis in 3 Simple Steps has_thumbnail: false thumbnail: /images/static-image -layout: user-guide name: Principal Component Analysis -language: python +ipynb: ~notebook_demo/264 +layout: user-guide page_type: u-guide +language: python --- {% raw %} -
-
+
@@ -22,8 +22,7 @@
-
-
+
@@ -32,8 +31,7 @@

Principal Component Anal

-
-
+
@@ -42,8 +40,7 @@

Introduction&#

-
-
+ -
-
+ -
-
+
-
-
+
-

Often, the desired goal is to reduce the dimensions of a $d$-dimensional dataset by projecting it onto a $(k)$-dimensional subspace (where $k\;<\;d$) in order to increase the computational efficiency while retaining most of the information. An important question is "what is the size of $k$ that represents the data 'well'?"

+

Often, the desired goal is to reduce the dimensions of a $d$-dimensional dataset by projecting it onto a $(k)$-dimensional subspace (where $k\;<\;d$) in order to increase the computational efficiency while retaining most of the information. An important question is "what is the size of $k$ that represents the data 'well'?"

Later, we will compute eigenvectors (the principal components) of a dataset and collect them in a projection matrix. Each of those eigenvectors is associated with an eigenvalue which can be interpreted as the "length" or "magnitude" of the corresponding eigenvector. If some eigenvalues have a significantly larger magnitude than others that the reduction of the dataset via PCA onto a smaller dimensional subspace by dropping the "less informative" eigenpairs is reasonable.

-
-
+
@@ -91,8 +84,7 @@

A Summary of the PCA Approach -
+
@@ -107,8 +99,7 @@

A Summary of the PCA Approach -
+
@@ -116,8 +107,7 @@

Preparing the Iris Dataset -
+
@@ -141,8 +131,7 @@

Preparing the Iris Dataset -
+
@@ -150,8 +139,7 @@

Loading the Dataset -
+
-
import pandas as pd
+
import pandas as pd
 
 df = pd.read_csv(
     filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
-    header=None, 
+    header=None, 
     sep=',')
 
 df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
-df.dropna(how="all", inplace=True) # drops the empty line at file-end
+df.dropna(how="all", inplace=True) # drops the empty line at file-end
 
 df.tail()
 
@@ -187,11 +175,26 @@

Loading the Dataset +
Out[1]:
+
-
+
+ @@ -206,43 +209,43 @@

Loading the DatasetLoading the DatasetIn [2]:
-
# split data table into data X and class labels y
+
# split data table into data X and class labels y
 
-X = df.ix[:,0:4].values
-y = df.ix[:,4].values
+X = df.iloc[:,0:4].values
+y = df.iloc[:,4].values
 
@@ -271,8 +274,7 @@

Loading the Dataset -
+
@@ -284,8 +286,7 @@

Loading the Dataset -
+
In [4]:
-
# plotting histograms
+
import plotly.plotly as py
 
-traces = []
+# plotting histograms
+data = []
 
-legend = {0:False, 1:False, 2:False, 3:True}
+legend = {0:False, 1:False, 2:False, 3:True}
 
-colors = {'Iris-setosa': 'rgb(31, 119, 180)', 
-          'Iris-versicolor': 'rgb(255, 127, 14)', 
-          'Iris-virginica': 'rgb(44, 160, 44)'}
+colors = {'Iris-setosa': '#0D76BF', 
+          'Iris-versicolor': '#00cc96', 
+          'Iris-virginica': '#EF553B'}
 
 for col in range(4):
     for key in colors:
-        traces.append(Histogram(x=X[y==key, col], 
-                        opacity=0.75,
-                        xaxis='x%s' %(col+1),
-                        marker=Marker(color=colors[key]),
-                        name=key,
-                        showlegend=legend[col]))
-
-data = Data(traces)
-
-layout = Layout(barmode='overlay',
-                xaxis=XAxis(domain=[0, 0.25], title='sepal length (cm)'),
-                xaxis2=XAxis(domain=[0.3, 0.5], title='sepal width (cm)'),
-                xaxis3=XAxis(domain=[0.55, 0.75], title='petal length (cm)'),
-                xaxis4=XAxis(domain=[0.8, 1], title='petal width (cm)'),
-                yaxis=YAxis(title='count'),
-                title='Distribution of the different Iris flower features')
-
-fig = Figure(data=data, layout=layout)
-py.iplot(fig)
+        trace = dict(
+            type='histogram',
+            x=list(X[y==key, col]),
+            opacity=0.75,
+            xaxis='x%s' %(col+1),
+            marker=dict(color=colors[key]),
+            name=key,
+            showlegend=legend[col]
+        )
+        data.append(trace)
+
+layout = dict(
+    barmode='overlay',
+    xaxis=dict(domain=[0, 0.25], title='sepal length (cm)'),
+    xaxis2=dict(domain=[0.3, 0.5], title='sepal width (cm)'),
+    xaxis3=dict(domain=[0.55, 0.75], title='petal length (cm)'),
+    xaxis4=dict(domain=[0.8, 1], title='petal width (cm)'),
+    yaxis=dict(title='count'),
+    title='Distribution of the different Iris flower features'
+)
+
+fig = dict(data=data, layout=layout)
+py.iplot(fig, filename='exploratory-vis-histogram')
 
@@ -356,11 +347,13 @@

Exploratory Visualization +
Out[4]:
+
- +

@@ -369,8 +362,7 @@

Exploratory Visualization -
+
-
-
+
@@ -390,10 +381,10 @@

Standardizing
-
In [5]:
+
In [6]:
- -
-
+
@@ -411,8 +401,7 @@

1 - Eig

-
-
+
@@ -421,8 +410,7 @@

1 - Eig

-
-
+
@@ -430,8 +418,7 @@

Covariance Matrix -
+
@@ -448,13 +435,13 @@

Covariance Matrix
-
In [6]:
+
In [7]:
-
import numpy as np
+
import numpy as np
 mean_vec = np.mean(X_std, axis=0)
 cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
-print('Covariance matrix \n%s' %cov_mat)
+print('Covariance matrix \n%s' %cov_mat)
 
@@ -466,8 +453,10 @@

Covariance Matrix +
+
Covariance matrix 
 [[ 1.00671141 -0.11010327  0.87760486  0.82344326]
@@ -482,8 +471,7 @@ 

Covariance Matrix -
+
@@ -494,10 +482,10 @@

Covariance Matrix
-
In [7]:
+
In [8]:
-
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))
+
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))
 
@@ -509,8 +497,10 @@

Covariance Matrix +
+
NumPy covariance matrix: 
 [[ 1.00671141 -0.11010327  0.87760486  0.82344326]
@@ -525,8 +515,7 @@ 

Covariance Matrix -
+
@@ -537,15 +526,15 @@

Covariance Matrix
-
In [8]:
+
In [9]:
-
cov_mat = np.cov(X_std.T)
+
cov_mat = np.cov(X_std.T)
 
 eig_vals, eig_vecs = np.linalg.eig(cov_mat)
 
-print('Eigenvectors \n%s' %eig_vecs)
-print('\nEigenvalues \n%s' %eig_vals)
+print('Eigenvectors \n%s' %eig_vecs)
+print('\nEigenvalues \n%s' %eig_vals)
 
@@ -557,8 +546,10 @@

Covariance Matrix +
+
Eigenvectors 
 [[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
@@ -567,7 +558,7 @@ 

Covariance MatrixCovariance Matrix -
+
@@ -589,15 +579,15 @@

Correlation Matrix
-
In [9]:
+
In [10]:
-
cor_mat1 = np.corrcoef(X_std.T)
+
cor_mat1 = np.corrcoef(X_std.T)
 
 eig_vals, eig_vecs = np.linalg.eig(cor_mat1)
 
-print('Eigenvectors \n%s' %eig_vecs)
-print('\nEigenvalues \n%s' %eig_vals)
+print('Eigenvectors \n%s' %eig_vecs)
+print('\nEigenvalues \n%s' %eig_vals)
 
@@ -609,8 +599,10 @@

Correlation Matrix +
+
Eigenvectors 
 [[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
@@ -619,7 +611,7 @@ 

Correlation MatrixCorrelation Matrix -
+
@@ -640,15 +631,15 @@

Correlation Matrix
-
In [10]:
+
In [11]:
-
cor_mat2 = np.corrcoef(X.T)
+
cor_mat2 = np.corrcoef(X.T)
 
 eig_vals, eig_vecs = np.linalg.eig(cor_mat2)
 
-print('Eigenvectors \n%s' %eig_vecs)
-print('\nEigenvalues \n%s' %eig_vals)
+print('Eigenvectors \n%s' %eig_vecs)
+print('\nEigenvalues \n%s' %eig_vals)
 
@@ -660,8 +651,10 @@

Correlation Matrix +
+
Eigenvectors 
 [[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
@@ -670,7 +663,7 @@ 

Correlation MatrixCorrelation Matrix -
+
@@ -694,8 +686,7 @@

Correlation Matrix -
+
@@ -706,10 +697,10 @@

Singular Vector Decomposition
-
In [11]:
+
In [12]:
-
u,s,v = np.linalg.svd(X_std.T)
+
u,s,v = np.linalg.svd(X_std.T)
 u
 
@@ -722,7 +713,9 @@

Singular Vector Decomposition -
Out[11]:
+ +
Out[12]:
+ @@ -739,8 +732,7 @@

Singular Vector Decomposition -
+
-
-
+
@@ -760,12 +751,12 @@

2 - Selecting Principal Components
-
In [12]:
+
In [13]:
-
for ev in eig_vecs:
+
for ev in eig_vecs:
     np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
-print('Everything ok!')
+print('Everything ok!')
 
@@ -777,8 +768,10 @@

2 - Selecting Principal Components +
+
-
-
+
-

In order to decide which eigenvector(s) can dropped without losing too much information +

In order to decide which eigenvector(s) can be dropped without losing too much information for the construction of lower-dimensional subspace, we need to inspect the corresponding eigenvalues: The eigenvectors with the lowest eigenvalues bear the least information about the distribution of the data; those are the ones can be dropped.
In order to do so, the common approach is to rank the eigenvalues from highest to lowest in order choose the top $k$ eigenvectors.

@@ -803,10 +795,10 @@

2 - Selecting Principal Components
-
In [13]:
+
In [14]:
-
-
+
@@ -856,31 +849,48 @@

2 - Selecting Principal Components
-
In [14]:
+
In [16]:
-
-
+
-
-
+
@@ -928,13 +938,13 @@

2 - Selecting Principal Components
-
In [15]:
+
In [17]:
-
-
+
@@ -970,8 +982,7 @@

3 - Projection Onto the New F

-
-
+
@@ -983,10 +994,10 @@

3 - Projection Onto the New F

-
In [16]:
+
In [19]:
-
Y = X_std.dot(matrix_w)
+
Y = X_std.dot(matrix_w)
 
@@ -996,34 +1007,38 @@

3 - Projection Onto the New F

-
In [17]:
+
In [20]:
-
traces = []
+
data = []
 
-for name in ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'):
-
-    trace = Scatter(
+for name, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'), colors.values()):
+    trace = dict(
+        type='scatter',
         x=Y[y==name,0],
         y=Y[y==name,1],
         mode='markers',
         name=name,
-        marker=Marker(
+        marker=dict(
+            color=col,
             size=12,
-            line=Line(
+            line=dict(
                 color='rgba(217, 217, 217, 0.14)',
                 width=0.5),
-            opacity=0.8))
-    traces.append(trace)
-
-
-data = Data(traces)
-layout = Layout(showlegend=True,
-                scene=Scene(xaxis=XAxis(title='PC1'),
-                yaxis=YAxis(title='PC2'),))
-
-fig = Figure(data=data, layout=layout)
-py.iplot(fig)
+            opacity=0.8)
+    )
+    data.append(trace)
+
+layout = dict(
+    showlegend=True,
+    scene=dict(
+        xaxis=dict(title='PC1'),
+        yaxis=dict(title='PC2')
+    )
+)
+
+fig = dict(data=data, layout=layout)
+py.iplot(fig, filename='projection-matrix')
 
@@ -1035,11 +1050,13 @@

3 - Projection Onto the New F
-
Out[17]:
+ +
Out[20]:
+
- +
@@ -1048,8 +1065,7 @@

3 - Projection Onto the New F

-
-
+
@@ -1059,8 +1075,7 @@

3 - Projection Onto the New F

-
-
+
@@ -1068,8 +1083,7 @@

Shortcut - PCA in scikit-learn -
+
@@ -1080,10 +1094,10 @@

Shortcut - PCA in scikit-learn
-
In [18]:
+
In [21]:
-
from sklearn.decomposition import PCA as sklearnPCA
+
from sklearn.decomposition import PCA as sklearnPCA
 sklearn_pca = sklearnPCA(n_components=2)
 Y_sklearn = sklearn_pca.fit_transform(X_std)
 
@@ -1095,32 +1109,35 @@

Shortcut - PCA in scikit-learn
-
In [19]:
+
In [23]:
diff --git a/_posts/ipython-notebooks/principal_component_analysis.ipynb b/_posts/ipython-notebooks/principal_component_analysis.ipynb index 59baad29468b..c1973c04a5c2 100755 --- a/_posts/ipython-notebooks/principal_component_analysis.ipynb +++ b/_posts/ipython-notebooks/principal_component_analysis.ipynb @@ -128,7 +128,20 @@ { "data": { "text/html": [ - "
\n", + "
\n", + "\n", "

\n", " \n", " \n", @@ -143,43 +156,43 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
145 6.7 3.0 5.2 2.3 Iris-virginica6.73.05.22.3Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica6.32.55.01.9Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica6.53.05.22.0Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica6.23.45.42.3Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica5.93.05.11.8Iris-virginica
\n", @@ -216,15 +229,13 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# split data table into data X and class labels y\n", "\n", - "X = df.ix[:,0:4].values\n", - "y = df.ix[:,4].values" + "X = df.iloc[:,0:4].values\n", + "y = df.iloc[:,4].values" ] }, { @@ -247,19 +258,6 @@ "To get a feeling for how the 3 different flower classes are distributes along the 4 different features, let us visualize them via histograms." ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import plotly.plotly as py\n", - "from plotly.graph_objs import *\n", - "import plotly.tools as tls" - ] - }, { "cell_type": "code", "execution_count": 4, @@ -268,7 +266,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -280,37 +278,42 @@ } ], "source": [ - "# plotting histograms\n", + "import plotly.plotly as py\n", "\n", - "traces = []\n", + "# plotting histograms\n", + "data = []\n", "\n", "legend = {0:False, 1:False, 2:False, 3:True}\n", "\n", - "colors = {'Iris-setosa': 'rgb(31, 119, 180)', \n", - " 'Iris-versicolor': 'rgb(255, 127, 14)', \n", - " 'Iris-virginica': 'rgb(44, 160, 44)'}\n", + "colors = {'Iris-setosa': '#0D76BF', \n", + " 'Iris-versicolor': '#00cc96', \n", + " 'Iris-virginica': '#EF553B'}\n", "\n", "for col in range(4):\n", " for key in colors:\n", - " traces.append(Histogram(x=X[y==key, col], \n", - " opacity=0.75,\n", - " xaxis='x%s' %(col+1),\n", - " marker=Marker(color=colors[key]),\n", - " name=key,\n", - " showlegend=legend[col]))\n", - "\n", - "data = Data(traces)\n", + " trace = dict(\n", + " type='histogram',\n", + " x=list(X[y==key, col]),\n", + " opacity=0.75,\n", + " xaxis='x%s' %(col+1),\n", + " marker=dict(color=colors[key]),\n", + " name=key,\n", + " showlegend=legend[col]\n", + " )\n", + " data.append(trace)\n", "\n", - "layout = Layout(barmode='overlay',\n", - " xaxis=XAxis(domain=[0, 0.25], title='sepal length (cm)'),\n", - " xaxis2=XAxis(domain=[0.3, 0.5], title='sepal width (cm)'),\n", - " xaxis3=XAxis(domain=[0.55, 0.75], title='petal length (cm)'),\n", - " xaxis4=XAxis(domain=[0.8, 1], title='petal width (cm)'),\n", - " yaxis=YAxis(title='count'),\n", - " title='Distribution of the different Iris flower features')\n", + "layout = dict(\n", + " barmode='overlay',\n", + " xaxis=dict(domain=[0, 0.25], title='sepal length (cm)'),\n", + " xaxis2=dict(domain=[0.3, 0.5], title='sepal width (cm)'),\n", + " xaxis3=dict(domain=[0.55, 0.75], title='petal length (cm)'),\n", + " xaxis4=dict(domain=[0.8, 1], title='petal width (cm)'),\n", + " yaxis=dict(title='count'),\n", + " title='Distribution of the different Iris flower features'\n", + ")\n", "\n", - "fig = Figure(data=data, layout=layout)\n", - "py.iplot(fig)" + "fig = dict(data=data, layout=layout)\n", + "py.iplot(fig, filename='exploratory-vis-histogram')" ] }, { @@ -329,10 +332,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", @@ -377,7 +378,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -408,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -436,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -450,7 +451,7 @@ " [ 0.56561105 -0.06541577 0.6338014 0.52354627]]\n", "\n", "Eigenvalues \n", - "[ 2.93035378 0.92740362 0.14834223 0.02074601]\n" + "[2.93035378 0.92740362 0.14834223 0.02074601]\n" ] } ], @@ -474,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -488,7 +489,7 @@ " [ 0.56561105 -0.06541577 0.6338014 0.52354627]]\n", "\n", "Eigenvalues \n", - "[ 2.91081808 0.92122093 0.14735328 0.02060771]\n" + "[2.91081808 0.92122093 0.14735328 0.02060771]\n" ] } ], @@ -510,7 +511,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -524,7 +525,7 @@ " [ 0.56561105 -0.06541577 0.6338014 0.52354627]]\n", "\n", "Eigenvalues \n", - "[ 2.91081808 0.92122093 0.14735328 0.02060771]\n" + "[2.91081808 0.92122093 0.14735328 0.02060771]\n" ] } ], @@ -558,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -570,7 +571,7 @@ " [-0.56561105, -0.06541577, -0.6338014 , 0.52354627]])" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -596,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -617,14 +618,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In order to decide which eigenvector(s) can dropped without losing too much information\n", + "In order to decide which eigenvector(s) can be dropped without losing too much information\n", "for the construction of lower-dimensional subspace, we need to inspect the corresponding eigenvalues: The eigenvectors with the lowest eigenvalues bear the least information about the distribution of the data; those are the ones can be dropped. \n", "In order to do so, the common approach is to rank the eigenvalues from highest to lowest in order choose the top $k$ eigenvectors." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -632,10 +633,10 @@ "output_type": "stream", "text": [ "Eigenvalues in descending order:\n", - "2.91081808375\n", - "0.921220930707\n", - "0.147353278305\n", - "0.0206077072356\n" + "2.910818083752054\n", + "0.9212209307072242\n", + "0.14735327830509573\n", + "0.020607707235625678\n" ] } ], @@ -662,19 +663,19 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -684,24 +685,41 @@ "var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]\n", "cum_var_exp = np.cumsum(var_exp)\n", "\n", - "trace1 = Bar(\n", - " x=['PC %s' %i for i in range(1,5)],\n", - " y=var_exp,\n", - " showlegend=False)\n", + "trace1 = dict(\n", + " type='bar',\n", + " x=['PC %s' %i for i in range(1,5)],\n", + " y=var_exp,\n", + " name='Individual'\n", + ")\n", "\n", - "trace2 = Scatter(\n", - " x=['PC %s' %i for i in range(1,5)], \n", - " y=cum_var_exp,\n", - " name='cumulative explained variance')\n", + "trace2 = dict(\n", + " type='scatter',\n", + " x=['PC %s' %i for i in range(1,5)], \n", + " y=cum_var_exp,\n", + " name='Cumulative'\n", + ")\n", "\n", - "data = Data([trace1, trace2])\n", + "data = [trace1, trace2]\n", "\n", - "layout=Layout(\n", - " yaxis=YAxis(title='Explained variance in percent'),\n", - " title='Explained variance by different principal components')\n", + "layout=dict(\n", + " title='Explained variance by different principal components',\n", + " yaxis=dict(\n", + " title='Explained variance in percent'\n", + " ),\n", + " annotations=list([\n", + " dict(\n", + " x=1.16,\n", + " y=1.05,\n", + " xref='paper',\n", + " yref='paper',\n", + " text='Explained Variance',\n", + " showarrow=False,\n", + " )\n", + " ])\n", + ")\n", "\n", - "fig = Figure(data=data, layout=layout)\n", - "py.iplot(fig)" + "fig = dict(data=data, layout=layout)\n", + "py.iplot(fig, filename='selecting-principal-components')" ] }, { @@ -722,17 +740,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "('Matrix W:\\n', array([[ 0.52237162, -0.37231836],\n", - " [-0.26335492, -0.92555649],\n", - " [ 0.58125401, -0.02109478],\n", - " [ 0.56561105, -0.06541577]]))\n" + "Matrix W:\n", + " [[ 0.52237162 -0.37231836]\n", + " [-0.26335492 -0.92555649]\n", + " [ 0.58125401 -0.02109478]\n", + " [ 0.56561105 -0.06541577]]\n" ] } ], @@ -760,10 +779,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true - }, + "execution_count": 19, + "metadata": {}, "outputs": [], "source": [ "Y = X_std.dot(matrix_w)" @@ -771,49 +788,54 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "traces = []\n", "\n", - "for name in ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'):\n", + "data = []\n", "\n", - " trace = Scatter(\n", + "for name, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'), colors.values()):\n", + " trace = dict(\n", + " type='scatter',\n", " x=Y[y==name,0],\n", " y=Y[y==name,1],\n", " mode='markers',\n", " name=name,\n", - " marker=Marker(\n", + " marker=dict(\n", + " color=col,\n", " size=12,\n", - " line=Line(\n", + " line=dict(\n", " color='rgba(217, 217, 217, 0.14)',\n", " width=0.5),\n", - " opacity=0.8))\n", - " traces.append(trace)\n", - "\n", + " opacity=0.8)\n", + " )\n", + " data.append(trace)\n", "\n", - "data = Data(traces)\n", - "layout = Layout(showlegend=True,\n", - " scene=Scene(xaxis=XAxis(title='PC1'),\n", - " yaxis=YAxis(title='PC2'),))\n", + "layout = dict(\n", + " showlegend=True,\n", + " scene=dict(\n", + " xaxis=dict(title='PC1'),\n", + " yaxis=dict(title='PC2')\n", + " )\n", + ")\n", "\n", - "fig = Figure(data=data, layout=layout)\n", - "py.iplot(fig)" + "fig = dict(data=data, layout=layout)\n", + "py.iplot(fig, filename='projection-matrix')" ] }, { @@ -840,10 +862,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA as sklearnPCA\n", @@ -853,52 +873,55 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" ] }, - "execution_count": 19, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "traces = []\n", + "data = []\n", "\n", - "for name in ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'):\n", + "for name, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'), colors.values()):\n", "\n", - " trace = Scatter(\n", + " trace = dict(\n", + " type='scatter',\n", " x=Y_sklearn[y==name,0],\n", " y=Y_sklearn[y==name,1],\n", " mode='markers',\n", " name=name,\n", - " marker=Marker(\n", + " marker=dict(\n", + " color=col,\n", " size=12,\n", - " line=Line(\n", + " line=dict(\n", " color='rgba(217, 217, 217, 0.14)',\n", " width=0.5),\n", - " opacity=0.8))\n", - " traces.append(trace)\n", + " opacity=0.8)\n", + " )\n", + " data.append(trace)\n", "\n", - "\n", - "data = Data(traces)\n", - "layout = Layout(xaxis=XAxis(title='PC1', showline=False),\n", - " yaxis=YAxis(title='PC2', showline=False))\n", - "fig = Figure(data=data, layout=layout)\n", - "py.iplot(fig)" + "layout = dict(\n", + " xaxis=dict(title='PC1', showline=False),\n", + " yaxis=dict(title='PC2', showline=False)\n", + ")\n", + "fig = dict(data=data, layout=layout)\n", + "py.iplot(fig, filename='pca-scikitlearn')" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -929,7 +952,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already up-to-date: publisher in /Users/chelsea/venv/venv2.7/lib/python2.7/site-packages\r\n" + "Requirement already up-to-date: publisher in c:\\anaconda\\anaconda3\\lib\\site-packages (0.13)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Anaconda\\Anaconda3\\lib\\site-packages\\IPython\\nbconvert.py:13: ShimWarning:\n", + "\n", + "The `IPython.nbconvert` package has been deprecated since IPython 4.0. You should import from nbconvert instead.\n", + "\n", + "C:\\Anaconda\\Anaconda3\\lib\\site-packages\\publisher\\publisher.py:53: UserWarning:\n", + "\n", + "Did you \"Save\" this notebook before running this command? Remember to save, always save.\n", + "\n" ] } ], @@ -946,36 +983,36 @@ " 'ipython-notebooks/principal-component-analysis/', \n", " 'Principal Component Analysis in 3 Simple Steps', \n", " 'A step by step tutorial to Principal Component Analysis, a simple yet powerful transformation technique.',\n", - " name='Principal Component Analysis')" + " name='Principal Component Analysis',\n", + " ipynb='~notebook_demo/264'\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" + "pygments_lexer": "ipython3", + "version": "3.6.8" } }, "nbformat": 4,