plotly · xhluca · Feb 8, 2020 · Feb 8, 2020 · Feb 22, 2020 · Feb 22, 2020
diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md
@@ -0,0 +1,251 @@
+---
+jupyter:
+  jupytext:
+    notebook_metadata_filter: all
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.1'
+      jupytext_version: 1.1.1
+  kernelspec:
+    display_name: Python 3
+    language: python
+    name: python3
+  language_info:
+    codemirror_mode:
+      name: ipython
+      version: 3
+    file_extension: .py
+    mimetype: text/x-python
+    name: python
+    nbconvert_exporter: python
+    pygments_lexer: ipython3
+    version: 3.7.6
+  plotly:
+    description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification
+      with Plotly
+    display_as: ai_ml
+    language: python
+    layout: base
+    name: kNN Classification
+    order: 1
+    page_type: example_index
+    permalink: python/knn-classification/
+    thumbnail: thumbnail/knn-classification.png
+---
+
+## Basic Binary Classification with `plotly.express`
+
+```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.datasets import make_moons
+from sklearn.neighbors import KNeighborsClassifier
+
+X, y = make_moons(noise=0.3, random_state=0)
+X_test, _ = make_moons(noise=0.3, random_state=1)
+
+clf = KNeighborsClassifier(15)
+clf.fit(X, y.astype(str))  # Fit on training set
+y_pred = clf.predict(X_test)  # Predict on new data
+
+fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'})
+fig.update_traces(marker_size=10)
+fig.show()
+```
+
+## Visualize Binary Prediction Scores
+
+```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.datasets import make_classification
+from sklearn.neighbors import KNeighborsClassifier
+
+X, y = make_classification(n_features=2, n_redundant=0, random_state=0)
+X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1)
+
+clf = KNeighborsClassifier(15)
+clf.fit(X, y)  # Fit on training set
+y_score = clf.predict_proba(X_test)[:, 1]  # Predict on new data
+
+fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'})
+fig.update_traces(marker_size=10)
+fig.show()
+```
+
+## Probability Estimates with `go.Contour`
+
+```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.datasets import make_moons
+from sklearn.neighbors import KNeighborsClassifier
+
+mesh_size = .02
+margin = 1
+
+X, y = make_moons(noise=0.3, random_state=0)
+
+# Create a mesh grid on which we will run our model
+x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin
+y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin
+xrange = np.arange(x_min, x_max, mesh_size)
+yrange = np.arange(y_min, y_max, mesh_size)
+xx, yy = np.meshgrid(xrange, yrange)
+
+# Create classifier, run predictions on grid
+clf = KNeighborsClassifier(15, weights='uniform')
+clf.fit(X, y)
+Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+Z = Z.reshape(xx.shape)
+
+fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''})
+fig.update_traces(marker_size=10, marker_line_width=1)
+fig.add_trace(
+    go.Contour(
+        x=xrange, 
+        y=yrange, 
+        z=Z, 
+        showscale=False,
+        colorscale=['Blue', 'Red'],
+        opacity=0.4,
+        name='Confidence'
+    )
+)
+fig.show()
+```
+
+## Multi-class prediction confidence with `go.Heatmap`
+
+```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.neighbors import KNeighborsClassifier
+
+mesh_size = .02
+margin = 1
+
+# We will use the iris data, which is included in px
+df = px.data.iris()
+X = df[['sepal_length', 'sepal_width']]
+y = df.species_id
+
+# Create a mesh grid on which we will run our model
+l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin
+w_min, w_max = df.sepal_width.min() - margin, df.sepal_width.max() + margin
+lrange = np.arange(l_min, l_max, mesh_size)
+wrange = np.arange(w_min, w_max, mesh_size)
+ll, ww = np.meshgrid(lrange, wrange)
+
+# Create classifier, run predictions on grid
+clf = KNeighborsClassifier(15, weights='distance')
+clf.fit(X, y)
+Z = clf.predict(np.c_[ll.ravel(), ww.ravel()])
+Z = Z.reshape(ll.shape)
+proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()])
+proba = proba.reshape(ll.shape + (3,))
+
+fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species', width=1000, height=1000)
+fig.update_traces(marker_size=10, marker_line_width=1)
+fig.add_trace(
+    go.Heatmap(
+        x=lrange, 
+        y=wrange, 
+        z=Z, 
+        showscale=False,
+        colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']],
+        opacity=0.25,
+        customdata=proba,
+        hovertemplate=(
+            'sepal length: %{x} <br>'
+            'sepal width: %{y} <br>'
+            'p(setosa): %{customdata[0]:.3f}<br>'
+            'p(versicolor): %{customdata[1]:.3f}<br>'
+            'p(virginica): %{customdata[2]:.3f}<extra></extra>'
+        )
+    )
+)
+fig.show()
+```
+
+## 3D Classification with `px.scatter_3d`
+
+```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.model_selection import train_test_split
+
+df = px.data.iris()
+features = ["sepal_width", "sepal_length", "petal_width"]
+
+X = df[features]
+y = df.species
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
+
+# Create classifier, run predictions on grid
+clf = KNeighborsClassifier(15, weights='distance')
+clf.fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+y_score = clf.predict_proba(X_test)
+y_score = np.around(y_score.max(axis=1), 4)
+
+fig = px.scatter_3d(
+    X_test, 
+    x='sepal_length', 
+    y='sepal_width', 
+    z='petal_width', 
+    symbol=y_pred,
+    color=y_score,
+    labels={'symbol': 'prediction', 'color': 'score'}
+)
+fig.update_layout(legend=dict(x=0, y=0))
+fig.show()
+```
+
+## High Dimension Visualization with `px.scatter_matrix`
+
+If you need to visualize classifications that go beyond 3D, you can use the [scatter plot matrix](https://plot.ly/python/splom/).
+
+```python
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.model_selection import train_test_split
+
+df = px.data.iris()
+features = ["sepal_width", "sepal_length", "petal_width", "petal_length"]
+
+X = df[features]
+y = df.species
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
+
+# Create classifier, run predictions on grid
+clf = KNeighborsClassifier(15, weights='distance')
+clf.fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+
+fig = px.scatter_matrix(X_test, dimensions=features, color=y_pred, labels={'color': 'prediction'})
+fig.show()
+```
+
+### Reference
+
+Learn more about `px`, `go.Contour`, and `go.Heatmap` here:
+* https://plot.ly/python/plotly-express/
+* https://plot.ly/python/heatmaps/
+* https://plot.ly/python/contour-plots/
+* https://plot.ly/python/3d-scatter-plots/
+* https://plot.ly/python/splom/
+
+This tutorial was inspired by amazing examples from the official scikit-learn docs:
+* https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
+* https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
+* https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md
@@ -0,0 +1,135 @@
+---
+jupyter:
+  jupytext:
+    notebook_metadata_filter: all
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.1'
+      jupytext_version: 1.1.1
+  kernelspec:
+    display_name: Python 3
+    language: python
+    name: python3
+  language_info:
+    codemirror_mode:
+      name: ipython
+      version: 3
+    file_extension: .py
+    mimetype: text/x-python
+    name: python
+    nbconvert_exporter: python
+    pygments_lexer: ipython3
+    version: 3.7.6
+  plotly:
+    description: Visualize Principle Component Analysis (PCA) of your high-dimensional
+      data with Plotly on Python.
+    display_as: ai_ml
+    language: python
+    layout: base
+    name: PCA Visualization
+    order: 4
+    page_type: example_index
+    permalink: python/pca-visualization/
+    thumbnail: thumbnail/ml-pca.png
+---
+
+## Basic PCA Scatter Plot
+
+This example shows you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. It uses scikit-learn's `PCA`.
+
+```python
+import plotly.express as px
+from sklearn.decomposition import PCA
+
+df = px.data.iris()
+X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
+
+pca = PCA(n_components=2)
+components = pca.fit_transform(X)
+
+fig = px.scatter(x=components[:, 0], y=components[:, 1], color=df['species'])
+fig.show()
+```
+
+## Visualize PCA with `px.scatter_3d`
+
+Just like the basic PCA plot, this let you visualize the first 3 dimensions. This additionally displays the total variance explained by those components.
+
+```python
+import plotly.express as px
+from sklearn.decomposition import PCA
+
+df = px.data.iris()
+X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
+
+pca = PCA(n_components=3)
+components = pca.fit_transform(X)
+
+total_var = pca.explained_variance_ratio_.sum() * 100
+
+fig = px.scatter_3d(
+    x=components[:, 0], y=components[:, 1], z=components[:, 2],
+    color=df['species'], 
+    title=f'Total Explained Variance: {total_var:.2f}%',
+    labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'},
+)
+fig.show()
+```
+
+## Plot high-dimensional components with `px.scatter_matrix`
+
+If you need to visualize more than 3 dimensions, you can use scatter plot matrices.
+
+```python
+import pandas as pd
+from sklearn.decomposition import PCA
+from sklearn.datasets import load_boston
+
+boston = load_boston()
+df = pd.DataFrame(boston.data, columns=boston.feature_names)
+
+pca = PCA(n_components=5)
+components = pca.fit_transform(df)
+
+total_var = pca.explained_variance_ratio_.sum() * 100
+
+labels = {str(i): f"PC {i+1}" for i in range(5)}
+labels['color'] = 'Median Price'
+
+fig = px.scatter_matrix(
+    components, 
+    color=boston.target,
+    dimensions=range(5),
+    labels=labels,
+    title=f'Total Explained Variance: {total_var:.2f}%',
+)
+fig.update_traces(diagonal_visible=False)
+fig.show()
+```
+
+## Plotting explained variance
+
+Often, you might be interested in seeing how much variance the PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset).
+
+```python
+import numpy as np
+import pandas as pd
+from sklearn.decomposition import PCA
+from sklearn.datasets import load_diabetes
+
+boston = load_diabetes()
+df = pd.DataFrame(boston.data, columns=boston.feature_names)
+
+pca = PCA()
+pca.fit(df)
+exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
+
+px.area(
+    x=range(1, exp_var_cumul.shape[0] + 1),
+    y=exp_var_cumul, 
+    labels={"x": "# Components", "y": "Explained Variance"}
+)
+```
+
+## Visualize loadings