From 7d709d19ed904b263851f812ad33280bf8890aa6 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 7 Feb 2020 21:32:54 -0500 Subject: [PATCH 01/16] Add sklearn to docs requirements --- doc/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/requirements.txt b/doc/requirements.txt index 988f05efdbb..1222b140821 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -17,6 +17,7 @@ requests networkx squarify scikit-image +scikit-learn sphinx sphinx_bootstrap_theme recommonmark From 5ef8038a616c8e245d73ef2c22178c4afdca2c21 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 7 Feb 2020 21:34:44 -0500 Subject: [PATCH 02/16] Create kNN docs draft --- doc/python/ml-knn.md | 119 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 doc/python/ml-knn.md diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md new file mode 100644 index 00000000000..78e04479db1 --- /dev/null +++ b/doc/python/ml-knn.md @@ -0,0 +1,119 @@ +## K-Nearest Neighbors (kNN) + +How to visualize the K-Nearest Neighbors (kNN) algorithm using scikit-learn. + + +### Binary Probability Estimates with `go.Contour` + +```python +import numpy as np +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier +import plotly.express as px +import plotly.graph_objects as go + +X, y = make_moons(noise=0.3, random_state=0) + +# Create a mesh grid on which we will run our model +x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin +y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin +xrange = np.arange(x_min, x_max, mesh_size) +yrange = np.arange(y_min, y_max, mesh_size) +xx, yy = np.meshgrid(xrange, yrange) + +# Create classifier, run predictions on grid +clf = neighbors.KNeighborsClassifier(15, weights='uniform') +clf.fit(X, y) +Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] +Z = Z.reshape(xx.shape) + +fig = px.scatter(X, x=0, y=1, color=y.astype(str)) +fig.add_trace( + go.Contour( + x=xrange, + y=yrange, + z=Z, + showscale=False, + colorscale=['Blue', 'Red'], + opacity=0.4 + ) +) +``` + +### Multi-class classification with `px.data` and `go.Heatmap` + +```python +import numpy as np +from sklearn.neighbors import KNeighborsClassifier +import plotly.express as px +import plotly.graph_objects as go + +mesh_size = .02 +margin = 1 + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width']] +y = df.species_id + +# Create a mesh grid on which we will run our model +l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin +w_min, w_max = df.sepal_width.min() - margin, df.sepal_width.max() + margin +lrange = np.arange(l_min, l_max, mesh_size) +wrange = np.arange(w_min, w_max, mesh_size) +ll, ww = np.meshgrid(lrange, wrange) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X, y) +Z = clf.predict(np.c_[ll.ravel(), ww.ravel()]) +Z = Z.reshape(ll.shape) + +fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') +fig.update_traces(marker_size=10, marker_line_width=1) +fig.add_trace( + go.Heatmap( + x=lrange, + y=wrange, + z=Z, + showscale=False, + colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']], + opacity=0.25 + ) +) +``` + +### Visualizing kNN Regression + +```python +from sklearn.neighbors import KNeighborsRegressor +import plotly.express as px +import plotly.graph_objects as go + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) + +knn_dist = KNeighborsRegressor(10, weights='distance') +knn_uni = KNeighborsRegressor(10, weights='uniform') +knn_dist.fit(X, df.tip) +knn_uni.fit(X, df.tip) + +x_range = np.linspace(X.min(), X.max(), 100) +y_dist = knn_dist.predict(x_range.reshape(-1, 1)) +y_uni = knn_uni.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) +fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +``` + +### Reference + +Learn more about `px`, `go.Contour`, and `go.Heatmap` here: +* https://plot.ly/python/plotly-express/ +* https://plot.ly/python/heatmaps/ +* https://plot.ly/python/contour-plots/ + +This tutorial was inspired by amazing examples from the official scikit-learn docs: +* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html +* https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html +* https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html From 93ba0dc87511f656aec47c2553c991b5c207eadc Mon Sep 17 00:00:00 2001 From: Xing Han Date: Sat, 22 Feb 2020 15:52:39 -0500 Subject: [PATCH 03/16] Update based on Emma's suggestions --- doc/python/ml-knn.md | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 78e04479db1..031097a4404 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -7,10 +7,13 @@ How to visualize the K-Nearest Neighbors (kNN) algorithm using scikit-learn. ```python import numpy as np -from sklearn.datasets import make_moons -from sklearn.neighbors import KNeighborsClassifier import plotly.express as px import plotly.graph_objects as go +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier + +mesh_size = .02 +margin = 1 X, y = make_moons(noise=0.3, random_state=0) @@ -22,12 +25,12 @@ yrange = np.arange(y_min, y_max, mesh_size) xx, yy = np.meshgrid(xrange, yrange) # Create classifier, run predictions on grid -clf = neighbors.KNeighborsClassifier(15, weights='uniform') +clf = KNeighborsClassifier(15, weights='uniform') clf.fit(X, y) Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) -fig = px.scatter(X, x=0, y=1, color=y.astype(str)) +fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''}) fig.add_trace( go.Contour( x=xrange, @@ -38,15 +41,16 @@ fig.add_trace( opacity=0.4 ) ) +fig.show() ``` ### Multi-class classification with `px.data` and `go.Heatmap` ```python import numpy as np -from sklearn.neighbors import KNeighborsClassifier import plotly.express as px import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsClassifier mesh_size = .02 margin = 1 @@ -67,6 +71,8 @@ clf = KNeighborsClassifier(15, weights='distance') clf.fit(X, y) Z = clf.predict(np.c_[ll.ravel(), ww.ravel()]) Z = Z.reshape(ll.shape) +proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()]) +proba = proba.reshape(ll.shape + (3,)) fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') fig.update_traces(marker_size=10, marker_line_width=1) @@ -77,17 +83,27 @@ fig.add_trace( z=Z, showscale=False, colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']], - opacity=0.25 + opacity=0.25, + customdata=proba, + hovertemplate=( + 'sepal length: %{x}
' + 'sepal width: %{y}
' + 'p(setosa): %{customdata[0]:.3f}
' + 'p(versicolor): %{customdata[1]:.3f}
' + 'p(virginica): %{customdata[2]:.3f}' + ) ) ) +fig.show() ``` ### Visualizing kNN Regression ```python -from sklearn.neighbors import KNeighborsRegressor +import numpy as np import plotly.express as px import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor df = px.data.tips() X = df.total_bill.values.reshape(-1, 1) @@ -104,6 +120,7 @@ y_uni = knn_uni.predict(x_range.reshape(-1, 1)) fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +fig.show() ``` ### Reference From 0f9cf11e978afead9d8af5be5a6f7df617ee2545 Mon Sep 17 00:00:00 2001 From: Xing Han Date: Sat, 22 Feb 2020 16:44:08 -0500 Subject: [PATCH 04/16] Add a header --- doc/python/ml-knn.md | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 031097a4404..7e265ee8485 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -1,6 +1,43 @@ -## K-Nearest Neighbors (kNN) - -How to visualize the K-Nearest Neighbors (kNN) algorithm using scikit-learn. +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.6.10 + plotly: + description: How to visualize k-Nearest Neighbors (kNN) created using scikit-learn + in Python with Plotly. + display_as: basic + language: python + layout: base + name: k-Nearest Neighbors + order: 1 + page_type: example_index + permalink: python/knn/ + redirect_from: python/machine-learning-tutorials/ + thumbnail: thumbnail/line-and-scatter.jpg +--- + +## K-Nearest Neighbors (kNN) Classification + +How to visualize K-Nearest Neighbors (kNN) classification using scikit-learn. ### Binary Probability Estimates with `go.Contour` From 2a666a4d28a1756b5b6f1b94858ca2f86c401768 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Sun, 23 Feb 2020 01:26:10 -0500 Subject: [PATCH 05/16] Placeholder Regression Section --- doc/python/ml-regression.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 doc/python/ml-regression.md diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md new file mode 100644 index 00000000000..e2b0d37724d --- /dev/null +++ b/doc/python/ml-regression.md @@ -0,0 +1,36 @@ +# Regression + + +### Visualizing kNN Regression + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) + +knn_dist = KNeighborsRegressor(10, weights='distance') +knn_uni = KNeighborsRegressor(10, weights='uniform') +knn_dist.fit(X, df.tip) +knn_uni.fit(X, df.tip) + +x_range = np.linspace(X.min(), X.max(), 100) +y_dist = knn_dist.predict(x_range.reshape(-1, 1)) +y_uni = knn_uni.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) +fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +fig.show() +``` + +### Reference + +Learn more about `px` here: +* https://plot.ly/python/plotly-express/ + +This tutorial was inspired by amazing examples from the official scikit-learn docs: +* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html From 70f7131b939e4396b46405b02e42b3914c9bdfe7 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Sun, 23 Feb 2020 01:26:27 -0500 Subject: [PATCH 06/16] Create 2 basic sections, 2 advanced sections --- doc/python/ml-knn.md | 124 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 22 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 7e265ee8485..27ef20c7388 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -1,6 +1,7 @@ --- jupyter: jupytext: + formats: ipynb,md notebook_metadata_filter: all text_representation: extension: .md @@ -20,14 +21,14 @@ jupyter: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.10 + version: 3.7.6 plotly: description: How to visualize k-Nearest Neighbors (kNN) created using scikit-learn in Python with Plotly. display_as: basic language: python layout: base - name: k-Nearest Neighbors + name: K-Nearest Neighbors (kNN) Classification order: 1 page_type: example_index permalink: python/knn/ @@ -35,12 +36,49 @@ jupyter: thumbnail: thumbnail/line-and-scatter.jpg --- -## K-Nearest Neighbors (kNN) Classification +## Basic Binary Classification with `plotly.express` -How to visualize K-Nearest Neighbors (kNN) classification using scikit-learn. +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier + +X, y = make_moons(noise=0.3, random_state=0) +X_test, _ = make_moons(noise=0.3, random_state=1) + +clf = KNeighborsClassifier(15) +clf.fit(X, y.astype(str)) # Fit on training set +y_pred = clf.predict(X_test) # Predict on new data + +fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'}) +fig.update_traces(marker_size=10) +fig.show() +``` +## Visualize Binary Prediction Scores + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_classification +from sklearn.neighbors import KNeighborsClassifier + +X, y = make_classification(n_features=2, n_redundant=0, random_state=0) +X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1) + +clf = KNeighborsClassifier(15) +clf.fit(X, y) # Fit on training set +y_score = clf.predict_proba(X_test)[:, 1] # Predict on new data + +fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'}) +fig.update_traces(marker_size=10) +fig.show() +``` -### Binary Probability Estimates with `go.Contour` +## Probability Estimates with `go.Contour` ```python import numpy as np @@ -68,6 +106,7 @@ Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''}) +fig.update_traces(marker_size=10, marker_line_width=1) fig.add_trace( go.Contour( x=xrange, @@ -75,13 +114,14 @@ fig.add_trace( z=Z, showscale=False, colorscale=['Blue', 'Red'], - opacity=0.4 + opacity=0.4, + name='Confidence' ) ) fig.show() ``` -### Multi-class classification with `px.data` and `go.Heatmap` +## Multi-class prediction confidence with `go.Heatmap` ```python import numpy as np @@ -92,6 +132,7 @@ from sklearn.neighbors import KNeighborsClassifier mesh_size = .02 margin = 1 +# We will use the iris data, which is included in px df = px.data.iris() X = df[['sepal_length', 'sepal_width']] y = df.species_id @@ -134,29 +175,66 @@ fig.add_trace( fig.show() ``` -### Visualizing kNN Regression +## 3D Classification with `px.scatter_3d` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import train_test_split + +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width"] + +X = df[features] +y = df.species +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) +y_score = clf.predict_proba(X_test) +y_score = np.around(y_score.max(axis=1), 4) + +fig = px.scatter_3d( + X_test, + x='sepal_length', + y='sepal_width', + z='petal_width', + symbol=y_pred, + color=y_score, + labels={'symbol': 'prediction', 'color': 'score'} +) +fig.update_layout(legend=dict(x=0, y=0)) +fig.show() +``` + +## High Dimension Visualization with `px.scatter_matrix` + +If you need to visualize classifications that go beyond 3D, you can use the [scatter plot matrix](https://plot.ly/python/splom/). ```python import numpy as np import plotly.express as px import plotly.graph_objects as go -from sklearn.neighbors import KNeighborsRegressor +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import train_test_split -df = px.data.tips() -X = df.total_bill.values.reshape(-1, 1) +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] -knn_dist = KNeighborsRegressor(10, weights='distance') -knn_uni = KNeighborsRegressor(10, weights='uniform') -knn_dist.fit(X, df.tip) -knn_uni.fit(X, df.tip) +X = df[features] +y = df.species +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) -x_range = np.linspace(X.min(), X.max(), 100) -y_dist = knn_dist.predict(x_range.reshape(-1, 1)) -y_uni = knn_uni.predict(x_range.reshape(-1, 1)) +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) -fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) -fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) -fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +fig = px.scatter_matrix(X_test, dimensions=features, color=y_pred, labels={'color': 'prediction'}) fig.show() ``` @@ -166,8 +244,10 @@ Learn more about `px`, `go.Contour`, and `go.Heatmap` here: * https://plot.ly/python/plotly-express/ * https://plot.ly/python/heatmaps/ * https://plot.ly/python/contour-plots/ +* https://plot.ly/python/3d-scatter-plots/ +* https://plot.ly/python/splom/ This tutorial was inspired by amazing examples from the official scikit-learn docs: -* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html * https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html * https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html +* https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html From 120f0d8d2abfa1ec92513c63186af837acd2f5fc Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 28 Feb 2020 12:40:16 -0500 Subject: [PATCH 07/16] KNN ML docs: Update thumbnail, name, permalink, description, display_as --- doc/python/ml-knn.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 27ef20c7388..2bcab469875 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -1,7 +1,6 @@ --- jupyter: jupytext: - formats: ipynb,md notebook_metadata_filter: all text_representation: extension: .md @@ -23,17 +22,16 @@ jupyter: pygments_lexer: ipython3 version: 3.7.6 plotly: - description: How to visualize k-Nearest Neighbors (kNN) created using scikit-learn - in Python with Plotly. - display_as: basic + description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification + with Plotly + display_as: ai_ml language: python layout: base - name: K-Nearest Neighbors (kNN) Classification + name: kNN Classification order: 1 page_type: example_index - permalink: python/knn/ - redirect_from: python/machine-learning-tutorials/ - thumbnail: thumbnail/line-and-scatter.jpg + permalink: python/knn-classification/ + thumbnail: thumbnail/knn-classification.png --- ## Basic Binary Classification with `plotly.express` @@ -152,7 +150,7 @@ Z = Z.reshape(ll.shape) proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()]) proba = proba.reshape(ll.shape + (3,)) -fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') +fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species', width=1000, height=1000) fig.update_traces(marker_size=10, marker_line_width=1) fig.add_trace( go.Heatmap( From 121dc0c6731ee8ad27d21fcf77e88eec4beed2ce Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 28 Feb 2020 16:55:42 -0500 Subject: [PATCH 08/16] Added 3 sections, drafted out 2 sections --- doc/python/ml-regression.md | 157 +++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 2 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index e2b0d37724d..3e34f73de3f 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -1,7 +1,91 @@ -# Regression +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize regression in scikit-learn with Plotly + display_as: ai_ml + language: python + layout: base + name: ML Regression + order: 2 + page_type: example_index + permalink: python/ml-regression/ + thumbnail: thumbnail/knn-classification.png +--- +## Basic linear regression -### Visualizing kNN Regression +This example shows how to train a simple linear regression from `sklearn` to predicts the tips servers will receive based on the value of the total bill (dataset is included in `px.data`). + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) + +model = LinearRegression() +model.fit(X, df.tip) + +x_range = np.linspace(X.min(), X.max(), 100) +y_range = model.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit')) +fig.show() +``` + +## Model generalization on unseen data + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +X_train, X_test, y_train, y_test = train_test_split(X, df.tip, random_state=0) + +model = LinearRegression() +model.fit(X_train, y_train) + +x_range = np.linspace(X.min(), X.max(), 100) +y_range = model.predict(x_range.reshape(-1, 1)) + + +fig = go.Figure([ + go.Scatter(x=X_train.squeeze(), y=y_train, name='train', mode='markers'), + go.Scatter(x=X_test.squeeze(), y=y_test, name='test', mode='markers'), + go.Scatter(x=x_range, y=y_range, name='prediction') +]) +fig.show() +``` + +## Comparing different kNN models parameters ```python import numpy as np @@ -27,6 +111,75 @@ fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) fig.show() ``` +## 3D regression surface with `px.scatter_3d` and `go.Surface` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor + +mesh_size = .02 +margin = 0 + +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width"] + +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] + +# Condition the model on sepal width and length, predict the petal width +knn = KNeighborsRegressor(10, weights='distance') +knn.fit(X, y) + +# Create a mesh grid on which we will run our model +x_min, x_max = X.sepal_width.min() - margin, X.sepal_width.max() + margin +y_min, y_max = X.sepal_length.min() - margin, X.sepal_length.max() + margin +xrange = np.arange(x_min, x_max, mesh_size) +yrange = np.arange(y_min, y_max, mesh_size) +xx, yy = np.meshgrid(xrange, yrange) + +# Run kNN +pred = knn.predict(np.c_[xx.ravel(), yy.ravel()]) +pred = pred.reshape(xx.shape) + +# Generate the plot +fig = px.scatter_3d(df, x='sepal_width', y='sepal_length', z='petal_width') +fig.update_traces(marker=dict(size=5)) +fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface')) +fig.show() +``` + +## Label polynomial fits with latex + +```python + +``` + +## Prediction Error Plots + + +### Simple Prediction Error + +```python + +``` + +### Augmented Prediction Error plot using `px` + +```python + +``` + +### Grid Search Visualization using `px.scatter_matrix` + + +## Residual Plots + +```python + +``` + ### Reference Learn more about `px` here: From c6eeeb2a8cffed96a0a5f88b62807dff75898374 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 15:39:38 -0500 Subject: [PATCH 09/16] ML Docs: Added 3 new sections to regression notebook --- doc/python/ml-regression.md | 206 ++++++++++++++++++++++++++++++++++-- 1 file changed, 199 insertions(+), 7 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 3e34f73de3f..2e0087982bd 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -33,9 +33,28 @@ jupyter: thumbnail: thumbnail/knn-classification.png --- -## Basic linear regression +## Basic linear regression plots -This example shows how to train a simple linear regression from `sklearn` to predicts the tips servers will receive based on the value of the total bill (dataset is included in `px.data`). + +### Ordinary Least Square (OLS) with `plotly.express` + + +This example shows how to use `plotly.express` to train a simply Ordinary Least Square (OLS) that can predict the tips servers will receive based on the value of the total bill. + +```python +import plotly.express as px + +df = px.data.tips() +fig = px.scatter( + df, x='total_bill', y='tip', opacity=0.65, + trendline='ols', trendline_color_override='red' +) +fig.show() +``` + +### Linear Regression with scikit-learn + +You can also perform the same prediction using scikit-learn's `LinearRegression`. ```python import numpy as np @@ -123,7 +142,6 @@ mesh_size = .02 margin = 0 df = px.data.iris() -features = ["sepal_width", "sepal_length", "petal_width"] X = df[['sepal_width', 'sepal_length']] y = df['petal_width'] @@ -150,10 +168,46 @@ fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface')) fig.show() ``` -## Label polynomial fits with latex +## Displaying `PolynomialFeatures` using $\LaTeX$ + +It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. ```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures + +def format_coefs(coefs): + equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] + equation = "$" + " + ".join(equation_list) + "$" + + replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} + for old, new in replace_map.items(): + equation = equation.replace(old, new) + + return equation +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +for n_features in [1, 2, 3, 4]: + poly = PolynomialFeatures(n_features) + poly.fit(X) + X_poly = poly.transform(X) + x_range_poly = poly.transform(x_range) + + model = LinearRegression(fit_intercept=False) + model.fit(X_poly, df.tip) + y_poly = model.predict(x_range_poly) + + equation = format_coefs(model.coef_.round(2)) + fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) + +fig.show() ``` ## Prediction Error Plots @@ -162,22 +216,160 @@ fig.show() ### Simple Prediction Error ```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +df = px.data.iris() +X = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X, y) +y_pred = model.predict(X) + +fig = px.scatter(x=y, y=y_pred, labels={'x': 'y true', 'y': 'y pred'}) +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=y.min(), y0=y.min(), + x1=y.max(), y1=y.max() +) +fig.show() ``` -### Augmented Prediction Error plot using `px` +### Augmented Prediction Error analysis using `plotly.express` ```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split -``` +df = px.data.iris() -### Grid Search Visualization using `px.scatter_matrix` +# Split data into training and test splits +train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0) +df['split'] = 'train' +df.loc[test_idx, 'split'] = 'test' +X = df[['sepal_width', 'sepal_length']] +X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y_train = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X_train, y_train) +df['prediction'] = model.predict(X) + +fig = px.scatter( + df, x='petal_width', y='prediction', + marginal_x='histogram', marginal_y='histogram', + color='split', trendline='ols' +) +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=y.min(), y0=y.min(), + x1=y.max(), y1=y.max() +) + +fig.show() +``` ## Residual Plots +Just like prediction error plots, it's easy to visualize your prediction residuals in just a few lines of codes using `plotly.express` built-in capabilities. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +df = px.data.iris() + +# Split data into training and test splits +train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0) +df['split'] = 'train' +df.loc[test_idx, 'split'] = 'test' + +X = df[['sepal_width', 'sepal_length']] +X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y_train = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X_train, y_train) +df['prediction'] = model.predict(X) +df['residual'] = df['prediction'] - df['petal_width'] + +fig = px.scatter( + df, x='prediction', y='residual', + marginal_y='violin', + color='split', trendline='ols' +) +fig.show() +``` + +## Grid Search Visualization using `px` facets + ```python +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from sklearn.model_selection import GridSearchCV +from sklearn.tree import DecisionTreeRegressor +N_FOLD = 5 + +df = px.data.iris() +X = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y = df.loc[train_idx, 'petal_width'] + +model = DecisionTreeRegressor() +param_grid = { + 'criterion': ['mse', 'friedman_mse', 'mae'], + 'max_depth': range(2, 5) +} +grid = GridSearchCV(model, param_grid, cv=N_FOLD) + +grid.fit(X, y) +grid_df = pd.DataFrame(grid.cv_results_) + +# Convert the wide format of the grid into the long format +# accepted by plotly.express +melted = ( + grid_df + .rename(columns=lambda col: col.replace('param_', '')) + .melt( + value_vars=[f'split{i}_test_score' for i in range(N_FOLD)], + id_vars=['rank_test_score', 'mean_test_score', + 'mean_fit_time', 'criterion', 'max_depth'] + ) +) + +# Convert R-Squared measure to % +melted[['value', 'mean_test_score']] *= 100 + +# Format the variable names for simplicity +melted['variable'] = ( + melted['variable'] + .str.replace('_test_score', '') + .str.replace('split', '') +) + +px.bar( + melted, x='variable', y='value', + color='mean_test_score', + facet_row='max_depth', + facet_col='criterion', + title='Test Scores of Grid Search', + hover_data=['mean_fit_time', 'rank_test_score'], + labels={'variable': 'cv_split', + 'value': 'r_squared', + 'mean_test_score': "mean_r_squared"} +) ``` ### Reference From cf420032021168daae65f7476925c75d902d4726 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 16:48:38 -0500 Subject: [PATCH 10/16] ML Docs: Updated last ML regression section for clarity --- doc/python/ml-regression.md | 70 +++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 2e0087982bd..3c9a2326188 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -213,7 +213,7 @@ fig.show() ## Prediction Error Plots -### Simple Prediction Error +### Simple actual vs predicted plot ```python import plotly.express as px @@ -221,8 +221,8 @@ import plotly.graph_objects as go from sklearn.linear_model import LinearRegression df = px.data.iris() -X = df.loc[train_idx, ['sepal_width', 'sepal_length']] -y = df.loc[train_idx, 'petal_width'] +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] # Condition the model on sepal width and length, predict the petal width model = LinearRegression() @@ -238,7 +238,7 @@ fig.add_shape( fig.show() ``` -### Augmented Prediction Error analysis using `plotly.express` +### Augmented prediction error analysis using `plotly.express` ```python import plotly.express as px @@ -276,7 +276,7 @@ fig.add_shape( fig.show() ``` -## Residual Plots +## Residual plots Just like prediction error plots, it's easy to visualize your prediction residuals in just a few lines of codes using `plotly.express` built-in capabilities. @@ -312,28 +312,34 @@ fig = px.scatter( fig.show() ``` -## Grid Search Visualization using `px` facets +## Grid search visualization using `px.density_heatmap` and `px.box` + +In this example, we show how to visualize the results of a grid search on a `DecisionTreeRegressor`. The first plot shows how to visualize the score of each model parameter on individual splits (grouped using facets). The second plot aggregates the results of all splits such that each box represents a single model. ```python +import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeRegressor -N_FOLD = 5 +N_FOLD = 6 +# Load and shuffle dataframe df = px.data.iris() -X = df.loc[train_idx, ['sepal_width', 'sepal_length']] -y = df.loc[train_idx, 'petal_width'] +df = df.sample(frac=1, random_state=0) + +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] +# Define and fit the grid model = DecisionTreeRegressor() param_grid = { 'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': range(2, 5) } grid = GridSearchCV(model, param_grid, cv=N_FOLD) - grid.fit(X, y) grid_df = pd.DataFrame(grid.cv_results_) @@ -344,32 +350,42 @@ melted = ( .rename(columns=lambda col: col.replace('param_', '')) .melt( value_vars=[f'split{i}_test_score' for i in range(N_FOLD)], - id_vars=['rank_test_score', 'mean_test_score', - 'mean_fit_time', 'criterion', 'max_depth'] + id_vars=['mean_test_score', 'mean_fit_time', 'criterion', 'max_depth'], + var_name="cv_split", + value_name="r_squared" ) ) -# Convert R-Squared measure to % -melted[['value', 'mean_test_score']] *= 100 - # Format the variable names for simplicity -melted['variable'] = ( - melted['variable'] +melted['cv_split'] = ( + melted['cv_split'] .str.replace('_test_score', '') .str.replace('split', '') ) -px.bar( - melted, x='variable', y='value', - color='mean_test_score', - facet_row='max_depth', - facet_col='criterion', - title='Test Scores of Grid Search', - hover_data=['mean_fit_time', 'rank_test_score'], - labels={'variable': 'cv_split', - 'value': 'r_squared', - 'mean_test_score': "mean_r_squared"} +# Single function call to plot each figure +fig_hmap = px.density_heatmap( + melted, x="max_depth", y='criterion', + histfunc="sum", z="r_squared", + title='Grid search results on individual fold', + hover_data=['mean_fit_time'], + facet_col="cv_split", facet_col_wrap=3, + labels={'mean_test_score': "mean_r_squared"} ) + +fig_box = px.box( + melted, x='max_depth', y='r_squared', + title='Grid search results ', + hover_data=['mean_fit_time'], + points='all', + color="criterion", + hover_name='cv_split', + labels={'mean_test_score': "mean_r_squared"} +) + +# Display +fig_hmap.show() +fig_box.show() ``` ### Reference From e92d340ea5c0637b2deb22c0c213020104c81903 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 17:14:44 -0500 Subject: [PATCH 11/16] ML Docs: Added annotations after each section of regression notebook --- doc/python/ml-regression.md | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 3c9a2326188..6414dbf43a9 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -78,6 +78,8 @@ fig.show() ## Model generalization on unseen data +Easily color your plot based on a predefined data split. + ```python import numpy as np import plotly.express as px @@ -106,6 +108,8 @@ fig.show() ## Comparing different kNN models parameters +Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`. + ```python import numpy as np import plotly.express as px @@ -114,14 +118,16 @@ from sklearn.neighbors import KNeighborsRegressor df = px.data.tips() X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100) +# Model #1 knn_dist = KNeighborsRegressor(10, weights='distance') -knn_uni = KNeighborsRegressor(10, weights='uniform') knn_dist.fit(X, df.tip) -knn_uni.fit(X, df.tip) - -x_range = np.linspace(X.min(), X.max(), 100) y_dist = knn_dist.predict(x_range.reshape(-1, 1)) + +# Model #2 +knn_uni = KNeighborsRegressor(10, weights='uniform') +knn_uni.fit(X, df.tip) y_uni = knn_uni.predict(x_range.reshape(-1, 1)) fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) @@ -132,6 +138,8 @@ fig.show() ## 3D regression surface with `px.scatter_3d` and `go.Surface` +Visualize the decision plane of your model whenever you have more than one variable in your `X`. + ```python import numpy as np import plotly.express as px @@ -229,7 +237,7 @@ model = LinearRegression() model.fit(X, y) y_pred = model.predict(X) -fig = px.scatter(x=y, y=y_pred, labels={'x': 'y true', 'y': 'y pred'}) +fig = px.scatter(x=y_pred, y=y, labels={'x': 'prediction', 'y': 'actual'}) fig.add_shape( type="line", line=dict(dash='dash'), x0=y.min(), y0=y.min(), @@ -238,7 +246,9 @@ fig.add_shape( fig.show() ``` -### Augmented prediction error analysis using `plotly.express` +### Enhanced prediction error analysis using `plotly.express` + +Add marginal histograms to quickly diagnoses any prediction bias your model might have. The built-in `OLS` functionality let you visualize how well your model generalizes by comparing it with the theoretical optimal fit (black dotted line). ```python import plotly.express as px @@ -254,6 +264,7 @@ df['split'] = 'train' df.loc[test_idx, 'split'] = 'test' X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] y_train = df.loc[train_idx, 'petal_width'] @@ -263,7 +274,7 @@ model.fit(X_train, y_train) df['prediction'] = model.predict(X) fig = px.scatter( - df, x='petal_width', y='prediction', + df, x='prediction', y='petal_width', marginal_x='histogram', marginal_y='histogram', color='split', trendline='ols' ) From 9ad8ea22bf3aafd01b6f670d03c517b3a6d2f4dd Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 17:31:59 -0500 Subject: [PATCH 12/16] ML Docs: updated ml regression header --- doc/python/ml-regression.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 6414dbf43a9..968858ec64b 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -22,7 +22,7 @@ jupyter: pygments_lexer: ipython3 version: 3.7.6 plotly: - description: Visualize regression in scikit-learn with Plotly + description: Visualize regression in scikit-learn with Plotly. display_as: ai_ml language: python layout: base @@ -30,7 +30,7 @@ jupyter: order: 2 page_type: example_index permalink: python/ml-regression/ - thumbnail: thumbnail/knn-classification.png + thumbnail: thumbnail/ml-regression.png --- ## Basic linear regression plots From 1af0416f6b1aa61ad1fec7e2508a5f8c169be7c0 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 21:11:47 -0500 Subject: [PATCH 13/16] ML Docs: Added new section to regression, updated references --- doc/python/ml-regression.md | 76 ++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 968858ec64b..f345cc22445 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -323,6 +323,58 @@ fig = px.scatter( fig.show() ``` +## Regularization visualization + + +### Plot alphas for individual folds + +```python +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LassoCV + +# Load and preprocess the data +df = px.data.gapminder() +X = df.drop(columns=['lifeExp', 'iso_num']) +X = pd.get_dummies(X, columns=['country', 'continent', 'iso_alpha']) +y = df['lifeExp'] + +# Train model to predict life expectancy +model = LassoCV(cv=N_FOLD, normalize=True) +model.fit(X, y) +mean_alphas = model.mse_path_.mean(axis=-1) + +fig = go.Figure([ + go.Scatter( + x=model.alphas_, y=model.mse_path_[:, i], + name=f"Fold: {i+1}", opacity=.5, line=dict(dash='dash'), + hovertemplate="alpha: %{x}
MSE: %{y}" + ) + for i in range(N_FOLD) +]) +fig.add_traces(go.Scatter( + x=model.alphas_, y=mean_alphas, + name='Mean', line=dict(color='black', width=3), + hovertemplate="alpha: %{x}
MSE: %{y}", +)) + +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=model.alpha_, y0=0, + x1=model.alpha_, y1=1, + yref='paper' +) + +fig.update_layout( + xaxis_title='alpha', + xaxis_type="log", + yaxis_title="Mean Square Error (MSE)" +) +fig.show() +``` + ## Grid search visualization using `px.density_heatmap` and `px.box` In this example, we show how to visualize the results of a grid search on a `DecisionTreeRegressor`. The first plot shows how to visualize the score of each model parameter on individual splits (grouped using facets). The second plot aggregates the results of all splits such that each box represents a single model. @@ -401,8 +453,22 @@ fig_box.show() ### Reference -Learn more about `px` here: -* https://plot.ly/python/plotly-express/ - -This tutorial was inspired by amazing examples from the official scikit-learn docs: -* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html +Learn more about the `px` figures used in this tutorial: +* Plotly Express: https://plot.ly/python/plotly-express/ +* Vertical Lines: https://plot.ly/python/shapes/ +* Heatmaps: https://plot.ly/python/heatmaps/ +* Box Plots: https://plot.ly/python/box-plots/ +* 3D Scatter: https://plot.ly/python/3d-scatter-plots/ +* Surface Plots: https://plot.ly/python/3d-surface-plots/ + +Learn more about the Machine Learning models used in this tutorial: +* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html +* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html +* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html +* https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html +* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html + +Other tutorials that inspired this notebook: +* https://seaborn.pydata.org/examples/residplot.html +* https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html +* http://www.scikit-yb.org/zh/latest/api/regressor/peplot.html From e9098df5075dfa2fb1d82ee1db1c47ec6ef929f9 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 6 Mar 2020 15:05:49 -0500 Subject: [PATCH 14/16] ML Docs: Added coefficient MLR example --- doc/python/ml-regression.md | 101 ++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 33 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index f345cc22445..e37e1d04e95 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -39,7 +39,7 @@ jupyter: ### Ordinary Least Square (OLS) with `plotly.express` -This example shows how to use `plotly.express` to train a simply Ordinary Least Square (OLS) that can predict the tips servers will receive based on the value of the total bill. +This example shows how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS) for predicting the tips servers will receive based on the value of the total bill. ```python import plotly.express as px @@ -108,7 +108,7 @@ fig.show() ## Comparing different kNN models parameters -Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`. +Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`, such as coloring by the assigned `sex`. ```python import numpy as np @@ -136,9 +136,51 @@ fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) fig.show() ``` +## Displaying `PolynomialFeatures` using $\LaTeX$ + +It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures + +def format_coefs(coefs): + equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] + equation = "$" + " + ".join(equation_list) + "$" + + replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} + for old, new in replace_map.items(): + equation = equation.replace(old, new) + + return equation + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +for n_features in [1, 2, 3, 4]: + poly = PolynomialFeatures(n_features) + poly.fit(X) + X_poly = poly.transform(X) + x_range_poly = poly.transform(x_range) + + model = LinearRegression(fit_intercept=False) + model.fit(X_poly, df.tip) + y_poly = model.predict(x_range_poly) + + equation = format_coefs(model.coef_.round(2)) + fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) + +fig.show() +``` + ## 3D regression surface with `px.scatter_3d` and `go.Surface` -Visualize the decision plane of your model whenever you have more than one variable in your `X`. +Visualize the decision plane of your model whenever you have more than one variable in your input data. ```python import numpy as np @@ -176,53 +218,44 @@ fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface')) fig.show() ``` -## Displaying `PolynomialFeatures` using $\LaTeX$ +## Visualizing coefficients for multiple linear regression (MLR) -It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. +When you are fitting a linear regression, you want to often know what feature matters the most in your regression's output. ```python -import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LinearRegression -from sklearn.preprocessing import PolynomialFeatures -def format_coefs(coefs): - equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] - equation = "$" + " + ".join(equation_list) + "$" - - replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} - for old, new in replace_map.items(): - equation = equation.replace(old, new) - - return equation +df = px.data.iris() -df = px.data.tips() -X = df.total_bill.values.reshape(-1, 1) -x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) +X = df.drop(columns=['petal_width', 'species_id']) +X = pd.get_dummies(X, columns=['species'], prefix_sep='=') +y = df['petal_width'] -fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) -for n_features in [1, 2, 3, 4]: - poly = PolynomialFeatures(n_features) - poly.fit(X) - X_poly = poly.transform(X) - x_range_poly = poly.transform(x_range) +model = LinearRegression() +model.fit(X, y) - model = LinearRegression(fit_intercept=False) - model.fit(X_poly, df.tip) - y_poly = model.predict(x_range_poly) - - equation = format_coefs(model.coef_.round(2)) - fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) +colors = ['Positive' if c > 0 else 'Negative' for c in model.coef_] +fig = px.bar( + x=X.columns, y=model.coef_, color=colors, + color_discrete_sequence=['red', 'blue'], + labels=dict(x='Feature', y='Linear coefficient'), + title='Weight of each feature for predicting petal width' +) fig.show() ``` ## Prediction Error Plots +When you are working with very high-dimensional data, it is inconvenient to plot every dimension with your output `y`. Instead, you can use methods such as prediction error plots, which let you visualize how well your model does compared to the ground truth. + ### Simple actual vs predicted plot +This example shows you the simplest way to compare the predicted output vs. the actual output. A good model will have most of the scatter dots near the diagonal black line. + ```python import plotly.express as px import plotly.graph_objects as go @@ -323,10 +356,10 @@ fig = px.scatter( fig.show() ``` -## Regularization visualization +## Visualize regularization across different cross-validation folds -### Plot alphas for individual folds +In this example, we show how to plot the results of various $\alpha$ penalization values from the results of cross-validation using scikit-learn's `LassoCV`. This is useful to see how much the error of the optimal alpha actually varies across CV folds. ```python import pandas as pd @@ -335,6 +368,8 @@ import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LassoCV +N_FOLD = 6 + # Load and preprocess the data df = px.data.gapminder() X = df.drop(columns=['lifeExp', 'iso_num']) From 3857aea538bcf6e5af02160e6df1a61e42abffdb Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 6 Mar 2020 16:18:32 -0500 Subject: [PATCH 15/16] ML Docs: Start pca notebook --- doc/python/ml-pca.md | 135 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 doc/python/ml-pca.md diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md new file mode 100644 index 00000000000..105edd8af66 --- /dev/null +++ b/doc/python/ml-pca.md @@ -0,0 +1,135 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize Principle Component Analysis (PCA) of your high-dimensional + data with Plotly on Python. + display_as: ai_ml + language: python + layout: base + name: PCA Visualization + order: 4 + page_type: example_index + permalink: python/pca-visualization/ + thumbnail: thumbnail/ml-pca.png +--- + +## Basic PCA Scatter Plot + +This example shows you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. It uses scikit-learn's `PCA`. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=2) +components = pca.fit_transform(X) + +fig = px.scatter(x=components[:, 0], y=components[:, 1], color=df['species']) +fig.show() +``` + +## Visualize PCA with `px.scatter_3d` + +Just like the basic PCA plot, this let you visualize the first 3 dimensions. This additionally displays the total variance explained by those components. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=3) +components = pca.fit_transform(X) + +total_var = pca.explained_variance_ratio_.sum() * 100 + +fig = px.scatter_3d( + x=components[:, 0], y=components[:, 1], z=components[:, 2], + color=df['species'], + title=f'Total Explained Variance: {total_var:.2f}%', + labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'}, +) +fig.show() +``` + +## Plot high-dimensional components with `px.scatter_matrix` + +If you need to visualize more than 3 dimensions, you can use scatter plot matrices. + +```python +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.datasets import load_boston + +boston = load_boston() +df = pd.DataFrame(boston.data, columns=boston.feature_names) + +pca = PCA(n_components=5) +components = pca.fit_transform(df) + +total_var = pca.explained_variance_ratio_.sum() * 100 + +labels = {str(i): f"PC {i+1}" for i in range(5)} +labels['color'] = 'Median Price' + +fig = px.scatter_matrix( + components, + color=boston.target, + dimensions=range(5), + labels=labels, + title=f'Total Explained Variance: {total_var:.2f}%', +) +fig.update_traces(diagonal_visible=False) +fig.show() +``` + +## Plotting explained variance + +Often, you might be interested in seeing how much variance the PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset). + +```python +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.datasets import load_diabetes + +boston = load_diabetes() +df = pd.DataFrame(boston.data, columns=boston.feature_names) + +pca = PCA() +pca.fit(df) +exp_var_cumul = np.cumsum(pca.explained_variance_ratio_) + +px.area( + x=range(1, exp_var_cumul.shape[0] + 1), + y=exp_var_cumul, + labels={"x": "# Components", "y": "Explained Variance"} +) +``` + +## Visualize loadings From 77a3d82088dec7a909eda676c2153a603b97ce86 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 6 Mar 2020 18:06:03 -0500 Subject: [PATCH 16/16] ML Docs: Start ROC/PR section --- doc/python/ml-roc-pr.md | 201 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 doc/python/ml-roc-pr.md diff --git a/doc/python/ml-roc-pr.md b/doc/python/ml-roc-pr.md new file mode 100644 index 00000000000..8c1bc6becb4 --- /dev/null +++ b/doc/python/ml-roc-pr.md @@ -0,0 +1,201 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Interpret the results of your classification using Receiver Operating + Characteristics (ROC) and Precision-Recall (PR) Curves using Plotly on Python. + display_as: ai_ml + language: python + layout: base + name: ROC and PR Curves + order: 3 + page_type: example_index + permalink: python/roc-and-pr-curves/ + thumbnail: thumbnail/ml-roc-pr.png +--- + +## Basic Binary ROC Curve + +```python +import plotly.express as px +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_curve, auc +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=500, random_state=0) + +model = LogisticRegression() +model.fit(X, y) +y_score = model.predict_proba(X)[:, 1] + +fpr, tpr, thresholds = roc_curve(y, y_score) + +fig = px.area( + x=fpr, y=tpr, + title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})', + labels=dict(x='False Positive Rate', y='True Positive Rate') +) +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=0, y1=1 +) +fig.show() +``` + +## Multiclass ROC Curve + +When you have more than 2 classes, you will need to plot the ROC curve for each class separately. Make sure that you use a [one-versus-rest](https://scikit-learn.org/stable/modules/multiclass.html#one-vs-the-rest) model, or make sure that your problem has a [multi-label](https://scikit-learn.org/stable/modules/multiclass.html#multilabel-classification-format) format; otherwise, your ROC curve might not return the expected results. + +```python +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_curve, roc_auc_score +import plotly.graph_objects as go +import plotly.express as px + +np.random.seed(0) + +# Artificially add noise to make task harder +df = px.data.iris() +samples = df.species.sample(n=50, random_state=0) +np.random.shuffle(samples.values) +df.loc[samples.index, 'species'] = samples.values + +# Define the inputs and outputs +X = df.drop(columns=['species', 'species_id']) +y = df['species'] +y_onehot = pd.get_dummies(y, columns=model.classes_) + +# Fit the model +model = LogisticRegression(max_iter=200) +model.fit(X, y) +y_scores = model.predict_proba(X) + +# Create an empty figure, and iteratively add new lines +# every time we compute a new class +fig = go.Figure() +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=0, y1=1 +) + +for i in range(y_scores.shape[1]): + y_true = y_onehot.iloc[:, i] + y_score = y_scores[:, i] + + fpr, tpr, _ = roc_curve(y_true, y_score) + auc_score = roc_auc_score(y_true, y_score) + + name = f"{y_onehot.columns[i]} (AUC={auc_score:.2f})" + fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) + +fig.update_layout( + xaxis_title='False Positive Rate', + yaxis_title='True Positive Rate' +) +fig.show() +``` + +## Precision-Recall Curves + +Plotting the PR curve is very similar to plotting the ROC curve. The following examples are slightly modified from the previous examples: + +```python +import plotly.express as px +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_recall_curve, auc +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=500, random_state=0) + +model = LogisticRegression() +model.fit(X, y) +y_score = model.predict_proba(X)[:, 1] + +precision, recall, thresholds = precision_recall_curve(y, y_score) + +fig = px.area( + x=recall, y=precision, + title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})', + labels=dict(x='Recall', y='Precision') +) +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=1, y1=0 +) +fig.show() +``` + +In this example, we use the [average precision](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html) metric, which is an alternative scoring method to the area under the PR curve. + +```python +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_recall_curve, average_precision_score +import plotly.graph_objects as go +import plotly.express as px + +np.random.seed(0) + +# Artificially add noise to make task harder +df = px.data.iris() +samples = df.species.sample(n=30, random_state=0) +np.random.shuffle(samples.values) +df.loc[samples.index, 'species'] = samples.values + +# Define the inputs and outputs +X = df.drop(columns=['species', 'species_id']) +y = df['species'] +y_onehot = pd.get_dummies(y, columns=model.classes_) + +# Fit the model +model = LogisticRegression(max_iter=200) +model.fit(X, y) +y_scores = model.predict_proba(X) + +# Create an empty figure, and iteratively add new lines +# every time we compute a new class +fig = go.Figure() +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=1, y1=0 +) + +for i in range(y_scores.shape[1]): + y_true = y_onehot.iloc[:, i] + y_score = y_scores[:, i] + + precision, recall, _ = precision_recall_curve(y_true, y_score) + auc_score = average_precision_score(y_true, y_score) + + name = f"{y_onehot.columns[i]} (AP={auc_score:.2f})" + fig.add_trace(go.Scatter(x=recall, y=precision, name=name, mode='lines')) + +fig.update_layout( + xaxis_title='Recall', + yaxis_title='Precision' +) +fig.show() +```