diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md new file mode 100644 index 00000000000..2bcab469875 --- /dev/null +++ b/doc/python/ml-knn.md @@ -0,0 +1,251 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification + with Plotly + display_as: ai_ml + language: python + layout: base + name: kNN Classification + order: 1 + page_type: example_index + permalink: python/knn-classification/ + thumbnail: thumbnail/knn-classification.png +--- + +## Basic Binary Classification with `plotly.express` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier + +X, y = make_moons(noise=0.3, random_state=0) +X_test, _ = make_moons(noise=0.3, random_state=1) + +clf = KNeighborsClassifier(15) +clf.fit(X, y.astype(str)) # Fit on training set +y_pred = clf.predict(X_test) # Predict on new data + +fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'}) +fig.update_traces(marker_size=10) +fig.show() +``` + +## Visualize Binary Prediction Scores + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_classification +from sklearn.neighbors import KNeighborsClassifier + +X, y = make_classification(n_features=2, n_redundant=0, random_state=0) +X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1) + +clf = KNeighborsClassifier(15) +clf.fit(X, y) # Fit on training set +y_score = clf.predict_proba(X_test)[:, 1] # Predict on new data + +fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'}) +fig.update_traces(marker_size=10) +fig.show() +``` + +## Probability Estimates with `go.Contour` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier + +mesh_size = .02 +margin = 1 + +X, y = make_moons(noise=0.3, random_state=0) + +# Create a mesh grid on which we will run our model +x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin +y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin +xrange = np.arange(x_min, x_max, mesh_size) +yrange = np.arange(y_min, y_max, mesh_size) +xx, yy = np.meshgrid(xrange, yrange) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='uniform') +clf.fit(X, y) +Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] +Z = Z.reshape(xx.shape) + +fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''}) +fig.update_traces(marker_size=10, marker_line_width=1) +fig.add_trace( + go.Contour( + x=xrange, + y=yrange, + z=Z, + showscale=False, + colorscale=['Blue', 'Red'], + opacity=0.4, + name='Confidence' + ) +) +fig.show() +``` + +## Multi-class prediction confidence with `go.Heatmap` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsClassifier + +mesh_size = .02 +margin = 1 + +# We will use the iris data, which is included in px +df = px.data.iris() +X = df[['sepal_length', 'sepal_width']] +y = df.species_id + +# Create a mesh grid on which we will run our model +l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin +w_min, w_max = df.sepal_width.min() - margin, df.sepal_width.max() + margin +lrange = np.arange(l_min, l_max, mesh_size) +wrange = np.arange(w_min, w_max, mesh_size) +ll, ww = np.meshgrid(lrange, wrange) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X, y) +Z = clf.predict(np.c_[ll.ravel(), ww.ravel()]) +Z = Z.reshape(ll.shape) +proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()]) +proba = proba.reshape(ll.shape + (3,)) + +fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species', width=1000, height=1000) +fig.update_traces(marker_size=10, marker_line_width=1) +fig.add_trace( + go.Heatmap( + x=lrange, + y=wrange, + z=Z, + showscale=False, + colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']], + opacity=0.25, + customdata=proba, + hovertemplate=( + 'sepal length: %{x}
' + 'sepal width: %{y}
' + 'p(setosa): %{customdata[0]:.3f}
' + 'p(versicolor): %{customdata[1]:.3f}
' + 'p(virginica): %{customdata[2]:.3f}' + ) + ) +) +fig.show() +``` + +## 3D Classification with `px.scatter_3d` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import train_test_split + +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width"] + +X = df[features] +y = df.species +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) +y_score = clf.predict_proba(X_test) +y_score = np.around(y_score.max(axis=1), 4) + +fig = px.scatter_3d( + X_test, + x='sepal_length', + y='sepal_width', + z='petal_width', + symbol=y_pred, + color=y_score, + labels={'symbol': 'prediction', 'color': 'score'} +) +fig.update_layout(legend=dict(x=0, y=0)) +fig.show() +``` + +## High Dimension Visualization with `px.scatter_matrix` + +If you need to visualize classifications that go beyond 3D, you can use the [scatter plot matrix](https://plot.ly/python/splom/). + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import train_test_split + +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] + +X = df[features] +y = df.species +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) + +fig = px.scatter_matrix(X_test, dimensions=features, color=y_pred, labels={'color': 'prediction'}) +fig.show() +``` + +### Reference + +Learn more about `px`, `go.Contour`, and `go.Heatmap` here: +* https://plot.ly/python/plotly-express/ +* https://plot.ly/python/heatmaps/ +* https://plot.ly/python/contour-plots/ +* https://plot.ly/python/3d-scatter-plots/ +* https://plot.ly/python/splom/ + +This tutorial was inspired by amazing examples from the official scikit-learn docs: +* https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html +* https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html +* https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md new file mode 100644 index 00000000000..105edd8af66 --- /dev/null +++ b/doc/python/ml-pca.md @@ -0,0 +1,135 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize Principle Component Analysis (PCA) of your high-dimensional + data with Plotly on Python. + display_as: ai_ml + language: python + layout: base + name: PCA Visualization + order: 4 + page_type: example_index + permalink: python/pca-visualization/ + thumbnail: thumbnail/ml-pca.png +--- + +## Basic PCA Scatter Plot + +This example shows you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. It uses scikit-learn's `PCA`. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=2) +components = pca.fit_transform(X) + +fig = px.scatter(x=components[:, 0], y=components[:, 1], color=df['species']) +fig.show() +``` + +## Visualize PCA with `px.scatter_3d` + +Just like the basic PCA plot, this let you visualize the first 3 dimensions. This additionally displays the total variance explained by those components. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=3) +components = pca.fit_transform(X) + +total_var = pca.explained_variance_ratio_.sum() * 100 + +fig = px.scatter_3d( + x=components[:, 0], y=components[:, 1], z=components[:, 2], + color=df['species'], + title=f'Total Explained Variance: {total_var:.2f}%', + labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'}, +) +fig.show() +``` + +## Plot high-dimensional components with `px.scatter_matrix` + +If you need to visualize more than 3 dimensions, you can use scatter plot matrices. + +```python +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.datasets import load_boston + +boston = load_boston() +df = pd.DataFrame(boston.data, columns=boston.feature_names) + +pca = PCA(n_components=5) +components = pca.fit_transform(df) + +total_var = pca.explained_variance_ratio_.sum() * 100 + +labels = {str(i): f"PC {i+1}" for i in range(5)} +labels['color'] = 'Median Price' + +fig = px.scatter_matrix( + components, + color=boston.target, + dimensions=range(5), + labels=labels, + title=f'Total Explained Variance: {total_var:.2f}%', +) +fig.update_traces(diagonal_visible=False) +fig.show() +``` + +## Plotting explained variance + +Often, you might be interested in seeing how much variance the PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset). + +```python +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.datasets import load_diabetes + +boston = load_diabetes() +df = pd.DataFrame(boston.data, columns=boston.feature_names) + +pca = PCA() +pca.fit(df) +exp_var_cumul = np.cumsum(pca.explained_variance_ratio_) + +px.area( + x=range(1, exp_var_cumul.shape[0] + 1), + y=exp_var_cumul, + labels={"x": "# Components", "y": "Explained Variance"} +) +``` + +## Visualize loadings diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md new file mode 100644 index 00000000000..e37e1d04e95 --- /dev/null +++ b/doc/python/ml-regression.md @@ -0,0 +1,509 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize regression in scikit-learn with Plotly. + display_as: ai_ml + language: python + layout: base + name: ML Regression + order: 2 + page_type: example_index + permalink: python/ml-regression/ + thumbnail: thumbnail/ml-regression.png +--- + +## Basic linear regression plots + + +### Ordinary Least Square (OLS) with `plotly.express` + + +This example shows how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS) for predicting the tips servers will receive based on the value of the total bill. + +```python +import plotly.express as px + +df = px.data.tips() +fig = px.scatter( + df, x='total_bill', y='tip', opacity=0.65, + trendline='ols', trendline_color_override='red' +) +fig.show() +``` + +### Linear Regression with scikit-learn + +You can also perform the same prediction using scikit-learn's `LinearRegression`. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) + +model = LinearRegression() +model.fit(X, df.tip) + +x_range = np.linspace(X.min(), X.max(), 100) +y_range = model.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit')) +fig.show() +``` + +## Model generalization on unseen data + +Easily color your plot based on a predefined data split. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +X_train, X_test, y_train, y_test = train_test_split(X, df.tip, random_state=0) + +model = LinearRegression() +model.fit(X_train, y_train) + +x_range = np.linspace(X.min(), X.max(), 100) +y_range = model.predict(x_range.reshape(-1, 1)) + + +fig = go.Figure([ + go.Scatter(x=X_train.squeeze(), y=y_train, name='train', mode='markers'), + go.Scatter(x=X_test.squeeze(), y=y_test, name='test', mode='markers'), + go.Scatter(x=x_range, y=y_range, name='prediction') +]) +fig.show() +``` + +## Comparing different kNN models parameters + +Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`, such as coloring by the assigned `sex`. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100) + +# Model #1 +knn_dist = KNeighborsRegressor(10, weights='distance') +knn_dist.fit(X, df.tip) +y_dist = knn_dist.predict(x_range.reshape(-1, 1)) + +# Model #2 +knn_uni = KNeighborsRegressor(10, weights='uniform') +knn_uni.fit(X, df.tip) +y_uni = knn_uni.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) +fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +fig.show() +``` + +## Displaying `PolynomialFeatures` using $\LaTeX$ + +It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures + +def format_coefs(coefs): + equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] + equation = "$" + " + ".join(equation_list) + "$" + + replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} + for old, new in replace_map.items(): + equation = equation.replace(old, new) + + return equation + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +for n_features in [1, 2, 3, 4]: + poly = PolynomialFeatures(n_features) + poly.fit(X) + X_poly = poly.transform(X) + x_range_poly = poly.transform(x_range) + + model = LinearRegression(fit_intercept=False) + model.fit(X_poly, df.tip) + y_poly = model.predict(x_range_poly) + + equation = format_coefs(model.coef_.round(2)) + fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) + +fig.show() +``` + +## 3D regression surface with `px.scatter_3d` and `go.Surface` + +Visualize the decision plane of your model whenever you have more than one variable in your input data. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor + +mesh_size = .02 +margin = 0 + +df = px.data.iris() + +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] + +# Condition the model on sepal width and length, predict the petal width +knn = KNeighborsRegressor(10, weights='distance') +knn.fit(X, y) + +# Create a mesh grid on which we will run our model +x_min, x_max = X.sepal_width.min() - margin, X.sepal_width.max() + margin +y_min, y_max = X.sepal_length.min() - margin, X.sepal_length.max() + margin +xrange = np.arange(x_min, x_max, mesh_size) +yrange = np.arange(y_min, y_max, mesh_size) +xx, yy = np.meshgrid(xrange, yrange) + +# Run kNN +pred = knn.predict(np.c_[xx.ravel(), yy.ravel()]) +pred = pred.reshape(xx.shape) + +# Generate the plot +fig = px.scatter_3d(df, x='sepal_width', y='sepal_length', z='petal_width') +fig.update_traces(marker=dict(size=5)) +fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface')) +fig.show() +``` + +## Visualizing coefficients for multiple linear regression (MLR) + +When you are fitting a linear regression, you want to often know what feature matters the most in your regression's output. + +```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression + +df = px.data.iris() + +X = df.drop(columns=['petal_width', 'species_id']) +X = pd.get_dummies(X, columns=['species'], prefix_sep='=') +y = df['petal_width'] + +model = LinearRegression() +model.fit(X, y) + +colors = ['Positive' if c > 0 else 'Negative' for c in model.coef_] + +fig = px.bar( + x=X.columns, y=model.coef_, color=colors, + color_discrete_sequence=['red', 'blue'], + labels=dict(x='Feature', y='Linear coefficient'), + title='Weight of each feature for predicting petal width' +) +fig.show() +``` + +## Prediction Error Plots + +When you are working with very high-dimensional data, it is inconvenient to plot every dimension with your output `y`. Instead, you can use methods such as prediction error plots, which let you visualize how well your model does compared to the ground truth. + + +### Simple actual vs predicted plot + +This example shows you the simplest way to compare the predicted output vs. the actual output. A good model will have most of the scatter dots near the diagonal black line. + +```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression + +df = px.data.iris() +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X, y) +y_pred = model.predict(X) + +fig = px.scatter(x=y_pred, y=y, labels={'x': 'prediction', 'y': 'actual'}) +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=y.min(), y0=y.min(), + x1=y.max(), y1=y.max() +) +fig.show() +``` + +### Enhanced prediction error analysis using `plotly.express` + +Add marginal histograms to quickly diagnoses any prediction bias your model might have. The built-in `OLS` functionality let you visualize how well your model generalizes by comparing it with the theoretical optimal fit (black dotted line). + +```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +df = px.data.iris() + +# Split data into training and test splits +train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0) +df['split'] = 'train' +df.loc[test_idx, 'split'] = 'test' + +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] +X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y_train = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X_train, y_train) +df['prediction'] = model.predict(X) + +fig = px.scatter( + df, x='prediction', y='petal_width', + marginal_x='histogram', marginal_y='histogram', + color='split', trendline='ols' +) +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=y.min(), y0=y.min(), + x1=y.max(), y1=y.max() +) + +fig.show() +``` + +## Residual plots + +Just like prediction error plots, it's easy to visualize your prediction residuals in just a few lines of codes using `plotly.express` built-in capabilities. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +df = px.data.iris() + +# Split data into training and test splits +train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0) +df['split'] = 'train' +df.loc[test_idx, 'split'] = 'test' + +X = df[['sepal_width', 'sepal_length']] +X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y_train = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X_train, y_train) +df['prediction'] = model.predict(X) +df['residual'] = df['prediction'] - df['petal_width'] + +fig = px.scatter( + df, x='prediction', y='residual', + marginal_y='violin', + color='split', trendline='ols' +) +fig.show() +``` + +## Visualize regularization across different cross-validation folds + + +In this example, we show how to plot the results of various $\alpha$ penalization values from the results of cross-validation using scikit-learn's `LassoCV`. This is useful to see how much the error of the optimal alpha actually varies across CV folds. + +```python +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LassoCV + +N_FOLD = 6 + +# Load and preprocess the data +df = px.data.gapminder() +X = df.drop(columns=['lifeExp', 'iso_num']) +X = pd.get_dummies(X, columns=['country', 'continent', 'iso_alpha']) +y = df['lifeExp'] + +# Train model to predict life expectancy +model = LassoCV(cv=N_FOLD, normalize=True) +model.fit(X, y) +mean_alphas = model.mse_path_.mean(axis=-1) + +fig = go.Figure([ + go.Scatter( + x=model.alphas_, y=model.mse_path_[:, i], + name=f"Fold: {i+1}", opacity=.5, line=dict(dash='dash'), + hovertemplate="alpha: %{x}
MSE: %{y}" + ) + for i in range(N_FOLD) +]) +fig.add_traces(go.Scatter( + x=model.alphas_, y=mean_alphas, + name='Mean', line=dict(color='black', width=3), + hovertemplate="alpha: %{x}
MSE: %{y}", +)) + +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=model.alpha_, y0=0, + x1=model.alpha_, y1=1, + yref='paper' +) + +fig.update_layout( + xaxis_title='alpha', + xaxis_type="log", + yaxis_title="Mean Square Error (MSE)" +) +fig.show() +``` + +## Grid search visualization using `px.density_heatmap` and `px.box` + +In this example, we show how to visualize the results of a grid search on a `DecisionTreeRegressor`. The first plot shows how to visualize the score of each model parameter on individual splits (grouped using facets). The second plot aggregates the results of all splits such that each box represents a single model. + +```python +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from sklearn.model_selection import GridSearchCV +from sklearn.tree import DecisionTreeRegressor + +N_FOLD = 6 + +# Load and shuffle dataframe +df = px.data.iris() +df = df.sample(frac=1, random_state=0) + +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] + +# Define and fit the grid +model = DecisionTreeRegressor() +param_grid = { + 'criterion': ['mse', 'friedman_mse', 'mae'], + 'max_depth': range(2, 5) +} +grid = GridSearchCV(model, param_grid, cv=N_FOLD) +grid.fit(X, y) +grid_df = pd.DataFrame(grid.cv_results_) + +# Convert the wide format of the grid into the long format +# accepted by plotly.express +melted = ( + grid_df + .rename(columns=lambda col: col.replace('param_', '')) + .melt( + value_vars=[f'split{i}_test_score' for i in range(N_FOLD)], + id_vars=['mean_test_score', 'mean_fit_time', 'criterion', 'max_depth'], + var_name="cv_split", + value_name="r_squared" + ) +) + +# Format the variable names for simplicity +melted['cv_split'] = ( + melted['cv_split'] + .str.replace('_test_score', '') + .str.replace('split', '') +) + +# Single function call to plot each figure +fig_hmap = px.density_heatmap( + melted, x="max_depth", y='criterion', + histfunc="sum", z="r_squared", + title='Grid search results on individual fold', + hover_data=['mean_fit_time'], + facet_col="cv_split", facet_col_wrap=3, + labels={'mean_test_score': "mean_r_squared"} +) + +fig_box = px.box( + melted, x='max_depth', y='r_squared', + title='Grid search results ', + hover_data=['mean_fit_time'], + points='all', + color="criterion", + hover_name='cv_split', + labels={'mean_test_score': "mean_r_squared"} +) + +# Display +fig_hmap.show() +fig_box.show() +``` + +### Reference + +Learn more about the `px` figures used in this tutorial: +* Plotly Express: https://plot.ly/python/plotly-express/ +* Vertical Lines: https://plot.ly/python/shapes/ +* Heatmaps: https://plot.ly/python/heatmaps/ +* Box Plots: https://plot.ly/python/box-plots/ +* 3D Scatter: https://plot.ly/python/3d-scatter-plots/ +* Surface Plots: https://plot.ly/python/3d-surface-plots/ + +Learn more about the Machine Learning models used in this tutorial: +* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html +* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html +* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html +* https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html +* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html + +Other tutorials that inspired this notebook: +* https://seaborn.pydata.org/examples/residplot.html +* https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html +* http://www.scikit-yb.org/zh/latest/api/regressor/peplot.html diff --git a/doc/python/ml-roc-pr.md b/doc/python/ml-roc-pr.md new file mode 100644 index 00000000000..8c1bc6becb4 --- /dev/null +++ b/doc/python/ml-roc-pr.md @@ -0,0 +1,201 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Interpret the results of your classification using Receiver Operating + Characteristics (ROC) and Precision-Recall (PR) Curves using Plotly on Python. + display_as: ai_ml + language: python + layout: base + name: ROC and PR Curves + order: 3 + page_type: example_index + permalink: python/roc-and-pr-curves/ + thumbnail: thumbnail/ml-roc-pr.png +--- + +## Basic Binary ROC Curve + +```python +import plotly.express as px +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_curve, auc +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=500, random_state=0) + +model = LogisticRegression() +model.fit(X, y) +y_score = model.predict_proba(X)[:, 1] + +fpr, tpr, thresholds = roc_curve(y, y_score) + +fig = px.area( + x=fpr, y=tpr, + title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})', + labels=dict(x='False Positive Rate', y='True Positive Rate') +) +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=0, y1=1 +) +fig.show() +``` + +## Multiclass ROC Curve + +When you have more than 2 classes, you will need to plot the ROC curve for each class separately. Make sure that you use a [one-versus-rest](https://scikit-learn.org/stable/modules/multiclass.html#one-vs-the-rest) model, or make sure that your problem has a [multi-label](https://scikit-learn.org/stable/modules/multiclass.html#multilabel-classification-format) format; otherwise, your ROC curve might not return the expected results. + +```python +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_curve, roc_auc_score +import plotly.graph_objects as go +import plotly.express as px + +np.random.seed(0) + +# Artificially add noise to make task harder +df = px.data.iris() +samples = df.species.sample(n=50, random_state=0) +np.random.shuffle(samples.values) +df.loc[samples.index, 'species'] = samples.values + +# Define the inputs and outputs +X = df.drop(columns=['species', 'species_id']) +y = df['species'] +y_onehot = pd.get_dummies(y, columns=model.classes_) + +# Fit the model +model = LogisticRegression(max_iter=200) +model.fit(X, y) +y_scores = model.predict_proba(X) + +# Create an empty figure, and iteratively add new lines +# every time we compute a new class +fig = go.Figure() +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=0, y1=1 +) + +for i in range(y_scores.shape[1]): + y_true = y_onehot.iloc[:, i] + y_score = y_scores[:, i] + + fpr, tpr, _ = roc_curve(y_true, y_score) + auc_score = roc_auc_score(y_true, y_score) + + name = f"{y_onehot.columns[i]} (AUC={auc_score:.2f})" + fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) + +fig.update_layout( + xaxis_title='False Positive Rate', + yaxis_title='True Positive Rate' +) +fig.show() +``` + +## Precision-Recall Curves + +Plotting the PR curve is very similar to plotting the ROC curve. The following examples are slightly modified from the previous examples: + +```python +import plotly.express as px +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_recall_curve, auc +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=500, random_state=0) + +model = LogisticRegression() +model.fit(X, y) +y_score = model.predict_proba(X)[:, 1] + +precision, recall, thresholds = precision_recall_curve(y, y_score) + +fig = px.area( + x=recall, y=precision, + title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})', + labels=dict(x='Recall', y='Precision') +) +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=1, y1=0 +) +fig.show() +``` + +In this example, we use the [average precision](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html) metric, which is an alternative scoring method to the area under the PR curve. + +```python +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_recall_curve, average_precision_score +import plotly.graph_objects as go +import plotly.express as px + +np.random.seed(0) + +# Artificially add noise to make task harder +df = px.data.iris() +samples = df.species.sample(n=30, random_state=0) +np.random.shuffle(samples.values) +df.loc[samples.index, 'species'] = samples.values + +# Define the inputs and outputs +X = df.drop(columns=['species', 'species_id']) +y = df['species'] +y_onehot = pd.get_dummies(y, columns=model.classes_) + +# Fit the model +model = LogisticRegression(max_iter=200) +model.fit(X, y) +y_scores = model.predict_proba(X) + +# Create an empty figure, and iteratively add new lines +# every time we compute a new class +fig = go.Figure() +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=1, y1=0 +) + +for i in range(y_scores.shape[1]): + y_true = y_onehot.iloc[:, i] + y_score = y_scores[:, i] + + precision, recall, _ = precision_recall_curve(y_true, y_score) + auc_score = average_precision_score(y_true, y_score) + + name = f"{y_onehot.columns[i]} (AP={auc_score:.2f})" + fig.add_trace(go.Scatter(x=recall, y=precision, name=name, mode='lines')) + +fig.update_layout( + xaxis_title='Recall', + yaxis_title='Precision' +) +fig.show() +``` diff --git a/doc/requirements.txt b/doc/requirements.txt index 988f05efdbb..1222b140821 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -17,6 +17,7 @@ requests networkx squarify scikit-image +scikit-learn sphinx sphinx_bootstrap_theme recommonmark