Skip to content

[Deprecated] Starting AI/ML section in python docs #2172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 251 additions & 0 deletions doc/python/ml-knn.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
---
jupyter:
jupytext:
notebook_metadata_filter: all
text_representation:
extension: .md
format_name: markdown
format_version: '1.1'
jupytext_version: 1.1.1
kernelspec:
display_name: Python 3
language: python
name: python3
language_info:
codemirror_mode:
name: ipython
version: 3
file_extension: .py
mimetype: text/x-python
name: python
nbconvert_exporter: python
pygments_lexer: ipython3
version: 3.7.6
plotly:
description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification
with Plotly
display_as: ai_ml
language: python
layout: base
name: kNN Classification
order: 1
page_type: example_index
permalink: python/knn-classification/
thumbnail: thumbnail/knn-classification.png
---

## Basic Binary Classification with `plotly.express`

```python
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier

X, y = make_moons(noise=0.3, random_state=0)
X_test, _ = make_moons(noise=0.3, random_state=1)

clf = KNeighborsClassifier(15)
clf.fit(X, y.astype(str)) # Fit on training set
y_pred = clf.predict(X_test) # Predict on new data

fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'})
fig.update_traces(marker_size=10)
fig.show()
```

## Visualize Binary Prediction Scores

```python
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier

X, y = make_classification(n_features=2, n_redundant=0, random_state=0)
X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1)

clf = KNeighborsClassifier(15)
clf.fit(X, y) # Fit on training set
y_score = clf.predict_proba(X_test)[:, 1] # Predict on new data

fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'})
fig.update_traces(marker_size=10)
fig.show()
```

## Probability Estimates with `go.Contour`

```python
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier

mesh_size = .02
margin = 1

X, y = make_moons(noise=0.3, random_state=0)

# Create a mesh grid on which we will run our model
x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin
y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin
xrange = np.arange(x_min, x_max, mesh_size)
yrange = np.arange(y_min, y_max, mesh_size)
xx, yy = np.meshgrid(xrange, yrange)

# Create classifier, run predictions on grid
clf = KNeighborsClassifier(15, weights='uniform')
clf.fit(X, y)
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''})
fig.update_traces(marker_size=10, marker_line_width=1)
fig.add_trace(
go.Contour(
x=xrange,
y=yrange,
z=Z,
showscale=False,
colorscale=['Blue', 'Red'],
opacity=0.4,
name='Confidence'
)
)
fig.show()
```

## Multi-class prediction confidence with `go.Heatmap`

```python
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import KNeighborsClassifier

mesh_size = .02
margin = 1

# We will use the iris data, which is included in px
df = px.data.iris()
X = df[['sepal_length', 'sepal_width']]
y = df.species_id

# Create a mesh grid on which we will run our model
l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin
w_min, w_max = df.sepal_width.min() - margin, df.sepal_width.max() + margin
lrange = np.arange(l_min, l_max, mesh_size)
wrange = np.arange(w_min, w_max, mesh_size)
ll, ww = np.meshgrid(lrange, wrange)

# Create classifier, run predictions on grid
clf = KNeighborsClassifier(15, weights='distance')
clf.fit(X, y)
Z = clf.predict(np.c_[ll.ravel(), ww.ravel()])
Z = Z.reshape(ll.shape)
proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()])
proba = proba.reshape(ll.shape + (3,))

fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species', width=1000, height=1000)
fig.update_traces(marker_size=10, marker_line_width=1)
fig.add_trace(
go.Heatmap(
x=lrange,
y=wrange,
z=Z,
showscale=False,
colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']],
opacity=0.25,
customdata=proba,
hovertemplate=(
'sepal length: %{x} <br>'
'sepal width: %{y} <br>'
'p(setosa): %{customdata[0]:.3f}<br>'
'p(versicolor): %{customdata[1]:.3f}<br>'
'p(virginica): %{customdata[2]:.3f}<extra></extra>'
)
)
)
fig.show()
```

## 3D Classification with `px.scatter_3d`

```python
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

df = px.data.iris()
features = ["sepal_width", "sepal_length", "petal_width"]

X = df[features]
y = df.species
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create classifier, run predictions on grid
clf = KNeighborsClassifier(15, weights='distance')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)
y_score = np.around(y_score.max(axis=1), 4)

fig = px.scatter_3d(
X_test,
x='sepal_length',
y='sepal_width',
z='petal_width',
symbol=y_pred,
color=y_score,
labels={'symbol': 'prediction', 'color': 'score'}
)
fig.update_layout(legend=dict(x=0, y=0))
fig.show()
```

## High Dimension Visualization with `px.scatter_matrix`

If you need to visualize classifications that go beyond 3D, you can use the [scatter plot matrix](https://plot.ly/python/splom/).

```python
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

df = px.data.iris()
features = ["sepal_width", "sepal_length", "petal_width", "petal_length"]

X = df[features]
y = df.species
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create classifier, run predictions on grid
clf = KNeighborsClassifier(15, weights='distance')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

fig = px.scatter_matrix(X_test, dimensions=features, color=y_pred, labels={'color': 'prediction'})
fig.show()
```

### Reference

Learn more about `px`, `go.Contour`, and `go.Heatmap` here:
* https://plot.ly/python/plotly-express/
* https://plot.ly/python/heatmaps/
* https://plot.ly/python/contour-plots/
* https://plot.ly/python/3d-scatter-plots/
* https://plot.ly/python/splom/

This tutorial was inspired by amazing examples from the official scikit-learn docs:
* https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
* https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
* https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
135 changes: 135 additions & 0 deletions doc/python/ml-pca.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
---
jupyter:
jupytext:
notebook_metadata_filter: all
text_representation:
extension: .md
format_name: markdown
format_version: '1.1'
jupytext_version: 1.1.1
kernelspec:
display_name: Python 3
language: python
name: python3
language_info:
codemirror_mode:
name: ipython
version: 3
file_extension: .py
mimetype: text/x-python
name: python
nbconvert_exporter: python
pygments_lexer: ipython3
version: 3.7.6
plotly:
description: Visualize Principle Component Analysis (PCA) of your high-dimensional
data with Plotly on Python.
display_as: ai_ml
language: python
layout: base
name: PCA Visualization
order: 4
page_type: example_index
permalink: python/pca-visualization/
thumbnail: thumbnail/ml-pca.png
---

## Basic PCA Scatter Plot

This example shows you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. It uses scikit-learn's `PCA`.

```python
import plotly.express as px
from sklearn.decomposition import PCA

df = px.data.iris()
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(x=components[:, 0], y=components[:, 1], color=df['species'])
fig.show()
```

## Visualize PCA with `px.scatter_3d`

Just like the basic PCA plot, this let you visualize the first 3 dimensions. This additionally displays the total variance explained by those components.

```python
import plotly.express as px
from sklearn.decomposition import PCA

df = px.data.iris()
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
x=components[:, 0], y=components[:, 1], z=components[:, 2],
color=df['species'],
title=f'Total Explained Variance: {total_var:.2f}%',
labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'},
)
fig.show()
```

## Plot high-dimensional components with `px.scatter_matrix`

If you need to visualize more than 3 dimensions, you can use scatter plot matrices.

```python
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import load_boston

boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)

pca = PCA(n_components=5)
components = pca.fit_transform(df)

total_var = pca.explained_variance_ratio_.sum() * 100

labels = {str(i): f"PC {i+1}" for i in range(5)}
labels['color'] = 'Median Price'

fig = px.scatter_matrix(
components,
color=boston.target,
dimensions=range(5),
labels=labels,
title=f'Total Explained Variance: {total_var:.2f}%',
)
fig.update_traces(diagonal_visible=False)
fig.show()
```

## Plotting explained variance

Often, you might be interested in seeing how much variance the PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset).

```python
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import load_diabetes

boston = load_diabetes()
df = pd.DataFrame(boston.data, columns=boston.feature_names)

pca = PCA()
pca.fit(df)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
labels={"x": "# Components", "y": "Explained Variance"}
)
```

## Visualize loadings
Loading