Skip to content

Commit ed870ad

Browse files
committed
Pushing the docs to dev/ for branch: main, commit d8d5637cfe372dd353dfc9f79dbb63c3189a9ecc
1 parent 3e7069e commit ed870ad

File tree

1,231 files changed

+4869
-4571
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,231 files changed

+4869
-4571
lines changed
Binary file not shown.
Binary file not shown.

dev/_downloads/751db3d5e6b909ff00972495eaae53df/plot_document_clustering.ipynb

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,90 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\n \"--lsa\",\n dest=\"n_components\",\n type=\"int\",\n help=\"Preprocess documents with latent semantic analysis.\",\n)\nop.add_option(\n \"--no-minibatch\",\n action=\"store_false\",\n dest=\"minibatch\",\n default=True,\n help=\"Use ordinary k-means algorithm (in batch mode).\",\n)\nop.add_option(\n \"--no-idf\",\n action=\"store_false\",\n dest=\"use_idf\",\n default=True,\n help=\"Disable Inverse Document Frequency feature weighting.\",\n)\nop.add_option(\n \"--use-hashing\",\n action=\"store_true\",\n default=False,\n help=\"Use a hashing feature vectorizer\",\n)\nop.add_option(\n \"--n-features\",\n type=int,\n default=10000,\n help=\"Maximum number of features (dimensions) to extract from text.\",\n)\nop.add_option(\n \"--verbose\",\n action=\"store_true\",\n dest=\"verbose\",\n default=False,\n help=\"Print progress reports inside k-means algorithm.\",\n)\n\nprint(__doc__)\nop.print_help()\nprint()\n\n\ndef is_interactive():\n return not hasattr(sys.modules[\"__main__\"], \"__file__\")\n\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n \"alt.atheism\",\n \"talk.religion.misc\",\n \"comp.graphics\",\n \"sci.space\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(\n subset=\"all\", categories=categories, shuffle=True, random_state=42\n)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()\n\nlabels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n if opts.use_idf:\n # Perform an IDF normalization on the output of HashingVectorizer\n hasher = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=None,\n )\n vectorizer = make_pipeline(hasher, TfidfTransformer())\n else:\n vectorizer = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=\"l2\",\n )\nelse:\n vectorizer = TfidfVectorizer(\n max_df=0.5,\n max_features=opts.n_features,\n min_df=2,\n stop_words=\"english\",\n use_idf=opts.use_idf,\n )\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n print(\"Performing dimensionality reduction using LSA\")\n t0 = time()\n # Vectorizer results are normalized, which makes KMeans behave as\n # spherical k-means for better results. Since LSA/SVD results are\n # not normalized, we have to redo the normalization.\n svd = TruncatedSVD(opts.n_components)\n normalizer = Normalizer(copy=False)\n lsa = make_pipeline(svd, normalizer)\n\n X = lsa.fit_transform(X)\n\n print(\"done in %fs\" % (time() - t0))\n\n explained_variance = svd.explained_variance_ratio_.sum()\n print(\n \"Explained variance of the SVD step: {}%\".format(int(explained_variance * 100))\n )\n\n print()\n\n\n# #############################################################################\n# Do the actual clustering\n\nif opts.minibatch:\n km = MiniBatchKMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n n_init=1,\n init_size=1000,\n batch_size=1000,\n verbose=opts.verbose,\n )\nelse:\n km = KMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n max_iter=100,\n n_init=1,\n verbose=opts.verbose,\n )\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\" % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\n \"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, km.labels_, sample_size=1000)\n)\n\nprint()\n\n\nif not opts.use_hashing:\n print(\"Top terms per cluster:\")\n\n if opts.n_components:\n original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n order_centroids = original_space_centroids.argsort()[:, ::-1]\n else:\n order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n terms = vectorizer.get_feature_names_out()\n for i in range(true_k):\n print(\"Cluster %d:\" % i, end=\"\")\n for ind in order_centroids[i, :10]:\n print(\" %s\" % terms[ind], end=\"\")\n print()"
29+
"# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\n \"--lsa\",\n dest=\"n_components\",\n type=\"int\",\n help=\"Preprocess documents with latent semantic analysis.\",\n)\nop.add_option(\n \"--no-minibatch\",\n action=\"store_false\",\n dest=\"minibatch\",\n default=True,\n help=\"Use ordinary k-means algorithm (in batch mode).\",\n)\nop.add_option(\n \"--no-idf\",\n action=\"store_false\",\n dest=\"use_idf\",\n default=True,\n help=\"Disable Inverse Document Frequency feature weighting.\",\n)\nop.add_option(\n \"--use-hashing\",\n action=\"store_true\",\n default=False,\n help=\"Use a hashing feature vectorizer\",\n)\nop.add_option(\n \"--n-features\",\n type=int,\n default=10000,\n help=\"Maximum number of features (dimensions) to extract from text.\",\n)\nop.add_option(\n \"--verbose\",\n action=\"store_true\",\n dest=\"verbose\",\n default=False,\n help=\"Print progress reports inside k-means algorithm.\",\n)\n\nprint(__doc__)\n\n\ndef is_interactive():\n return not hasattr(sys.modules[\"__main__\"], \"__file__\")\n\n\nif not is_interactive():\n op.print_help()\n print()\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Load some categories from the training set\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"categories = [\n \"alt.atheism\",\n \"talk.religion.misc\",\n \"comp.graphics\",\n \"sci.space\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(\n subset=\"all\", categories=categories, shuffle=True, random_state=42\n)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"## Feature Extraction\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"labels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n if opts.use_idf:\n # Perform an IDF normalization on the output of HashingVectorizer\n hasher = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=None,\n )\n vectorizer = make_pipeline(hasher, TfidfTransformer())\n else:\n vectorizer = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=\"l2\",\n )\nelse:\n vectorizer = TfidfVectorizer(\n max_df=0.5,\n max_features=opts.n_features,\n min_df=2,\n stop_words=\"english\",\n use_idf=opts.use_idf,\n )\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n print(\"Performing dimensionality reduction using LSA\")\n t0 = time()\n # Vectorizer results are normalized, which makes KMeans behave as\n # spherical k-means for better results. Since LSA/SVD results are\n # not normalized, we have to redo the normalization.\n svd = TruncatedSVD(opts.n_components)\n normalizer = Normalizer(copy=False)\n lsa = make_pipeline(svd, normalizer)\n\n X = lsa.fit_transform(X)\n\n print(\"done in %fs\" % (time() - t0))\n\n explained_variance = svd.explained_variance_ratio_.sum()\n print(\n \"Explained variance of the SVD step: {}%\".format(int(explained_variance * 100))\n )\n\n print()"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"## Clustering\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"if opts.minibatch:\n km = MiniBatchKMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n n_init=1,\n init_size=1000,\n batch_size=1000,\n verbose=opts.verbose,\n )\nelse:\n km = KMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n max_iter=100,\n n_init=1,\n verbose=opts.verbose,\n )\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"## Performance metrics\n\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {
97+
"collapsed": false
98+
},
99+
"outputs": [],
100+
"source": [
101+
"print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\" % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\n \"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, km.labels_, sample_size=1000)\n)\n\nprint()"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": null,
107+
"metadata": {
108+
"collapsed": false
109+
},
110+
"outputs": [],
111+
"source": [
112+
"if not opts.use_hashing:\n print(\"Top terms per cluster:\")\n\n if opts.n_components:\n original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n order_centroids = original_space_centroids.argsort()[:, ::-1]\n else:\n order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n terms = vectorizer.get_feature_names_out()\n for i in range(true_k):\n print(\"Cluster %d:\" % i, end=\"\")\n for ind in order_centroids[i, :10]:\n print(\" %s\" % terms[ind], end=\"\")\n print()"
30113
]
31114
}
32115
],

dev/_downloads/ba68199eea858ec04949b2c6c65147e0/plot_document_clustering.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,16 @@
118118
)
119119

120120
print(__doc__)
121-
op.print_help()
122-
print()
123121

124122

125123
def is_interactive():
126124
return not hasattr(sys.modules["__main__"], "__file__")
127125

128126

127+
if not is_interactive():
128+
op.print_help()
129+
print()
130+
129131
# work-around for Jupyter notebook and IPython console
130132
argv = [] if is_interactive() else sys.argv[1:]
131133
(opts, args) = op.parse_args(argv)
@@ -134,8 +136,10 @@ def is_interactive():
134136
sys.exit(1)
135137

136138

137-
# #############################################################################
139+
# %%
138140
# Load some categories from the training set
141+
# ------------------------------------------
142+
139143
categories = [
140144
"alt.atheism",
141145
"talk.religion.misc",
@@ -156,6 +160,11 @@ def is_interactive():
156160
print("%d categories" % len(dataset.target_names))
157161
print()
158162

163+
164+
# %%
165+
# Feature Extraction
166+
# ------------------
167+
159168
labels = dataset.target
160169
true_k = np.unique(labels).shape[0]
161170

@@ -214,8 +223,9 @@ def is_interactive():
214223
print()
215224

216225

217-
# #############################################################################
218-
# Do the actual clustering
226+
# %%
227+
# Clustering
228+
# ----------
219229

220230
if opts.minibatch:
221231
km = MiniBatchKMeans(
@@ -241,6 +251,11 @@ def is_interactive():
241251
print("done in %0.3fs" % (time() - t0))
242252
print()
243253

254+
255+
# %%
256+
# Performance metrics
257+
# -------------------
258+
244259
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
245260
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
246261
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
@@ -253,6 +268,8 @@ def is_interactive():
253268
print()
254269

255270

271+
# %%
272+
256273
if not opts.use_hashing:
257274
print("Top terms per cluster:")
258275

dev/_downloads/scikit-learn-docs.zip

-13 KB
Binary file not shown.
-117 Bytes
-77 Bytes
-113 Bytes
-135 Bytes
-10 Bytes
29 Bytes
20 Bytes
-4 Bytes
97 Bytes
-78 Bytes
-3.89 KB
-440 Bytes
82 Bytes
-148 Bytes
-265 Bytes
236 Bytes
5 Bytes
-18 Bytes
189 Bytes
45 Bytes
192 Bytes
132 Bytes
44 Bytes
-128 Bytes
-22 Bytes
-12 Bytes
17 Bytes

dev/_sources/auto_examples/applications/plot_cyclical_feature_engineering.rst.txt

Lines changed: 1 addition & 1 deletion

dev/_sources/auto_examples/applications/plot_digits_denoising.rst.txt

Lines changed: 1 addition & 1 deletion

dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt

Lines changed: 5 additions & 5 deletions

0 commit comments

Comments
 (0)