|
26 | 26 | },
|
27 | 27 | "outputs": [],
|
28 | 28 | "source": [
|
29 |
| - "# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\n \"--lsa\",\n dest=\"n_components\",\n type=\"int\",\n help=\"Preprocess documents with latent semantic analysis.\",\n)\nop.add_option(\n \"--no-minibatch\",\n action=\"store_false\",\n dest=\"minibatch\",\n default=True,\n help=\"Use ordinary k-means algorithm (in batch mode).\",\n)\nop.add_option(\n \"--no-idf\",\n action=\"store_false\",\n dest=\"use_idf\",\n default=True,\n help=\"Disable Inverse Document Frequency feature weighting.\",\n)\nop.add_option(\n \"--use-hashing\",\n action=\"store_true\",\n default=False,\n help=\"Use a hashing feature vectorizer\",\n)\nop.add_option(\n \"--n-features\",\n type=int,\n default=10000,\n help=\"Maximum number of features (dimensions) to extract from text.\",\n)\nop.add_option(\n \"--verbose\",\n action=\"store_true\",\n dest=\"verbose\",\n default=False,\n help=\"Print progress reports inside k-means algorithm.\",\n)\n\nprint(__doc__)\nop.print_help()\nprint()\n\n\ndef is_interactive():\n return not hasattr(sys.modules[\"__main__\"], \"__file__\")\n\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n \"alt.atheism\",\n \"talk.religion.misc\",\n \"comp.graphics\",\n \"sci.space\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(\n subset=\"all\", categories=categories, shuffle=True, random_state=42\n)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()\n\nlabels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n if opts.use_idf:\n # Perform an IDF normalization on the output of HashingVectorizer\n hasher = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=None,\n )\n vectorizer = make_pipeline(hasher, TfidfTransformer())\n else:\n vectorizer = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=\"l2\",\n )\nelse:\n vectorizer = TfidfVectorizer(\n max_df=0.5,\n max_features=opts.n_features,\n min_df=2,\n stop_words=\"english\",\n use_idf=opts.use_idf,\n )\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n print(\"Performing dimensionality reduction using LSA\")\n t0 = time()\n # Vectorizer results are normalized, which makes KMeans behave as\n # spherical k-means for better results. Since LSA/SVD results are\n # not normalized, we have to redo the normalization.\n svd = TruncatedSVD(opts.n_components)\n normalizer = Normalizer(copy=False)\n lsa = make_pipeline(svd, normalizer)\n\n X = lsa.fit_transform(X)\n\n print(\"done in %fs\" % (time() - t0))\n\n explained_variance = svd.explained_variance_ratio_.sum()\n print(\n \"Explained variance of the SVD step: {}%\".format(int(explained_variance * 100))\n )\n\n print()\n\n\n# #############################################################################\n# Do the actual clustering\n\nif opts.minibatch:\n km = MiniBatchKMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n n_init=1,\n init_size=1000,\n batch_size=1000,\n verbose=opts.verbose,\n )\nelse:\n km = KMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n max_iter=100,\n n_init=1,\n verbose=opts.verbose,\n )\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\" % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\n \"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, km.labels_, sample_size=1000)\n)\n\nprint()\n\n\nif not opts.use_hashing:\n print(\"Top terms per cluster:\")\n\n if opts.n_components:\n original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n order_centroids = original_space_centroids.argsort()[:, ::-1]\n else:\n order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n terms = vectorizer.get_feature_names_out()\n for i in range(true_k):\n print(\"Cluster %d:\" % i, end=\"\")\n for ind in order_centroids[i, :10]:\n print(\" %s\" % terms[ind], end=\"\")\n print()" |
| 29 | + "# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\n \"--lsa\",\n dest=\"n_components\",\n type=\"int\",\n help=\"Preprocess documents with latent semantic analysis.\",\n)\nop.add_option(\n \"--no-minibatch\",\n action=\"store_false\",\n dest=\"minibatch\",\n default=True,\n help=\"Use ordinary k-means algorithm (in batch mode).\",\n)\nop.add_option(\n \"--no-idf\",\n action=\"store_false\",\n dest=\"use_idf\",\n default=True,\n help=\"Disable Inverse Document Frequency feature weighting.\",\n)\nop.add_option(\n \"--use-hashing\",\n action=\"store_true\",\n default=False,\n help=\"Use a hashing feature vectorizer\",\n)\nop.add_option(\n \"--n-features\",\n type=int,\n default=10000,\n help=\"Maximum number of features (dimensions) to extract from text.\",\n)\nop.add_option(\n \"--verbose\",\n action=\"store_true\",\n dest=\"verbose\",\n default=False,\n help=\"Print progress reports inside k-means algorithm.\",\n)\n\nprint(__doc__)\n\n\ndef is_interactive():\n return not hasattr(sys.modules[\"__main__\"], \"__file__\")\n\n\nif not is_interactive():\n op.print_help()\n print()\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)" |
| 30 | + ] |
| 31 | + }, |
| 32 | + { |
| 33 | + "cell_type": "markdown", |
| 34 | + "metadata": {}, |
| 35 | + "source": [ |
| 36 | + "## Load some categories from the training set\n\n" |
| 37 | + ] |
| 38 | + }, |
| 39 | + { |
| 40 | + "cell_type": "code", |
| 41 | + "execution_count": null, |
| 42 | + "metadata": { |
| 43 | + "collapsed": false |
| 44 | + }, |
| 45 | + "outputs": [], |
| 46 | + "source": [ |
| 47 | + "categories = [\n \"alt.atheism\",\n \"talk.religion.misc\",\n \"comp.graphics\",\n \"sci.space\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(\n subset=\"all\", categories=categories, shuffle=True, random_state=42\n)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()" |
| 48 | + ] |
| 49 | + }, |
| 50 | + { |
| 51 | + "cell_type": "markdown", |
| 52 | + "metadata": {}, |
| 53 | + "source": [ |
| 54 | + "## Feature Extraction\n\n" |
| 55 | + ] |
| 56 | + }, |
| 57 | + { |
| 58 | + "cell_type": "code", |
| 59 | + "execution_count": null, |
| 60 | + "metadata": { |
| 61 | + "collapsed": false |
| 62 | + }, |
| 63 | + "outputs": [], |
| 64 | + "source": [ |
| 65 | + "labels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n if opts.use_idf:\n # Perform an IDF normalization on the output of HashingVectorizer\n hasher = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=None,\n )\n vectorizer = make_pipeline(hasher, TfidfTransformer())\n else:\n vectorizer = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=\"l2\",\n )\nelse:\n vectorizer = TfidfVectorizer(\n max_df=0.5,\n max_features=opts.n_features,\n min_df=2,\n stop_words=\"english\",\n use_idf=opts.use_idf,\n )\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n print(\"Performing dimensionality reduction using LSA\")\n t0 = time()\n # Vectorizer results are normalized, which makes KMeans behave as\n # spherical k-means for better results. Since LSA/SVD results are\n # not normalized, we have to redo the normalization.\n svd = TruncatedSVD(opts.n_components)\n normalizer = Normalizer(copy=False)\n lsa = make_pipeline(svd, normalizer)\n\n X = lsa.fit_transform(X)\n\n print(\"done in %fs\" % (time() - t0))\n\n explained_variance = svd.explained_variance_ratio_.sum()\n print(\n \"Explained variance of the SVD step: {}%\".format(int(explained_variance * 100))\n )\n\n print()" |
| 66 | + ] |
| 67 | + }, |
| 68 | + { |
| 69 | + "cell_type": "markdown", |
| 70 | + "metadata": {}, |
| 71 | + "source": [ |
| 72 | + "## Clustering\n\n" |
| 73 | + ] |
| 74 | + }, |
| 75 | + { |
| 76 | + "cell_type": "code", |
| 77 | + "execution_count": null, |
| 78 | + "metadata": { |
| 79 | + "collapsed": false |
| 80 | + }, |
| 81 | + "outputs": [], |
| 82 | + "source": [ |
| 83 | + "if opts.minibatch:\n km = MiniBatchKMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n n_init=1,\n init_size=1000,\n batch_size=1000,\n verbose=opts.verbose,\n )\nelse:\n km = KMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n max_iter=100,\n n_init=1,\n verbose=opts.verbose,\n )\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()" |
| 84 | + ] |
| 85 | + }, |
| 86 | + { |
| 87 | + "cell_type": "markdown", |
| 88 | + "metadata": {}, |
| 89 | + "source": [ |
| 90 | + "## Performance metrics\n\n" |
| 91 | + ] |
| 92 | + }, |
| 93 | + { |
| 94 | + "cell_type": "code", |
| 95 | + "execution_count": null, |
| 96 | + "metadata": { |
| 97 | + "collapsed": false |
| 98 | + }, |
| 99 | + "outputs": [], |
| 100 | + "source": [ |
| 101 | + "print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\" % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\n \"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, km.labels_, sample_size=1000)\n)\n\nprint()" |
| 102 | + ] |
| 103 | + }, |
| 104 | + { |
| 105 | + "cell_type": "code", |
| 106 | + "execution_count": null, |
| 107 | + "metadata": { |
| 108 | + "collapsed": false |
| 109 | + }, |
| 110 | + "outputs": [], |
| 111 | + "source": [ |
| 112 | + "if not opts.use_hashing:\n print(\"Top terms per cluster:\")\n\n if opts.n_components:\n original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n order_centroids = original_space_centroids.argsort()[:, ::-1]\n else:\n order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n terms = vectorizer.get_feature_names_out()\n for i in range(true_k):\n print(\"Cluster %d:\" % i, end=\"\")\n for ind in order_centroids[i, :10]:\n print(\" %s\" % terms[ind], end=\"\")\n print()" |
30 | 113 | ]
|
31 | 114 | }
|
32 | 115 | ],
|
|
0 commit comments