IntelPython · Pahandrovich · Sep 24, 2021 · Aug 17, 2021 · Aug 17, 2021 · Sep 9, 2021
diff --git a/bench.py b/bench.py
@@ -503,6 +503,10 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
             alg_instance_params = dict(alg_instance.attributes())
         else:
             alg_instance_params = dict(alg_instance.get_params())
+            if ('min_samples_split' in alg_instance_params
+                    and 'handle' in alg_instance_params):
+                alg_instance_params['dtype'] = str(
+                    alg_instance_params['dtype'])
         result['algorithm_parameters'].update(alg_instance_params)
     if alg_params is not None:
         result['algorithm_parameters'].update(alg_params)

diff --git a/configs/cuml_config.json b/configs/cuml_config.json
@@ -136,6 +136,40 @@
             "max-leaf-nodes": 131072,
             "max-features": 0.2
         },
+        {
+            "algorithm": "df_regr",
+            "dtype": "float32",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                },
+                {
+                    "source": "npy",
+                    "name": "airline_regression",
+                    "training":
+                    {
+                        "x": "data/airline_regression_x_train.npy",
+                        "y": "data/airline_regression_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/airline_regression_x_test.npy",
+                        "y": "data/airline_regression_y_test.npy"
+                    }
+                }
+            ]
+        },
         {
             "algorithm": "ridge",
             "dataset": [
@@ -564,6 +598,26 @@
             "alpha": 2.0,
             "l1_ratio": 0.5,
             "tol": 1e-4
+        },
+        {
+            "algorithm": "tsne",
+            "dtype": "float32",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "mnist",
+                    "training":
+                    {
+                        "x": "data/mnist_x_train.npy",
+                        "y": "data/mnist_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/mnist_x_test.npy",
+                        "y": "data/mnist_y_test.npy"
+                    }
+                }
+            ]
         }
     ]
 }
diff --git a/configs/skl_config.json b/configs/skl_config.json
@@ -148,6 +148,40 @@
             "max-leaf-nodes": 131072,
             "max-features": 0.2
         },
+        {
+            "algorithm": "df_regr",
+            "dtype": "float32",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                },
+                {
+                    "source": "npy",
+                    "name": "airline_regression",
+                    "training":
+                    {
+                        "x": "data/airline_regression_x_train.npy",
+                        "y": "data/airline_regression_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/airline_regression_x_test.npy",
+                        "y": "data/airline_regression_y_test.npy"
+                    }
+                }
+            ]
+        },
         {
             "algorithm": "ridge",
             "dataset": [
@@ -731,6 +765,25 @@
             "alpha": 2.0,
             "l1_ratio": 0.5,
             "tol": 1e-4
+        },
+        {
+            "algorithm": "tsne",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "mnist",
+                    "training":
+                    {
+                        "x": "data/mnist_x_train.npy",
+                        "y": "data/mnist_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/mnist_x_test.npy",
+                        "y": "data/mnist_y_test.npy"
+                    }
+                }
+            ]
         }
     ]
 }
diff --git a/configs/testing/sklearn.json b/configs/testing/sklearn.json
@@ -244,7 +244,7 @@
                     "n_features": 10,
                     "training": {
                         "n_samples": 1000
-                    }                
+                    }
                 }
             ],
             "include-y": "",
@@ -323,6 +323,23 @@
                     }
                 }
             ]
+        },
+        {
+            "algorithm": "tsne",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 5,
+                    "n_features": 10,
+                    "training": {
+                        "n_samples": 1000
+                    },
+                    "testing": {
+                        "n_samples": 20
+                    }
+                }
+            ]
         }
     ]
 }
diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py
@@ -15,44 +15,35 @@
 # ===============================================================================
 
 import argparse
-
 import bench
 from cuml.ensemble import RandomForestRegressor
 
 parser = argparse.ArgumentParser(description='cuml random forest '
                                              'regression benchmark')
 
-parser.add_argument('--criterion', type=str, default='mse',
-                    choices=('mse', 'mae'),
-                    help='The function to measure the quality of a split')
 parser.add_argument('--split-algorithm', type=str, default='hist',
                     choices=('hist', 'global_quantile'),
                     help='The algorithm to determine how '
                          'nodes are split in the tree')
 parser.add_argument('--num-trees', type=int, default=100,
                     help='Number of trees in the forest')
-parser.add_argument('--max-features', type=bench.float_or_int, default=None,
+parser.add_argument('--max-features', type=bench.float_or_int, default=1.0,
                     help='Upper bound on features used at each split')
-parser.add_argument('--max-depth', type=int, default=None,
+parser.add_argument('--max-depth', type=int, default=16,
                     help='Upper bound on depth of constructed trees')
 parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
                     help='Minimum samples number for node splitting')
 parser.add_argument('--max-leaf-nodes', type=int, default=-1,
                     help='Maximum leaf nodes per tree')
-parser.add_argument('--min-impurity-decrease', type=float, default=0.,
+parser.add_argument('--min-impurity-decrease', type=float, default=0.0,
                     help='Needed impurity decrease for node splitting')
 parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
                     action='store_false', help="Don't control bootstraping")
 
 params = bench.parse_args(parser)
 
 # Load and convert data
-X_train, X_test, y_train, y_test = bench.load_data(params)
-
-if params.criterion == 'mse':
-    params.criterion = 2
-else:
-    params.criterion = 3
+X_train, X_test, y_train, y_test = bench.load_data(params, int_label=True)
 
 if params.split_algorithm == 'hist':
     params.split_algorithm = 0
@@ -61,15 +52,15 @@
 
 # Create our random forest regressor
 regr = RandomForestRegressor(
-    split_criterion=params.criterion,
-    split_algo=params.split_algorithm,
     n_estimators=params.num_trees,
-    max_depth=params.max_depth,
+    split_algo=params.split_algorithm,
     max_features=params.max_features,
     min_samples_split=params.min_samples_split,
+    max_depth=params.max_depth,
     max_leaves=params.max_leaf_nodes,
     min_impurity_decrease=params.min_impurity_decrease,
     bootstrap=params.bootstrap,
+
 )
 
 
@@ -82,7 +73,6 @@ def predict(regr, X):
 
 
 fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params)
-
 y_pred = predict(regr, X_train)
 train_rmse = bench.rmse_score(y_pred, y_train)
 

diff --git a/cuml_bench/tsne.py b/cuml_bench/tsne.py
@@ -0,0 +1,39 @@
+import argparse
+import bench
+from cuml.manifold import TSNE
+
+parser = argparse.ArgumentParser(description='cuml tsne')
+
+parser.add_argument('--n-components', type=int, default=2,
+                    help='The dimension of the embedded space.')
+parser.add_argument('--early-exaggeration', type=float, default=12.0,
+                    help='This factor increases the attractive forces between points '
+                    'and allows points to move around more freely, '
+                    'finding their nearest neighbors more easily.')
+parser.add_argument('--learning-rate', type=float, default=200.0,
+                    help='The learning rate for t-SNE is usually in the range [10.0, 1000.0].')
+parser.add_argument('--angle', type=float, default=0.5,
+                    help='Angular size. This is the trade-off between speed and accuracy.')
+parser.add_argument('--min-grad-norm', type=float, default=1e-7,
+                    help='If the gradient norm is below this threshold,'
+                    'the optimization is stopped.')
+parser.add_argument('--random-state', type=int, default=1234)
+params = bench.parse_args(parser)
+
+# Load and convert data
+X, _, _, _ = bench.load_data(params)
+
+# Create our random forest regressor
+tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration,
+            learning_rate=params.learning_rate, angle=params.angle,
+            min_grad_norm=params.min_grad_norm, random_state=params.random_state)
+
+fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params)
+# Need to investigate how to compare sklearn and cuml metrics for tsne
+
+bench.print_output(library='cuml', algorithm='tsne',
+                   stages=['training'], params=params,
+                   functions=['tsne.fit'],
+                   times=[fit_time], metric_type=None,
+                   metrics=None, data=[X],
+                   alg_instance=tsne)