task: benchmarking with asv

jharlow-intel · jharlow-intel · commit dcd1a3d4bbfa · 2025-04-15T11:20:13.000-06:00
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,6 @@ dpctl/tensor/_usmarray.h
 
 # moved cmake scripts
 dpctl/resources/cmake
+
+# asv artifacts
+*.asv*
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,24 @@
+# dpctl benchmarks
+
+Benchmarking dpctl using Airspeed Velocity
+Read more about ASV [here](https://asv.readthedocs.io/en/stable/index.html)
+
+## Usage
+This was made for use within our CI in mind. As such, you will see the `asv.conf.json` is minimal without any environmental information supplied.
+The expectation is for users to execute `asv run` with an existing environment.
+
+As such, you should have conda or mamba installed, and create a `dpctl-benchmarking` environment as so:
+```
+conda create --name dpctl-benchmarking python=$PYTHON_VERSION dpctl asv libmambapy conda dpcpp_linux-64 --override-channels \
+        -c https://software.repos.intel.com/python/conda \
+        -c conda-forge -y
+```
+
+Then, you may activate the environment and instruct `asv run` to use this existing environment for the benchmarks by pointing it to thhe environment's python binary, like so:
+```
+conda activate dpctl-benchmarking
+asv run --environment existing:/full/mamba/path/envs/dpctl-benchmarking/bin/python
+```
+
+## Writing new benchmarks
+Read ASV's guidelines for writing benchmarks [here](https://asv.readthedocs.io/en/stable/writing_benchmarks.html)
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
@@ -0,0 +1,182 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "dpctl",
+
+    // The project's homepage
+    "project_url": "https://github.com/IntelPython/dpctl",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "..",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // Customizable commands for building the project.
+    // See asv.conf.json documentation.
+    "build_command": [],
+
+    // Customizable commands for installing and uninstalling the project.
+    // See asv.conf.json documentation.
+    // "install_command": ["in-dir={env_dir} conda install dpctl --yes"],
+    // "uninstall_command": ["return-code=any conda uninstall dpctl --yes"],
+
+    // List of branches to benchmark. If not provided, defaults to "main"
+    // (for git) or "default" (for mercurial).
+    "branches": ["HEAD"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv", "mamba" (above 3.8)
+    // or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    // "show_commit_url": "http://github.com/owner/project/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["3.8", "3.12"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    // "conda_channels": [
+    //     "https://af01p-igk.devtools.intel.com/artifactory/api/conda/idp-conda-pkgserver-igk-local/gold",
+    //     "https://af01p-igk.devtools.intel.com/artifactory/api/conda/idp-conda-pkgserver-igk-local/tools",
+    //     "conda-forge"
+    // ],
+
+    // A conda environment file that is used for environment creation.
+    // "conda_environment_file": "environment.yml",
+
+    // The matrix of dependencies to test.  Each key of the "req"
+    // requirements dictionary is the name of a package (in PyPI) and
+    // the values are version numbers.  An empty list or empty string
+    // indicates to just test against the default (latest)
+    // version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed
+    // via pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+    // environment variables to pass to build and benchmark commands.
+    // An environment will be created for every combination of the
+    // cartesian product of the "@env" variables in this matrix.
+    // Variables in "@env_nobuild" will be passed to every environment
+    // during the benchmark phase, but will not trigger creation of
+    // new environments.  A value of ``null`` means that the variable
+    // will not be set for the current combination.
+    //
+    // "matrix": {
+    //     "env": {"PACKAGE_PATH": [""]}
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    // - req
+    //     Required packages
+    // - env
+    //     Environment variables
+    // - env_nobuild
+    //     Non-build environment variables
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda
+    //     {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+    // ],
+    //
+    // "include": [
+    //     // additional env for python3.12
+    //     {"python": "3.12", "req": {"numpy": "1.26"}, "env_nobuild": {"FOO": "123"}},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "3.12", "req": {"libpython": ""}},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
diff --git a/benchmarks/benchmarks/binary.py b/benchmarks/benchmarks/binary.py
@@ -0,0 +1,130 @@
+import dpctl
+import dpctl.tensor as dpt
+
+SHARED_QUEUE = dpctl.SyclQueue(property="enable_profiling")
+
+
+class Binary:
+    """Benchmark class for binary operations on SYCL devices."""
+
+    timeout = 300.0
+
+    def setup(self):
+        """Setup the benchmark environment."""
+        self.q = SHARED_QUEUE
+        self.iterations = 1
+        self.n_values = 10**8
+
+    def run_bench(self, q, reps, n_max, dtype1, dtype2, op):
+        """Run the benchmark for a specific function and dtype combination."""
+
+        def get_sizes(n):
+            s = []
+            m = 8192
+            while m < n:
+                s.append(m)
+                m *= 2
+            s.append(n)
+            return s
+
+        x1 = dpt.ones(n_max, dtype=dtype1, sycl_queue=q)
+        x2 = dpt.ones(n_max, dtype=dtype2, sycl_queue=q)
+        r = op(x1, x2)
+
+        max_bytes = x1.nbytes + x2.nbytes + r.nbytes
+        times_res = []
+
+        for n in get_sizes(n_max):
+            x1_n = x1[:n]
+            x2_n = x2[:n]
+            r_n = r[:n]
+            n_bytes = x1_n.nbytes + x2_n.nbytes + r_n.nbytes
+
+            n_iters = int((max_bytes / n_bytes) * reps)
+
+            while True:
+                timer = dpctl.SyclTimer(
+                    device_timer="order_manager", time_scale=1e9
+                )
+                with timer(q):
+                    for _ in range(n_iters):
+                        op(x1_n, x2_n, out=r_n)
+
+                dev_dt = timer.dt.device_dt
+                if dev_dt > 0:
+                    times_res.append((n, dev_dt / n_iters))
+                    break
+
+        return times_res
+
+
+binary_instance = Binary()
+binary_instance.q = SHARED_QUEUE
+binary_instance.iterations = 1
+binary_instance.n_values = 10**8
+
+function_list = [
+    dpt.add,
+    dpt.multiply,
+    dpt.divide,
+    dpt.subtract,
+    dpt.floor_divide,
+    dpt.remainder,
+    dpt.hypot,
+    dpt.logaddexp,
+    dpt.pow,
+    dpt.atan2,
+    dpt.nextafter,
+    dpt.copysign,
+    dpt.less,
+    dpt.less_equal,
+    dpt.greater,
+    dpt.greater_equal,
+    dpt.equal,
+    dpt.not_equal,
+    dpt.minimum,
+    dpt.maximum,
+    dpt.bitwise_and,
+    dpt.bitwise_or,
+    dpt.bitwise_xor,
+    dpt.bitwise_left_shift,
+    dpt.bitwise_right_shift,
+    dpt.logical_and,
+    dpt.logical_or,
+    dpt.logical_xor,
+]
+
+# Generate dtype combinations for each function
+dtypes = {}
+for fn in function_list:
+    dtypes[fn] = [list(map(dpt.dtype, sig.split("->")[0])) for sig in fn.types]
+
+
+# Dynamically create benchmark methods at the module level
+def generate_benchmark_functions():
+    """Dynamically create benchmark functions for each
+    function and dtype combination.
+    """
+    for fn in function_list:
+        fn_name = fn.name_
+        for dtype1, dtype2 in dtypes[fn]:
+            # Create unique function names
+            method_name = f"time_{fn_name}_{dtype1.name}_{dtype2.name}"
+
+            def benchmark_method(self, fn=fn, dtype1=dtype1, dtype2=dtype2):
+                return self.run_bench(
+                    self.q,
+                    self.iterations,
+                    self.n_values,
+                    dtype1,
+                    dtype2,
+                    fn,
+                )
+
+            # Attach the new method to the Binary class
+            benchmark_method.__name__ = method_name
+            setattr(Binary, method_name, benchmark_method)
+
+
+# Generate the benchmark functions
+generate_benchmark_functions()
diff --git a/benchmarks/benchmarks/ef_bench_add.py b/benchmarks/benchmarks/ef_bench_add.py
@@ -0,0 +1,31 @@
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
+import dpctl.utils as dpu
+
+
+class EfBenchAdd:
+
+    def time_ef_bench_add(self):
+        q = dpctl.SyclQueue(property="enable_profiling")
+        n = 2**26
+        reps = 50
+
+        dt = dpt.int8
+        x1 = dpt.ones(n, dtype=dt, sycl_queue=q)
+        x2 = dpt.ones(n, dtype=dt, sycl_queue=q)
+
+        op1, op2 = dpt.add, tei._add
+
+        r = op1(x1, x2)
+
+        timer = dpctl.SyclTimer(device_timer="order_manager", time_scale=1e9)
+
+        m = dpu.SequentialOrderManager[q]
+        with timer(q):
+            for _ in range(reps):
+                deps = m.submitted_events
+                ht_e, c_e = op2(
+                    src1=x1, src2=x2, dst=r, sycl_queue=q, depends=deps
+                )
+                m.add_event_pair(ht_e, c_e)