IntelPython · ndgrigorian · May 27, 2026 · Apr 14, 2026 · Apr 14, 2026 · May 15, 2026
@@ -9,3 +9,6 @@ mkl_fft/_pydfti.c
 mkl_fft/_pydfti.cpython*.so
 mkl_fft/_pydfti.*-win_amd64.pyd
 mkl_fft/src/mklfft.c
+
+# ASV benchmark artifacts
+.asv/
@@ -0,0 +1,67 @@
+# mkl_fft ASV Benchmarks
+
+Performance benchmarks for [mkl_fft](https://github.com/IntelPython/mkl_fft) using
+[Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/).
+
+### Coverage
+
+| File | API | Transforms | Dtypes | Sizes/Shapes |
+|------|-----|-----------|--------|-------------|
+| `bench_fft1d.py` | `mkl_fft` | `fft`, `ifft`, `rfft`, `irfft` | float32, float64, complex64, complex128 | power-of-two and non-power-of-two |
+| `bench_fftnd.py` | `mkl_fft` | `fft2`, `ifft2`, `rfft2`, `irfft2`, `fftn`, `ifftn`, `rfftn`, `irfftn` | float32, float64, complex64, complex128 | square and non-square/non-cubic |
+| `bench_interfaces.py` | `mkl_fft.interfaces.{numpy_fft, scipy_fft}` | All exported functions; selected by a `module` parameter. Hermitian 2-D/N-D (`hfft2`, `hfftn`) are scipy-only. | float32, float64, complex64, complex128 | power-of-two and cubic |
+| `bench_memory.py` | `mkl_fft` | Peak RSS for 1-D, 2-D, and 3-D transforms | float32, float64, complex128 | power-of-two |
+
+## Threading
+
+Set `MKL_NUM_THREADS` in the environment before running ASV to control the
+thread count used by MKL:
+
+```bash
+MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
+```
+
+If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads
+when the machine has 4 or more physical cores, or **1** (single-threaded)
+otherwise. This keeps results comparable across CI machines in the shared pool
+regardless of their total core count. Physical cores are detected via
+`psutil.cpu_count(logical=False)` — hyperthreads are excluded per MKL
+recommendation.
+
+## Notes on Measurement
+
+### DFTI descriptor warmup
+
+MKL creates a DFTI descriptor on the first FFT call for a given (size, dtype,
+strides) combination and reuses it on subsequent calls. To avoid charging
+that one-time cost to the first measured iteration, each benchmark's `setup`
+performs an explicit warmup call after preparing the input array. ASV's
+default `warmup_time` (0.1s) already amortizes this for sub-millisecond
+transforms, but the explicit warmup makes the intent visible.
+
+## Running Benchmarks
+
+Prerequisites:
+
+```bash
+pip install asv psutil
+```
+
+Run benchmarks against the current environment:
+
+```bash
+asv run --python=same --quick HEAD^!
+```
+
+Compare two commits:
+
+```bash
+asv continuous --python=same HEAD~1 HEAD
+```
+
+View results in a browser:
+
+```bash
+asv publish
+asv preview
+```
@@ -0,0 +1,24 @@
+{
+    "version": 1,
+    "project": "mkl_fft",
+    "project_url": "https://github.com/IntelPython/mkl_fft",
+    "show_commit_url": "https://github.com/IntelPython/mkl_fft/commit/",
+    "repo": "..",
+    "branches": [
+        "master"
+    ],
+    "environment_type": "conda",
+    "conda_channels": [
+        "https://software.repos.intel.com/python/conda/",
+        "conda-forge"
+    ],
+    "benchmark_dir": "benchmarks",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html",
+    "build_cache_size": 2,
+    "default_benchmark_timeout": 500,
+    "regressions_thresholds": {
+        ".*": 0.3
+    }
+}
@@ -0,0 +1,21 @@
+"""ASV benchmarks for mkl_fft"""
+
+import os
+
+import psutil
+
+_MIN_THREADS = 4  # minimum physical cores required for multi-threaded mode
+
+
+def _physical_cores():
+    """Return physical core count; fall back to 1 (conservative)."""
+    return psutil.cpu_count(logical=False) or 1
+
+
+def _thread_count():
+    physical = _physical_cores()
+    return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1"
+
+
+_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count())
+os.environ["MKL_NUM_THREADS"] = _THREADS
@@ -0,0 +1,73 @@
+"""Shared utilities for mkl_fft benchmarks."""
+
+import numpy as np
+
+_RNG_SEED = 42
+
+
+def _make_input(rng, shape, dtype):
+    """Return an array of *shape* and *dtype*.
+
+    Complex dtypes get non-zero imaginary parts for a realistic signal.
+    `shape` may be an int (1-D) or a tuple.
+    """
+    dt = np.dtype(dtype)
+    s = (shape,) if isinstance(shape, int) else shape
+    if dt.kind == "c":
+        return (rng.standard_normal(s) + 1j * rng.standard_normal(s)).astype(dt)
+    return rng.standard_normal(s).astype(dt)
+
+
+class BenchC2C:
+    """Base setup for complex-to-complex benchmarks.
+
+    Subclasses define params, param_names, and time_* / peakmem_* methods.
+    Other positional params are ignored.
+    """
+
+    def setup(self, shape, dtype, *_):
+        rng = np.random.default_rng(_RNG_SEED)
+        self.x = _make_input(rng, shape, dtype)
+
+
+# dtype axes
+_DTYPES_ALL = ["float32", "float64", "complex64", "complex128"]
+_DTYPES_REAL = ["float32", "float64"]
+_DTYPES_REDUCED = ["float64", "complex128"]
+
+# shape/size axes shared across multiple files
+_SHAPES_2D = [(64, 64), (128, 128), (256, 256), (512, 512)]
+_SHAPES_2D_IFACE = [(64, 64), (256, 256), (512, 512)]
+_SHAPES_3D = [(16, 16, 16), (32, 32, 32), (64, 64, 64)]
+
+
+class BenchR2C:
+    """Base setup for real-to-complex / complex-to-real and Hermitian benchmarks.
+
+    Prepares:
+      self.x_real    — real array of full shape (rfft / ihfft input)
+      self.x_complex — complex half-spectrum array (irfft / hfft input)
+
+    DC (index 0 of the last axis) of x_complex has its imaginary part zeroed,
+    and when the full last-axis length is even the Nyquist bin imaginary part
+    is also zeroed, satisfying Hermitian symmetry expected by hfft / ihfft2 /
+    hfftn. Extra positional params are accepted and ignored.
+    """
+
+    def setup(self, shape, dtype, *_):
+        rng = np.random.default_rng(_RNG_SEED)
+        cdtype = "complex64" if dtype == "float32" else "complex128"
+        if isinstance(shape, int):
+            n_last = shape
+            half_shape = shape // 2 + 1
+        else:
+            n_last = shape[-1]
+            half_shape = shape[:-1] + (shape[-1] // 2 + 1,)
+        self.x_real = rng.standard_normal(shape).astype(dtype)
+        self.x_complex = (
+            rng.standard_normal(half_shape)
+            + 1j * rng.standard_normal(half_shape)
+        ).astype(cdtype)
+        self.x_complex[..., 0] = self.x_complex[..., 0].real
+        if n_last % 2 == 0:
+            self.x_complex[..., -1] = self.x_complex[..., -1].real
@@ -0,0 +1,105 @@
+"""Benchmarks for 1-D FFT operations using the mkl_fft root API."""
+
+import mkl_fft
+
+from ._utils import _DTYPES_ALL, _DTYPES_REAL, BenchC2C, BenchR2C
+
+_SIZES_POW2 = [64, 256, 1024, 4096, 16384, 65536]
+_SIZES_NONPOW2 = [127, 509, 1000, 4001, 10007]
+
+
+# ---------------------------------------------------------------------------
+# Complex-to-complex 1-D (power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class BenchFFT1D(BenchC2C):
+    """Forward and inverse complex FFT — power-of-two sizes."""
+
+    params = [_SIZES_POW2, _DTYPES_ALL]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        super().setup(n, dtype)
+        # prime MKL DFTI descriptor cache
+        mkl_fft.fft(self.x)
+        mkl_fft.ifft(self.x)
+
+    def time_fft(self, n, dtype):
+        mkl_fft.fft(self.x)
+
+    def time_ifft(self, n, dtype):
+        mkl_fft.ifft(self.x)
+
+
+# ---------------------------------------------------------------------------
+# Real-to-complex / complex-to-real 1-D (power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class BenchRFFT1D(BenchR2C):
+    """Forward rfft and inverse irfft — power-of-two sizes."""
+
+    params = [_SIZES_POW2, _DTYPES_REAL]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        super().setup(n, dtype)
+        mkl_fft.rfft(self.x_real)
+        mkl_fft.irfft(self.x_complex, n=n)
+
+    def time_rfft(self, n, dtype):
+        mkl_fft.rfft(self.x_real)
+
+    def time_irfft(self, n, dtype):
+        mkl_fft.irfft(self.x_complex, n=n)
+
+
+# ---------------------------------------------------------------------------
+# Complex-to-complex 1-D (non-power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class BenchFFT1DNonPow2(BenchC2C):
+    """Forward and inverse complex FFT — non-power-of-two sizes.
+
+    MKL uses a different code path for non-power-of-two transforms;
+    this suite catches regressions in that path.
+    """
+
+    params = [_SIZES_NONPOW2, _DTYPES_ALL]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        super().setup(n, dtype)
+        mkl_fft.fft(self.x)
+        mkl_fft.ifft(self.x)
+
+    def time_fft(self, n, dtype):
+        mkl_fft.fft(self.x)
+
+    def time_ifft(self, n, dtype):
+        mkl_fft.ifft(self.x)
+
+
+# ---------------------------------------------------------------------------
+# Real-to-complex / complex-to-real 1-D (non-power-of-two sizes)
+# ---------------------------------------------------------------------------
+
+
+class BenchRFFT1DNonPow2(BenchR2C):
+    """Forward rfft and inverse irfft — non-power-of-two sizes."""
+
+    params = [_SIZES_NONPOW2, _DTYPES_REAL]
+    param_names = ["n", "dtype"]
+
+    def setup(self, n, dtype):
+        super().setup(n, dtype)
+        mkl_fft.rfft(self.x_real)
+        mkl_fft.irfft(self.x_complex, n=n)
+
+    def time_rfft(self, n, dtype):
+        mkl_fft.rfft(self.x_real)
+
+    def time_irfft(self, n, dtype):
+        mkl_fft.irfft(self.x_complex, n=n)