From d1e5c3a442e920b7901d82b193b633c59af16280 Mon Sep 17 00:00:00 2001 From: Nikolay Petrov Date: Fri, 23 Jan 2026 17:45:13 -0800 Subject: [PATCH 1/3] first version of agents instructions --- .github/instructions/cython.instructions.md | 245 ++++++++++ .github/instructions/dpctl.instructions.md | 135 ++++++ .../instructions/elementwise.instructions.md | 415 +++++++++++++++++ .../libsyclinterface.instructions.md | 290 ++++++++++++ .../libtensor-cpp.instructions.md | 316 +++++++++++++ .github/instructions/memory.instructions.md | 279 +++++++++++ .../tensor-python.instructions.md | 261 +++++++++++ .github/instructions/testing.instructions.md | 257 ++++++++++ AGENTS.md | 437 ++++++++++++++++++ 9 files changed, 2635 insertions(+) create mode 100644 .github/instructions/cython.instructions.md create mode 100644 .github/instructions/dpctl.instructions.md create mode 100644 .github/instructions/elementwise.instructions.md create mode 100644 .github/instructions/libsyclinterface.instructions.md create mode 100644 .github/instructions/libtensor-cpp.instructions.md create mode 100644 .github/instructions/memory.instructions.md create mode 100644 .github/instructions/tensor-python.instructions.md create mode 100644 .github/instructions/testing.instructions.md create mode 100644 AGENTS.md diff --git a/.github/instructions/cython.instructions.md b/.github/instructions/cython.instructions.md new file mode 100644 index 0000000000..5bc58dc0ae --- /dev/null +++ b/.github/instructions/cython.instructions.md @@ -0,0 +1,245 @@ +--- +applyTo: + - "dpctl/**/*.pyx" + - "dpctl/**/*.pxd" + - "dpctl/**/*.pxi" +--- + +# Cython Bindings Instructions + +## Context + +Cython files in dpctl provide the bridge between Python and the C/C++ SYCL interface. They wrap SYCL runtime objects as Python extension types. + +## File Types + +| Extension | Purpose | +|-----------|---------| +| `.pyx` | Implementation files (compiled to C++) | +| `.pxd` | Declaration files (like C headers) | +| `.pxi` | Include files (textually included) | + +## Required Directives + +Every `.pyx` file must start with (after license header): + +```cython +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True +``` + +## Import Conventions + +### cimport vs import + +```cython +# cimport - for C-level declarations (compile-time) +from cpython cimport pycapsule +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from ._backend cimport DPCTLSyclDeviceRef, DPCTLDevice_Create + +# import - for Python-level usage (runtime) +import numpy as np +from . import _device_selection +``` + +### Import Order +1. Standard library cimports +2. Third-party cimports +3. Local cimports (with `# noqa: E211` if needed) +4. Blank line +5. Standard library imports +6. Third-party imports +7. Local imports + +## Extension Type Pattern + +```cython +cdef class SyclDevice: + """ + Python wrapper for sycl::device. + + Docstring describing the class. + """ + # C-level attribute (not accessible from Python) + cdef DPCTLSyclDeviceRef _device_ref + + def __cinit__(self, filter_string=None): + """ + Called before __init__, handles C memory allocation. + Must not raise Python exceptions that leave C state invalid. + """ + if filter_string is not None: + self._device_ref = DPCTLDevice_CreateFromSelector(...) + else: + self._device_ref = NULL + + def __dealloc__(self): + """ + Called during garbage collection. Clean up C resources. + """ + if self._device_ref is not NULL: + DPCTLDevice_Delete(self._device_ref) + + cdef DPCTLSyclDeviceRef get_device_ref(self): + """ + Internal method for C-level access. + Not visible from Python. + """ + return self._device_ref + + @property + def name(self): + """Python property wrapping C getter.""" + cdef const char *name_ptr = DPCTLDevice_GetName(self._device_ref) + if name_ptr is NULL: + raise RuntimeError("Failed to get device name") + try: + return name_ptr.decode("utf-8") + finally: + DPCTLCString_Delete(name_ptr) +``` + +## Memory Management + +### Rule: Always Clean Up in __dealloc__ + +```cython +def __dealloc__(self): + # Check for NULL before deleting + if self._queue_ref is not NULL: + DPCTLQueue_Delete(self._queue_ref) +``` + +### Ownership Annotations + +Match C API annotations: +- `__dpctl_give` - Caller receives ownership, must delete +- `__dpctl_take` - Function takes ownership, don't use after +- `__dpctl_keep` - Function only observes, doesn't take ownership + +```cython +cdef void example(): + # Receives ownership (__dpctl_give) - must delete + cdef DPCTLSyclEventRef event = DPCTLQueue_Submit(...) + + # ... use event ... + + # Clean up owned resource + DPCTLEvent_Delete(event) +``` + +### GIL Management + +Release GIL for blocking C operations: + +```cython +cdef void copy_with_wait(DPCTLSyclEventRef event): + with nogil: + DPCTLEvent_Wait(event) + +# Or for longer sections +cdef void long_operation() nogil: + # Entire function runs without GIL + # Cannot call Python objects here + pass +``` + +## Error Handling + +### NULL Checks + +```cython +cdef DPCTLSyclDeviceRef dref = DPCTLDevice_Create(...) +if dref is NULL: + raise SyclDeviceCreationError("Failed to create device") +``` + +### Exception Safety + +```cython +def create_something(self): + cdef SomeRef ref = SomeFunction() + if ref is NULL: + raise SomeError("Creation failed") + try: + # Operations that might raise + result = self._process(ref) + except: + # Clean up on exception + SomeDelete(ref) + raise + return result +``` + +## Backend Declarations (.pxd) + +```cython +# _backend.pxd +cdef extern from "syclinterface/dpctl_sycl_device_interface.h": + ctypedef void* DPCTLSyclDeviceRef + + DPCTLSyclDeviceRef DPCTLDevice_Create( + DPCTLDeviceSelectorRef DSRef + ) nogil + + void DPCTLDevice_Delete( + DPCTLSyclDeviceRef DRef + ) nogil + + const char* DPCTLDevice_GetName( + DPCTLSyclDeviceRef DRef + ) nogil +``` + +## Include Files (.pxi) + +Used for shared code snippets: + +```cython +# In main .pyx file +include "_sycl_usm_array_interface_utils.pxi" +``` + +## Common Patterns + +### Property with C String Return + +```cython +@property +def driver_version(self): + cdef const char* ver = DPCTLDevice_GetDriverVersion(self._device_ref) + if ver is NULL: + return "" + try: + return ver.decode("utf-8") + finally: + DPCTLCString_Delete(ver) +``` + +### Capsule for Opaque Pointers + +```cython +def get_capsule(self): + """Return PyCapsule containing C pointer.""" + if self._device_ref is NULL: + return None + return pycapsule.PyCapsule_New( + self._device_ref, + "SyclDeviceRef", + NULL # No destructor - we manage lifetime + ) +``` + +### Type Checking + +```cython +def some_method(self, other): + if not isinstance(other, SyclDevice): + raise TypeError( + f"Expected SyclDevice, got {type(other).__name__}" + ) + cdef SyclDevice other_dev = other + # Now can access other_dev._device_ref +``` diff --git a/.github/instructions/dpctl.instructions.md b/.github/instructions/dpctl.instructions.md new file mode 100644 index 0000000000..227da04bb2 --- /dev/null +++ b/.github/instructions/dpctl.instructions.md @@ -0,0 +1,135 @@ +--- +applyTo: + - "**/*.py" + - "**/*.pyx" + - "**/*.pxd" + - "**/*.pxi" + - "**/*.cpp" + - "**/*.hpp" + - "**/*.h" + - "**/*.c" +--- + +# DPCTL General Instructions + +## Project Context + +DPCTL (Data Parallel Control) is a Python SYCL binding library for heterogeneous computing. It provides: +- Python wrappers for SYCL runtime objects (devices, queues, contexts) +- Array API-compliant tensor operations (`dpctl.tensor`) +- USM memory management (`dpctl.memory`) +- Kernel compilation (`dpctl.program`) + +## License Header Requirement + +**All source files must include the Apache 2.0 license header.** + +### Python/Cython Files +```python +# Data Parallel Control (dpctl) +# +# Copyright 2020-2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +``` + +### C/C++ Files +```cpp +//===-- filename.hpp - Brief description -*-C++-*-===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// ... +//===----------------------------------------------------------------------===// +``` + +## Code Style + +### Python +- **Formatter:** Black (line length 80) +- **Import sorting:** isort +- **Linting:** flake8 +- **Max line length:** 80 characters + +### C/C++ +- **Formatter:** clang-format (LLVM-based) +- **Indent:** 4 spaces +- **Brace style:** Break before functions, classes, namespaces + +## Naming Conventions + +### Python/Cython +- Classes: `PascalCase` (e.g., `SyclDevice`, `SyclQueue`) +- Functions/methods: `snake_case` (e.g., `get_device_info`) +- Private: prefix with `_` (e.g., `_validate_queue`) +- Constants: `UPPER_SNAKE_CASE` + +### C API (libsyclinterface) +- Functions: `DPCTL_` (e.g., `DPCTLDevice_Create`) +- Types: `DPCTLRef` (e.g., `DPCTLSyclDeviceRef`) +- Macros: `DPCTL_` prefix + +### C++ Kernels +- Functors: `Functor` (e.g., `AddFunctor`) +- Kernel classes: `_krn` suffix +- Namespaces: `dpctl::tensor::kernels` + +## Error Handling + +### Python Exceptions +- `SyclDeviceCreationError` - Device creation failed +- `SyclQueueCreationError` - Queue creation failed +- `ExecutionPlacementError` - Queue mismatch between arrays +- `TypeError`, `ValueError` - Standard Python errors for invalid input + +### C API Error Handling +- Return `NULL` for failed object creation +- Use `error_handler_callback` for async errors +- Check return values before using + +## Device Compatibility + +**Warning:** Not all SYCL devices support all data types. + +- **fp64 (double):** Check `device.has_aspect_fp64` +- **fp16 (half):** Check `device.has_aspect_fp16` + +Always use `skip_if_dtype_not_supported()` in tests. + +## Common Patterns + +### Queue Validation +```python +def some_operation(x, y): + exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, y.sycl_queue]) + if exec_q is None: + raise ExecutionPlacementError("...") +``` + +### Resource Cleanup +```python +cdef class SyclDevice: + cdef DPCTLSyclDeviceRef _device_ref + + def __dealloc__(self): + DPCTLDevice_Delete(self._device_ref) +``` + +### GIL Release for Blocking Operations +```cython +with nogil: + DPCTLEvent_Wait(event_ref) +``` diff --git a/.github/instructions/elementwise.instructions.md b/.github/instructions/elementwise.instructions.md new file mode 100644 index 0000000000..f747c6c926 --- /dev/null +++ b/.github/instructions/elementwise.instructions.md @@ -0,0 +1,415 @@ +--- +applyTo: + - "dpctl/tensor/libtensor/include/kernels/elementwise_functions/**" + - "dpctl/tensor/libtensor/source/elementwise_functions/**" + - "dpctl/tensor/_elementwise_funcs.py" + - "dpctl/tensor/_elementwise_common.py" + - "dpctl/tests/elementwise/**" +--- + +# Elementwise Operations Instructions + +## Context + +Elementwise operations span the full stack from C++ SYCL kernels to Python wrappers and tests. This guide covers the complete pattern for implementing elementwise operations. + +## Full Stack Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ dpctl/tensor/_elementwise_funcs.py │ +│ Python wrapper using UnaryElementwiseFunc/BinaryElementwiseFunc│ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ dpctl/tensor/_tensor_impl (pybind11) │ +│ _abs, _abs_result_type, etc. │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ libtensor/source/elementwise_functions/abs.cpp │ +│ Dispatch tables and pybind11 bindings │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ libtensor/include/kernels/elementwise_functions/abs.hpp │ +│ SYCL kernel implementation (AbsFunctor) │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Adding a New Elementwise Operation + +### Step 1: C++ Kernel Header + +Create `libtensor/include/kernels/elementwise_functions/myop.hpp`: + +```cpp +//===-- myop.hpp - Implementation of myop -*-C++-*-===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// ... license ... +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include "common.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::myop +{ + +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +// Output type definition +template +struct MyOpOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + // ... map all supported input types to output types + td_ns::DefaultResultEntry>::result_type; +}; + +// Kernel functor +template +struct MyOpFunctor +{ + using supports_sg_loadstore = + typename std::negation>; + using supports_vec = + typename std::negation>; + + resT operator()(const argT &x) const + { + if constexpr (tu_ns::is_complex::value) { + // Handle complex types + return /* complex implementation */; + } + else { + return /* scalar implementation */; + } + } + + template + sycl::vec + operator()(const sycl::vec &x) const + { + // Vectorized implementation + return /* vectorized implementation */; + } +}; + +// Factory for contiguous arrays +template +struct MyOpContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename MyOpOutputType::value_type, void>) + { + return nullptr; // Unsupported type + } + else { + using dpctl::tensor::kernels::unary_contig_impl; + return unary_contig_impl; + } + } +}; + +// Factory for strided arrays +template +struct MyOpStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename MyOpOutputType::value_type, void>) + { + return nullptr; + } + else { + using dpctl::tensor::kernels::unary_strided_impl; + return unary_strided_impl; + } + } +}; + +// Type support check factory +template +struct MyOpTypeSupportFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename MyOpOutputType::value_type, void>) + { + return nullptr; + } + else { + return [](void) -> bool { return true; }; + } + } +}; + +} // namespace dpctl::tensor::kernels::myop +``` + +### Step 2: C++ Source File + +Create `libtensor/source/elementwise_functions/myop.cpp`: + +```cpp +//===----------- myop.cpp - Implementation of myop -*-C++-*-===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// ... license ... +//===----------------------------------------------------------------------===// + +#include "myop.hpp" +#include "elementwise_functions.hpp" + +#include "dpctl4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +// Dispatch tables +static unary_contig_impl_fn_ptr_t + myop_contig_dispatch_table[td_ns::num_types]; + +static unary_strided_impl_fn_ptr_t + myop_strided_dispatch_table[td_ns::num_types]; + +void init_myop_dispatch_tables(void) +{ + using dpctl::tensor::kernels::myop::MyOpContigFactory; + using dpctl::tensor::kernels::myop::MyOpStridedFactory; + + td_ns::DispatchTableBuilder + dtb_contig; + dtb_contig.populate_dispatch_table(myop_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb_strided; + dtb_strided.populate_dispatch_table(myop_strided_dispatch_table); +} + +// Result type function +py::object myop_result_type(const py::dtype &input_dtype, + const py::object &device_obj) +{ + // Implementation using type lookup +} + +// Main function +std::pair +myop_func(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // Dispatch to appropriate kernel based on src dtype +} + +void init_myop(py::module_ m) +{ + init_myop_dispatch_tables(); + + m.def("_myop", &myop_func, "MyOp implementation", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_myop_result_type", &myop_result_type); +} + +} // namespace dpctl::tensor::py_internal +``` + +### Step 3: Register in tensor_elementwise.cpp + +In `libtensor/source/tensor_elementwise.cpp`: + +```cpp +#include "elementwise_functions/myop.hpp" + +void init_elementwise_functions(py::module_ m) +{ + // ... existing operations ... + init_myop(m); +} +``` + +### Step 4: Python Wrapper + +In `dpctl/tensor/_elementwise_funcs.py`: + +```python +import dpctl.tensor._tensor_impl as ti +from ._elementwise_common import UnaryElementwiseFunc + +_myop_docstring_ = """ +myop(x, /, out=None, order="K") + +Computes myop element-wise. + +Args: + x (usm_ndarray): Input array. + out (usm_ndarray, optional): Output array to use. + order ({"K", "C", "F", "A"}, optional): Memory order for output. + +Returns: + usm_ndarray: Result array. + +See Also: + :func:`related_func`: Description. +""" + +myop = UnaryElementwiseFunc( + "myop", + ti._myop_result_type, + ti._myop, + _myop_docstring_, +) +``` + +### Step 5: Export in __init__.py + +In `dpctl/tensor/__init__.py`: + +```python +from ._elementwise_funcs import ( + # ... existing exports ... + myop, +) + +__all__ = [ + # ... existing exports ... + "myop", +] +``` + +### Step 6: Create Tests + +Create `dpctl/tests/elementwise/test_myop.py`: + +```python +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _usm_types + + +class TestMyOp: + @pytest.mark.parametrize("dtype", _all_dtypes) + def test_myop_dtype(self, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.linspace(0, 1, 100, dtype=dtype, sycl_queue=q) + result = dpt.myop(x) + + expected = np.myop(dpt.asnumpy(x)) + assert np.allclose(dpt.asnumpy(result), expected) + + @pytest.mark.parametrize("usm_type", _usm_types) + def test_myop_usm_type(self, usm_type): + q = get_queue_or_skip() + + x = dpt.ones(100, dtype="f4", usm_type=usm_type, sycl_queue=q) + result = dpt.myop(x) + + assert result.usm_type == usm_type + + def test_myop_empty(self): + q = get_queue_or_skip() + x = dpt.empty((0,), dtype="f4", sycl_queue=q) + result = dpt.myop(x) + assert result.shape == (0,) + + def test_myop_scalar(self): + q = get_queue_or_skip() + x = dpt.asarray(5.0, sycl_queue=q) + result = dpt.myop(x) + assert result.ndim == 0 + + def test_myop_out(self): + q = get_queue_or_skip() + x = dpt.ones(100, dtype="f4", sycl_queue=q) + out = dpt.empty_like(x) + result = dpt.myop(x, out=out) + assert result is out + + @pytest.mark.parametrize("order", ["C", "F"]) + def test_myop_order(self, order): + q = get_queue_or_skip() + x = dpt.ones((10, 10), dtype="f4", order=order, sycl_queue=q) + result = dpt.myop(x, order=order) + + if order == "C": + assert result.flags.c_contiguous + else: + assert result.flags.f_contiguous +``` + +## Type Support Matrix + +Common type mappings for unary operations: + +| Operation | Input Types | Output Type | +|-----------|-------------|-------------| +| abs | int, float | same | +| abs | complex | real component type | +| negative | int, float, complex | same | +| sin, cos, etc. | float, complex | same | +| isnan, isinf | float, complex | bool | +| sign | int, float, complex | same | + +## Binary Operation Pattern + +For binary operations, use `BinaryElementwiseFunc`: + +```python +from ._elementwise_common import BinaryElementwiseFunc + +add = BinaryElementwiseFunc( + "add", + ti._add_result_type, + ti._add, + _add_docstring_, + # Binary operations may need additional parameters + ti._add_inplace, # In-place variant +) +``` + +## Vectorization Notes + +- `supports_sg_loadstore` - Can use sub-group load/store +- `supports_vec` - Can use sycl::vec for vectorization +- Complex types typically don't support vectorization +- Check device capabilities for half-precision (fp16) diff --git a/.github/instructions/libsyclinterface.instructions.md b/.github/instructions/libsyclinterface.instructions.md new file mode 100644 index 0000000000..9aa23c09e1 --- /dev/null +++ b/.github/instructions/libsyclinterface.instructions.md @@ -0,0 +1,290 @@ +--- +applyTo: + - "libsyclinterface/**/*.h" + - "libsyclinterface/**/*.hpp" + - "libsyclinterface/**/*.cpp" + - "libsyclinterface/**/*.c" +--- + +# C API Layer Instructions (libsyclinterface) + +## Context + +The libsyclinterface directory provides a C API wrapper around SYCL C++ runtime objects. This enables language interoperability with Python (via Cython) and other languages. + +## Directory Structure + +``` +libsyclinterface/ +├── include/syclinterface/ +│ ├── Support/ +│ │ ├── DllExport.h # DLL export macros +│ │ ├── ExternC.h # extern "C" wrapper macros +│ │ └── MemOwnershipAttrs.h # Memory ownership annotations +│ ├── dpctl_sycl_context_interface.h +│ ├── dpctl_sycl_device_interface.h +│ ├── dpctl_sycl_device_manager.h +│ ├── dpctl_sycl_device_selector_interface.h +│ ├── dpctl_sycl_event_interface.h +│ ├── dpctl_sycl_kernel_bundle_interface.h +│ ├── dpctl_sycl_kernel_interface.h +│ ├── dpctl_sycl_platform_interface.h +│ ├── dpctl_sycl_platform_manager.h +│ ├── dpctl_sycl_queue_interface.h +│ ├── dpctl_sycl_queue_manager.h +│ ├── dpctl_sycl_usm_interface.h +│ └── dpctl_sycl_types.h +├── helper/ +│ ├── include/ +│ └── source/ +├── source/ # Implementation files +└── tests/ # C interface tests +``` + +## Function Naming Convention + +``` +DPCTL_ +``` + +### Examples +```c +DPCTLDevice_Create() +DPCTLDevice_Delete() +DPCTLDevice_GetName() +DPCTLDevice_GetVendor() +DPCTLDevice_HasAspect() + +DPCTLQueue_Create() +DPCTLQueue_Delete() +DPCTLQueue_Submit() +DPCTLQueue_Memcpy() +DPCTLQueue_Wait() + +DPCTLContext_Create() +DPCTLContext_Delete() +DPCTLContext_GetDeviceCount() +``` + +## Memory Ownership Annotations + +Defined in `Support/MemOwnershipAttrs.h`: + +| Annotation | Meaning | Caller Action | +|------------|---------|---------------| +| `__dpctl_give` | Function returns ownership | Caller must eventually free | +| `__dpctl_take` | Function takes ownership | Caller must not use after call | +| `__dpctl_keep` | Function only observes | Caller retains ownership | +| `__dpctl_null` | NULL is valid | Check for NULL returns | + +### Usage Examples + +```c +// Caller receives ownership - must call Delete later +DPCTL_API +__dpctl_give DPCTLSyclContextRef +DPCTLContext_Create(__dpctl_keep const DPCTLSyclDeviceRef DRef, + error_handler_callback *handler, + int properties); + +// Function takes ownership - don't use CRef after this +DPCTL_API +void DPCTLContext_Delete(__dpctl_take DPCTLSyclContextRef CRef); + +// Function only observes - QRef still valid after call +DPCTL_API +size_t DPCTLQueue_GetBackend(__dpctl_keep const DPCTLSyclQueueRef QRef); + +// May return NULL on failure +DPCTL_API +__dpctl_give __dpctl_null DPCTLSyclDeviceRef +DPCTLDevice_Create(__dpctl_keep DPCTLSyclDeviceSelectorRef DSRef); +``` + +## Opaque Pointer Types + +```c +// Forward declarations - actual struct defined in implementation +typedef struct DPCTLOpaqueSyclContext *DPCTLSyclContextRef; +typedef struct DPCTLOpaqueSyclDevice *DPCTLSyclDeviceRef; +typedef struct DPCTLOpaqueSyclEvent *DPCTLSyclEventRef; +typedef struct DPCTLOpaqueSyclKernel *DPCTLSyclKernelRef; +typedef struct DPCTLOpaqueSyclPlatform *DPCTLSyclPlatformRef; +typedef struct DPCTLOpaqueSyclQueue *DPCTLSyclQueueRef; +``` + +## Extern C Wrapper + +Use macros from `Support/ExternC.h`: + +```c +#include "Support/ExternC.h" + +DPCTL_C_EXTERN_C_BEGIN + +// All C API declarations here +DPCTL_API +__dpctl_give DPCTLSyclDeviceRef +DPCTLDevice_Create(...); + +DPCTL_C_EXTERN_C_END +``` + +## DLL Export + +Use `DPCTL_API` macro from `Support/DllExport.h`: + +```c +DPCTL_API +__dpctl_give DPCTLSyclContextRef +DPCTLContext_Create(...); +``` + +## Error Handling + +### Error Callback + +```c +// Type definition +typedef void error_handler_callback(int err_code); + +// Usage in async error handling +DPCTL_API +__dpctl_give DPCTLSyclQueueRef +DPCTLQueue_Create(__dpctl_keep DPCTLSyclContextRef CRef, + __dpctl_keep DPCTLSyclDeviceRef DRef, + error_handler_callback *handler, + int properties); +``` + +### NULL Return Convention + +Functions that create objects return `NULL` on failure: + +```c +DPCTLSyclDeviceRef dref = DPCTLDevice_Create(selector); +if (dref == NULL) { + // Handle error +} +``` + +## Header File Template + +```c +//===-- dpctl_sycl_foo_interface.h - C API for Foo -*-C-*-===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// ... +//===----------------------------------------------------------------------===// +/// +/// \file +/// This header declares a C API for sycl::foo class. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "Support/DllExport.h" +#include "Support/ExternC.h" +#include "Support/MemOwnershipAttrs.h" +#include "dpctl_sycl_types.h" + +DPCTL_C_EXTERN_C_BEGIN + +/** + * @defgroup FooInterface Foo class C wrapper + */ + +/*! + * @brief Creates a new Foo object. + * + * @param param1 Description of param1. + * @param param2 Description of param2. + * @return A new DPCTLSyclFooRef on success, NULL on failure. + * @ingroup FooInterface + */ +DPCTL_API +__dpctl_give DPCTLSyclFooRef +DPCTLFoo_Create(__dpctl_keep DPCTLSyclBarRef BRef); + +/*! + * @brief Deletes a Foo object. + * + * @param FRef The Foo reference to delete. + * @ingroup FooInterface + */ +DPCTL_API +void DPCTLFoo_Delete(__dpctl_take DPCTLSyclFooRef FRef); + +DPCTL_C_EXTERN_C_END +``` + +## Implementation Pattern + +```cpp +// In source/dpctl_sycl_foo_interface.cpp + +#include "dpctl_sycl_foo_interface.h" +#include + +namespace +{ +// Internal helpers in anonymous namespace +struct DPCTLOpaqueSyclFoo +{ + sycl::foo *ptr; +}; + +inline sycl::foo *unwrap(DPCTLSyclFooRef ref) +{ + return ref ? ref->ptr : nullptr; +} + +inline DPCTLSyclFooRef wrap(sycl::foo *ptr) +{ + if (ptr) { + auto ref = new DPCTLOpaqueSyclFoo; + ref->ptr = ptr; + return ref; + } + return nullptr; +} +} // namespace + +__dpctl_give DPCTLSyclFooRef +DPCTLFoo_Create(__dpctl_keep DPCTLSyclBarRef BRef) +{ + try { + auto bar = unwrap(BRef); + if (!bar) return nullptr; + + auto foo = new sycl::foo(*bar); + return wrap(foo); + } + catch (const std::exception &e) { + // Log error + return nullptr; + } +} + +void DPCTLFoo_Delete(__dpctl_take DPCTLSyclFooRef FRef) +{ + if (FRef) { + delete FRef->ptr; + delete FRef; + } +} +``` + +## Best Practices + +1. **Always use ownership annotations** - Every function parameter and return value should have appropriate annotation +2. **Check for NULL** - Always validate input pointers before use +3. **Use try/catch** - Catch C++ exceptions and return NULL or error codes +4. **Wrap/unwrap consistently** - Use helper functions for opaque pointer conversion +5. **Document with Doxygen** - Every public function needs documentation +6. **Thread safety** - Document thread safety guarantees diff --git a/.github/instructions/libtensor-cpp.instructions.md b/.github/instructions/libtensor-cpp.instructions.md new file mode 100644 index 0000000000..bbb695b68d --- /dev/null +++ b/.github/instructions/libtensor-cpp.instructions.md @@ -0,0 +1,316 @@ +--- +applyTo: + - "dpctl/tensor/libtensor/**/*.hpp" + - "dpctl/tensor/libtensor/**/*.cpp" +--- + +# C++ SYCL Kernel Instructions (libtensor) + +## Context + +The libtensor directory contains C++ SYCL kernel implementations for tensor operations. These kernels are called from Python via pybind11 bindings. + +## Directory Structure + +``` +libtensor/ +├── include/ +│ ├── kernels/ +│ │ ├── elementwise_functions/ # Unary/binary operations +│ │ │ ├── common.hpp # Base iteration patterns +│ │ │ ├── common_inplace.hpp # In-place operation patterns +│ │ │ ├── add.hpp, sin.hpp, etc. +│ │ ├── linalg_functions/ # Linear algebra (gemm, dot) +│ │ ├── sorting/ # Sort, argsort, searchsorted +│ │ └── reductions.hpp # Reduction operations +│ └── utils/ +│ ├── type_dispatch.hpp # Type lookup tables +│ ├── type_dispatch_building.hpp # Dispatch table generation +│ ├── offset_utils.hpp # Stride calculations +│ └── sycl_alloc_utils.hpp # USM allocation helpers +└── source/ + ├── elementwise_functions/ # Implementation files + ├── reductions/ + ├── sorting/ + └── tensor_*.cpp # pybind11 entry points +``` + +## File Header Format + +```cpp +//===-- operation.hpp - Brief description -*-C++-*-===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// ... +//===----------------------------------------------------------------------===// +/// +/// \file +/// Detailed description of the file's purpose and contents. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +// ... other includes +``` + +## Type Dispatch System + +### Type Enumeration + +```cpp +// In type_dispatch_building.hpp +enum class typenum_t : int { + BOOL = 0, + INT8, UINT8, INT16, UINT16, INT32, UINT32, + INT64, UINT64, HALF, FLOAT, DOUBLE, + CFLOAT, CDOUBLE, +}; +inline constexpr int num_types = 14; +``` + +### Dispatch Table Pattern + +```cpp +// 2D dispatch table: output_type_fn_ptr_t[dst_type][src_type] +template typename factory, + int _num_types> +class DispatchTableBuilder +{ +public: + DispatchTableBuilder() = default; + + void populate_dispatch_table(funcPtrT table[][_num_types]) const + { + // Populates table with type-specific function pointers + // using the factory template + } +}; +``` + +### Using Dispatch Tables + +```cpp +// In operation.cpp +static unary_contig_impl_fn_ptr_t abs_contig_dispatch_table[num_types]; + +void init_abs_dispatch_tables() +{ + using dpctl::tensor::kernels::abs::AbsContigFactory; + + DispatchTableBuilder + dtb; + dtb.populate_dispatch_table(abs_contig_dispatch_table); +} +``` + +## Kernel Functor Pattern + +### Unary Operation + +```cpp +template +struct AbsFunctor +{ + // Type traits for optimization + using is_constant = std::false_type; + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + // Scalar operation + resT operator()(const argT &x) const + { + if constexpr (std::is_same_v) { + return x; + } + else if constexpr (is_complex::value) { + return cabs(x); + } + else if constexpr (std::is_unsigned_v) { + return x; + } + else { + return (x >= argT(0)) ? x : -x; + } + } + + // Vectorized operation (when supports_vec is true) + template + sycl::vec + operator()(const sycl::vec &x) const + { + return sycl::abs(x); + } +}; +``` + +### Binary Operation + +```cpp +template +struct AddFunctor +{ + using supports_sg_loadstore = std::negation< + std::disjunction, is_complex>>; + using supports_vec = std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT1 &x, const argT2 &y) const + { + if constexpr (is_complex::value || is_complex::value) { + using rT1 = typename argT1::value_type; + using rT2 = typename argT2::value_type; + return exprm_ns::complex(x) + exprm_ns::complex(y); + } + else { + return x + y; + } + } + + template + sycl::vec + operator()(const sycl::vec &x, + const sycl::vec &y) const + { + return x + y; + } +}; +``` + +## Kernel Naming + +Kernel class names must be unique across the entire codebase: + +```cpp +template +class abs_contig_kernel; // Unique name + +template +class abs_strided_kernel; // Different name for different pattern +``` + +## Type Support Matrix + +Define which type combinations are supported: + +```cpp +template +struct AbsOutputType +{ + // Default: not defined (unsupported) +}; + +template <> +struct AbsOutputType +{ + using value_type = float; +}; + +template <> +struct AbsOutputType, void> +{ + using value_type = float; // abs of complex returns real +}; +``` + +## Factory Pattern + +```cpp +template +struct AbsContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename AbsOutputType::value_type, + void>) + { + // Unsupported type combination + return nullptr; + } + else { + return abs_contig_impl; + } + } +}; +``` + +## Memory Patterns + +### Contiguous Arrays + +```cpp +template +void process_contig(sycl::queue &q, + size_t nelems, + const T *src, + T *dst, + const std::vector &depends) +{ + sycl::event e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for(sycl::range<1>(nelems), + [=](sycl::id<1> id) { + dst[id] = process(src[id]); + }); + }); +} +``` + +### Strided Arrays + +```cpp +template +void process_strided(sycl::queue &q, + int nd, + const ssize_t *shape, + const T *src, + ssize_t src_offset, + const ssize_t *src_strides, + T *dst, + ssize_t dst_offset, + const ssize_t *dst_strides) +{ + // Use indexing utilities from offset_utils.hpp + using dpctl::tensor::offset_utils::StridedIndexer; + + StridedIndexer src_indexer(nd, src_offset, shape, src_strides); + StridedIndexer dst_indexer(nd, dst_offset, shape, dst_strides); + + // Submit kernel using indexers +} +``` + +## Common Utilities + +### offset_utils.hpp +- `StridedIndexer` - Convert linear index to strided offset +- `UnpackedStridedIndexer` - Optimized for specific dimensions +- `TwoOffsets_StridedIndexer` - For binary operations + +### sycl_alloc_utils.hpp +- `usm_allocator` - SYCL USM allocator wrapper +- Temporary allocation helpers + +### type_utils.hpp +- `is_complex` - Check if type is complex +- Type promotion utilities + +## Best Practices + +1. **Use `if constexpr`** for compile-time type branching +2. **Mark kernel classes** with unique names +3. **Support vectorization** where possible via `sycl::vec` +4. **Handle complex types** separately (no vectorization) +5. **Check device capabilities** before using fp64/fp16 +6. **Use dispatch tables** for runtime type resolution +7. **Avoid raw new/delete** - use SYCL allocators diff --git a/.github/instructions/memory.instructions.md b/.github/instructions/memory.instructions.md new file mode 100644 index 0000000000..1be336aa23 --- /dev/null +++ b/.github/instructions/memory.instructions.md @@ -0,0 +1,279 @@ +--- +applyTo: + - "dpctl/memory/**" + - "dpctl/memory/*.py" + - "dpctl/memory/*.pyx" + - "dpctl/memory/*.pxd" + - "**/usm*.py" + - "**/test_sycl_usm*.py" +--- + +# USM Memory Instructions + +## Context + +The `dpctl.memory` module provides Python classes for SYCL Unified Shared Memory (USM) allocation and management. USM allows allocating memory that can be accessed by both host and device. + +## USM Types + +| Type | Class | Description | +|------|-------|-------------| +| Device | `MemoryUSMDevice` | Memory on device only, fastest device access | +| Shared | `MemoryUSMShared` | Accessible from host and device, automatic migration | +| Host | `MemoryUSMHost` | Host memory accessible from device | + +### When to Use Each Type + +- **Device**: Best performance for device-only data +- **Shared**: Convenient for data accessed by both host and device +- **Host**: When data primarily lives on host but device needs access + +## Memory Classes + +### MemoryUSMDevice + +```python +import dpctl +from dpctl.memory import MemoryUSMDevice + +q = dpctl.SyclQueue() + +# Allocate 1024 bytes of device memory +mem = MemoryUSMDevice(1024, queue=q) + +# Properties +mem.nbytes # Size in bytes +mem.sycl_queue # Associated queue +mem.sycl_device # Associated device +mem.sycl_context # Associated context + +# Copy from host +import numpy as np +host_data = np.array([1, 2, 3, 4], dtype=np.float32) +mem.copy_from_host(host_data.view(np.uint8)) + +# Copy to host +result = np.empty(4, dtype=np.float32) +mem.copy_to_host(result.view(np.uint8)) +``` + +### MemoryUSMShared + +```python +from dpctl.memory import MemoryUSMShared + +# Shared memory - accessible from both host and device +mem = MemoryUSMShared(1024, queue=q) + +# Can be used similarly to MemoryUSMDevice +``` + +### MemoryUSMHost + +```python +from dpctl.memory import MemoryUSMHost + +# Host memory with device accessibility +mem = MemoryUSMHost(1024, queue=q) +``` + +## __sycl_usm_array_interface__ + +All memory classes implement the `__sycl_usm_array_interface__` protocol for interoperability: + +```python +# The interface is a dictionary +interface = mem.__sycl_usm_array_interface__ + +# Structure: +{ + "data": (pointer, readonly_flag), # Tuple of (int, bool) + "shape": (nbytes,), # Tuple of sizes + "strides": None, # Strides (None for contiguous) + "typestr": "|u1", # Data type string (bytes) + "version": 1, # Interface version + "syclobj": queue # Associated SYCL queue +} +``` + +### Using the Interface + +```python +def accepts_usm_memory(obj): + """Accept any object with __sycl_usm_array_interface__.""" + if hasattr(obj, "__sycl_usm_array_interface__"): + iface = obj.__sycl_usm_array_interface__ + ptr, readonly = iface["data"] + nbytes = iface["shape"][0] + queue = iface["syclobj"] + return ptr, nbytes, queue + raise TypeError("Object does not support __sycl_usm_array_interface__") +``` + +## Memory Lifetime Rules + +### Rule 1: Memory is Queue-Bound + +Memory allocations are associated with a specific queue/context: + +```python +q1 = dpctl.SyclQueue("gpu:0") +q2 = dpctl.SyclQueue("gpu:1") + +mem1 = MemoryUSMDevice(1024, queue=q1) + +# Cannot directly use mem1 with q2 if different context +# Must copy through host or peer-to-peer if supported +``` + +### Rule 2: Memory Outlives Operations + +Ensure memory remains valid until all operations complete: + +```python +# BAD - memory might be freed before kernel finishes +def bad_example(): + mem = MemoryUSMDevice(1024, queue=q) + submit_kernel(mem) # Async operation + # mem goes out of scope - danger! + +# GOOD - wait for completion +def good_example(): + mem = MemoryUSMDevice(1024, queue=q) + event = submit_kernel(mem) + event.wait() # Ensure completion before mem freed +``` + +### Rule 3: View Keeps Base Alive + +Creating views extends lifetime: + +```python +base = MemoryUSMDevice(1024, queue=q) +view = base[100:200] # View into base + +# base remains valid as long as view exists +del base # view still holds reference +``` + +## Cython Memory Management + +### Allocation Pattern + +```cython +cdef class MemoryUSMDevice: + cdef DPCTLSyclUSMRef _memory_ref + cdef size_t _nbytes + cdef SyclQueue _queue + + def __cinit__(self, size_t nbytes, SyclQueue queue): + cdef DPCTLSyclQueueRef qref = queue.get_queue_ref() + + self._memory_ref = DPCTLmalloc_device(nbytes, qref) + if self._memory_ref is NULL: + raise MemoryError(f"Failed to allocate {nbytes} bytes") + + self._nbytes = nbytes + self._queue = queue + + def __dealloc__(self): + if self._memory_ref is not NULL: + DPCTLfree_with_queue( + self._memory_ref, + self._queue.get_queue_ref() + ) +``` + +### Copy Operations + +```cython +def copy_from_host(self, host_buffer): + """Copy data from host to device memory.""" + cdef unsigned char[::1] buf = host_buffer + + if buf.shape[0] > self._nbytes: + raise ValueError("Source buffer larger than allocation") + + cdef DPCTLSyclEventRef event = DPCTLQueue_Memcpy( + self._queue.get_queue_ref(), + self._memory_ref, + &buf[0], + buf.shape[0] + ) + + with nogil: + DPCTLEvent_Wait(event) + DPCTLEvent_Delete(event) +``` + +## Testing Memory Operations + +```python +import pytest +import numpy as np +from dpctl.memory import MemoryUSMDevice, MemoryUSMShared, MemoryUSMHost +from dpctl.tests.helper import get_queue_or_skip + + +class TestUSMMemory: + @pytest.mark.parametrize("mem_class", [ + MemoryUSMDevice, MemoryUSMShared, MemoryUSMHost + ]) + def test_allocation(self, mem_class): + q = get_queue_or_skip() + mem = mem_class(1024, queue=q) + assert mem.nbytes == 1024 + assert mem.sycl_queue == q + + @pytest.mark.parametrize("mem_class", [ + MemoryUSMDevice, MemoryUSMShared, MemoryUSMHost + ]) + def test_copy_roundtrip(self, mem_class): + q = get_queue_or_skip() + + src = np.array([1, 2, 3, 4], dtype=np.float32) + mem = mem_class(src.nbytes, queue=q) + + mem.copy_from_host(src.view(np.uint8)) + + dst = np.empty_like(src) + mem.copy_to_host(dst.view(np.uint8)) + + assert np.array_equal(src, dst) + + def test_interface(self): + q = get_queue_or_skip() + mem = MemoryUSMDevice(1024, queue=q) + + iface = mem.__sycl_usm_array_interface__ + assert "data" in iface + assert "shape" in iface + assert iface["shape"] == (1024,) + assert iface["syclobj"] == q +``` + +## Integration with usm_ndarray + +`dpctl.tensor.usm_ndarray` uses USM memory internally: + +```python +import dpctl.tensor as dpt + +# Creates usm_ndarray with underlying USM allocation +x = dpt.ones((100, 100), dtype=dpt.float32, usm_type="device") + +# Access memory properties +x.usm_type # "device", "shared", or "host" +x.sycl_queue # Associated queue + +# Get raw memory (if needed) +# Note: usm_ndarray manages its own memory +``` + +## Best Practices + +1. **Use usm_ndarray for arrays** - Prefer `dpctl.tensor` over raw memory +2. **Match queues** - Ensure arrays/memory share compatible queues +3. **Wait before access** - Wait for async operations before host access +4. **Use appropriate USM type** - Device for performance, shared for convenience +5. **Check allocation success** - Handle MemoryError for large allocations diff --git a/.github/instructions/tensor-python.instructions.md b/.github/instructions/tensor-python.instructions.md new file mode 100644 index 0000000000..f294321ffc --- /dev/null +++ b/.github/instructions/tensor-python.instructions.md @@ -0,0 +1,261 @@ +--- +applyTo: + - "dpctl/tensor/*.py" + - "dpctl/tensor/**/*.py" +--- + +# Tensor Python Code Instructions + +## Context + +The `dpctl.tensor` module provides Array API-compliant tensor operations. It wraps C++ SYCL kernels with Python interfaces that follow the Python Array API standard. + +## Key Files + +| File | Purpose | +|------|---------| +| `_usmarray.pyx` | `usm_ndarray` extension type | +| `_elementwise_funcs.py` | Elementwise operation wrappers | +| `_elementwise_common.py` | Base classes for elementwise ops | +| `_reduction.py` | Reduction operations | +| `_manipulation_functions.py` | reshape, stack, concat, etc. | +| `_ctors.py` | Array constructors | +| `_copy_utils.py` | Copy and cast utilities | +| `_type_utils.py` | Type promotion logic | +| `_device.py` | Device object for Array API | + +## Array API Compliance + +DPCTL tensor aims for Array API compatibility. Key requirements: + +1. **Immutable dtype and device** - Arrays don't change type or device implicitly +2. **Explicit data movement** - Use `asarray(x, device=...)` for device transfers +3. **Type promotion rules** - Follow Array API promotion table +4. **No implicit broadcasting** - Some operations require explicit broadcast + +## Queue Validation Pattern + +All operations must validate that input arrays share a compatible execution context: + +```python +def some_operation(x, y): + """Example binary operation.""" + if not isinstance(x, usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(x)}") + if not isinstance(y, usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(y)}") + + # Get common queue (returns None if incompatible) + exec_q = dpctl.utils.get_execution_queue( + [x.sycl_queue, y.sycl_queue] + ) + if exec_q is None: + raise ExecutionPlacementError( + "Input arrays must share the same SYCL queue or be on " + "compatible queues." + ) + + # Proceed with operation + return _impl(x, y, sycl_queue=exec_q) +``` + +## Order Manager + +Use `OrderManager` for handling memory order (C/F contiguous): + +```python +from dpctl.utils import OrderManager + +def operation_with_order(x, order="K"): + """ + Args: + order: Memory order for output. + - "C": C-contiguous (row-major) + - "F": Fortran-contiguous (column-major) + - "A": F if input is F-contiguous, else C + - "K": Keep input order + """ + om = OrderManager(order) + + # Determine output order based on input + out_order = om.get_order(x) + + # Create output array with determined order + result = empty(x.shape, dtype=x.dtype, order=out_order, + sycl_queue=x.sycl_queue) + return result +``` + +## Elementwise Operation Wrapper + +```python +# In _elementwise_funcs.py + +from ._elementwise_common import ( + UnaryElementwiseFunc, + BinaryElementwiseFunc, +) +import dpctl.tensor._tensor_impl as ti + +_abs_docstring_ = """ +abs(x, /, out=None, order="K") + +Computes the absolute value element-wise. + +Args: + x (usm_ndarray): Input array. + out (usm_ndarray, optional): Output array. + order ({"K", "C", "F", "A"}): Memory order of output. + +Returns: + usm_ndarray: Absolute values. +""" + +abs = UnaryElementwiseFunc( + "abs", # Operation name + ti._abs_result_type, # Result type inference function + ti._abs, # Kernel implementation + _abs_docstring_ # Docstring +) +``` + +## Docstring Format + +Follow NumPy-style docstrings: + +```python +def reshape(x, /, shape, *, order="C", copy=None): + """ + reshape(x, shape, order="C", copy=None) + + Gives a new shape to an array without changing its data. + + Args: + x (usm_ndarray): + Input array to reshape. + shape (tuple of ints): + New shape. One dimension may be -1, which is inferred. + order ({"C", "F"}, optional): + Read elements in this order. Default: "C". + copy (bool, optional): + If True, always copy. If False, never copy (raise if needed). + If None, copy only when necessary. Default: None. + + Returns: + usm_ndarray: + Reshaped array. May be a view or a copy. + + Raises: + ValueError: + If the new shape is incompatible with the original shape. + TypeError: + If input is not a usm_ndarray. + + Examples: + >>> import dpctl.tensor as dpt + >>> x = dpt.arange(6) + >>> dpt.reshape(x, (2, 3)) + usm_ndarray([[0, 1, 2], + [3, 4, 5]]) + """ +``` + +## Type Utilities + +```python +from ._type_utils import ( + _to_device_supported_dtype, + _resolve_one_strong_one_weak_types, + _resolve_one_strong_two_weak_types, +) + +def some_operation(x, scalar): + # Resolve type when mixing array and scalar + res_dtype = _resolve_one_strong_one_weak_types( + x.dtype, + type(scalar), + x.sycl_device + ) + + # Ensure dtype is supported on device + res_dtype = _to_device_supported_dtype(res_dtype, x.sycl_device) +``` + +## Common Imports + +```python +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor import usm_ndarray +from dpctl.tensor._type_utils import _to_device_supported_dtype +from dpctl.utils import ExecutionPlacementError +import dpctl.tensor._tensor_impl as ti # C++ bindings + +import numpy as np +from numpy.core.numeric import normalize_axis_tuple +``` + +## Error Messages + +Provide clear, actionable error messages: + +```python +# Good +raise TypeError( + f"Expected usm_ndarray for argument 'x', got {type(x).__name__}" +) + +# Good +raise ValueError( + f"Cannot reshape array of size {x.size} into shape {shape}" +) + +# Good +raise ExecutionPlacementError( + "Input arrays are allocated on incompatible devices. " + "Use dpctl.tensor.asarray(x, device=target_device) to copy." +) +``` + +## Output Array Pattern + +Many operations accept an optional `out` parameter: + +```python +def operation(x, /, out=None, order="K"): + # Validate input + if not isinstance(x, usm_ndarray): + raise TypeError(...) + + # Determine output properties + res_dtype = _compute_result_type(x.dtype) + res_shape = x.shape + + if out is not None: + # Validate output array + if not isinstance(out, usm_ndarray): + raise TypeError("out must be usm_ndarray") + if out.shape != res_shape: + raise ValueError(f"out shape {out.shape} != {res_shape}") + if out.dtype != res_dtype: + raise ValueError(f"out dtype {out.dtype} != {res_dtype}") + if out.sycl_queue != x.sycl_queue: + raise ValueError("out must be on same queue as input") + else: + # Create output array + out = empty(res_shape, dtype=res_dtype, order=order, + sycl_queue=x.sycl_queue) + + # Call implementation + _impl(x, out) + return out +``` + +## Best Practices + +1. **Validate inputs early** - Check types and shapes before computation +2. **Use get_execution_queue** - Ensure queue compatibility +3. **Support order parameter** - Allow C/F/A/K memory orders +4. **Return same queue** - Output arrays on same queue as inputs +5. **Follow Array API** - Match standard function signatures +6. **Document thoroughly** - Include types, shapes, exceptions in docstrings diff --git a/.github/instructions/testing.instructions.md b/.github/instructions/testing.instructions.md new file mode 100644 index 0000000000..ce02e675c5 --- /dev/null +++ b/.github/instructions/testing.instructions.md @@ -0,0 +1,257 @@ +--- +applyTo: + - "dpctl/tests/**/*.py" + - "**/test_*.py" + - "**/*_test.py" +--- + +# Testing Instructions + +## Context + +DPCTL uses pytest for testing. Tests must handle device availability, dtype support variations, and multiple memory configurations. + +## Test Location + +``` +dpctl/tests/ +├── conftest.py # Fixtures and pytest configuration +├── helper/ # Test helper utilities +│ └── _helper.py +├── elementwise/ # Elementwise operation tests +│ ├── utils.py # Shared dtype lists +│ └── test_*.py +├── test_sycl_*.py # Core SYCL object tests +├── test_tensor_*.py # Tensor operation tests +└── test_usm_ndarray_*.py # Array tests +``` + +## Essential Fixtures and Helpers + +### Device/Queue Creation + +```python +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +def test_something(): + # Skip test if no device available + q = get_queue_or_skip() + + # Skip if specific device type needed + q = get_queue_or_skip(("gpu",)) +``` + +### Dtype Support Check + +```python +def test_with_dtype(): + q = get_queue_or_skip() + dtype = "f8" # float64 + + # Skip if device doesn't support this dtype + skip_if_dtype_not_supported(dtype, q) + + # Test implementation + x = dpt.ones(10, dtype=dtype, sycl_queue=q) +``` + +## Standard Dtype Lists + +From `dpctl/tests/elementwise/utils.py`: + +```python +# Integer types (always supported) +_integral_dtypes = ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"] + +# Floating point types (may need device support check) +_real_fp_dtypes = ["f2", "f4", "f8"] # f8 (float64) needs fp64 support + +# Complex types +_complex_fp_dtypes = ["c8", "c16"] # c16 needs fp64 support + +# All numeric types +_all_dtypes = _integral_dtypes + _real_fp_dtypes + _complex_fp_dtypes + +# Boolean +_boolean_dtypes = ["?"] + +# USM allocation types +_usm_types = ["device", "shared", "host"] + +# Memory orders +_orders = ["C", "F", "A", "K"] +``` + +## Test Patterns + +### Basic Parametrized Test + +```python +import pytest +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +@pytest.mark.parametrize("dtype", ["f4", "f8", "c8", "c16"]) +def test_operation(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.linspace(0, 1, 100, dtype=dtype, sycl_queue=q) + result = dpt.some_operation(x) + + expected = np.some_operation(dpt.asnumpy(x)) + assert_allclose(dpt.asnumpy(result), expected) +``` + +### Dtype Matrix Test (Binary Operations) + +```python +@pytest.mark.parametrize("dtype1", _all_dtypes) +@pytest.mark.parametrize("dtype2", _all_dtypes) +def test_binary_dtype_matrix(dtype1, dtype2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype1, q) + skip_if_dtype_not_supported(dtype2, q) + + x = dpt.ones(10, dtype=dtype1, sycl_queue=q) + y = dpt.ones(10, dtype=dtype2, sycl_queue=q) + + result = dpt.add(x, y) + + # Verify result dtype matches NumPy promotion + expected_dtype = np.result_type(np.dtype(dtype1), np.dtype(dtype2)) + assert result.dtype == expected_dtype +``` + +### USM Type Test + +```python +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_usm_types(usm_type): + q = get_queue_or_skip() + + x = dpt.ones(10, dtype="f4", usm_type=usm_type, sycl_queue=q) + result = dpt.abs(x) + + assert result.usm_type == usm_type +``` + +### Memory Order Test + +```python +@pytest.mark.parametrize("order", ["C", "F"]) +def test_memory_order(order): + q = get_queue_or_skip() + + x = dpt.ones((10, 10), dtype="f4", order=order, sycl_queue=q) + + if order == "C": + assert x.flags.c_contiguous + else: + assert x.flags.f_contiguous +``` + +### Edge Case Tests + +```python +def test_empty_array(): + q = get_queue_or_skip() + x = dpt.empty((0,), dtype="f4", sycl_queue=q) + result = dpt.abs(x) + assert result.shape == (0,) + +def test_scalar(): + q = get_queue_or_skip() + x = dpt.asarray(5.0, dtype="f4", sycl_queue=q) + result = dpt.abs(x) + assert result.ndim == 0 + +def test_broadcast(): + q = get_queue_or_skip() + x = dpt.ones((10, 1), dtype="f4", sycl_queue=q) + y = dpt.ones((1, 10), dtype="f4", sycl_queue=q) + result = dpt.add(x, y) + assert result.shape == (10, 10) +``` + +## Test Naming Convention + +```python +# Function being tested: dpt.abs() + +def test_abs_basic(): + """Basic functionality test""" + +def test_abs_dtype_matrix(): + """Test all dtype combinations""" + +def test_abs_usm_types(): + """Test all USM allocation types""" + +def test_abs_order(): + """Test memory order preservation""" + +def test_abs_empty(): + """Test with empty array""" + +def test_abs_scalar(): + """Test with 0-d array""" + +def test_abs_broadcast(): + """Test broadcasting behavior""" + +def test_abs_out_param(): + """Test output array parameter""" + +def test_abs_inplace(): + """Test in-place operation (if supported)""" +``` + +## Assertions + +```python +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal + +# For exact integer comparisons +assert_array_equal(dpt.asnumpy(result), expected) + +# For floating point with tolerance +assert_allclose(dpt.asnumpy(result), expected, rtol=1e-5, atol=1e-8) + +# For dtype checking +assert result.dtype == dpt.float32 + +# For shape checking +assert result.shape == (10, 10) + +# For USM type +assert result.usm_type == "device" + +# For queue equality +assert result.sycl_queue == x.sycl_queue +``` + +## Conftest Markers + +```python +# Mark test as having known complex number issues +@pytest.mark.broken_complex + +# Skip on CI +@pytest.mark.skip(reason="Known issue #123") + +# Expected failure +@pytest.mark.xfail(reason="Not yet implemented") +``` + +## Best Practices + +1. **Always use fixtures** - Use `get_queue_or_skip()` instead of direct queue creation +2. **Check dtype support** - Use `skip_if_dtype_not_supported()` for fp64/fp16 +3. **Test all dtypes** - Use parametrization to cover dtype matrix +4. **Test USM types** - Cover device, shared, and host memory +5. **Test edge cases** - Empty arrays, scalars, broadcasts +6. **Compare with NumPy** - Verify results match NumPy behavior +7. **Clean up resources** - Let arrays go out of scope naturally +8. **Use descriptive names** - Test names should indicate what's being tested diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000..090acbab4c --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,437 @@ +# AGENTS.md - AI Agent Guide for DPCTL + +## Overview + +**DPCTL** (Data Parallel Control) is a Python library providing bindings for SYCL, enabling heterogeneous computing on CPUs, GPUs, and accelerators. It implements the Python Array API standard for tensor operations on SYCL devices. + +- **License:** Apache 2.0 +- **Copyright:** Intel Corporation +- **Primary Language:** Python with Cython bindings and C++ SYCL kernels + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Python API Layer │ +│ dpctl, dpctl.tensor, dpctl.memory, dpctl.program, dpctl.utils │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Cython Bindings │ +│ _sycl_device.pyx, _sycl_queue.pyx, _usmarray.pyx, etc. │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ C API Layer │ +│ libsyclinterface (dpctl_*.h) │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ C++ Kernel Layer │ +│ dpctl/tensor/libtensor (SYCL kernels) │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ SYCL Runtime │ +│ (Intel oneAPI, OpenSYCL, etc.) │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Module Relationships + +- **dpctl**: Core SYCL object wrappers (Device, Queue, Context, Event, Platform) +- **dpctl.tensor**: Array API-compliant tensor operations using `usm_ndarray` +- **dpctl.memory**: USM (Unified Shared Memory) allocation classes +- **dpctl.program**: SYCL kernel compilation and program management +- **dpctl.utils**: Utility functions for device queries and order management + +--- + +## Module Reference + +### dpctl/ (Core SYCL Bindings) + +**Purpose:** Python wrappers for fundamental SYCL runtime objects. + +**Key Files:** +| File | Purpose | +|------|---------| +| `_sycl_device.pyx` | `SyclDevice` - wraps `sycl::device` | +| `_sycl_queue.pyx` | `SyclQueue` - wraps `sycl::queue` | +| `_sycl_context.pyx` | `SyclContext` - wraps `sycl::context` | +| `_sycl_event.pyx` | `SyclEvent` - wraps `sycl::event` | +| `_sycl_platform.pyx` | `SyclPlatform` - wraps `sycl::platform` | +| `_backend.pxd` | C API declarations for Cython | +| `enum_types.py` | Python enums for SYCL types | + +**Conventions:** +- Extension types use `cdef class` with C reference as `_*_ref` attribute +- All C resources cleaned up in `__dealloc__` +- Use `nogil` when calling blocking C operations +- Exceptions: `SyclDeviceCreationError`, `SyclQueueCreationError`, etc. + +### dpctl/tensor/ (Array API) + +**Purpose:** Python Array API-compliant tensor operations on SYCL devices. + +**Key Files:** +| File | Purpose | +|------|---------| +| `_usmarray.pyx` | `usm_ndarray` extension type | +| `_elementwise_funcs.py` | Elementwise operation wrappers | +| `_elementwise_common.py` | `UnaryElementwiseFunc`, `BinaryElementwiseFunc` | +| `_reduction.py` | Reduction operations (sum, prod, etc.) | +| `_manipulation_functions.py` | reshape, concat, stack, etc. | +| `_ctors.py` | Array constructors (empty, zeros, ones, etc.) | +| `_copy_utils.py` | Copy and type casting utilities | +| `_type_utils.py` | Type promotion and validation | + +**Type Dispatch Pattern:** +```python +# In _elementwise_funcs.py +abs = UnaryElementwiseFunc( + "abs", # Operation name + ti._abs_result_type, # Type inference function + ti._abs, # Kernel implementation + _abs_docstring_ # Documentation +) +``` + +**Queue Validation Pattern:** +```python +def validate_queues(q1, q2): + """Ensure arrays share execution context""" + if q1 != q2: + raise ExecutionPlacementError(...) +``` + +### dpctl/tensor/libtensor/ (C++ Kernels) + +**Purpose:** SYCL kernel implementations for tensor operations. + +**Directory Structure:** +``` +libtensor/ +├── include/ +│ ├── kernels/ +│ │ ├── elementwise_functions/ # 100+ operations +│ │ │ ├── common.hpp # Base patterns +│ │ │ ├── add.hpp, sin.hpp, etc. +│ │ ├── linalg_functions/ # GEMM, dot +│ │ ├── sorting/ # Sort algorithms +│ │ └── reductions.hpp +│ └── utils/ +│ ├── type_dispatch.hpp +│ ├── type_dispatch_building.hpp +│ └── offset_utils.hpp +└── source/ + ├── elementwise_functions/ + ├── reductions/ + └── tensor_*.cpp # Entry points +``` + +**Type Enumeration (14 types):** +```cpp +enum class typenum_t : int { + BOOL = 0, + INT8, UINT8, INT16, UINT16, INT32, UINT32, + INT64, UINT64, HALF, FLOAT, DOUBLE, + CFLOAT, CDOUBLE, +}; +``` + +**Kernel Functor Pattern:** +```cpp +template +struct MyFunctor { + using supports_sg_loadstore = std::true_type; // Vectorization hint + + resT operator()(const argT &x) const { + return /* computation */; + } + + template + sycl::vec operator()( + const sycl::vec &x) const { + return /* vectorized computation */; + } +}; +``` + +### dpctl/memory/ (USM Memory) + +**Purpose:** Unified Shared Memory allocation and management. + +**Key Classes:** +- `MemoryUSMDevice` - Device-only memory +- `MemoryUSMShared` - Host and device accessible +- `MemoryUSMHost` - Host memory accessible from device + +**Interface:** +All classes implement `__sycl_usm_array_interface__`: +```python +{ + "data": (ptr, readonly_flag), + "shape": (nbytes,), + "strides": None, + "typestr": "|u1", + "version": 1, + "syclobj": queue +} +``` + +### dpctl/program/ (Kernel Compilation) + +**Purpose:** Compile and manage SYCL kernels from source. + +**Key Classes:** +- `SyclProgram` - Compiled SYCL program +- `SyclKernel` - Individual kernel extracted from program + +### libsyclinterface/ (C API) + +**Purpose:** C wrapper around SYCL C++ runtime for language interoperability. + +**Naming Convention:** `DPCTL_` +```c +DPCTLDevice_Create() +DPCTLQueue_Submit() +DPCTLContext_Delete() +``` + +**Memory Ownership Annotations:** +| Annotation | Meaning | +|------------|---------| +| `__dpctl_give` | Caller receives ownership, must free | +| `__dpctl_take` | Function takes ownership, caller must not use after | +| `__dpctl_keep` | Function observes only, does not take ownership | +| `__dpctl_null` | NULL value expected or may be returned | + +**Example:** +```c +DPCTL_API +__dpctl_give DPCTLSyclContextRef +DPCTLContext_Create(__dpctl_keep const DPCTLSyclDeviceRef DRef, + error_handler_callback *handler, + int properties); +``` + +--- + +## Code Style + +### Python/Cython +- **Formatter:** Black (line length 80) +- **Import Sort:** isort +- **Linting:** flake8, cython-lint +- **String Quotes:** Double quotes for Cython + +### C/C++ +- **Formatter:** clang-format (LLVM-based style) +- **Indent:** 4 spaces +- **Braces:** Break before functions, classes, namespaces + +### License Header (Required in ALL files) + +**Python/Cython:** +```python +# Data Parallel Control (dpctl) +# +# Copyright 2020-2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# ... +``` + +**C/C++:** +```cpp +//===-- filename.hpp - Description -*-C++-*-===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// ... +//===----------------------------------------------------------------------===// +``` + +--- + +## Testing Guidelines + +### Test Location +- Main tests: `dpctl/tests/` +- Elementwise tests: `dpctl/tests/elementwise/` +- Libtensor tests: `dpctl/tensor/libtensor/tests/` + +### Key Fixtures (`conftest.py`) +```python +get_queue_or_skip() # Create queue or skip test +skip_if_dtype_not_supported() # Skip if device lacks dtype support +``` + +### Dtype Coverage +All operations should be tested with: +```python +_integral_dtypes = ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"] +_real_fp_dtypes = ["f2", "f4", "f8"] +_complex_fp_dtypes = ["c8", "c16"] +``` + +### USM Type Coverage +```python +_usm_types = ["device", "shared", "host"] +``` + +### Memory Order Coverage +```python +_orders = ["C", "F", "A", "K"] +``` + +### Test Pattern +```python +@pytest.mark.parametrize("dtype", _all_dtypes) +@pytest.mark.parametrize("usm_type", _usm_types) +def test_operation(dtype, usm_type): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.ones(100, dtype=dtype, usm_type=usm_type, sycl_queue=q) + result = dpt.operation(x) + + expected = np.operation(dpt.asnumpy(x)) + assert_allclose(dpt.asnumpy(result), expected) +``` + +--- + +## Adding New Elementwise Operations + +### Checklist + +1. **C++ Kernel** (`libtensor/include/kernels/elementwise_functions/`) + - [ ] Create `operation.hpp` with functor template + - [ ] Define type support matrix + - [ ] Implement scalar and vector operations + +2. **C++ Source** (`libtensor/source/elementwise_functions/`) + - [ ] Create `operation.cpp` + - [ ] Instantiate dispatch tables + - [ ] Register in `tensor_elementwise.cpp` + +3. **Python Wrapper** (`dpctl/tensor/`) + - [ ] Add to `_elementwise_funcs.py` + - [ ] Export in `__init__.py` + +4. **Tests** (`dpctl/tests/elementwise/`) + - [ ] Create `test_operation.py` + - [ ] Cover all dtypes, USM types, orders + - [ ] Test edge cases (empty, scalar, broadcast) + +5. **Documentation** (`docs/`) + - [ ] Add to API reference + - [ ] Include in elementwise functions list + +--- + +## Common Pitfalls + +### Memory Management +- **Always** clean up C resources in `__dealloc__` +- Use `__dpctl_give`/`__dpctl_take` annotations correctly +- Check for NULL returns from C API functions +- Release GIL (`with nogil:`) during blocking operations + +### Type Promotion +- Follow Array API type promotion rules +- Handle mixed real/complex carefully +- Check device support for fp16/fp64 before operations + +### Device Capabilities +- Not all devices support fp64 (double precision) +- Not all devices support fp16 (half precision) +- Use `skip_if_dtype_not_supported()` in tests + +### Queue Consistency +- All arrays in an operation must share the same queue +- Use `dpctl.utils.get_execution_queue()` to validate + +### Broadcasting +- Follow NumPy broadcasting rules +- Handle 0-d arrays (scalars) correctly + +--- + +## Code Review Checklist + +### All Code +- [ ] License header present and correct year +- [ ] Code formatted (black/clang-format) +- [ ] No hardcoded device assumptions (fp64, etc.) + +### Python/Cython +- [ ] Docstrings for public functions +- [ ] Type hints where applicable +- [ ] Exceptions have descriptive messages +- [ ] Resources cleaned up properly + +### C++ Kernels +- [ ] Functor handles all required types +- [ ] Vectorization hints correct (`supports_sg_loadstore`, etc.) +- [ ] No raw `new`/`delete` (use SYCL allocators) +- [ ] Kernel names are unique + +### C API +- [ ] Memory ownership annotations on all functions +- [ ] NULL checks for parameters +- [ ] Error handling via callbacks +- [ ] Extern C wrapper for C++ code + +### Tests +- [ ] Covers all supported dtypes +- [ ] Covers all USM types +- [ ] Handles device capability limitations +- [ ] Tests edge cases (empty arrays, scalars) +- [ ] Uses appropriate fixtures + +--- + +## Quick Reference + +### Filter String Syntax +``` +backend:device_type:device_num +``` +Examples: `"opencl:gpu:0"`, `"level_zero:gpu"`, `"cpu"` + +### Common Imports +```python +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor import usm_ndarray +from dpctl.memory import MemoryUSMShared +``` + +### Create Queue +```python +q = dpctl.SyclQueue() # Default device +q = dpctl.SyclQueue("gpu") # First GPU +q = dpctl.SyclQueue("level_zero:gpu:0") # Specific device +``` + +### Create Array +```python +x = dpt.empty((100, 100), dtype=dpt.float32, sycl_queue=q) +x = dpt.asarray(np_array, device=q) +``` + +### Transfer Data +```python +np_array = dpt.asnumpy(x) # Device to host +x = dpt.asarray(np_array, device=q) # Host to device +``` From ffaa535f827c4a4deb32453e1081c0434ad1ba4d Mon Sep 17 00:00:00 2001 From: Nikolay Petrov Date: Fri, 23 Jan 2026 17:54:28 -0800 Subject: [PATCH 2/3] improved version of documentation --- .github/instructions/cython.instructions.md | 238 +--------- .github/instructions/dpctl.instructions.md | 129 +----- .../instructions/elementwise.instructions.md | 418 +---------------- .../libsyclinterface.instructions.md | 291 +----------- .../libtensor-cpp.instructions.md | 316 +------------ .github/instructions/memory.instructions.md | 278 +---------- .../tensor-python.instructions.md | 258 +---------- .github/instructions/testing.instructions.md | 254 +--------- AGENTS.md | 436 ++---------------- dpctl/AGENTS.md | 60 +++ dpctl/memory/AGENTS.md | 41 ++ dpctl/tensor/AGENTS.md | 52 +++ dpctl/tests/AGENTS.md | 51 ++ libsyclinterface/AGENTS.md | 58 +++ 14 files changed, 375 insertions(+), 2505 deletions(-) create mode 100644 dpctl/AGENTS.md create mode 100644 dpctl/memory/AGENTS.md create mode 100644 dpctl/tensor/AGENTS.md create mode 100644 dpctl/tests/AGENTS.md create mode 100644 libsyclinterface/AGENTS.md diff --git a/.github/instructions/cython.instructions.md b/.github/instructions/cython.instructions.md index 5bc58dc0ae..acc673bd89 100644 --- a/.github/instructions/cython.instructions.md +++ b/.github/instructions/cython.instructions.md @@ -5,241 +5,21 @@ applyTo: - "dpctl/**/*.pxi" --- -# Cython Bindings Instructions +# Cython Instructions -## Context +See [dpctl/AGENTS.md](/dpctl/AGENTS.md) for conventions and patterns. -Cython files in dpctl provide the bridge between Python and the C/C++ SYCL interface. They wrap SYCL runtime objects as Python extension types. - -## File Types - -| Extension | Purpose | -|-----------|---------| -| `.pyx` | Implementation files (compiled to C++) | -| `.pxd` | Declaration files (like C headers) | -| `.pxi` | Include files (textually included) | - -## Required Directives - -Every `.pyx` file must start with (after license header): +## Quick Reference +### Required Directives (after license header) ```cython # distutils: language = c++ # cython: language_level=3 # cython: linetrace=True ``` -## Import Conventions - -### cimport vs import - -```cython -# cimport - for C-level declarations (compile-time) -from cpython cimport pycapsule -from cpython.mem cimport PyMem_Free, PyMem_Malloc -from ._backend cimport DPCTLSyclDeviceRef, DPCTLDevice_Create - -# import - for Python-level usage (runtime) -import numpy as np -from . import _device_selection -``` - -### Import Order -1. Standard library cimports -2. Third-party cimports -3. Local cimports (with `# noqa: E211` if needed) -4. Blank line -5. Standard library imports -6. Third-party imports -7. Local imports - -## Extension Type Pattern - -```cython -cdef class SyclDevice: - """ - Python wrapper for sycl::device. - - Docstring describing the class. - """ - # C-level attribute (not accessible from Python) - cdef DPCTLSyclDeviceRef _device_ref - - def __cinit__(self, filter_string=None): - """ - Called before __init__, handles C memory allocation. - Must not raise Python exceptions that leave C state invalid. - """ - if filter_string is not None: - self._device_ref = DPCTLDevice_CreateFromSelector(...) - else: - self._device_ref = NULL - - def __dealloc__(self): - """ - Called during garbage collection. Clean up C resources. - """ - if self._device_ref is not NULL: - DPCTLDevice_Delete(self._device_ref) - - cdef DPCTLSyclDeviceRef get_device_ref(self): - """ - Internal method for C-level access. - Not visible from Python. - """ - return self._device_ref - - @property - def name(self): - """Python property wrapping C getter.""" - cdef const char *name_ptr = DPCTLDevice_GetName(self._device_ref) - if name_ptr is NULL: - raise RuntimeError("Failed to get device name") - try: - return name_ptr.decode("utf-8") - finally: - DPCTLCString_Delete(name_ptr) -``` - -## Memory Management - -### Rule: Always Clean Up in __dealloc__ - -```cython -def __dealloc__(self): - # Check for NULL before deleting - if self._queue_ref is not NULL: - DPCTLQueue_Delete(self._queue_ref) -``` - -### Ownership Annotations - -Match C API annotations: -- `__dpctl_give` - Caller receives ownership, must delete -- `__dpctl_take` - Function takes ownership, don't use after -- `__dpctl_keep` - Function only observes, doesn't take ownership - -```cython -cdef void example(): - # Receives ownership (__dpctl_give) - must delete - cdef DPCTLSyclEventRef event = DPCTLQueue_Submit(...) - - # ... use event ... - - # Clean up owned resource - DPCTLEvent_Delete(event) -``` - -### GIL Management - -Release GIL for blocking C operations: - -```cython -cdef void copy_with_wait(DPCTLSyclEventRef event): - with nogil: - DPCTLEvent_Wait(event) - -# Or for longer sections -cdef void long_operation() nogil: - # Entire function runs without GIL - # Cannot call Python objects here - pass -``` - -## Error Handling - -### NULL Checks - -```cython -cdef DPCTLSyclDeviceRef dref = DPCTLDevice_Create(...) -if dref is NULL: - raise SyclDeviceCreationError("Failed to create device") -``` - -### Exception Safety - -```cython -def create_something(self): - cdef SomeRef ref = SomeFunction() - if ref is NULL: - raise SomeError("Creation failed") - try: - # Operations that might raise - result = self._process(ref) - except: - # Clean up on exception - SomeDelete(ref) - raise - return result -``` - -## Backend Declarations (.pxd) - -```cython -# _backend.pxd -cdef extern from "syclinterface/dpctl_sycl_device_interface.h": - ctypedef void* DPCTLSyclDeviceRef - - DPCTLSyclDeviceRef DPCTLDevice_Create( - DPCTLDeviceSelectorRef DSRef - ) nogil - - void DPCTLDevice_Delete( - DPCTLSyclDeviceRef DRef - ) nogil - - const char* DPCTLDevice_GetName( - DPCTLSyclDeviceRef DRef - ) nogil -``` - -## Include Files (.pxi) - -Used for shared code snippets: - -```cython -# In main .pyx file -include "_sycl_usm_array_interface_utils.pxi" -``` - -## Common Patterns - -### Property with C String Return - -```cython -@property -def driver_version(self): - cdef const char* ver = DPCTLDevice_GetDriverVersion(self._device_ref) - if ver is NULL: - return "" - try: - return ver.decode("utf-8") - finally: - DPCTLCString_Delete(ver) -``` - -### Capsule for Opaque Pointers - -```cython -def get_capsule(self): - """Return PyCapsule containing C pointer.""" - if self._device_ref is NULL: - return None - return pycapsule.PyCapsule_New( - self._device_ref, - "SyclDeviceRef", - NULL # No destructor - we manage lifetime - ) -``` - -### Type Checking - -```cython -def some_method(self, other): - if not isinstance(other, SyclDevice): - raise TypeError( - f"Expected SyclDevice, got {type(other).__name__}" - ) - cdef SyclDevice other_dev = other - # Now can access other_dev._device_ref -``` +### Key Rules +- `cimport` for C-level declarations, `import` for Python +- Store C refs as `_*_ref`, clean up in `__dealloc__` +- Use `with nogil:` for blocking C operations +- Check NULL before using C API returns diff --git a/.github/instructions/dpctl.instructions.md b/.github/instructions/dpctl.instructions.md index 227da04bb2..7798f146a0 100644 --- a/.github/instructions/dpctl.instructions.md +++ b/.github/instructions/dpctl.instructions.md @@ -3,133 +3,24 @@ applyTo: - "**/*.py" - "**/*.pyx" - "**/*.pxd" - - "**/*.pxi" - "**/*.cpp" - "**/*.hpp" - "**/*.h" - - "**/*.c" --- # DPCTL General Instructions -## Project Context +See [/AGENTS.md](/AGENTS.md) for project overview and architecture. -DPCTL (Data Parallel Control) is a Python SYCL binding library for heterogeneous computing. It provides: -- Python wrappers for SYCL runtime objects (devices, queues, contexts) -- Array API-compliant tensor operations (`dpctl.tensor`) -- USM memory management (`dpctl.memory`) -- Kernel compilation (`dpctl.program`) +## Key References -## License Header Requirement +- **Code style:** See `.pre-commit-config.yaml` for tool versions, `.clang-format` for C++ style +- **License:** Apache 2.0 with Intel copyright - see existing files for header format +- **Directory guides:** Each major directory has its own `AGENTS.md` -**All source files must include the Apache 2.0 license header.** +## Critical Rules -### Python/Cython Files -```python -# Data Parallel Control (dpctl) -# -# Copyright 2020-2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -``` - -### C/C++ Files -```cpp -//===-- filename.hpp - Brief description -*-C++-*-===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2025 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// ... -//===----------------------------------------------------------------------===// -``` - -## Code Style - -### Python -- **Formatter:** Black (line length 80) -- **Import sorting:** isort -- **Linting:** flake8 -- **Max line length:** 80 characters - -### C/C++ -- **Formatter:** clang-format (LLVM-based) -- **Indent:** 4 spaces -- **Brace style:** Break before functions, classes, namespaces - -## Naming Conventions - -### Python/Cython -- Classes: `PascalCase` (e.g., `SyclDevice`, `SyclQueue`) -- Functions/methods: `snake_case` (e.g., `get_device_info`) -- Private: prefix with `_` (e.g., `_validate_queue`) -- Constants: `UPPER_SNAKE_CASE` - -### C API (libsyclinterface) -- Functions: `DPCTL_` (e.g., `DPCTLDevice_Create`) -- Types: `DPCTLRef` (e.g., `DPCTLSyclDeviceRef`) -- Macros: `DPCTL_` prefix - -### C++ Kernels -- Functors: `Functor` (e.g., `AddFunctor`) -- Kernel classes: `_krn` suffix -- Namespaces: `dpctl::tensor::kernels` - -## Error Handling - -### Python Exceptions -- `SyclDeviceCreationError` - Device creation failed -- `SyclQueueCreationError` - Queue creation failed -- `ExecutionPlacementError` - Queue mismatch between arrays -- `TypeError`, `ValueError` - Standard Python errors for invalid input - -### C API Error Handling -- Return `NULL` for failed object creation -- Use `error_handler_callback` for async errors -- Check return values before using - -## Device Compatibility - -**Warning:** Not all SYCL devices support all data types. - -- **fp64 (double):** Check `device.has_aspect_fp64` -- **fp16 (half):** Check `device.has_aspect_fp16` - -Always use `skip_if_dtype_not_supported()` in tests. - -## Common Patterns - -### Queue Validation -```python -def some_operation(x, y): - exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, y.sycl_queue]) - if exec_q is None: - raise ExecutionPlacementError("...") -``` - -### Resource Cleanup -```python -cdef class SyclDevice: - cdef DPCTLSyclDeviceRef _device_ref - - def __dealloc__(self): - DPCTLDevice_Delete(self._device_ref) -``` - -### GIL Release for Blocking Operations -```cython -with nogil: - DPCTLEvent_Wait(event_ref) -``` +1. **Device compatibility:** Not all devices support fp64/fp16 - check capabilities +2. **Queue consistency:** All arrays in an operation must share compatible queues +3. **Resource cleanup:** Always clean up C resources in `__dealloc__` +4. **NULL checks:** Always check C API returns before use diff --git a/.github/instructions/elementwise.instructions.md b/.github/instructions/elementwise.instructions.md index f747c6c926..ac266801f6 100644 --- a/.github/instructions/elementwise.instructions.md +++ b/.github/instructions/elementwise.instructions.md @@ -1,415 +1,25 @@ --- applyTo: - - "dpctl/tensor/libtensor/include/kernels/elementwise_functions/**" - - "dpctl/tensor/libtensor/source/elementwise_functions/**" - - "dpctl/tensor/_elementwise_funcs.py" - - "dpctl/tensor/_elementwise_common.py" + - "dpctl/tensor/libtensor/**/elementwise_functions/**" + - "dpctl/tensor/_elementwise_*.py" - "dpctl/tests/elementwise/**" --- # Elementwise Operations Instructions -## Context +Full stack: C++ kernel → pybind11 → Python wrapper → tests -Elementwise operations span the full stack from C++ SYCL kernels to Python wrappers and tests. This guide covers the complete pattern for implementing elementwise operations. +## References -## Full Stack Overview +- **C++ kernels:** [dpctl/tensor/libtensor/AGENTS.md](/dpctl/tensor/libtensor/AGENTS.md) +- **Python wrappers:** [dpctl/tensor/AGENTS.md](/dpctl/tensor/AGENTS.md) +- **Tests:** [dpctl/tests/AGENTS.md](/dpctl/tests/AGENTS.md) -``` -┌─────────────────────────────────────────────────────────────────┐ -│ dpctl/tensor/_elementwise_funcs.py │ -│ Python wrapper using UnaryElementwiseFunc/BinaryElementwiseFunc│ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ dpctl/tensor/_tensor_impl (pybind11) │ -│ _abs, _abs_result_type, etc. │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ libtensor/source/elementwise_functions/abs.cpp │ -│ Dispatch tables and pybind11 bindings │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ libtensor/include/kernels/elementwise_functions/abs.hpp │ -│ SYCL kernel implementation (AbsFunctor) │ -└─────────────────────────────────────────────────────────────────┘ -``` +## Adding New Operation Checklist -## Adding a New Elementwise Operation - -### Step 1: C++ Kernel Header - -Create `libtensor/include/kernels/elementwise_functions/myop.hpp`: - -```cpp -//===-- myop.hpp - Implementation of myop -*-C++-*-===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2025 Intel Corporation -// ... license ... -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include - -#include "common.hpp" -#include "utils/type_utils.hpp" - -namespace dpctl::tensor::kernels::myop -{ - -namespace td_ns = dpctl::tensor::type_dispatch; -namespace tu_ns = dpctl::tensor::type_utils; - -// Output type definition -template -struct MyOpOutputType -{ - using value_type = typename std::disjunction< - td_ns::TypeMapResultEntry, - td_ns::TypeMapResultEntry, - // ... map all supported input types to output types - td_ns::DefaultResultEntry>::result_type; -}; - -// Kernel functor -template -struct MyOpFunctor -{ - using supports_sg_loadstore = - typename std::negation>; - using supports_vec = - typename std::negation>; - - resT operator()(const argT &x) const - { - if constexpr (tu_ns::is_complex::value) { - // Handle complex types - return /* complex implementation */; - } - else { - return /* scalar implementation */; - } - } - - template - sycl::vec - operator()(const sycl::vec &x) const - { - // Vectorized implementation - return /* vectorized implementation */; - } -}; - -// Factory for contiguous arrays -template -struct MyOpContigFactory -{ - fnT get() - { - if constexpr (std::is_same_v< - typename MyOpOutputType::value_type, void>) - { - return nullptr; // Unsupported type - } - else { - using dpctl::tensor::kernels::unary_contig_impl; - return unary_contig_impl; - } - } -}; - -// Factory for strided arrays -template -struct MyOpStridedFactory -{ - fnT get() - { - if constexpr (std::is_same_v< - typename MyOpOutputType::value_type, void>) - { - return nullptr; - } - else { - using dpctl::tensor::kernels::unary_strided_impl; - return unary_strided_impl; - } - } -}; - -// Type support check factory -template -struct MyOpTypeSupportFactory -{ - fnT get() - { - if constexpr (std::is_same_v< - typename MyOpOutputType::value_type, void>) - { - return nullptr; - } - else { - return [](void) -> bool { return true; }; - } - } -}; - -} // namespace dpctl::tensor::kernels::myop -``` - -### Step 2: C++ Source File - -Create `libtensor/source/elementwise_functions/myop.cpp`: - -```cpp -//===----------- myop.cpp - Implementation of myop -*-C++-*-===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2025 Intel Corporation -// ... license ... -//===----------------------------------------------------------------------===// - -#include "myop.hpp" -#include "elementwise_functions.hpp" - -#include "dpctl4pybind11.hpp" -#include - -namespace py = pybind11; - -namespace dpctl::tensor::py_internal -{ - -namespace td_ns = dpctl::tensor::type_dispatch; - -// Dispatch tables -static unary_contig_impl_fn_ptr_t - myop_contig_dispatch_table[td_ns::num_types]; - -static unary_strided_impl_fn_ptr_t - myop_strided_dispatch_table[td_ns::num_types]; - -void init_myop_dispatch_tables(void) -{ - using dpctl::tensor::kernels::myop::MyOpContigFactory; - using dpctl::tensor::kernels::myop::MyOpStridedFactory; - - td_ns::DispatchTableBuilder - dtb_contig; - dtb_contig.populate_dispatch_table(myop_contig_dispatch_table); - - td_ns::DispatchTableBuilder - dtb_strided; - dtb_strided.populate_dispatch_table(myop_strided_dispatch_table); -} - -// Result type function -py::object myop_result_type(const py::dtype &input_dtype, - const py::object &device_obj) -{ - // Implementation using type lookup -} - -// Main function -std::pair -myop_func(const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends) -{ - // Dispatch to appropriate kernel based on src dtype -} - -void init_myop(py::module_ m) -{ - init_myop_dispatch_tables(); - - m.def("_myop", &myop_func, "MyOp implementation", - py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - - m.def("_myop_result_type", &myop_result_type); -} - -} // namespace dpctl::tensor::py_internal -``` - -### Step 3: Register in tensor_elementwise.cpp - -In `libtensor/source/tensor_elementwise.cpp`: - -```cpp -#include "elementwise_functions/myop.hpp" - -void init_elementwise_functions(py::module_ m) -{ - // ... existing operations ... - init_myop(m); -} -``` - -### Step 4: Python Wrapper - -In `dpctl/tensor/_elementwise_funcs.py`: - -```python -import dpctl.tensor._tensor_impl as ti -from ._elementwise_common import UnaryElementwiseFunc - -_myop_docstring_ = """ -myop(x, /, out=None, order="K") - -Computes myop element-wise. - -Args: - x (usm_ndarray): Input array. - out (usm_ndarray, optional): Output array to use. - order ({"K", "C", "F", "A"}, optional): Memory order for output. - -Returns: - usm_ndarray: Result array. - -See Also: - :func:`related_func`: Description. -""" - -myop = UnaryElementwiseFunc( - "myop", - ti._myop_result_type, - ti._myop, - _myop_docstring_, -) -``` - -### Step 5: Export in __init__.py - -In `dpctl/tensor/__init__.py`: - -```python -from ._elementwise_funcs import ( - # ... existing exports ... - myop, -) - -__all__ = [ - # ... existing exports ... - "myop", -] -``` - -### Step 6: Create Tests - -Create `dpctl/tests/elementwise/test_myop.py`: - -```python -import numpy as np -import pytest - -import dpctl.tensor as dpt -from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported - -from .utils import _all_dtypes, _usm_types - - -class TestMyOp: - @pytest.mark.parametrize("dtype", _all_dtypes) - def test_myop_dtype(self, dtype): - q = get_queue_or_skip() - skip_if_dtype_not_supported(dtype, q) - - x = dpt.linspace(0, 1, 100, dtype=dtype, sycl_queue=q) - result = dpt.myop(x) - - expected = np.myop(dpt.asnumpy(x)) - assert np.allclose(dpt.asnumpy(result), expected) - - @pytest.mark.parametrize("usm_type", _usm_types) - def test_myop_usm_type(self, usm_type): - q = get_queue_or_skip() - - x = dpt.ones(100, dtype="f4", usm_type=usm_type, sycl_queue=q) - result = dpt.myop(x) - - assert result.usm_type == usm_type - - def test_myop_empty(self): - q = get_queue_or_skip() - x = dpt.empty((0,), dtype="f4", sycl_queue=q) - result = dpt.myop(x) - assert result.shape == (0,) - - def test_myop_scalar(self): - q = get_queue_or_skip() - x = dpt.asarray(5.0, sycl_queue=q) - result = dpt.myop(x) - assert result.ndim == 0 - - def test_myop_out(self): - q = get_queue_or_skip() - x = dpt.ones(100, dtype="f4", sycl_queue=q) - out = dpt.empty_like(x) - result = dpt.myop(x, out=out) - assert result is out - - @pytest.mark.parametrize("order", ["C", "F"]) - def test_myop_order(self, order): - q = get_queue_or_skip() - x = dpt.ones((10, 10), dtype="f4", order=order, sycl_queue=q) - result = dpt.myop(x, order=order) - - if order == "C": - assert result.flags.c_contiguous - else: - assert result.flags.f_contiguous -``` - -## Type Support Matrix - -Common type mappings for unary operations: - -| Operation | Input Types | Output Type | -|-----------|-------------|-------------| -| abs | int, float | same | -| abs | complex | real component type | -| negative | int, float, complex | same | -| sin, cos, etc. | float, complex | same | -| isnan, isinf | float, complex | bool | -| sign | int, float, complex | same | - -## Binary Operation Pattern - -For binary operations, use `BinaryElementwiseFunc`: - -```python -from ._elementwise_common import BinaryElementwiseFunc - -add = BinaryElementwiseFunc( - "add", - ti._add_result_type, - ti._add, - _add_docstring_, - # Binary operations may need additional parameters - ti._add_inplace, # In-place variant -) -``` - -## Vectorization Notes - -- `supports_sg_loadstore` - Can use sub-group load/store -- `supports_vec` - Can use sycl::vec for vectorization -- Complex types typically don't support vectorization -- Check device capabilities for half-precision (fp16) +1. `libtensor/include/kernels/elementwise_functions/op.hpp` - functor +2. `libtensor/source/elementwise_functions/op.cpp` - dispatch tables +3. Register in `tensor_elementwise.cpp` +4. `_elementwise_funcs.py` - Python wrapper +5. Export in `__init__.py` +6. `tests/elementwise/test_op.py` - full coverage diff --git a/.github/instructions/libsyclinterface.instructions.md b/.github/instructions/libsyclinterface.instructions.md index 9aa23c09e1..bd5ffdb122 100644 --- a/.github/instructions/libsyclinterface.instructions.md +++ b/.github/instructions/libsyclinterface.instructions.md @@ -3,288 +3,23 @@ applyTo: - "libsyclinterface/**/*.h" - "libsyclinterface/**/*.hpp" - "libsyclinterface/**/*.cpp" - - "libsyclinterface/**/*.c" --- -# C API Layer Instructions (libsyclinterface) +# C API Instructions -## Context +See [libsyclinterface/AGENTS.md](/libsyclinterface/AGENTS.md) for conventions. -The libsyclinterface directory provides a C API wrapper around SYCL C++ runtime objects. This enables language interoperability with Python (via Cython) and other languages. +## Quick Reference -## Directory Structure +### Naming +`DPCTL_` (e.g., `DPCTLDevice_Create`) -``` -libsyclinterface/ -├── include/syclinterface/ -│ ├── Support/ -│ │ ├── DllExport.h # DLL export macros -│ │ ├── ExternC.h # extern "C" wrapper macros -│ │ └── MemOwnershipAttrs.h # Memory ownership annotations -│ ├── dpctl_sycl_context_interface.h -│ ├── dpctl_sycl_device_interface.h -│ ├── dpctl_sycl_device_manager.h -│ ├── dpctl_sycl_device_selector_interface.h -│ ├── dpctl_sycl_event_interface.h -│ ├── dpctl_sycl_kernel_bundle_interface.h -│ ├── dpctl_sycl_kernel_interface.h -│ ├── dpctl_sycl_platform_interface.h -│ ├── dpctl_sycl_platform_manager.h -│ ├── dpctl_sycl_queue_interface.h -│ ├── dpctl_sycl_queue_manager.h -│ ├── dpctl_sycl_usm_interface.h -│ └── dpctl_sycl_types.h -├── helper/ -│ ├── include/ -│ └── source/ -├── source/ # Implementation files -└── tests/ # C interface tests -``` +### Ownership annotations (from `Support/MemOwnershipAttrs.h`) +- `__dpctl_give` - caller must free +- `__dpctl_take` - function takes ownership +- `__dpctl_keep` - function only observes -## Function Naming Convention - -``` -DPCTL_ -``` - -### Examples -```c -DPCTLDevice_Create() -DPCTLDevice_Delete() -DPCTLDevice_GetName() -DPCTLDevice_GetVendor() -DPCTLDevice_HasAspect() - -DPCTLQueue_Create() -DPCTLQueue_Delete() -DPCTLQueue_Submit() -DPCTLQueue_Memcpy() -DPCTLQueue_Wait() - -DPCTLContext_Create() -DPCTLContext_Delete() -DPCTLContext_GetDeviceCount() -``` - -## Memory Ownership Annotations - -Defined in `Support/MemOwnershipAttrs.h`: - -| Annotation | Meaning | Caller Action | -|------------|---------|---------------| -| `__dpctl_give` | Function returns ownership | Caller must eventually free | -| `__dpctl_take` | Function takes ownership | Caller must not use after call | -| `__dpctl_keep` | Function only observes | Caller retains ownership | -| `__dpctl_null` | NULL is valid | Check for NULL returns | - -### Usage Examples - -```c -// Caller receives ownership - must call Delete later -DPCTL_API -__dpctl_give DPCTLSyclContextRef -DPCTLContext_Create(__dpctl_keep const DPCTLSyclDeviceRef DRef, - error_handler_callback *handler, - int properties); - -// Function takes ownership - don't use CRef after this -DPCTL_API -void DPCTLContext_Delete(__dpctl_take DPCTLSyclContextRef CRef); - -// Function only observes - QRef still valid after call -DPCTL_API -size_t DPCTLQueue_GetBackend(__dpctl_keep const DPCTLSyclQueueRef QRef); - -// May return NULL on failure -DPCTL_API -__dpctl_give __dpctl_null DPCTLSyclDeviceRef -DPCTLDevice_Create(__dpctl_keep DPCTLSyclDeviceSelectorRef DSRef); -``` - -## Opaque Pointer Types - -```c -// Forward declarations - actual struct defined in implementation -typedef struct DPCTLOpaqueSyclContext *DPCTLSyclContextRef; -typedef struct DPCTLOpaqueSyclDevice *DPCTLSyclDeviceRef; -typedef struct DPCTLOpaqueSyclEvent *DPCTLSyclEventRef; -typedef struct DPCTLOpaqueSyclKernel *DPCTLSyclKernelRef; -typedef struct DPCTLOpaqueSyclPlatform *DPCTLSyclPlatformRef; -typedef struct DPCTLOpaqueSyclQueue *DPCTLSyclQueueRef; -``` - -## Extern C Wrapper - -Use macros from `Support/ExternC.h`: - -```c -#include "Support/ExternC.h" - -DPCTL_C_EXTERN_C_BEGIN - -// All C API declarations here -DPCTL_API -__dpctl_give DPCTLSyclDeviceRef -DPCTLDevice_Create(...); - -DPCTL_C_EXTERN_C_END -``` - -## DLL Export - -Use `DPCTL_API` macro from `Support/DllExport.h`: - -```c -DPCTL_API -__dpctl_give DPCTLSyclContextRef -DPCTLContext_Create(...); -``` - -## Error Handling - -### Error Callback - -```c -// Type definition -typedef void error_handler_callback(int err_code); - -// Usage in async error handling -DPCTL_API -__dpctl_give DPCTLSyclQueueRef -DPCTLQueue_Create(__dpctl_keep DPCTLSyclContextRef CRef, - __dpctl_keep DPCTLSyclDeviceRef DRef, - error_handler_callback *handler, - int properties); -``` - -### NULL Return Convention - -Functions that create objects return `NULL` on failure: - -```c -DPCTLSyclDeviceRef dref = DPCTLDevice_Create(selector); -if (dref == NULL) { - // Handle error -} -``` - -## Header File Template - -```c -//===-- dpctl_sycl_foo_interface.h - C API for Foo -*-C-*-===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2025 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// ... -//===----------------------------------------------------------------------===// -/// -/// \file -/// This header declares a C API for sycl::foo class. -/// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "Support/DllExport.h" -#include "Support/ExternC.h" -#include "Support/MemOwnershipAttrs.h" -#include "dpctl_sycl_types.h" - -DPCTL_C_EXTERN_C_BEGIN - -/** - * @defgroup FooInterface Foo class C wrapper - */ - -/*! - * @brief Creates a new Foo object. - * - * @param param1 Description of param1. - * @param param2 Description of param2. - * @return A new DPCTLSyclFooRef on success, NULL on failure. - * @ingroup FooInterface - */ -DPCTL_API -__dpctl_give DPCTLSyclFooRef -DPCTLFoo_Create(__dpctl_keep DPCTLSyclBarRef BRef); - -/*! - * @brief Deletes a Foo object. - * - * @param FRef The Foo reference to delete. - * @ingroup FooInterface - */ -DPCTL_API -void DPCTLFoo_Delete(__dpctl_take DPCTLSyclFooRef FRef); - -DPCTL_C_EXTERN_C_END -``` - -## Implementation Pattern - -```cpp -// In source/dpctl_sycl_foo_interface.cpp - -#include "dpctl_sycl_foo_interface.h" -#include - -namespace -{ -// Internal helpers in anonymous namespace -struct DPCTLOpaqueSyclFoo -{ - sycl::foo *ptr; -}; - -inline sycl::foo *unwrap(DPCTLSyclFooRef ref) -{ - return ref ? ref->ptr : nullptr; -} - -inline DPCTLSyclFooRef wrap(sycl::foo *ptr) -{ - if (ptr) { - auto ref = new DPCTLOpaqueSyclFoo; - ref->ptr = ptr; - return ref; - } - return nullptr; -} -} // namespace - -__dpctl_give DPCTLSyclFooRef -DPCTLFoo_Create(__dpctl_keep DPCTLSyclBarRef BRef) -{ - try { - auto bar = unwrap(BRef); - if (!bar) return nullptr; - - auto foo = new sycl::foo(*bar); - return wrap(foo); - } - catch (const std::exception &e) { - // Log error - return nullptr; - } -} - -void DPCTLFoo_Delete(__dpctl_take DPCTLSyclFooRef FRef) -{ - if (FRef) { - delete FRef->ptr; - delete FRef; - } -} -``` - -## Best Practices - -1. **Always use ownership annotations** - Every function parameter and return value should have appropriate annotation -2. **Check for NULL** - Always validate input pointers before use -3. **Use try/catch** - Catch C++ exceptions and return NULL or error codes -4. **Wrap/unwrap consistently** - Use helper functions for opaque pointer conversion -5. **Document with Doxygen** - Every public function needs documentation -6. **Thread safety** - Document thread safety guarantees +### Key Rules +- Annotate all parameters and returns +- Return NULL on failure +- Use `DPCTL_API` for exports diff --git a/.github/instructions/libtensor-cpp.instructions.md b/.github/instructions/libtensor-cpp.instructions.md index bbb695b68d..bfd1379771 100644 --- a/.github/instructions/libtensor-cpp.instructions.md +++ b/.github/instructions/libtensor-cpp.instructions.md @@ -4,313 +4,17 @@ applyTo: - "dpctl/tensor/libtensor/**/*.cpp" --- -# C++ SYCL Kernel Instructions (libtensor) +# C++ SYCL Kernel Instructions -## Context +See [dpctl/tensor/libtensor/AGENTS.md](/dpctl/tensor/libtensor/AGENTS.md) for patterns and structure. -The libtensor directory contains C++ SYCL kernel implementations for tensor operations. These kernels are called from Python via pybind11 bindings. +## Quick Reference -## Directory Structure +### Type dispatch +See `include/utils/type_dispatch_building.hpp` for the 14 supported types. -``` -libtensor/ -├── include/ -│ ├── kernels/ -│ │ ├── elementwise_functions/ # Unary/binary operations -│ │ │ ├── common.hpp # Base iteration patterns -│ │ │ ├── common_inplace.hpp # In-place operation patterns -│ │ │ ├── add.hpp, sin.hpp, etc. -│ │ ├── linalg_functions/ # Linear algebra (gemm, dot) -│ │ ├── sorting/ # Sort, argsort, searchsorted -│ │ └── reductions.hpp # Reduction operations -│ └── utils/ -│ ├── type_dispatch.hpp # Type lookup tables -│ ├── type_dispatch_building.hpp # Dispatch table generation -│ ├── offset_utils.hpp # Stride calculations -│ └── sycl_alloc_utils.hpp # USM allocation helpers -└── source/ - ├── elementwise_functions/ # Implementation files - ├── reductions/ - ├── sorting/ - └── tensor_*.cpp # pybind11 entry points -``` - -## File Header Format - -```cpp -//===-- operation.hpp - Brief description -*-C++-*-===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2025 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// ... -//===----------------------------------------------------------------------===// -/// -/// \file -/// Detailed description of the file's purpose and contents. -/// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -// ... other includes -``` - -## Type Dispatch System - -### Type Enumeration - -```cpp -// In type_dispatch_building.hpp -enum class typenum_t : int { - BOOL = 0, - INT8, UINT8, INT16, UINT16, INT32, UINT32, - INT64, UINT64, HALF, FLOAT, DOUBLE, - CFLOAT, CDOUBLE, -}; -inline constexpr int num_types = 14; -``` - -### Dispatch Table Pattern - -```cpp -// 2D dispatch table: output_type_fn_ptr_t[dst_type][src_type] -template typename factory, - int _num_types> -class DispatchTableBuilder -{ -public: - DispatchTableBuilder() = default; - - void populate_dispatch_table(funcPtrT table[][_num_types]) const - { - // Populates table with type-specific function pointers - // using the factory template - } -}; -``` - -### Using Dispatch Tables - -```cpp -// In operation.cpp -static unary_contig_impl_fn_ptr_t abs_contig_dispatch_table[num_types]; - -void init_abs_dispatch_tables() -{ - using dpctl::tensor::kernels::abs::AbsContigFactory; - - DispatchTableBuilder - dtb; - dtb.populate_dispatch_table(abs_contig_dispatch_table); -} -``` - -## Kernel Functor Pattern - -### Unary Operation - -```cpp -template -struct AbsFunctor -{ - // Type traits for optimization - using is_constant = std::false_type; - using supports_sg_loadstore = std::true_type; - using supports_vec = std::true_type; - - // Scalar operation - resT operator()(const argT &x) const - { - if constexpr (std::is_same_v) { - return x; - } - else if constexpr (is_complex::value) { - return cabs(x); - } - else if constexpr (std::is_unsigned_v) { - return x; - } - else { - return (x >= argT(0)) ? x : -x; - } - } - - // Vectorized operation (when supports_vec is true) - template - sycl::vec - operator()(const sycl::vec &x) const - { - return sycl::abs(x); - } -}; -``` - -### Binary Operation - -```cpp -template -struct AddFunctor -{ - using supports_sg_loadstore = std::negation< - std::disjunction, is_complex>>; - using supports_vec = std::negation< - std::disjunction, is_complex>>; - - resT operator()(const argT1 &x, const argT2 &y) const - { - if constexpr (is_complex::value || is_complex::value) { - using rT1 = typename argT1::value_type; - using rT2 = typename argT2::value_type; - return exprm_ns::complex(x) + exprm_ns::complex(y); - } - else { - return x + y; - } - } - - template - sycl::vec - operator()(const sycl::vec &x, - const sycl::vec &y) const - { - return x + y; - } -}; -``` - -## Kernel Naming - -Kernel class names must be unique across the entire codebase: - -```cpp -template -class abs_contig_kernel; // Unique name - -template -class abs_strided_kernel; // Different name for different pattern -``` - -## Type Support Matrix - -Define which type combinations are supported: - -```cpp -template -struct AbsOutputType -{ - // Default: not defined (unsupported) -}; - -template <> -struct AbsOutputType -{ - using value_type = float; -}; - -template <> -struct AbsOutputType, void> -{ - using value_type = float; // abs of complex returns real -}; -``` - -## Factory Pattern - -```cpp -template -struct AbsContigFactory -{ - fnT get() - { - if constexpr (std::is_same_v< - typename AbsOutputType::value_type, - void>) - { - // Unsupported type combination - return nullptr; - } - else { - return abs_contig_impl; - } - } -}; -``` - -## Memory Patterns - -### Contiguous Arrays - -```cpp -template -void process_contig(sycl::queue &q, - size_t nelems, - const T *src, - T *dst, - const std::vector &depends) -{ - sycl::event e = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for(sycl::range<1>(nelems), - [=](sycl::id<1> id) { - dst[id] = process(src[id]); - }); - }); -} -``` - -### Strided Arrays - -```cpp -template -void process_strided(sycl::queue &q, - int nd, - const ssize_t *shape, - const T *src, - ssize_t src_offset, - const ssize_t *src_strides, - T *dst, - ssize_t dst_offset, - const ssize_t *dst_strides) -{ - // Use indexing utilities from offset_utils.hpp - using dpctl::tensor::offset_utils::StridedIndexer; - - StridedIndexer src_indexer(nd, src_offset, shape, src_strides); - StridedIndexer dst_indexer(nd, dst_offset, shape, dst_strides); - - // Submit kernel using indexers -} -``` - -## Common Utilities - -### offset_utils.hpp -- `StridedIndexer` - Convert linear index to strided offset -- `UnpackedStridedIndexer` - Optimized for specific dimensions -- `TwoOffsets_StridedIndexer` - For binary operations - -### sycl_alloc_utils.hpp -- `usm_allocator` - SYCL USM allocator wrapper -- Temporary allocation helpers - -### type_utils.hpp -- `is_complex` - Check if type is complex -- Type promotion utilities - -## Best Practices - -1. **Use `if constexpr`** for compile-time type branching -2. **Mark kernel classes** with unique names -3. **Support vectorization** where possible via `sycl::vec` -4. **Handle complex types** separately (no vectorization) -5. **Check device capabilities** before using fp64/fp16 -6. **Use dispatch tables** for runtime type resolution -7. **Avoid raw new/delete** - use SYCL allocators +### Key Rules +- Kernel class names must be unique +- Use `if constexpr` for compile-time type branching +- Complex types don't support vectorization +- Return `nullptr` from factory for unsupported types diff --git a/.github/instructions/memory.instructions.md b/.github/instructions/memory.instructions.md index 1be336aa23..9a012356da 100644 --- a/.github/instructions/memory.instructions.md +++ b/.github/instructions/memory.instructions.md @@ -1,279 +1,21 @@ --- applyTo: - "dpctl/memory/**" - - "dpctl/memory/*.py" - - "dpctl/memory/*.pyx" - - "dpctl/memory/*.pxd" - - "**/usm*.py" - "**/test_sycl_usm*.py" --- # USM Memory Instructions -## Context +See [dpctl/memory/AGENTS.md](/dpctl/memory/AGENTS.md) for details. -The `dpctl.memory` module provides Python classes for SYCL Unified Shared Memory (USM) allocation and management. USM allows allocating memory that can be accessed by both host and device. +## Quick Reference -## USM Types +### USM Types +- `MemoryUSMDevice` - device-only (fastest) +- `MemoryUSMShared` - host and device accessible +- `MemoryUSMHost` - host memory, device accessible -| Type | Class | Description | -|------|-------|-------------| -| Device | `MemoryUSMDevice` | Memory on device only, fastest device access | -| Shared | `MemoryUSMShared` | Accessible from host and device, automatic migration | -| Host | `MemoryUSMHost` | Host memory accessible from device | - -### When to Use Each Type - -- **Device**: Best performance for device-only data -- **Shared**: Convenient for data accessed by both host and device -- **Host**: When data primarily lives on host but device needs access - -## Memory Classes - -### MemoryUSMDevice - -```python -import dpctl -from dpctl.memory import MemoryUSMDevice - -q = dpctl.SyclQueue() - -# Allocate 1024 bytes of device memory -mem = MemoryUSMDevice(1024, queue=q) - -# Properties -mem.nbytes # Size in bytes -mem.sycl_queue # Associated queue -mem.sycl_device # Associated device -mem.sycl_context # Associated context - -# Copy from host -import numpy as np -host_data = np.array([1, 2, 3, 4], dtype=np.float32) -mem.copy_from_host(host_data.view(np.uint8)) - -# Copy to host -result = np.empty(4, dtype=np.float32) -mem.copy_to_host(result.view(np.uint8)) -``` - -### MemoryUSMShared - -```python -from dpctl.memory import MemoryUSMShared - -# Shared memory - accessible from both host and device -mem = MemoryUSMShared(1024, queue=q) - -# Can be used similarly to MemoryUSMDevice -``` - -### MemoryUSMHost - -```python -from dpctl.memory import MemoryUSMHost - -# Host memory with device accessibility -mem = MemoryUSMHost(1024, queue=q) -``` - -## __sycl_usm_array_interface__ - -All memory classes implement the `__sycl_usm_array_interface__` protocol for interoperability: - -```python -# The interface is a dictionary -interface = mem.__sycl_usm_array_interface__ - -# Structure: -{ - "data": (pointer, readonly_flag), # Tuple of (int, bool) - "shape": (nbytes,), # Tuple of sizes - "strides": None, # Strides (None for contiguous) - "typestr": "|u1", # Data type string (bytes) - "version": 1, # Interface version - "syclobj": queue # Associated SYCL queue -} -``` - -### Using the Interface - -```python -def accepts_usm_memory(obj): - """Accept any object with __sycl_usm_array_interface__.""" - if hasattr(obj, "__sycl_usm_array_interface__"): - iface = obj.__sycl_usm_array_interface__ - ptr, readonly = iface["data"] - nbytes = iface["shape"][0] - queue = iface["syclobj"] - return ptr, nbytes, queue - raise TypeError("Object does not support __sycl_usm_array_interface__") -``` - -## Memory Lifetime Rules - -### Rule 1: Memory is Queue-Bound - -Memory allocations are associated with a specific queue/context: - -```python -q1 = dpctl.SyclQueue("gpu:0") -q2 = dpctl.SyclQueue("gpu:1") - -mem1 = MemoryUSMDevice(1024, queue=q1) - -# Cannot directly use mem1 with q2 if different context -# Must copy through host or peer-to-peer if supported -``` - -### Rule 2: Memory Outlives Operations - -Ensure memory remains valid until all operations complete: - -```python -# BAD - memory might be freed before kernel finishes -def bad_example(): - mem = MemoryUSMDevice(1024, queue=q) - submit_kernel(mem) # Async operation - # mem goes out of scope - danger! - -# GOOD - wait for completion -def good_example(): - mem = MemoryUSMDevice(1024, queue=q) - event = submit_kernel(mem) - event.wait() # Ensure completion before mem freed -``` - -### Rule 3: View Keeps Base Alive - -Creating views extends lifetime: - -```python -base = MemoryUSMDevice(1024, queue=q) -view = base[100:200] # View into base - -# base remains valid as long as view exists -del base # view still holds reference -``` - -## Cython Memory Management - -### Allocation Pattern - -```cython -cdef class MemoryUSMDevice: - cdef DPCTLSyclUSMRef _memory_ref - cdef size_t _nbytes - cdef SyclQueue _queue - - def __cinit__(self, size_t nbytes, SyclQueue queue): - cdef DPCTLSyclQueueRef qref = queue.get_queue_ref() - - self._memory_ref = DPCTLmalloc_device(nbytes, qref) - if self._memory_ref is NULL: - raise MemoryError(f"Failed to allocate {nbytes} bytes") - - self._nbytes = nbytes - self._queue = queue - - def __dealloc__(self): - if self._memory_ref is not NULL: - DPCTLfree_with_queue( - self._memory_ref, - self._queue.get_queue_ref() - ) -``` - -### Copy Operations - -```cython -def copy_from_host(self, host_buffer): - """Copy data from host to device memory.""" - cdef unsigned char[::1] buf = host_buffer - - if buf.shape[0] > self._nbytes: - raise ValueError("Source buffer larger than allocation") - - cdef DPCTLSyclEventRef event = DPCTLQueue_Memcpy( - self._queue.get_queue_ref(), - self._memory_ref, - &buf[0], - buf.shape[0] - ) - - with nogil: - DPCTLEvent_Wait(event) - DPCTLEvent_Delete(event) -``` - -## Testing Memory Operations - -```python -import pytest -import numpy as np -from dpctl.memory import MemoryUSMDevice, MemoryUSMShared, MemoryUSMHost -from dpctl.tests.helper import get_queue_or_skip - - -class TestUSMMemory: - @pytest.mark.parametrize("mem_class", [ - MemoryUSMDevice, MemoryUSMShared, MemoryUSMHost - ]) - def test_allocation(self, mem_class): - q = get_queue_or_skip() - mem = mem_class(1024, queue=q) - assert mem.nbytes == 1024 - assert mem.sycl_queue == q - - @pytest.mark.parametrize("mem_class", [ - MemoryUSMDevice, MemoryUSMShared, MemoryUSMHost - ]) - def test_copy_roundtrip(self, mem_class): - q = get_queue_or_skip() - - src = np.array([1, 2, 3, 4], dtype=np.float32) - mem = mem_class(src.nbytes, queue=q) - - mem.copy_from_host(src.view(np.uint8)) - - dst = np.empty_like(src) - mem.copy_to_host(dst.view(np.uint8)) - - assert np.array_equal(src, dst) - - def test_interface(self): - q = get_queue_or_skip() - mem = MemoryUSMDevice(1024, queue=q) - - iface = mem.__sycl_usm_array_interface__ - assert "data" in iface - assert "shape" in iface - assert iface["shape"] == (1024,) - assert iface["syclobj"] == q -``` - -## Integration with usm_ndarray - -`dpctl.tensor.usm_ndarray` uses USM memory internally: - -```python -import dpctl.tensor as dpt - -# Creates usm_ndarray with underlying USM allocation -x = dpt.ones((100, 100), dtype=dpt.float32, usm_type="device") - -# Access memory properties -x.usm_type # "device", "shared", or "host" -x.sycl_queue # Associated queue - -# Get raw memory (if needed) -# Note: usm_ndarray manages its own memory -``` - -## Best Practices - -1. **Use usm_ndarray for arrays** - Prefer `dpctl.tensor` over raw memory -2. **Match queues** - Ensure arrays/memory share compatible queues -3. **Wait before access** - Wait for async operations before host access -4. **Use appropriate USM type** - Device for performance, shared for convenience -5. **Check allocation success** - Handle MemoryError for large allocations +### Lifetime Rules +1. Memory is queue-bound +2. Keep memory alive until operations complete +3. Views extend base memory lifetime diff --git a/.github/instructions/tensor-python.instructions.md b/.github/instructions/tensor-python.instructions.md index f294321ffc..3044b6197f 100644 --- a/.github/instructions/tensor-python.instructions.md +++ b/.github/instructions/tensor-python.instructions.md @@ -4,258 +4,18 @@ applyTo: - "dpctl/tensor/**/*.py" --- -# Tensor Python Code Instructions +# Tensor Python Instructions -## Context +See [dpctl/tensor/AGENTS.md](/dpctl/tensor/AGENTS.md) for patterns. -The `dpctl.tensor` module provides Array API-compliant tensor operations. It wraps C++ SYCL kernels with Python interfaces that follow the Python Array API standard. - -## Key Files - -| File | Purpose | -|------|---------| -| `_usmarray.pyx` | `usm_ndarray` extension type | -| `_elementwise_funcs.py` | Elementwise operation wrappers | -| `_elementwise_common.py` | Base classes for elementwise ops | -| `_reduction.py` | Reduction operations | -| `_manipulation_functions.py` | reshape, stack, concat, etc. | -| `_ctors.py` | Array constructors | -| `_copy_utils.py` | Copy and cast utilities | -| `_type_utils.py` | Type promotion logic | -| `_device.py` | Device object for Array API | - -## Array API Compliance - -DPCTL tensor aims for Array API compatibility. Key requirements: - -1. **Immutable dtype and device** - Arrays don't change type or device implicitly -2. **Explicit data movement** - Use `asarray(x, device=...)` for device transfers -3. **Type promotion rules** - Follow Array API promotion table -4. **No implicit broadcasting** - Some operations require explicit broadcast - -## Queue Validation Pattern - -All operations must validate that input arrays share a compatible execution context: - -```python -def some_operation(x, y): - """Example binary operation.""" - if not isinstance(x, usm_ndarray): - raise TypeError(f"Expected usm_ndarray, got {type(x)}") - if not isinstance(y, usm_ndarray): - raise TypeError(f"Expected usm_ndarray, got {type(y)}") - - # Get common queue (returns None if incompatible) - exec_q = dpctl.utils.get_execution_queue( - [x.sycl_queue, y.sycl_queue] - ) - if exec_q is None: - raise ExecutionPlacementError( - "Input arrays must share the same SYCL queue or be on " - "compatible queues." - ) - - # Proceed with operation - return _impl(x, y, sycl_queue=exec_q) -``` - -## Order Manager - -Use `OrderManager` for handling memory order (C/F contiguous): - -```python -from dpctl.utils import OrderManager - -def operation_with_order(x, order="K"): - """ - Args: - order: Memory order for output. - - "C": C-contiguous (row-major) - - "F": Fortran-contiguous (column-major) - - "A": F if input is F-contiguous, else C - - "K": Keep input order - """ - om = OrderManager(order) - - # Determine output order based on input - out_order = om.get_order(x) - - # Create output array with determined order - result = empty(x.shape, dtype=x.dtype, order=out_order, - sycl_queue=x.sycl_queue) - return result -``` - -## Elementwise Operation Wrapper - -```python -# In _elementwise_funcs.py - -from ._elementwise_common import ( - UnaryElementwiseFunc, - BinaryElementwiseFunc, -) -import dpctl.tensor._tensor_impl as ti - -_abs_docstring_ = """ -abs(x, /, out=None, order="K") - -Computes the absolute value element-wise. - -Args: - x (usm_ndarray): Input array. - out (usm_ndarray, optional): Output array. - order ({"K", "C", "F", "A"}): Memory order of output. - -Returns: - usm_ndarray: Absolute values. -""" - -abs = UnaryElementwiseFunc( - "abs", # Operation name - ti._abs_result_type, # Result type inference function - ti._abs, # Kernel implementation - _abs_docstring_ # Docstring -) -``` - -## Docstring Format - -Follow NumPy-style docstrings: +## Quick Reference +### Queue validation (required for all operations) ```python -def reshape(x, /, shape, *, order="C", copy=None): - """ - reshape(x, shape, order="C", copy=None) - - Gives a new shape to an array without changing its data. - - Args: - x (usm_ndarray): - Input array to reshape. - shape (tuple of ints): - New shape. One dimension may be -1, which is inferred. - order ({"C", "F"}, optional): - Read elements in this order. Default: "C". - copy (bool, optional): - If True, always copy. If False, never copy (raise if needed). - If None, copy only when necessary. Default: None. - - Returns: - usm_ndarray: - Reshaped array. May be a view or a copy. - - Raises: - ValueError: - If the new shape is incompatible with the original shape. - TypeError: - If input is not a usm_ndarray. - - Examples: - >>> import dpctl.tensor as dpt - >>> x = dpt.arange(6) - >>> dpt.reshape(x, (2, 3)) - usm_ndarray([[0, 1, 2], - [3, 4, 5]]) - """ -``` - -## Type Utilities - -```python -from ._type_utils import ( - _to_device_supported_dtype, - _resolve_one_strong_one_weak_types, - _resolve_one_strong_two_weak_types, -) - -def some_operation(x, scalar): - # Resolve type when mixing array and scalar - res_dtype = _resolve_one_strong_one_weak_types( - x.dtype, - type(scalar), - x.sycl_device - ) - - # Ensure dtype is supported on device - res_dtype = _to_device_supported_dtype(res_dtype, x.sycl_device) -``` - -## Common Imports - -```python -import dpctl -import dpctl.tensor as dpt -from dpctl.tensor import usm_ndarray -from dpctl.tensor._type_utils import _to_device_supported_dtype -from dpctl.utils import ExecutionPlacementError -import dpctl.tensor._tensor_impl as ti # C++ bindings - -import numpy as np -from numpy.core.numeric import normalize_axis_tuple +exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, y.sycl_queue]) +if exec_q is None: + raise ExecutionPlacementError("...") ``` -## Error Messages - -Provide clear, actionable error messages: - -```python -# Good -raise TypeError( - f"Expected usm_ndarray for argument 'x', got {type(x).__name__}" -) - -# Good -raise ValueError( - f"Cannot reshape array of size {x.size} into shape {shape}" -) - -# Good -raise ExecutionPlacementError( - "Input arrays are allocated on incompatible devices. " - "Use dpctl.tensor.asarray(x, device=target_device) to copy." -) -``` - -## Output Array Pattern - -Many operations accept an optional `out` parameter: - -```python -def operation(x, /, out=None, order="K"): - # Validate input - if not isinstance(x, usm_ndarray): - raise TypeError(...) - - # Determine output properties - res_dtype = _compute_result_type(x.dtype) - res_shape = x.shape - - if out is not None: - # Validate output array - if not isinstance(out, usm_ndarray): - raise TypeError("out must be usm_ndarray") - if out.shape != res_shape: - raise ValueError(f"out shape {out.shape} != {res_shape}") - if out.dtype != res_dtype: - raise ValueError(f"out dtype {out.dtype} != {res_dtype}") - if out.sycl_queue != x.sycl_queue: - raise ValueError("out must be on same queue as input") - else: - # Create output array - out = empty(res_shape, dtype=res_dtype, order=order, - sycl_queue=x.sycl_queue) - - # Call implementation - _impl(x, out) - return out -``` - -## Best Practices - -1. **Validate inputs early** - Check types and shapes before computation -2. **Use get_execution_queue** - Ensure queue compatibility -3. **Support order parameter** - Allow C/F/A/K memory orders -4. **Return same queue** - Output arrays on same queue as inputs -5. **Follow Array API** - Match standard function signatures -6. **Document thoroughly** - Include types, shapes, exceptions in docstrings +### Adding operations +See checklist in [dpctl/tensor/AGENTS.md](/dpctl/tensor/AGENTS.md). diff --git a/.github/instructions/testing.instructions.md b/.github/instructions/testing.instructions.md index ce02e675c5..423b2709bd 100644 --- a/.github/instructions/testing.instructions.md +++ b/.github/instructions/testing.instructions.md @@ -2,256 +2,24 @@ applyTo: - "dpctl/tests/**/*.py" - "**/test_*.py" - - "**/*_test.py" --- # Testing Instructions -## Context +See [dpctl/tests/AGENTS.md](/dpctl/tests/AGENTS.md) for patterns and dtype lists. -DPCTL uses pytest for testing. Tests must handle device availability, dtype support variations, and multiple memory configurations. - -## Test Location - -``` -dpctl/tests/ -├── conftest.py # Fixtures and pytest configuration -├── helper/ # Test helper utilities -│ └── _helper.py -├── elementwise/ # Elementwise operation tests -│ ├── utils.py # Shared dtype lists -│ └── test_*.py -├── test_sycl_*.py # Core SYCL object tests -├── test_tensor_*.py # Tensor operation tests -└── test_usm_ndarray_*.py # Array tests -``` - -## Essential Fixtures and Helpers - -### Device/Queue Creation - -```python -from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported - -def test_something(): - # Skip test if no device available - q = get_queue_or_skip() - - # Skip if specific device type needed - q = get_queue_or_skip(("gpu",)) -``` - -### Dtype Support Check - -```python -def test_with_dtype(): - q = get_queue_or_skip() - dtype = "f8" # float64 - - # Skip if device doesn't support this dtype - skip_if_dtype_not_supported(dtype, q) - - # Test implementation - x = dpt.ones(10, dtype=dtype, sycl_queue=q) -``` - -## Standard Dtype Lists - -From `dpctl/tests/elementwise/utils.py`: - -```python -# Integer types (always supported) -_integral_dtypes = ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"] - -# Floating point types (may need device support check) -_real_fp_dtypes = ["f2", "f4", "f8"] # f8 (float64) needs fp64 support - -# Complex types -_complex_fp_dtypes = ["c8", "c16"] # c16 needs fp64 support - -# All numeric types -_all_dtypes = _integral_dtypes + _real_fp_dtypes + _complex_fp_dtypes - -# Boolean -_boolean_dtypes = ["?"] - -# USM allocation types -_usm_types = ["device", "shared", "host"] - -# Memory orders -_orders = ["C", "F", "A", "K"] -``` - -## Test Patterns - -### Basic Parametrized Test - -```python -import pytest -import dpctl.tensor as dpt -from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported - -@pytest.mark.parametrize("dtype", ["f4", "f8", "c8", "c16"]) -def test_operation(dtype): - q = get_queue_or_skip() - skip_if_dtype_not_supported(dtype, q) - - x = dpt.linspace(0, 1, 100, dtype=dtype, sycl_queue=q) - result = dpt.some_operation(x) - - expected = np.some_operation(dpt.asnumpy(x)) - assert_allclose(dpt.asnumpy(result), expected) -``` - -### Dtype Matrix Test (Binary Operations) - -```python -@pytest.mark.parametrize("dtype1", _all_dtypes) -@pytest.mark.parametrize("dtype2", _all_dtypes) -def test_binary_dtype_matrix(dtype1, dtype2): - q = get_queue_or_skip() - skip_if_dtype_not_supported(dtype1, q) - skip_if_dtype_not_supported(dtype2, q) - - x = dpt.ones(10, dtype=dtype1, sycl_queue=q) - y = dpt.ones(10, dtype=dtype2, sycl_queue=q) - - result = dpt.add(x, y) - - # Verify result dtype matches NumPy promotion - expected_dtype = np.result_type(np.dtype(dtype1), np.dtype(dtype2)) - assert result.dtype == expected_dtype -``` - -### USM Type Test +## Quick Reference +### Essential helpers (from `helper/_helper.py`) ```python -@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) -def test_usm_types(usm_type): - q = get_queue_or_skip() - - x = dpt.ones(10, dtype="f4", usm_type=usm_type, sycl_queue=q) - result = dpt.abs(x) - - assert result.usm_type == usm_type -``` - -### Memory Order Test - -```python -@pytest.mark.parametrize("order", ["C", "F"]) -def test_memory_order(order): - q = get_queue_or_skip() - - x = dpt.ones((10, 10), dtype="f4", order=order, sycl_queue=q) - - if order == "C": - assert x.flags.c_contiguous - else: - assert x.flags.f_contiguous -``` - -### Edge Case Tests - -```python -def test_empty_array(): - q = get_queue_or_skip() - x = dpt.empty((0,), dtype="f4", sycl_queue=q) - result = dpt.abs(x) - assert result.shape == (0,) - -def test_scalar(): - q = get_queue_or_skip() - x = dpt.asarray(5.0, dtype="f4", sycl_queue=q) - result = dpt.abs(x) - assert result.ndim == 0 - -def test_broadcast(): - q = get_queue_or_skip() - x = dpt.ones((10, 1), dtype="f4", sycl_queue=q) - y = dpt.ones((1, 10), dtype="f4", sycl_queue=q) - result = dpt.add(x, y) - assert result.shape == (10, 10) -``` - -## Test Naming Convention - -```python -# Function being tested: dpt.abs() - -def test_abs_basic(): - """Basic functionality test""" - -def test_abs_dtype_matrix(): - """Test all dtype combinations""" - -def test_abs_usm_types(): - """Test all USM allocation types""" - -def test_abs_order(): - """Test memory order preservation""" - -def test_abs_empty(): - """Test with empty array""" - -def test_abs_scalar(): - """Test with 0-d array""" - -def test_abs_broadcast(): - """Test broadcasting behavior""" - -def test_abs_out_param(): - """Test output array parameter""" - -def test_abs_inplace(): - """Test in-place operation (if supported)""" -``` - -## Assertions - -```python -import numpy as np -from numpy.testing import assert_allclose, assert_array_equal - -# For exact integer comparisons -assert_array_equal(dpt.asnumpy(result), expected) - -# For floating point with tolerance -assert_allclose(dpt.asnumpy(result), expected, rtol=1e-5, atol=1e-8) - -# For dtype checking -assert result.dtype == dpt.float32 - -# For shape checking -assert result.shape == (10, 10) - -# For USM type -assert result.usm_type == "device" - -# For queue equality -assert result.sycl_queue == x.sycl_queue -``` - -## Conftest Markers - -```python -# Mark test as having known complex number issues -@pytest.mark.broken_complex - -# Skip on CI -@pytest.mark.skip(reason="Known issue #123") - -# Expected failure -@pytest.mark.xfail(reason="Not yet implemented") +get_queue_or_skip() # Create queue or skip +skip_if_dtype_not_supported() # Skip if device lacks dtype ``` -## Best Practices +### Dtype lists (from `elementwise/utils.py`) +Use `_all_dtypes`, `_usm_types` for parametrization. -1. **Always use fixtures** - Use `get_queue_or_skip()` instead of direct queue creation -2. **Check dtype support** - Use `skip_if_dtype_not_supported()` for fp64/fp16 -3. **Test all dtypes** - Use parametrization to cover dtype matrix -4. **Test USM types** - Cover device, shared, and host memory -5. **Test edge cases** - Empty arrays, scalars, broadcasts -6. **Compare with NumPy** - Verify results match NumPy behavior -7. **Clean up resources** - Let arrays go out of scope naturally -8. **Use descriptive names** - Test names should indicate what's being tested +### Coverage requirements +- All supported dtypes +- All USM types (device, shared, host) +- Edge cases: empty arrays, scalars diff --git a/AGENTS.md b/AGENTS.md index 090acbab4c..a66f63b09c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,436 +2,54 @@ ## Overview -**DPCTL** (Data Parallel Control) is a Python library providing bindings for SYCL, enabling heterogeneous computing on CPUs, GPUs, and accelerators. It implements the Python Array API standard for tensor operations on SYCL devices. +**DPCTL** (Data Parallel Control) is a Python SYCL binding library for heterogeneous computing. It provides Python wrappers for SYCL runtime objects and implements the Python Array API standard for tensor operations. -- **License:** Apache 2.0 +- **License:** Apache 2.0 (see `LICENSE`) - **Copyright:** Intel Corporation -- **Primary Language:** Python with Cython bindings and C++ SYCL kernels ## Architecture ``` -┌─────────────────────────────────────────────────────────────────┐ -│ Python API Layer │ -│ dpctl, dpctl.tensor, dpctl.memory, dpctl.program, dpctl.utils │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Cython Bindings │ -│ _sycl_device.pyx, _sycl_queue.pyx, _usmarray.pyx, etc. │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ C API Layer │ -│ libsyclinterface (dpctl_*.h) │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ C++ Kernel Layer │ -│ dpctl/tensor/libtensor (SYCL kernels) │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ SYCL Runtime │ -│ (Intel oneAPI, OpenSYCL, etc.) │ -└─────────────────────────────────────────────────────────────────┘ +Python API → Cython Bindings → C API → C++ Kernels → SYCL Runtime + dpctl/ _sycl_*.pyx libsycl- libtensor/ + interface/ ``` -### Module Relationships +## Directory Guide -- **dpctl**: Core SYCL object wrappers (Device, Queue, Context, Event, Platform) -- **dpctl.tensor**: Array API-compliant tensor operations using `usm_ndarray` -- **dpctl.memory**: USM (Unified Shared Memory) allocation classes -- **dpctl.program**: SYCL kernel compilation and program management -- **dpctl.utils**: Utility functions for device queries and order management - ---- - -## Module Reference - -### dpctl/ (Core SYCL Bindings) - -**Purpose:** Python wrappers for fundamental SYCL runtime objects. - -**Key Files:** -| File | Purpose | -|------|---------| -| `_sycl_device.pyx` | `SyclDevice` - wraps `sycl::device` | -| `_sycl_queue.pyx` | `SyclQueue` - wraps `sycl::queue` | -| `_sycl_context.pyx` | `SyclContext` - wraps `sycl::context` | -| `_sycl_event.pyx` | `SyclEvent` - wraps `sycl::event` | -| `_sycl_platform.pyx` | `SyclPlatform` - wraps `sycl::platform` | -| `_backend.pxd` | C API declarations for Cython | -| `enum_types.py` | Python enums for SYCL types | - -**Conventions:** -- Extension types use `cdef class` with C reference as `_*_ref` attribute -- All C resources cleaned up in `__dealloc__` -- Use `nogil` when calling blocking C operations -- Exceptions: `SyclDeviceCreationError`, `SyclQueueCreationError`, etc. - -### dpctl/tensor/ (Array API) - -**Purpose:** Python Array API-compliant tensor operations on SYCL devices. - -**Key Files:** -| File | Purpose | -|------|---------| -| `_usmarray.pyx` | `usm_ndarray` extension type | -| `_elementwise_funcs.py` | Elementwise operation wrappers | -| `_elementwise_common.py` | `UnaryElementwiseFunc`, `BinaryElementwiseFunc` | -| `_reduction.py` | Reduction operations (sum, prod, etc.) | -| `_manipulation_functions.py` | reshape, concat, stack, etc. | -| `_ctors.py` | Array constructors (empty, zeros, ones, etc.) | -| `_copy_utils.py` | Copy and type casting utilities | -| `_type_utils.py` | Type promotion and validation | - -**Type Dispatch Pattern:** -```python -# In _elementwise_funcs.py -abs = UnaryElementwiseFunc( - "abs", # Operation name - ti._abs_result_type, # Type inference function - ti._abs, # Kernel implementation - _abs_docstring_ # Documentation -) -``` - -**Queue Validation Pattern:** -```python -def validate_queues(q1, q2): - """Ensure arrays share execution context""" - if q1 != q2: - raise ExecutionPlacementError(...) -``` - -### dpctl/tensor/libtensor/ (C++ Kernels) - -**Purpose:** SYCL kernel implementations for tensor operations. - -**Directory Structure:** -``` -libtensor/ -├── include/ -│ ├── kernels/ -│ │ ├── elementwise_functions/ # 100+ operations -│ │ │ ├── common.hpp # Base patterns -│ │ │ ├── add.hpp, sin.hpp, etc. -│ │ ├── linalg_functions/ # GEMM, dot -│ │ ├── sorting/ # Sort algorithms -│ │ └── reductions.hpp -│ └── utils/ -│ ├── type_dispatch.hpp -│ ├── type_dispatch_building.hpp -│ └── offset_utils.hpp -└── source/ - ├── elementwise_functions/ - ├── reductions/ - └── tensor_*.cpp # Entry points -``` - -**Type Enumeration (14 types):** -```cpp -enum class typenum_t : int { - BOOL = 0, - INT8, UINT8, INT16, UINT16, INT32, UINT32, - INT64, UINT64, HALF, FLOAT, DOUBLE, - CFLOAT, CDOUBLE, -}; -``` - -**Kernel Functor Pattern:** -```cpp -template -struct MyFunctor { - using supports_sg_loadstore = std::true_type; // Vectorization hint - - resT operator()(const argT &x) const { - return /* computation */; - } - - template - sycl::vec operator()( - const sycl::vec &x) const { - return /* vectorized computation */; - } -}; -``` - -### dpctl/memory/ (USM Memory) - -**Purpose:** Unified Shared Memory allocation and management. - -**Key Classes:** -- `MemoryUSMDevice` - Device-only memory -- `MemoryUSMShared` - Host and device accessible -- `MemoryUSMHost` - Host memory accessible from device - -**Interface:** -All classes implement `__sycl_usm_array_interface__`: -```python -{ - "data": (ptr, readonly_flag), - "shape": (nbytes,), - "strides": None, - "typestr": "|u1", - "version": 1, - "syclobj": queue -} -``` - -### dpctl/program/ (Kernel Compilation) - -**Purpose:** Compile and manage SYCL kernels from source. - -**Key Classes:** -- `SyclProgram` - Compiled SYCL program -- `SyclKernel` - Individual kernel extracted from program - -### libsyclinterface/ (C API) - -**Purpose:** C wrapper around SYCL C++ runtime for language interoperability. - -**Naming Convention:** `DPCTL_` -```c -DPCTLDevice_Create() -DPCTLQueue_Submit() -DPCTLContext_Delete() -``` - -**Memory Ownership Annotations:** -| Annotation | Meaning | -|------------|---------| -| `__dpctl_give` | Caller receives ownership, must free | -| `__dpctl_take` | Function takes ownership, caller must not use after | -| `__dpctl_keep` | Function observes only, does not take ownership | -| `__dpctl_null` | NULL value expected or may be returned | - -**Example:** -```c -DPCTL_API -__dpctl_give DPCTLSyclContextRef -DPCTLContext_Create(__dpctl_keep const DPCTLSyclDeviceRef DRef, - error_handler_callback *handler, - int properties); -``` - ---- +| Directory | AGENTS.md | Purpose | +|-----------|-----------|---------| +| `dpctl/` | [dpctl/AGENTS.md](dpctl/AGENTS.md) | Core SYCL bindings (Device, Queue, Context) | +| `dpctl/tensor/` | [dpctl/tensor/AGENTS.md](dpctl/tensor/AGENTS.md) | Array API tensor operations | +| `dpctl/tensor/libtensor/` | [dpctl/tensor/libtensor/AGENTS.md](dpctl/tensor/libtensor/AGENTS.md) | C++ SYCL kernels | +| `dpctl/memory/` | [dpctl/memory/AGENTS.md](dpctl/memory/AGENTS.md) | USM memory management | +| `dpctl/tests/` | [dpctl/tests/AGENTS.md](dpctl/tests/AGENTS.md) | Test suite | +| `libsyclinterface/` | [libsyclinterface/AGENTS.md](libsyclinterface/AGENTS.md) | C API layer | ## Code Style -### Python/Cython -- **Formatter:** Black (line length 80) -- **Import Sort:** isort -- **Linting:** flake8, cython-lint -- **String Quotes:** Double quotes for Cython - -### C/C++ -- **Formatter:** clang-format (LLVM-based style) -- **Indent:** 4 spaces -- **Braces:** Break before functions, classes, namespaces - -### License Header (Required in ALL files) - -**Python/Cython:** -```python -# Data Parallel Control (dpctl) -# -# Copyright 2020-2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# ... -``` - -**C/C++:** -```cpp -//===-- filename.hpp - Description -*-C++-*-===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2025 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// ... -//===----------------------------------------------------------------------===// -``` - ---- - -## Testing Guidelines - -### Test Location -- Main tests: `dpctl/tests/` -- Elementwise tests: `dpctl/tests/elementwise/` -- Libtensor tests: `dpctl/tensor/libtensor/tests/` - -### Key Fixtures (`conftest.py`) -```python -get_queue_or_skip() # Create queue or skip test -skip_if_dtype_not_supported() # Skip if device lacks dtype support -``` - -### Dtype Coverage -All operations should be tested with: -```python -_integral_dtypes = ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"] -_real_fp_dtypes = ["f2", "f4", "f8"] -_complex_fp_dtypes = ["c8", "c16"] -``` - -### USM Type Coverage -```python -_usm_types = ["device", "shared", "host"] -``` - -### Memory Order Coverage -```python -_orders = ["C", "F", "A", "K"] -``` - -### Test Pattern -```python -@pytest.mark.parametrize("dtype", _all_dtypes) -@pytest.mark.parametrize("usm_type", _usm_types) -def test_operation(dtype, usm_type): - q = get_queue_or_skip() - skip_if_dtype_not_supported(dtype, q) - - x = dpt.ones(100, dtype=dtype, usm_type=usm_type, sycl_queue=q) - result = dpt.operation(x) - - expected = np.operation(dpt.asnumpy(x)) - assert_allclose(dpt.asnumpy(result), expected) -``` - ---- - -## Adding New Elementwise Operations - -### Checklist - -1. **C++ Kernel** (`libtensor/include/kernels/elementwise_functions/`) - - [ ] Create `operation.hpp` with functor template - - [ ] Define type support matrix - - [ ] Implement scalar and vector operations - -2. **C++ Source** (`libtensor/source/elementwise_functions/`) - - [ ] Create `operation.cpp` - - [ ] Instantiate dispatch tables - - [ ] Register in `tensor_elementwise.cpp` +Style tools and configuration are defined in: +- **Python/Cython:** `.pre-commit-config.yaml` (black, isort, flake8, cython-lint) +- **C/C++:** `.clang-format` +- **Linting config:** `.flake8` -3. **Python Wrapper** (`dpctl/tensor/`) - - [ ] Add to `_elementwise_funcs.py` - - [ ] Export in `__init__.py` +## License Header -4. **Tests** (`dpctl/tests/elementwise/`) - - [ ] Create `test_operation.py` - - [ ] Cover all dtypes, USM types, orders - - [ ] Test edge cases (empty, scalar, broadcast) - -5. **Documentation** (`docs/`) - - [ ] Add to API reference - - [ ] Include in elementwise functions list - ---- - -## Common Pitfalls - -### Memory Management -- **Always** clean up C resources in `__dealloc__` -- Use `__dpctl_give`/`__dpctl_take` annotations correctly -- Check for NULL returns from C API functions -- Release GIL (`with nogil:`) during blocking operations - -### Type Promotion -- Follow Array API type promotion rules -- Handle mixed real/complex carefully -- Check device support for fp16/fp64 before operations - -### Device Capabilities -- Not all devices support fp64 (double precision) -- Not all devices support fp16 (half precision) -- Use `skip_if_dtype_not_supported()` in tests - -### Queue Consistency -- All arrays in an operation must share the same queue -- Use `dpctl.utils.get_execution_queue()` to validate - -### Broadcasting -- Follow NumPy broadcasting rules -- Handle 0-d arrays (scalars) correctly - ---- - -## Code Review Checklist - -### All Code -- [ ] License header present and correct year -- [ ] Code formatted (black/clang-format) -- [ ] No hardcoded device assumptions (fp64, etc.) - -### Python/Cython -- [ ] Docstrings for public functions -- [ ] Type hints where applicable -- [ ] Exceptions have descriptive messages -- [ ] Resources cleaned up properly - -### C++ Kernels -- [ ] Functor handles all required types -- [ ] Vectorization hints correct (`supports_sg_loadstore`, etc.) -- [ ] No raw `new`/`delete` (use SYCL allocators) -- [ ] Kernel names are unique - -### C API -- [ ] Memory ownership annotations on all functions -- [ ] NULL checks for parameters -- [ ] Error handling via callbacks -- [ ] Extern C wrapper for C++ code - -### Tests -- [ ] Covers all supported dtypes -- [ ] Covers all USM types -- [ ] Handles device capability limitations -- [ ] Tests edge cases (empty arrays, scalars) -- [ ] Uses appropriate fixtures - ---- +All source files require Apache 2.0 header with Intel copyright. See existing files for format. ## Quick Reference -### Filter String Syntax -``` -backend:device_type:device_num -``` -Examples: `"opencl:gpu:0"`, `"level_zero:gpu"`, `"cpu"` - -### Common Imports ```python import dpctl import dpctl.tensor as dpt -from dpctl.tensor import usm_ndarray -from dpctl.memory import MemoryUSMShared -``` -### Create Queue -```python -q = dpctl.SyclQueue() # Default device -q = dpctl.SyclQueue("gpu") # First GPU -q = dpctl.SyclQueue("level_zero:gpu:0") # Specific device +q = dpctl.SyclQueue("gpu") # Create queue +x = dpt.ones((100, 100), dtype="f4", device=q) # Create array +np_array = dpt.asnumpy(x) # Transfer to host ``` -### Create Array -```python -x = dpt.empty((100, 100), dtype=dpt.float32, sycl_queue=q) -x = dpt.asarray(np_array, device=q) -``` +## Key Concepts -### Transfer Data -```python -np_array = dpt.asnumpy(x) # Device to host -x = dpt.asarray(np_array, device=q) # Host to device -``` +- **Queue:** Execution context binding device + context +- **USM:** Unified Shared Memory (device/shared/host types) +- **Filter string:** Device selector syntax `"backend:device_type:num"` diff --git a/dpctl/AGENTS.md b/dpctl/AGENTS.md new file mode 100644 index 0000000000..b4549a938d --- /dev/null +++ b/dpctl/AGENTS.md @@ -0,0 +1,60 @@ +# dpctl/ - Core SYCL Bindings + +## Purpose + +Python/Cython wrappers for SYCL runtime objects: Device, Queue, Context, Event, Platform. + +## Key Files + +| File | Purpose | +|------|---------| +| `_sycl_device.pyx` | `SyclDevice` wrapping `sycl::device` | +| `_sycl_queue.pyx` | `SyclQueue` wrapping `sycl::queue` | +| `_sycl_context.pyx` | `SyclContext` wrapping `sycl::context` | +| `_sycl_event.pyx` | `SyclEvent` wrapping `sycl::event` | +| `_sycl_platform.pyx` | `SyclPlatform` wrapping `sycl::platform` | +| `_backend.pxd` | C API declarations from libsyclinterface | +| `enum_types.py` | Python enums for SYCL types | + +## Cython Conventions + +### Required Directives +```cython +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True +``` + +### Extension Type Pattern +```cython +cdef class SyclDevice: + cdef DPCTLSyclDeviceRef _device_ref # C reference + + def __dealloc__(self): + if self._device_ref is not NULL: + DPCTLDevice_Delete(self._device_ref) + + cdef DPCTLSyclDeviceRef get_device_ref(self): + return self._device_ref +``` + +### Key Rules +- Store C references as `_*_ref` attributes +- Always clean up in `__dealloc__` +- Use `with nogil:` for blocking C calls +- Check NULL before using C API returns + +### Exceptions +- `SyclDeviceCreationError` +- `SyclQueueCreationError` +- `SyclContextCreationError` + +## cimport vs import + +```cython +# cimport - C-level declarations (compile-time) +from ._backend cimport DPCTLSyclDeviceRef, DPCTLDevice_Create + +# import - Python-level (runtime) +from . import _device_selection +``` diff --git a/dpctl/memory/AGENTS.md b/dpctl/memory/AGENTS.md new file mode 100644 index 0000000000..ee1f232402 --- /dev/null +++ b/dpctl/memory/AGENTS.md @@ -0,0 +1,41 @@ +# dpctl/memory/ - USM Memory Management + +## Purpose + +Python classes for SYCL Unified Shared Memory (USM) allocation. + +## USM Types + +| Class | USM Type | Description | +|-------|----------|-------------| +| `MemoryUSMDevice` | Device | Device-only, fastest access | +| `MemoryUSMShared` | Shared | Host and device accessible | +| `MemoryUSMHost` | Host | Host memory, device accessible | + +## __sycl_usm_array_interface__ + +All memory classes implement this protocol: +```python +{ + "data": (ptr, readonly_flag), + "shape": (nbytes,), + "strides": None, + "typestr": "|u1", + "version": 1, + "syclobj": queue +} +``` + +## Memory Lifetime Rules + +1. **Queue-bound:** Memory tied to specific queue/context +2. **Outlive operations:** Keep memory alive until operations complete +3. **Views extend lifetime:** Views keep base memory alive + +## Key Files + +| File | Purpose | +|------|---------| +| `_memory.pyx` | Memory class implementations | +| `_memory.pxd` | Cython declarations | +| `__init__.py` | Public API exports | diff --git a/dpctl/tensor/AGENTS.md b/dpctl/tensor/AGENTS.md new file mode 100644 index 0000000000..d032d63132 --- /dev/null +++ b/dpctl/tensor/AGENTS.md @@ -0,0 +1,52 @@ +# dpctl/tensor/ - Array API Tensor Operations + +## Purpose + +Python Array API-compliant tensor operations using `usm_ndarray` on SYCL devices. + +## Key Files + +| File | Purpose | +|------|---------| +| `_usmarray.pyx` | `usm_ndarray` extension type | +| `_elementwise_funcs.py` | Elementwise operation wrappers | +| `_elementwise_common.py` | `UnaryElementwiseFunc`, `BinaryElementwiseFunc` | +| `_reduction.py` | Reduction operations (sum, prod, etc.) | +| `_manipulation_functions.py` | reshape, concat, stack, etc. | +| `_ctors.py` | Array constructors (empty, zeros, ones) | +| `_type_utils.py` | Type promotion and validation | + +See [libtensor/AGENTS.md](libtensor/AGENTS.md) for C++ kernel implementation. + +## Elementwise Wrapper Pattern + +```python +from ._elementwise_common import UnaryElementwiseFunc +import dpctl.tensor._tensor_impl as ti + +abs = UnaryElementwiseFunc( + "abs", # Operation name + ti._abs_result_type, # Type inference + ti._abs, # Kernel implementation + _abs_docstring_ +) +``` + +## Queue Validation + +All operations must validate queue compatibility: + +```python +exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, y.sycl_queue]) +if exec_q is None: + raise ExecutionPlacementError("Arrays on incompatible queues") +``` + +## Adding New Operations + +1. C++ kernel in `libtensor/include/kernels/` +2. C++ source in `libtensor/source/` +3. Register in `libtensor/source/tensor_elementwise.cpp` +4. Python wrapper in `_elementwise_funcs.py` +5. Export in `__init__.py` +6. Tests in `../tests/elementwise/` diff --git a/dpctl/tests/AGENTS.md b/dpctl/tests/AGENTS.md new file mode 100644 index 0000000000..93469b0f28 --- /dev/null +++ b/dpctl/tests/AGENTS.md @@ -0,0 +1,51 @@ +# dpctl/tests/ - Test Suite + +## Purpose + +pytest-based test suite for dpctl functionality. + +## Key Files + +| File | Purpose | +|------|---------| +| `conftest.py` | Fixtures and pytest configuration | +| `helper/_helper.py` | Test utilities | +| `elementwise/utils.py` | Dtype lists for parametrization | + +## Essential Fixtures + +From `helper/_helper.py`: +```python +get_queue_or_skip() # Create queue or skip test +skip_if_dtype_not_supported() # Skip if device lacks dtype +``` + +## Standard Dtype Lists + +Defined in `elementwise/utils.py`: +```python +_integral_dtypes = ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"] +_real_fp_dtypes = ["f2", "f4", "f8"] +_complex_fp_dtypes = ["c8", "c16"] +_usm_types = ["device", "shared", "host"] +``` + +## Test Pattern + +```python +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_operation(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.ones(100, dtype=dtype, sycl_queue=q) + result = dpt.operation(x) + # ... assertions +``` + +## Coverage Requirements + +- All supported dtypes +- All USM types (device, shared, host) +- Memory orders (C, F) where applicable +- Edge cases: empty arrays, scalars, broadcasting diff --git a/libsyclinterface/AGENTS.md b/libsyclinterface/AGENTS.md new file mode 100644 index 0000000000..b724de8679 --- /dev/null +++ b/libsyclinterface/AGENTS.md @@ -0,0 +1,58 @@ +# libsyclinterface/ - C API Layer + +## Purpose + +C wrapper around SYCL C++ runtime for language interoperability (used by Cython bindings). + +## Directory Structure + +``` +libsyclinterface/ +├── include/syclinterface/ +│ ├── Support/ +│ │ ├── DllExport.h # DPCTL_API macro +│ │ ├── ExternC.h # extern "C" macros +│ │ └── MemOwnershipAttrs.h # Ownership annotations +│ └── dpctl_sycl_*.h # API headers +├── source/ # Implementation +└── tests/ # C interface tests +``` + +## Naming Convention + +``` +DPCTL_ +``` + +Examples: `DPCTLDevice_Create`, `DPCTLQueue_Submit`, `DPCTLContext_Delete` + +## Memory Ownership Annotations + +Defined in `Support/MemOwnershipAttrs.h`: + +| Annotation | Meaning | +|------------|---------| +| `__dpctl_give` | Caller receives ownership, must free | +| `__dpctl_take` | Function takes ownership, caller must not use after | +| `__dpctl_keep` | Function only observes, does not take ownership | +| `__dpctl_null` | NULL is valid value | + +## Function Pattern + +```c +DPCTL_API +__dpctl_give DPCTLSyclContextRef +DPCTLContext_Create(__dpctl_keep const DPCTLSyclDeviceRef DRef, + error_handler_callback *handler, + int properties); + +DPCTL_API +void DPCTLContext_Delete(__dpctl_take DPCTLSyclContextRef CRef); +``` + +## Key Rules + +- Always annotate ownership on parameters and returns +- Return NULL on failure +- Use `DPCTL_C_EXTERN_C_BEGIN/END` for C++ implementations +- Use `DPCTL_API` for exported functions From a4d95a026080e7d490291c7e9efba9a4e89b7f53 Mon Sep 17 00:00:00 2001 From: Nikolay Petrov Date: Fri, 23 Jan 2026 18:00:42 -0800 Subject: [PATCH 3/3] instruction improvements --- .github/instructions/cython.instructions.md | 13 ++- .github/instructions/dpctl.instructions.md | 14 ++-- .../instructions/elementwise.instructions.md | 15 ++-- .../libsyclinterface.instructions.md | 10 +-- .../libtensor-cpp.instructions.md | 12 +-- .github/instructions/memory.instructions.md | 8 +- .../tensor-python.instructions.md | 10 +-- .github/instructions/testing.instructions.md | 18 ++-- AGENTS.md | 24 +++--- dpctl/AGENTS.md | 6 +- dpctl/program/AGENTS.md | 39 +++++++++ dpctl/tensor/AGENTS.md | 30 +++---- dpctl/tensor/libtensor/AGENTS.md | 83 +++++++++++++++++++ dpctl/tests/AGENTS.md | 25 +++--- dpctl/utils/AGENTS.md | 34 ++++++++ 15 files changed, 243 insertions(+), 98 deletions(-) create mode 100644 dpctl/program/AGENTS.md create mode 100644 dpctl/tensor/libtensor/AGENTS.md create mode 100644 dpctl/utils/AGENTS.md diff --git a/.github/instructions/cython.instructions.md b/.github/instructions/cython.instructions.md index acc673bd89..a68c2c9c2c 100644 --- a/.github/instructions/cython.instructions.md +++ b/.github/instructions/cython.instructions.md @@ -7,19 +7,16 @@ applyTo: # Cython Instructions -See [dpctl/AGENTS.md](/dpctl/AGENTS.md) for conventions and patterns. +See `dpctl/AGENTS.md` for full conventions. -## Quick Reference - -### Required Directives (after license header) +## Required Directives (after license) ```cython # distutils: language = c++ # cython: language_level=3 # cython: linetrace=True ``` -### Key Rules -- `cimport` for C-level declarations, `import` for Python -- Store C refs as `_*_ref`, clean up in `__dealloc__` +## Key Rules +- `cimport` for C-level, `import` for Python-level +- Store C refs as `_*_ref`, clean up in `__dealloc__` with NULL check - Use `with nogil:` for blocking C operations -- Check NULL before using C API returns diff --git a/.github/instructions/dpctl.instructions.md b/.github/instructions/dpctl.instructions.md index 7798f146a0..b97eb0e0a4 100644 --- a/.github/instructions/dpctl.instructions.md +++ b/.github/instructions/dpctl.instructions.md @@ -10,17 +10,17 @@ applyTo: # DPCTL General Instructions -See [/AGENTS.md](/AGENTS.md) for project overview and architecture. +See `AGENTS.md` at repository root for project overview and architecture. +Each major directory has its own `AGENTS.md` with specific conventions. ## Key References -- **Code style:** See `.pre-commit-config.yaml` for tool versions, `.clang-format` for C++ style -- **License:** Apache 2.0 with Intel copyright - see existing files for header format -- **Directory guides:** Each major directory has its own `AGENTS.md` +- **Code style:** `.pre-commit-config.yaml`, `.clang-format`, `.flake8` +- **License:** Apache 2.0 with Intel copyright - match existing file headers ## Critical Rules -1. **Device compatibility:** Not all devices support fp64/fp16 - check capabilities -2. **Queue consistency:** All arrays in an operation must share compatible queues -3. **Resource cleanup:** Always clean up C resources in `__dealloc__` +1. **Device compatibility:** Not all devices support fp64/fp16 - never assume availability +2. **Queue consistency:** Arrays in same operation must share compatible queues +3. **Resource cleanup:** Clean up C resources in `__dealloc__` with NULL check 4. **NULL checks:** Always check C API returns before use diff --git a/.github/instructions/elementwise.instructions.md b/.github/instructions/elementwise.instructions.md index ac266801f6..0b8c392fe1 100644 --- a/.github/instructions/elementwise.instructions.md +++ b/.github/instructions/elementwise.instructions.md @@ -1,6 +1,7 @@ --- applyTo: - - "dpctl/tensor/libtensor/**/elementwise_functions/**" + - "dpctl/tensor/libtensor/include/kernels/elementwise_functions/**" + - "dpctl/tensor/libtensor/source/elementwise_functions/**" - "dpctl/tensor/_elementwise_*.py" - "dpctl/tests/elementwise/**" --- @@ -10,16 +11,14 @@ applyTo: Full stack: C++ kernel → pybind11 → Python wrapper → tests ## References +- C++ kernels: `dpctl/tensor/libtensor/AGENTS.md` +- Python wrappers: `dpctl/tensor/AGENTS.md` +- Tests: `dpctl/tests/AGENTS.md` -- **C++ kernels:** [dpctl/tensor/libtensor/AGENTS.md](/dpctl/tensor/libtensor/AGENTS.md) -- **Python wrappers:** [dpctl/tensor/AGENTS.md](/dpctl/tensor/AGENTS.md) -- **Tests:** [dpctl/tests/AGENTS.md](/dpctl/tests/AGENTS.md) - -## Adding New Operation Checklist - +## Adding New Operation 1. `libtensor/include/kernels/elementwise_functions/op.hpp` - functor 2. `libtensor/source/elementwise_functions/op.cpp` - dispatch tables 3. Register in `tensor_elementwise.cpp` 4. `_elementwise_funcs.py` - Python wrapper 5. Export in `__init__.py` -6. `tests/elementwise/test_op.py` - full coverage +6. `tests/elementwise/test_op.py` - full dtype/usm coverage diff --git a/.github/instructions/libsyclinterface.instructions.md b/.github/instructions/libsyclinterface.instructions.md index bd5ffdb122..9b9d738d71 100644 --- a/.github/instructions/libsyclinterface.instructions.md +++ b/.github/instructions/libsyclinterface.instructions.md @@ -7,19 +7,17 @@ applyTo: # C API Instructions -See [libsyclinterface/AGENTS.md](/libsyclinterface/AGENTS.md) for conventions. +See `libsyclinterface/AGENTS.md` for conventions. -## Quick Reference - -### Naming +## Naming `DPCTL_` (e.g., `DPCTLDevice_Create`) -### Ownership annotations (from `Support/MemOwnershipAttrs.h`) +## Ownership annotations (see `include/syclinterface/Support/MemOwnershipAttrs.h`) - `__dpctl_give` - caller must free - `__dpctl_take` - function takes ownership - `__dpctl_keep` - function only observes -### Key Rules +## Key Rules - Annotate all parameters and returns - Return NULL on failure - Use `DPCTL_API` for exports diff --git a/.github/instructions/libtensor-cpp.instructions.md b/.github/instructions/libtensor-cpp.instructions.md index bfd1379771..6629139103 100644 --- a/.github/instructions/libtensor-cpp.instructions.md +++ b/.github/instructions/libtensor-cpp.instructions.md @@ -6,15 +6,11 @@ applyTo: # C++ SYCL Kernel Instructions -See [dpctl/tensor/libtensor/AGENTS.md](/dpctl/tensor/libtensor/AGENTS.md) for patterns and structure. +See `dpctl/tensor/libtensor/AGENTS.md` for patterns and directory structure. -## Quick Reference - -### Type dispatch -See `include/utils/type_dispatch_building.hpp` for the 14 supported types. - -### Key Rules -- Kernel class names must be unique +## Key Rules +- Kernel class names must be globally unique - Use `if constexpr` for compile-time type branching - Complex types don't support vectorization - Return `nullptr` from factory for unsupported types +- Check `include/kernels/elementwise_functions/common.hpp` for base patterns diff --git a/.github/instructions/memory.instructions.md b/.github/instructions/memory.instructions.md index 9a012356da..ff99ac359f 100644 --- a/.github/instructions/memory.instructions.md +++ b/.github/instructions/memory.instructions.md @@ -6,16 +6,14 @@ applyTo: # USM Memory Instructions -See [dpctl/memory/AGENTS.md](/dpctl/memory/AGENTS.md) for details. +See `dpctl/memory/AGENTS.md` for details. -## Quick Reference - -### USM Types +## USM Types - `MemoryUSMDevice` - device-only (fastest) - `MemoryUSMShared` - host and device accessible - `MemoryUSMHost` - host memory, device accessible -### Lifetime Rules +## Lifetime Rules 1. Memory is queue-bound 2. Keep memory alive until operations complete 3. Views extend base memory lifetime diff --git a/.github/instructions/tensor-python.instructions.md b/.github/instructions/tensor-python.instructions.md index 3044b6197f..b7dcefe840 100644 --- a/.github/instructions/tensor-python.instructions.md +++ b/.github/instructions/tensor-python.instructions.md @@ -6,16 +6,14 @@ applyTo: # Tensor Python Instructions -See [dpctl/tensor/AGENTS.md](/dpctl/tensor/AGENTS.md) for patterns. +See `dpctl/tensor/AGENTS.md` for patterns. -## Quick Reference - -### Queue validation (required for all operations) +## Queue validation (required) ```python exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, y.sycl_queue]) if exec_q is None: raise ExecutionPlacementError("...") ``` -### Adding operations -See checklist in [dpctl/tensor/AGENTS.md](/dpctl/tensor/AGENTS.md). +## Adding operations +See checklist in `dpctl/tensor/AGENTS.md`. diff --git a/.github/instructions/testing.instructions.md b/.github/instructions/testing.instructions.md index 423b2709bd..560ec89b80 100644 --- a/.github/instructions/testing.instructions.md +++ b/.github/instructions/testing.instructions.md @@ -6,20 +6,18 @@ applyTo: # Testing Instructions -See [dpctl/tests/AGENTS.md](/dpctl/tests/AGENTS.md) for patterns and dtype lists. +See `dpctl/tests/AGENTS.md` for patterns. -## Quick Reference - -### Essential helpers (from `helper/_helper.py`) +## Essential helpers (from `helper/_helper.py`) ```python get_queue_or_skip() # Create queue or skip skip_if_dtype_not_supported() # Skip if device lacks dtype ``` -### Dtype lists (from `elementwise/utils.py`) -Use `_all_dtypes`, `_usm_types` for parametrization. +## Dtype/USM lists +Import from `elementwise/utils.py` - do not hardcode. -### Coverage requirements -- All supported dtypes -- All USM types (device, shared, host) -- Edge cases: empty arrays, scalars +## Coverage +- All dtypes from `_all_dtypes` +- All USM types: device, shared, host +- Edge cases: empty, scalar, broadcast diff --git a/AGENTS.md b/AGENTS.md index a66f63b09c..e64054f200 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -10,9 +10,10 @@ ## Architecture ``` -Python API → Cython Bindings → C API → C++ Kernels → SYCL Runtime - dpctl/ _sycl_*.pyx libsycl- libtensor/ - interface/ +Python API → Cython Bindings → C API → SYCL Runtime + dpctl/ _sycl_*.pyx libsyclinterface/ + +dpctl.tensor → pybind11 → C++ Kernels (libtensor/) → SYCL Runtime ``` ## Directory Guide @@ -23,19 +24,21 @@ Python API → Cython Bindings → C API → C++ Kernels → SYCL Runtim | `dpctl/tensor/` | [dpctl/tensor/AGENTS.md](dpctl/tensor/AGENTS.md) | Array API tensor operations | | `dpctl/tensor/libtensor/` | [dpctl/tensor/libtensor/AGENTS.md](dpctl/tensor/libtensor/AGENTS.md) | C++ SYCL kernels | | `dpctl/memory/` | [dpctl/memory/AGENTS.md](dpctl/memory/AGENTS.md) | USM memory management | +| `dpctl/program/` | [dpctl/program/AGENTS.md](dpctl/program/AGENTS.md) | SYCL kernel compilation | +| `dpctl/utils/` | [dpctl/utils/AGENTS.md](dpctl/utils/AGENTS.md) | Utility functions | | `dpctl/tests/` | [dpctl/tests/AGENTS.md](dpctl/tests/AGENTS.md) | Test suite | | `libsyclinterface/` | [libsyclinterface/AGENTS.md](libsyclinterface/AGENTS.md) | C API layer | ## Code Style -Style tools and configuration are defined in: -- **Python/Cython:** `.pre-commit-config.yaml` (black, isort, flake8, cython-lint) +Configuration files (do not hardcode versions - check these files): +- **Python/Cython:** `.pre-commit-config.yaml` - **C/C++:** `.clang-format` -- **Linting config:** `.flake8` +- **Linting:** `.flake8` ## License Header -All source files require Apache 2.0 header with Intel copyright. See existing files for format. +All source files require Apache 2.0 header with Intel copyright. Reference existing files for exact format. ## Quick Reference @@ -43,9 +46,9 @@ All source files require Apache 2.0 header with Intel copyright. See existing fi import dpctl import dpctl.tensor as dpt -q = dpctl.SyclQueue("gpu") # Create queue -x = dpt.ones((100, 100), dtype="f4", device=q) # Create array -np_array = dpt.asnumpy(x) # Transfer to host +q = dpctl.SyclQueue("gpu") # Create queue +x = dpt.ones((100, 100), dtype="f4", sycl_queue=q) # Create array +np_array = dpt.asnumpy(x) # Transfer to host ``` ## Key Concepts @@ -53,3 +56,4 @@ np_array = dpt.asnumpy(x) # Transfer to host - **Queue:** Execution context binding device + context - **USM:** Unified Shared Memory (device/shared/host types) - **Filter string:** Device selector syntax `"backend:device_type:num"` +- **Array API:** Python standard for array operations (https://data-apis.org/array-api/) diff --git a/dpctl/AGENTS.md b/dpctl/AGENTS.md index b4549a938d..23fbbf2d69 100644 --- a/dpctl/AGENTS.md +++ b/dpctl/AGENTS.md @@ -13,12 +13,14 @@ Python/Cython wrappers for SYCL runtime objects: Device, Queue, Context, Event, | `_sycl_context.pyx` | `SyclContext` wrapping `sycl::context` | | `_sycl_event.pyx` | `SyclEvent` wrapping `sycl::event` | | `_sycl_platform.pyx` | `SyclPlatform` wrapping `sycl::platform` | +| `_sycl_device_factory.pyx` | Device enumeration and selection | +| `_sycl_queue_manager.pyx` | Queue management utilities | | `_backend.pxd` | C API declarations from libsyclinterface | | `enum_types.py` | Python enums for SYCL types | ## Cython Conventions -### Required Directives +### Required Directives (after license header) ```cython # distutils: language = c++ # cython: language_level=3 @@ -40,7 +42,7 @@ cdef class SyclDevice: ### Key Rules - Store C references as `_*_ref` attributes -- Always clean up in `__dealloc__` +- Always clean up in `__dealloc__` with NULL check - Use `with nogil:` for blocking C calls - Check NULL before using C API returns diff --git a/dpctl/program/AGENTS.md b/dpctl/program/AGENTS.md new file mode 100644 index 0000000000..3f24023452 --- /dev/null +++ b/dpctl/program/AGENTS.md @@ -0,0 +1,39 @@ +# dpctl/program/ - SYCL Kernel Compilation + +## Purpose + +Compile and manage SYCL kernels from OpenCL C or SPIR-V source. + +## Key Files + +| File | Purpose | +|------|---------| +| `_program.pyx` | `SyclProgram`, `SyclKernel` extension types | +| `_program.pxd` | Cython declarations | +| `__init__.py` | Public API exports | + +## Classes + +- **`SyclProgram`** - Compiled SYCL program containing one or more kernels +- **`SyclKernel`** - Individual kernel extracted from a program + +## Usage Pattern + +```python +from dpctl.program import create_program_from_source + +source = """ +__kernel void add(__global float* a, __global float* b, __global float* c) { + int i = get_global_id(0); + c[i] = a[i] + b[i]; +} +""" + +program = create_program_from_source(queue, source) +kernel = program.get_sycl_kernel("add") +``` + +## Notes + +- Programs are context-bound +- Follows same Cython patterns as core dpctl (see `../AGENTS.md`) diff --git a/dpctl/tensor/AGENTS.md b/dpctl/tensor/AGENTS.md index d032d63132..594809960e 100644 --- a/dpctl/tensor/AGENTS.md +++ b/dpctl/tensor/AGENTS.md @@ -10,11 +10,15 @@ Python Array API-compliant tensor operations using `usm_ndarray` on SYCL devices |------|---------| | `_usmarray.pyx` | `usm_ndarray` extension type | | `_elementwise_funcs.py` | Elementwise operation wrappers | -| `_elementwise_common.py` | `UnaryElementwiseFunc`, `BinaryElementwiseFunc` | -| `_reduction.py` | Reduction operations (sum, prod, etc.) | -| `_manipulation_functions.py` | reshape, concat, stack, etc. | -| `_ctors.py` | Array constructors (empty, zeros, ones) | +| `_elementwise_common.py` | `UnaryElementwiseFunc`, `BinaryElementwiseFunc` base classes | +| `_reduction.py` | Reduction operations (sum, prod, mean, etc.) | +| `_manipulation_functions.py` | reshape, concat, stack, split, etc. | +| `_ctors.py` | Array constructors (empty, zeros, ones, arange, etc.) | | `_type_utils.py` | Type promotion and validation | +| `_sorting.py` | sort, argsort | +| `_searchsorted.py` | searchsorted, digitize | +| `_dlpack.pyx` | DLPack interoperability | +| `_copy_utils.py` | Copy and type casting | See [libtensor/AGENTS.md](libtensor/AGENTS.md) for C++ kernel implementation. @@ -26,15 +30,13 @@ import dpctl.tensor._tensor_impl as ti abs = UnaryElementwiseFunc( "abs", # Operation name - ti._abs_result_type, # Type inference - ti._abs, # Kernel implementation + ti._abs_result_type, # Type inference function + ti._abs, # Kernel implementation (from pybind11) _abs_docstring_ ) ``` -## Queue Validation - -All operations must validate queue compatibility: +## Queue Validation (required for all operations) ```python exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, y.sycl_queue]) @@ -44,9 +46,9 @@ if exec_q is None: ## Adding New Operations -1. C++ kernel in `libtensor/include/kernels/` -2. C++ source in `libtensor/source/` -3. Register in `libtensor/source/tensor_elementwise.cpp` -4. Python wrapper in `_elementwise_funcs.py` +1. C++ kernel header: `libtensor/include/kernels//op.hpp` +2. C++ source: `libtensor/source//op.cpp` +3. Register in appropriate `tensor_*.cpp` entry point +4. Python wrapper in appropriate `_*.py` module 5. Export in `__init__.py` -6. Tests in `../tests/elementwise/` +6. Tests in `../tests/` with full dtype/usm coverage diff --git a/dpctl/tensor/libtensor/AGENTS.md b/dpctl/tensor/libtensor/AGENTS.md new file mode 100644 index 0000000000..29790f7299 --- /dev/null +++ b/dpctl/tensor/libtensor/AGENTS.md @@ -0,0 +1,83 @@ +# dpctl/tensor/libtensor/ - C++ SYCL Kernels + +## Purpose + +C++ SYCL kernel implementations for tensor operations, exposed to Python via pybind11. + +## Directory Structure + +``` +libtensor/ +├── include/ +│ ├── kernels/ +│ │ ├── elementwise_functions/ # Unary/binary operations +│ │ │ ├── common.hpp # Base iteration patterns +│ │ │ ├── common_inplace.hpp # In-place patterns +│ │ │ └── .hpp # Individual operations +│ │ ├── linalg_functions/ # GEMM, dot product +│ │ ├── sorting/ # Sort, searchsorted +│ │ ├── reductions.hpp +│ │ └── accumulators.hpp +│ └── utils/ +│ ├── type_dispatch.hpp # Runtime type lookup +│ ├── type_dispatch_building.hpp # Dispatch table generation +│ ├── offset_utils.hpp # Stride/offset calculations +│ └── sycl_alloc_utils.hpp # USM allocation helpers +├── source/ +│ ├── elementwise_functions/ +│ ├── reductions/ +│ ├── sorting/ +│ ├── linalg_functions/ +│ └── tensor_*.cpp # pybind11 module definitions +└── tests/ # C++ unit tests +``` + +## Type Dispatch + +14 supported types - see `include/utils/type_dispatch_building.hpp` for `typenum_t` enum. + +## Kernel Functor Pattern + +```cpp +template +struct MyOpFunctor { + // Vectorization hints (check common.hpp for usage) + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + // Scalar operation (required) + resT operator()(const argT &x) const { + return /* computation */; + } + + // Vectorized operation (optional, when supports_vec is true) + template + sycl::vec operator()( + const sycl::vec &x) const { + return /* vectorized computation */; + } +}; +``` + +## Factory Pattern + +```cpp +template +struct MyOpContigFactory { + fnT get() { + if constexpr (/* unsupported type combination */) { + return nullptr; // Signals unsupported + } else { + return my_op_contig_impl; + } + } +}; +``` + +## Key Rules + +- Kernel class names must be globally unique +- Use `if constexpr` for compile-time type branching +- Complex types typically don't support vectorization +- Return `nullptr` from factory for unsupported type combinations +- Check `common.hpp` and existing operations for patterns diff --git a/dpctl/tests/AGENTS.md b/dpctl/tests/AGENTS.md index 93469b0f28..46e0a9c378 100644 --- a/dpctl/tests/AGENTS.md +++ b/dpctl/tests/AGENTS.md @@ -9,25 +9,22 @@ pytest-based test suite for dpctl functionality. | File | Purpose | |------|---------| | `conftest.py` | Fixtures and pytest configuration | -| `helper/_helper.py` | Test utilities | -| `elementwise/utils.py` | Dtype lists for parametrization | +| `helper/_helper.py` | `get_queue_or_skip()`, `skip_if_dtype_not_supported()` | +| `elementwise/utils.py` | Dtype and USM type lists for parametrization | -## Essential Fixtures +## Essential Helpers From `helper/_helper.py`: ```python get_queue_or_skip() # Create queue or skip test -skip_if_dtype_not_supported() # Skip if device lacks dtype +skip_if_dtype_not_supported() # Skip if device lacks dtype (fp64/fp16) ``` -## Standard Dtype Lists +## Dtype/USM Lists -Defined in `elementwise/utils.py`: +**Do not hardcode** - import from `elementwise/utils.py`: ```python -_integral_dtypes = ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"] -_real_fp_dtypes = ["f2", "f4", "f8"] -_complex_fp_dtypes = ["c8", "c16"] -_usm_types = ["device", "shared", "host"] +from .utils import _all_dtypes, _usm_types, _no_complex_dtypes ``` ## Test Pattern @@ -45,7 +42,7 @@ def test_operation(dtype): ## Coverage Requirements -- All supported dtypes -- All USM types (device, shared, host) -- Memory orders (C, F) where applicable -- Edge cases: empty arrays, scalars, broadcasting +- All supported dtypes (see `elementwise/utils.py`) +- All USM types: device, shared, host +- Memory orders: C, F where applicable +- Edge cases: empty arrays, 0-d arrays (scalars), broadcasting diff --git a/dpctl/utils/AGENTS.md b/dpctl/utils/AGENTS.md new file mode 100644 index 0000000000..8ab88fff5f --- /dev/null +++ b/dpctl/utils/AGENTS.md @@ -0,0 +1,34 @@ +# dpctl/utils/ - Utility Functions + +## Purpose + +Helper utilities for device queries, execution context management, and ordering. + +## Key Files + +| File | Purpose | +|------|---------| +| `_compute_follows_data.pyx` | `get_execution_queue()` for queue compatibility | +| `_order_manager.py` | `OrderManager` for memory layout handling | +| `_intel_device_info.py` | Intel-specific device information | +| `_onetrace_context.py` | Tracing/profiling context manager | + +## Key Functions + +### get_execution_queue() +Validates queue compatibility between arrays: +```python +from dpctl.utils import get_execution_queue + +exec_q = get_execution_queue([x.sycl_queue, y.sycl_queue]) +if exec_q is None: + raise ExecutionPlacementError("Incompatible queues") +``` + +### ExecutionPlacementError +Exception raised when arrays are on incompatible queues. + +## Notes + +- `get_execution_queue()` is critical for all tensor operations +- See usage examples in `dpctl/tensor/` modules