diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 6d04a43ce6..dbbbb0e1b5 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -6,6 +6,8 @@ on:
- master
pull_request:
+permissions: read-all
+
env:
PACKAGE_NAME: dpctl
MODULE_NAME: dpctl
@@ -20,7 +22,7 @@ jobs:
matrix:
python: ['3.9', '3.10', '3.11']
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -28,7 +30,7 @@ jobs:
run: |
echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
- name: Cache conda packages
- uses: actions/cache@v3
+ uses: actions/cache@v4
env:
CACHE_NUMBER: 3 # Increase to reset cache
with:
@@ -58,12 +60,12 @@ jobs:
$CHANNELS \
conda-recipe
- name: Upload artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
path: /usr/share/miniconda/conda-bld/linux-64/${{ env.PACKAGE_NAME }}-*.tar.bz2
- name: Upload wheels artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
path: ${{ env.WHEELS_OUTPUT_FOLDER }}${{ env.PACKAGE_NAME }}-*.whl
@@ -77,10 +79,10 @@ jobs:
env:
conda-bld: C:\Miniconda\conda-bld\win-64\
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
with:
fetch-depth: 0
- - uses: conda-incubator/setup-miniconda@v2
+ - uses: conda-incubator/setup-miniconda@v3
with:
auto-activate-base: true
conda-build-version: "*"
@@ -88,7 +90,7 @@ jobs:
python-version: ${{ matrix.python }}
- name: Cache conda packages
- uses: actions/cache@v3
+ uses: actions/cache@v4
env:
CACHE_NUMBER: 3 # Increase to reset cache
with:
@@ -107,12 +109,12 @@ jobs:
OVERRIDE_INTEL_IPO: 1 # IPO requires more resources that GH actions VM provides
run: conda build --no-test --python ${{ matrix.python }} -c intel -c conda-forge --override-channels conda-recipe
- name: Upload artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
path: ${{ env.conda-bld }}${{ env.PACKAGE_NAME }}-*.tar.bz2
- name: Upload wheels artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
path: ${{ env.WHEELS_OUTPUT_FOLDER }}${{ env.PACKAGE_NAME }}-*.whl
@@ -132,7 +134,7 @@ jobs:
steps:
- name: Download artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
- name: Add conda to system path
@@ -159,7 +161,7 @@ jobs:
run: |
echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
- name: Cache conda packages
- uses: actions/cache@v3
+ uses: actions/cache@v4
env:
CACHE_NUMBER: 3 # Increase to reset cache
with:
@@ -217,10 +219,10 @@ jobs:
steps:
- name: Download artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
- - uses: conda-incubator/setup-miniconda@v2
+ - uses: conda-incubator/setup-miniconda@v3
with:
auto-update-conda: true
conda-build-version: '*'
@@ -260,7 +262,7 @@ jobs:
shell: pwsh
run: Get-Content -Path .\lockfile
- name: Cache conda packages
- uses: actions/cache@v3
+ uses: actions/cache@v4
env:
CACHE_NUMBER: 3 # Increase to reset cache
with:
@@ -324,12 +326,12 @@ jobs:
python: ['3.9', '3.10', '3.11']
steps:
- name: Download conda artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
- name: Download wheel artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
@@ -360,16 +362,16 @@ jobs:
python: ['3.9', '3.10', '3.11']
steps:
- name: Download artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
- name: Download wheel artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
- - uses: conda-incubator/setup-miniconda@v2
+ - uses: conda-incubator/setup-miniconda@v3
with:
auto-activate-base: true
activate-environment: ""
@@ -409,11 +411,11 @@ jobs:
# Needed to be able to run conda index
run: conda install conda-build python=${{ matrix.python }}
- name: Checkout dpctl repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Download artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
- name: Add conda to system path
@@ -435,7 +437,7 @@ jobs:
run: |
echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
- name: Cache conda packages
- uses: actions/cache@v3
+ uses: actions/cache@v4
env:
CACHE_NUMBER: 3 # Increase to reset cache
with:
@@ -539,6 +541,8 @@ jobs:
array-api-conformity:
needs: build_linux
runs-on: ${{ matrix.runner }}
+ permissions:
+ pull-requests: write
strategy:
matrix:
@@ -550,12 +554,12 @@ jobs:
CHANNELS: -c intel -c conda-forge --override-channels
steps:
- name: Checkout dpctl repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cache array API tests
id: cache-array-api-tests
- uses: actions/cache@v3
+ uses: actions/cache@v4
env:
ARRAY_CACHE: 3
with:
@@ -574,7 +578,7 @@ jobs:
git clone --recurse-submodules https://github.com/data-apis/array-api-tests array-api-tests
cd array-api-tests
- name: Download artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
- name: Add conda to system path
@@ -601,7 +605,7 @@ jobs:
run: |
echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
- name: Cache conda packages
- uses: actions/cache@v3
+ uses: actions/cache@v4
env:
CACHE_NUMBER: 3 # Increase to reset cache
with:
@@ -642,7 +646,7 @@ jobs:
python -c "import dpctl; dpctl.lsplatform()"
export ARRAY_API_TESTS_MODULE=dpctl.tensor
cd /home/runner/work/array-api-tests
- pytest --json-report --json-report-file=$FILE --skips-file ${GITHUB_WORKSPACE}/.github/workflows/array-api-skips.txt array_api_tests/ || true
+ pytest --json-report --json-report-file=$FILE --disable-deadline --skips-file ${GITHUB_WORKSPACE}/.github/workflows/array-api-skips.txt array_api_tests/ || true
- name: Set Github environment variables
shell: bash -l {0}
run: |
@@ -668,7 +672,7 @@ jobs:
run: echo "::notice ${{ env.MESSAGE }}"
- name: Post result to PR
if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
- uses: mshick/add-pr-comment@v1
+ uses: mshick/add-pr-comment@v2
with:
message: |
${{ env.MESSAGE }}
@@ -684,7 +688,7 @@ jobs:
run:
shell: bash -el {0}
steps:
- - uses: conda-incubator/setup-miniconda@v2
+ - uses: conda-incubator/setup-miniconda@v3
with:
run-post: false
channel-priority: "disabled"
@@ -695,7 +699,7 @@ jobs:
run: conda install anaconda-client
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
repository: IntelPython/devops-tools
fetch-depth: 0
diff --git a/.github/workflows/cpp_style_checks.yml b/.github/workflows/cpp_style_checks.yml
index a450bff627..facf85651b 100644
--- a/.github/workflows/cpp_style_checks.yml
+++ b/.github/workflows/cpp_style_checks.yml
@@ -9,19 +9,21 @@ on:
push:
branches: [master]
+permissions: read-all
+
jobs:
formatting-check:
name: clang-format
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Run clang-format style check for C/C++ programs.
- uses: jidicula/clang-format-action@v3.5.1
+ uses: jidicula/clang-format-action@v4.11.0
with:
clang-format-version: '11'
check-path: 'libsyclinterface'
- name: Run clang-format style check for api headers.
- uses: jidicula/clang-format-action@v3.5.1
+ uses: jidicula/clang-format-action@v4.11.0
with:
clang-format-version: '11'
check-path: 'dpctl/apis'
diff --git a/.github/workflows/generate-coverage.yaml b/.github/workflows/generate-coverage.yaml
index edf03bc8f6..7ec430331f 100644
--- a/.github/workflows/generate-coverage.yaml
+++ b/.github/workflows/generate-coverage.yaml
@@ -4,10 +4,14 @@ on:
push:
branches: [master]
+permissions: read-all
+
jobs:
generate-coverage:
name: Generate coverage and push to Coveralls.io
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-latest
+ permissions:
+ pull-requests: write
env:
ONEAPI_ROOT: /opt/intel/oneapi
@@ -17,7 +21,7 @@ jobs:
steps:
- name: Cancel Previous Runs
- uses: styfle/cancel-workflow-action@0.11.0
+ uses: styfle/cancel-workflow-action@0.12.1
with:
access_token: ${{ github.token }}
@@ -46,14 +50,14 @@ jobs:
sudo apt-get install ninja-build
- name: Setup Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.11'
architecture: x64
- name: Cache Gtest
id: cache-gtest
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: |
/home/runner/work/googletest-1.13.0/install
@@ -77,7 +81,7 @@ jobs:
make && make install
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index 84bbed4622..1faa18713c 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -6,13 +6,18 @@ on:
pull_request:
types: [opened, synchronize, reopened, closed]
+permissions: read-all
+
jobs:
build-and-deploy:
name: Build and Deploy Documentation
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write
+ pull-requests: write
steps:
- name: Cancel Previous Runs
- uses: styfle/cancel-workflow-action@0.11.0
+ uses: styfle/cancel-workflow-action@0.12.1
with:
access_token: ${{ github.token }}
- name: Add Intel repository
@@ -41,7 +46,7 @@ jobs:
sudo apt-get install ninja-build
- name: Setup Python
if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.10'
architecture: x64
@@ -51,7 +56,7 @@ jobs:
run: |
pip install numpy cython setuptools scikit-build cmake sphinx"<7.2" sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false
@@ -76,7 +81,7 @@ jobs:
mv ../cmake-install/docs/docs ~/docs
git clean -dfx
- name: Publish docs
- if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.ref == 'refs/heads/master' }}
+ if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.ref == 'refs/heads/master' && github.event.action != 'closed' }}
shell: bash -l {0}
run: |
git remote add tokened_docs https://IntelPython:${{ secrets.GITHUB_TOKEN }}@github.com/IntelPython/dpctl.git
@@ -93,7 +98,7 @@ jobs:
git push tokened_docs gh-pages
- name: Save built docs as an artifact
if: ${{ github.event.pull_request && github.event.pull_request.head.repo.fork && github.event.action != 'closed'}}
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: ${{ env.PACKAGE_NAME }} rendered documentation
path: ~/docs
@@ -138,7 +143,7 @@ jobs:
if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action != 'closed' }}
env:
PR_NUM: ${{ github.event.number }}
- uses: mshick/add-pr-comment@v1
+ uses: mshick/add-pr-comment@v2
with:
message: |
View rendered docs @ https://intelpython.github.io/dpctl/pulls/${{ env.PR_NUM }}/index.html
@@ -148,7 +153,7 @@ jobs:
if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action == 'closed' }}
env:
PR_NUM: ${{ github.event.number }}
- uses: mshick/add-pr-comment@v1
+ uses: mshick/add-pr-comment@v2
with:
message: |
Deleted rendered PR docs from intelpython.github.com/dpctl, latest should be updated shortly. :crossed_fingers:
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
new file mode 100644
index 0000000000..fbd16a4f28
--- /dev/null
+++ b/.github/workflows/openssf-scorecard.yml
@@ -0,0 +1,73 @@
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+name: Scorecard supply-chain security
+on:
+ # For Branch-Protection check. Only the default branch is supported. See
+ # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+ branch_protection_rule:
+ # To guarantee Maintained check is occasionally updated. See
+ # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+ schedule:
+ - cron: '28 2 * * 1'
+ - cron: '28 2 * * 4'
+ push:
+ branches: [ "master" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+ analysis:
+ name: Scorecard analysis
+ runs-on: ubuntu-latest
+ permissions:
+ # Needed to upload the results to code-scanning dashboard.
+ security-events: write
+ # Needed to publish results and get a badge (see publish_results below).
+ id-token: write
+ # Uncomment the permissions below if installing in a private repository.
+ # contents: read
+ # actions: read
+
+ steps:
+ - name: "Checkout code"
+ uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+ with:
+ persist-credentials: false
+
+ - name: "Run analysis"
+ uses: ossf/scorecard-action@e38b1902ae4f44df626f11ba0734b14fb91f8f86 # v2.1.2
+ with:
+ results_file: results.sarif
+ results_format: sarif
+ # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+ # - you want to enable the Branch-Protection check on a *public* repository, or
+ # - you are installing Scorecard on a *private* repository
+ # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
+ # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+ # Public repositories:
+ # - Publish results to OpenSSF REST API for easy access by consumers
+ # - Allows the repository to include the Scorecard badge.
+ # - See https://github.com/ossf/scorecard-action#publishing-results.
+ # For private repositories:
+ # - `publish_results` will always be set to `false`, regardless
+ # of the value entered here.
+ publish_results: true
+
+ # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+ # format to the repository Actions tab.
+ - name: "Upload artifact"
+ uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0
+ with:
+ name: SARIF file
+ path: results.sarif
+ retention-days: 14
+
+ # Upload the results to GitHub's code scanning dashboard.
+ - name: "Upload to code-scanning"
+ uses: github/codeql-action/upload-sarif@17573ee1cc1b9d061760f3a006fc4aac4f944fd5 # v2.2.4
+ with:
+ sarif_file: results.sarif
diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
index 3731a3fb77..78e825c9bf 100644
--- a/.github/workflows/os-llvm-sycl-build.yml
+++ b/.github/workflows/os-llvm-sycl-build.yml
@@ -4,6 +4,8 @@ on:
push:
branches: [master]
+permissions: read-all
+
jobs:
install-compiler:
name: Build with nightly build of DPC++ toolchain
@@ -20,13 +22,13 @@ jobs:
steps:
- name: Cancel Previous Runs
- uses: styfle/cancel-workflow-action@0.11.0
+ uses: styfle/cancel-workflow-action@0.12.1
with:
access_token: ${{ github.token }}
- name: Cache sycl bundle
id: cache-sycl-bundle
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: |
/home/runner/work/sycl_bundle
@@ -100,7 +102,7 @@ jobs:
sudo apt-get install libtinfo5
- name: Setup Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.11'
architecture: x64
@@ -111,7 +113,7 @@ jobs:
pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index f7d799463d..c9925da2df 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,12 +5,14 @@ on:
push:
branches: [master]
+permissions: read-all
+
jobs:
pre-commit:
runs-on: ubuntu-20.04
steps:
- - uses: actions/checkout@v3
- - uses: actions/setup-python@v4
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Version of clang-format
diff --git a/.github/workflows/python_style_checks.yml b/.github/workflows/python_style_checks.yml
index 3afd5acbd9..9059e90aec 100644
--- a/.github/workflows/python_style_checks.yml
+++ b/.github/workflows/python_style_checks.yml
@@ -9,16 +9,18 @@ on:
push:
branches: [master]
+permissions: read-all
+
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# The isort job sorts all imports in .py, .pyx, .pxd files
isort:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
- - uses: actions/setup-python@v4
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
with:
- python-version: '3.10'
+ python-version: '3.11'
- uses: jamescurtin/isort-action@master
with:
configuration: "--check-only"
@@ -30,11 +32,11 @@ jobs:
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
# Set up a Python environment for use in actions
- - uses: actions/setup-python@v4
+ - uses: actions/setup-python@v5
with:
- python-version: '3.10'
+ python-version: '3.11'
# Run black code formatter
- uses: psf/black@stable
@@ -47,11 +49,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
- python-version: '3.10'
+ python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc5bb4db36..d4da4189ec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,39 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## [0.15.0]
+## [0.16.0] - MMM. DD, 2024
+
+This release reaches milestone of 100% compliance of `dpctl.tensor` functions with Python Array API 2022.12 standard for the main namespace.
+
+### Added
+
+* Added reduction functions `dpctl.tensor.min`, `dpctl.tensor.max`, `dpctl.tensor.argmin`, `dpctl.tensor.argmax`, and `dpctl.tensor.prod` per Python Array API specifications: [#1399](https://github.com/IntelPython/dpctl/pull/1399)
+* Added dedicated in-place operations for binary elementwise operations and deployed them in Python operators of `dpctl.tensor.usm_ndarray` type: [#1431](https://github.com/IntelPython/dpctl/pull/1431), [#1447](https://github.com/IntelPython/dpctl/pull/1447)
+* Added new elementwise functions `dpctl.tensor.cbrt`, `dpctl.tensor.rsqrt`, `dpctl.tensor.exp2`, `dpctl.tensor.copysign`, `dpctl.tensor.angle`, and `dpctl.tensor.reciprocal`: [#1443](https://github.com/IntelPython/dpctl/pull/1443), [#1474](https://github.com/IntelPython/dpctl/pull/1474)
+* Added statistical functions `dpctl.tensor.mean`, `dpctl.tensor.std`, `dpctl.tensor.var` per Python Array API specifications: [#1465](https://github.com/IntelPython/dpctl/pull/1465)
+* Added sorting functions `dpctl.tensor.sort` and `dpctl.tensor.argsort`, and set functions `dpctl.tensor.unique_values`, `dpctl.tensor.unique_counts`, `dpctl.tensor.unique_inverse`, `dpctl.tensor.unique_all`: [#1483](https://github.com/IntelPython/dpctl/pull/1483)
+* Added linear algebra functions from the Array API namespace `dpctl.tensor.matrix_transpose`, `dpctl.tensor.matmul`, `dpctl.tensor.vecdot`, and `dpctl.tensor.tensordot`: [#1490](https://github.com/IntelPython/dpctl/pull/1490)
+* Added `dpctl.tensor.clip` function: [#1444](https://github.com/IntelPython/dpctl/pull/1444), [#1505](https://github.com/IntelPython/dpctl/pull/1505)
+* Added custom reduction functions `dpt.logsumexp` (reduction using binary function `dpctl.tensor.logaddexp`), `dpt.reduce_hypot` (reduction using binary function `dpctl.tensor.hypot`): [#1446](https://github.com/IntelPython/dpctl/pull/1446)
+* Added inspection API to query capabilities of Python Array API specification implementation: [#1469](https://github.com/IntelPython/dpctl/pull/1469)
+* Support for compilation for NVIDIA(R) sycl target with use of [CodePlay oneAPI plug-in](https://developer.codeplay.com/products/oneapi/nvidia/home/): [#1411](https://github.com/IntelPython/dpctl/pull/1411), [#1124](https://github.com/IntelPython/dpctl/discussions/1124)
+* Added `dpctl.utils.intel_device_info` function to query additional information about Intel(R) GPU devices: [gh-1428](https://github.com/IntelPython/dpctl/pull/1428) and [gh-1445](https://github.com/IntelPython/dpctl/pull/1445)
+
+### Changed
+
+* Functions `dpctl.tensor.result_type` and `dpctl.tensor.can_cast` became device-aware: [#1488](https://github.com/IntelPython/dpctl/pull/1488), [#1473](https://github.com/IntelPython/dpctl/pull/1473)
+* Implementation of method `dpctl.SyclEvent.wait_for` changed to use ``sycl::event::wait`` instead of ``sycl::event::wait_and_throw``: [gh-1436](https://github.com/IntelPython/dpctl/pull/1436)
+* `dpctl.tensor.astype` was changed to support `device` keyword as per Python Array API specification: [#1511](https://github.com/IntelPython/dpctl/pull/1511)
+* C++ header files in `libtensor/include/kernels` containing implementations of SYCL kernels no longer depends on "pybind11.h": [#1516](https://github.com/IntelPython/dpctl/pull/1516)
+
+### Fixed
+
+* Fixed issues with `dpctl.tensor.repeat` support for `axis` keyword: [#1427](https://github.com/IntelPython/dpctl/pull/1427), [#1433](https://github.com/IntelPython/dpctl/pull/1433)
+* Fix for gh-1503 for bug `usm_ndarray.__setitem__`: [#1504](https://github.com/IntelPython/dpctl/pull/1504)
+* Other bug fixes: [#1485](https://github.com/IntelPython/dpctl/pull/1485), [#1477](https://github.com/IntelPython/dpctl/pull/1477), [#1512](https://github.com/IntelPython/dpctl/pull/1512)
+
+
+## [0.15.0] - Sep. 29, 2023
### Added
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7688ff040c..eb1346a423 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,10 @@ file(GLOB _cmake_scripts ${CMAKE_SOURCE_DIR}/cmake/*.cmake)
install(FILES ${_cmake_scripts}
DESTINATION dpctl/resources/cmake
)
+install(FILES
+ ${CMAKE_SOURCE_DIR}/cmake/dpctl-config.cmake
+ DESTINATION lib/cmake/dpctl
+)
if (DPCTL_GENERATE_DOCS)
add_subdirectory(docs)
diff --git a/README.md b/README.md
index d26b4c97af..19d2eca840 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@
[![Coverage Status](https://coveralls.io/repos/github/IntelPython/dpctl/badge.svg?branch=master)](https://coveralls.io/github/IntelPython/dpctl?branch=master)
![Generate Documentation](https://github.com/IntelPython/dpctl/actions/workflows/generate-docs.yml/badge.svg?branch=master)
[![Join the chat at https://matrix.to/#/#Data-Parallel-Python_community:gitter.im](https://badges.gitter.im/Join%20Chat.svg)](https://app.gitter.im/#/room/#Data-Parallel-Python_community:gitter.im)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/IntelPython/dpctl/badge)](https://securityscorecards.dev/viewer/?uri=github.com/IntelPython/dpctl)
diff --git a/cmake/FindDpctl.cmake b/cmake/dpctl-config.cmake
similarity index 83%
rename from cmake/FindDpctl.cmake
rename to cmake/dpctl-config.cmake
index 149c75bd51..fa3f136b47 100644
--- a/cmake/FindDpctl.cmake
+++ b/cmake/dpctl-config.cmake
@@ -6,14 +6,17 @@
#
# ``Dpctl_FOUND``
# True if DPCTL was found.
-# ``Dpctl_INCLUDE_DIRS``
-# The include directories needed to use Dpctl.
+# ``Dpctl_INCLUDE_DIR``
+# The include directory needed to use dpctl.
+# ``Dpctl_TENSOR_INCLUDE_DIR``
+# The include directory for tensor kernels implementation.
# ``Dpctl_VERSION``
-# The version of DPCTL found.
+# The version of dpctl found.
#
-# The module will also explicitly define one cache variable:
+# The module will also explicitly define two cache variables:
#
# ``Dpctl_INCLUDE_DIR``
+# ``Dpctl_TENSOR_INCLUDE_DIR``
#
if(NOT Dpctl_FOUND)
@@ -22,7 +25,7 @@ if(NOT Dpctl_FOUND)
if(Python_EXECUTABLE)
execute_process(COMMAND "${Python_EXECUTABLE}"
- -c "import dpctl; print(dpctl.get_include())"
+ -m dpctl --include-dir
OUTPUT_VARIABLE _dpctl_include_dir
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index b5c6f9a7e1..c99bdb9545 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -31,7 +31,7 @@ requirements:
run:
- python
- dpcpp-cpp-rt ={{ required_compiler_version }}
- - {{ pin_compatible('numpy', min_pin='x.x', upper_bound='1.26') }}
+ - {{ pin_compatible('numpy', min_pin='x.x', max_pin='x') }}
- level-zero # [linux]
test:
diff --git a/dpctl/__main__.py b/dpctl/__main__.py
index 78c5f7fde0..9b51d74903 100644
--- a/dpctl/__main__.py
+++ b/dpctl/__main__.py
@@ -15,42 +15,57 @@
# limitations under the License.
import argparse
+import importlib
import os
import os.path
import platform
import sys
import warnings
-import dpctl
-
def _dpctl_dir() -> str:
- abs_path = os.path.abspath(dpctl.__file__)
- dpctl_dir = os.path.dirname(abs_path)
- return dpctl_dir
+ dpctl_dir = importlib.util.find_spec("dpctl").submodule_search_locations[0]
+ abs_dpctl_dir = os.path.abspath(dpctl_dir)
+ return abs_dpctl_dir
-def print_includes() -> None:
+def get_include_dir() -> str:
"Prints include flags for dpctl and SyclInterface library"
- print("-I " + dpctl.get_include())
+ return os.path.join(_dpctl_dir(), "include")
-def print_tensor_includes() -> None:
+def print_include_flags() -> None:
"Prints include flags for dpctl and SyclInterface library"
+ print("-I " + get_include_dir())
+
+
+def get_tensor_include_dir() -> str:
dpctl_dir = _dpctl_dir()
libtensor_dir = os.path.join(dpctl_dir, "tensor", "libtensor", "include")
+ return libtensor_dir
+
+
+def print_tensor_include_flags() -> None:
+ "Prints include flags for dpctl and SyclInterface library"
+ libtensor_dir = get_tensor_include_dir()
print("-I " + libtensor_dir)
def print_cmake_dir() -> None:
"Prints directory with FindDpctl.cmake"
dpctl_dir = _dpctl_dir()
- print(os.path.join(dpctl_dir, "resources", "cmake"))
+ cmake_dir = os.path.join(dpctl_dir, "resources", "cmake")
+ print(cmake_dir)
+
+
+def get_library_dir() -> str:
+ dpctl_dir = _dpctl_dir()
+ return dpctl_dir
def print_library() -> None:
"Prints linker flags for SyclInterface library"
- dpctl_dir = _dpctl_dir()
+ dpctl_dir = get_library_dir()
plt = platform.platform()
ld_flags = "-L " + dpctl_dir
if plt != "Windows":
@@ -73,6 +88,8 @@ def _warn_if_any_set(args, li) -> None:
def print_lsplatform(verbosity: int) -> None:
+ import dpctl
+
dpctl.lsplatform(verbosity=verbosity)
@@ -84,11 +101,21 @@ def main() -> None:
action="store_true",
help="Include flags for dpctl headers.",
)
+ parser.add_argument(
+ "--include-dir",
+ action="store_true",
+ help="Path to dpctl include directory.",
+ )
parser.add_argument(
"--tensor-includes",
action="store_true",
help="Include flags for dpctl libtensor headers.",
)
+ parser.add_argument(
+ "--tensor-include-dir",
+ action="store_true",
+ help="Path to dpctl libtensor include directory.",
+ )
parser.add_argument(
"--cmakedir",
action="store_true",
@@ -99,6 +126,11 @@ def main() -> None:
action="store_true",
help="Linker flags for SyclInterface library.",
)
+ parser.add_argument(
+ "--library-dir",
+ action="store_true",
+ help="Path to directory containing DPCTLSyclInterface library",
+ )
parser.add_argument(
"-f",
"--full-list",
@@ -139,13 +171,19 @@ def main() -> None:
print_lsplatform(0)
return
if args.includes:
- print_includes()
+ print_include_flags()
+ if args.include_dir:
+ print(get_include_dir())
if args.tensor_includes:
- print_tensor_includes()
+ print_tensor_include_flags()
+ if args.tensor_include_dir:
+ print(get_tensor_include_dir())
if args.cmakedir:
print_cmake_dir()
if args.library:
print_library()
+ if args.library_dir:
+ print(get_library_dir())
if __name__ == "__main__":
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index d2947aa772..d23142473e 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -156,6 +156,15 @@ set(_tensor_sorting_impl_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
${_sorting_sources}
)
+set(_linalg_sources
+ ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linalg_functions/dot.cpp
+)
+set(_tensor_linalg_impl_sources
+ ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_linalg.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+ ${_linalg_sources}
+)
set(_py_trgts)
@@ -179,6 +188,11 @@ pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources}
add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
list(APPEND _py_trgts ${python_module_name})
+set(python_module_name _tensor_linalg_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
set(_clang_prefix "")
if (WIN32)
set(_clang_prefix "/clang:")
@@ -193,6 +207,7 @@ list(APPEND _no_fast_math_sources
${_elementwise_sources}
${_reduction_sources}
${_sorting_sources}
+ ${_linalg_sources}
)
foreach(_src_fn ${_no_fast_math_sources})
@@ -208,7 +223,11 @@ set(_compiler_definitions "USE_SYCL_FOR_COMPLEX_TYPES")
foreach(_src_fn ${_elementwise_sources})
get_source_file_property(_cmpl_options_defs ${_src_fn} COMPILE_DEFINITIONS)
- set(_combined_options_defs ${_cmpl_options_defs} "${_compiler_definitions}")
+ if(${_cmpl_options_defs})
+ set(_combined_options_defs ${_cmpl_options_defs} "${_compiler_definitions}")
+ else()
+ set(_combined_options_defs "${_compiler_definitions}")
+ endif()
set_source_files_properties(
${_src_fn}
PROPERTIES COMPILE_DEFINITIONS "${_combined_options_defs}"
@@ -219,10 +238,6 @@ set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
foreach(python_module_name ${_py_trgts})
target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
- if(UNIX)
- # this option is supported on Linux only
- target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
- endif()
target_include_directories(${python_module_name}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../include
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 81fc152e7a..77c4e23d8c 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -51,7 +51,6 @@
int16,
int32,
int64,
- isdtype,
uint8,
uint16,
uint32,
@@ -60,7 +59,12 @@
from dpctl.tensor._device import Device
from dpctl.tensor._dlpack import from_dlpack
from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take
-from dpctl.tensor._linear_algebra_functions import matrix_transpose
+from dpctl.tensor._linear_algebra_functions import (
+ matmul,
+ matrix_transpose,
+ tensordot,
+ vecdot,
+)
from dpctl.tensor._manipulation_functions import (
broadcast_arrays,
broadcast_to,
@@ -183,7 +187,7 @@
)
from ._sorting import argsort, sort
from ._testing import allclose
-from ._type_utils import can_cast, finfo, iinfo, result_type
+from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
__all__ = [
"Device",
@@ -356,4 +360,7 @@
"unique_counts",
"unique_inverse",
"unique_values",
+ "matmul",
+ "tensordot",
+ "vecdot",
]
diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py
index f2bc326e82..d95c0fa764 100644
--- a/dpctl/tensor/_clip.py
+++ b/dpctl/tensor/_clip.py
@@ -168,9 +168,9 @@ def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev):
return dpt.dtype(ti.default_device_int_type(dev))
if isinstance(dtype, WeakComplexType):
if st_dtype is dpt.float16 or st_dtype is dpt.float32:
- return st_dtype, dpt.complex64
+ return dpt.complex64
return _to_device_supported_dtype(dpt.complex128, dev)
- return (_to_device_supported_dtype(dpt.float64, dev),)
+ return _to_device_supported_dtype(dpt.float64, dev)
else:
return st_dtype
else:
@@ -197,8 +197,6 @@ def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev):
def _clip_none(x, val, out, order, _binary_fn):
- if order not in ["K", "C", "F", "A"]:
- order = "K"
q1, x_usm_type = x.sycl_queue, x.usm_type
q2, val_usm_type = _get_queue_usm_type(val)
if q2 is None:
@@ -391,9 +389,8 @@ def _clip_none(x, val, out, order, _binary_fn):
return out
-# need to handle logic for min or max being None
-def clip(x, min=None, max=None, out=None, order="K"):
- """clip(x, min, max, out=None, order="K")
+def clip(x, /, min=None, max=None, out=None, order="K"):
+ """clip(x, min=None, max=None, out=None, order="K")
Clips to the range [`min_i`, `max_i`] for each element `x_i`
in `x`.
@@ -402,14 +399,14 @@ def clip(x, min=None, max=None, out=None, order="K"):
x (usm_ndarray): Array containing elements to clip.
Must be compatible with `min` and `max` according
to broadcasting rules.
- min ({None, usm_ndarray}, optional): Array containing minimum values.
+ min ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+ Array containing minimum values.
Must be compatible with `x` and `max` according
to broadcasting rules.
- Only one of `min` and `max` can be `None`.
- max ({None, usm_ndarray}, optional): Array containing maximum values.
+ max ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+ Array containing maximum values.
Must be compatible with `x` and `min` according
to broadcasting rules.
- Only one of `min` and `max` can be `None`.
out ({None, usm_ndarray}, optional):
Output array to populate.
Array must have the correct shape and the expected data type.
@@ -428,10 +425,67 @@ def clip(x, min=None, max=None, out=None, order="K"):
"Expected `x` to be of dpctl.tensor.usm_ndarray type, got "
f"{type(x)}"
)
+ if order not in ["K", "C", "F", "A"]:
+ order = "K"
if min is None and max is None:
- raise ValueError(
- "only one of `min` and `max` is permitted to be `None`"
+ exec_q = x.sycl_queue
+ orig_out = out
+ if out is not None:
+ if not isinstance(out, dpt.usm_ndarray):
+ raise TypeError(
+ "output array must be of usm_ndarray type, got "
+ f"{type(out)}"
+ )
+
+ if out.shape != x.shape:
+ raise ValueError(
+ "The shape of input and output arrays are "
+ f"inconsistent. Expected output shape is {x.shape}, "
+ f"got {out.shape}"
+ )
+
+ if x.dtype != out.dtype:
+ raise ValueError(
+ f"Output array of type {x.dtype} is needed, "
+ f"got {out.dtype}"
+ )
+
+ if (
+ dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+ is None
+ ):
+ raise ExecutionPlacementError(
+ "Input and output allocation queues are not compatible"
+ )
+
+ if ti._array_overlap(x, out):
+ if not ti._same_logical_tensors(x, out):
+ out = dpt.empty_like(out)
+ else:
+ return out
+ else:
+ if order == "K":
+ out = _empty_like_orderK(x, x.dtype)
+ else:
+ if order == "A":
+ order = "F" if x.flags.f_contiguous else "C"
+ out = dpt.empty_like(x, order=order)
+
+ ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x, dst=out, sycl_queue=exec_q
)
+ if not (orig_out is None or orig_out is out):
+ # Copy the out data from temporary buffer to original memory
+ ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=out,
+ dst=orig_out,
+ sycl_queue=exec_q,
+ depends=[copy_ev],
+ )
+ ht_copy_out_ev.wait()
+ out = orig_out
+ ht_copy_ev.wait()
+ return out
elif max is None:
return _clip_none(x, min, out, order, tei._maximum)
elif min is None:
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 81928692a6..ecf3eade35 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -26,6 +26,7 @@
import dpctl.utils
from dpctl.tensor._data_types import _get_dtype
from dpctl.tensor._device import normalize_queue_device
+from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
__doc__ = (
"Implementation module for copy- and cast- operations on "
@@ -121,7 +122,7 @@ def from_numpy(np_ary, device=None, usm_type="device", sycl_queue=None):
output array is created. Device can be specified by a
a filter selector string, an instance of
:class:`dpctl.SyclDevice`, an instance of
- :class:`dpctl.SyclQueue`, an instance of
+ :class:`dpctl.SyclQueue`, or an instance of
:class:`dpctl.tensor.Device`. If the value is `None`,
returned array is created on the default-selected device.
Default: `None`.
@@ -300,14 +301,22 @@ def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
src.shape, src.strides, len(common_shape)
)
src_same_shape = dpt.usm_ndarray(
- common_shape, dtype=src.dtype, buffer=src, strides=new_src_strides
+ common_shape,
+ dtype=src.dtype,
+ buffer=src,
+ strides=new_src_strides,
+ offset=src._element_offset,
)
elif src.ndim == len(common_shape):
new_src_strides = _broadcast_strides(
src.shape, src.strides, len(common_shape)
)
src_same_shape = dpt.usm_ndarray(
- common_shape, dtype=src.dtype, buffer=src, strides=new_src_strides
+ common_shape,
+ dtype=src.dtype,
+ buffer=src,
+ strides=new_src_strides,
+ offset=src._element_offset,
)
else:
# since broadcasting succeeded, src.ndim is greater because of
@@ -523,7 +532,7 @@ def copy(usm_ary, order="K"):
)
order = order[0].upper()
if not isinstance(usm_ary, dpt.usm_ndarray):
- return TypeError(
+ raise TypeError(
f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
)
copy_order = "C"
@@ -556,9 +565,11 @@ def copy(usm_ary, order="K"):
return R
-def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
+def astype(
+ usm_ary, newdtype, /, order="K", casting="unsafe", *, copy=True, device=None
+):
""" astype(array, new_dtype, order="K", casting="unsafe", \
- copy=True)
+ copy=True, device=None)
Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a
specified type.
@@ -568,7 +579,8 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
An input array.
new_dtype (dtype):
The data type of the resulting array. If `None`, gives default
- floating point type supported by device where `array` is allocated.
+ floating point type supported by device where the resulting array
+ will be located.
order ({"C", "F", "A", "K"}, optional):
Controls memory layout of the resulting array if a copy
is returned.
@@ -579,6 +591,14 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
By default, `astype` always returns a newly allocated array.
If this keyword is set to `False`, a view of the input array
may be returned when possible.
+ device (object): array API specification of device where the
+ output array is created. Device can be specified by a
+ a filter selector string, an instance of
+ :class:`dpctl.SyclDevice`, an instance of
+ :class:`dpctl.SyclQueue`, or an instance of
+ :class:`dpctl.tensor.Device`. If the value is `None`,
+ returned array is created on the same device as `array`.
+ Default: `None`.
Returns:
usm_ndarray:
@@ -596,7 +616,25 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
)
order = order[0].upper()
ary_dtype = usm_ary.dtype
- target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
+ if device is not None:
+ if not isinstance(device, dpctl.SyclQueue):
+ if isinstance(device, dpt.Device):
+ device = device.sycl_queue
+ else:
+ device = dpt.Device.create_device(device).sycl_queue
+ d = device.sycl_device
+ target_dtype = _get_dtype(newdtype, device)
+ if not _dtype_supported_by_device_impl(
+ target_dtype, d.has_aspect_fp16, d.has_aspect_fp64
+ ):
+ raise ValueError(
+ f"Requested dtype `{target_dtype}` is not supported by the "
+ "target device"
+ )
+ usm_ary = usm_ary.to_device(device)
+ else:
+ target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
+
if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
raise TypeError(
f"Can not cast from {ary_dtype} to {newdtype} "
diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py
index ba16f0f1fc..5c5c7279db 100644
--- a/dpctl/tensor/_ctors.py
+++ b/dpctl/tensor/_ctors.py
@@ -632,17 +632,13 @@ def asarray(
usm_type=usm_type,
order=order,
)
-
- raise NotImplementedError(
- "Converting Python sequences is not implemented"
- )
if copy is False:
raise ValueError(
f"Converting {type(obj)} to usm_ndarray requires a copy"
)
# obj is a scalar, create 0d array
return _asarray_from_numpy_ndarray(
- np.asarray(obj),
+ np.asarray(obj, dtype=dtype),
dtype=dtype,
usm_type=usm_type,
sycl_queue=sycl_queue,
diff --git a/dpctl/tensor/_data_types.py b/dpctl/tensor/_data_types.py
index bee557cf18..78e8714607 100644
--- a/dpctl/tensor/_data_types.py
+++ b/dpctl/tensor/_data_types.py
@@ -50,48 +50,6 @@
complex128 = dtype("complex128")
-def isdtype(dtype_, kind):
- """isdtype(dtype, kind)
-
- Returns a boolean indicating whether a provided `dtype` is
- of a specified data type `kind`.
-
- See [array API](array_api) for more information.
-
- [array_api]: https://data-apis.org/array-api/latest/
- """
-
- if not isinstance(dtype_, dtype):
- raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype_}")
-
- if isinstance(kind, dtype):
- return dtype_ == kind
-
- elif isinstance(kind, str):
- if kind == "bool":
- return dtype_ == dtype("bool")
- elif kind == "signed integer":
- return dtype_.kind == "i"
- elif kind == "unsigned integer":
- return dtype_.kind == "u"
- elif kind == "integral":
- return dtype_.kind in "iu"
- elif kind == "real floating":
- return dtype_.kind == "f"
- elif kind == "complex floating":
- return dtype_.kind == "c"
- elif kind == "numeric":
- return dtype_.kind in "iufc"
- else:
- raise ValueError(f"Unrecognized data type kind: {kind}")
-
- elif isinstance(kind, tuple):
- return any(isdtype(dtype_, k) for k in kind)
-
- else:
- raise TypeError(f"Unsupported data type kind: {kind}")
-
-
def _get_dtype(inp_dt, sycl_obj, ref_type=None):
"""
Type inference utility to construct data type
@@ -121,7 +79,6 @@ def _get_dtype(inp_dt, sycl_obj, ref_type=None):
__all__ = [
"dtype",
"_get_dtype",
- "isdtype",
"bool",
"int8",
"uint8",
diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py
index fd2c58b08a..0894ac2077 100644
--- a/dpctl/tensor/_linear_algebra_functions.py
+++ b/dpctl/tensor/_linear_algebra_functions.py
@@ -14,7 +14,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import operator
+
+from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
+
+import dpctl
import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
+import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_linalg_impl as tli
+from dpctl.tensor._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
+from dpctl.tensor._manipulation_functions import _broadcast_shape_impl
+from dpctl.tensor._type_utils import (
+ _acceptance_fn_default_binary,
+ _find_buf_dtype2,
+ _to_device_supported_dtype,
+)
+from dpctl.utils import ExecutionPlacementError
def matrix_transpose(x):
@@ -46,3 +62,921 @@ def matrix_transpose(x):
)
return x.mT
+
+
+def tensordot(x1, x2, axes=2):
+ """tensordot(x1, x2, axes=2)
+
+ Returns a tensor contraction of `x1` and `x2` over specific axes.
+
+ Args:
+ x1 (usm_ndarray):
+ first input array, expected to have numeric data type.
+ x2 (usm_ndarray):
+ second input array, expected to have numeric data type.
+ Corresponding contracted axes of `x1` and `x2` must be equal.
+ axes (Union[int, Tuple[Sequence[int], Sequence[int]]):
+ number of axes to contract or explicit sequences of axes for
+ `x1` and `x2`, respectively. If `axes` is an integer equal to `N`,
+ then the contraction is performed over last `N` axes of `x1` and
+ the first `N` axis of `x2` in order. The size of each corresponding
+ axis must match and must be non-negative.
+ * if `N` equals `0`, the result is the tensor outer product
+ * if `N` equals `1`, the result is the tensor dot product
+ * if `N` equals `2`, the result is the tensor double
+ contraction (default).
+ If `axes` is a tuple of two sequences `(x1_axes, x2_axes)`, the
+ first sequence applies to `x1` and the second sequence applies
+ to `x2`. Both sequences must have equal length, and each axis
+ `x1_axes[i]` for `x1` must have the same size as the respective
+ axis `x2_axes[i]` for `x2`. Each sequence must consist of unique
+ non-negative integers that specify valid axes for each respective
+ array.
+ Returns:
+ usm_ndarray:
+ an array containing the tensor contraction whose shape consists of
+ the non-contracted axes of the first array `x1`, followed by the
+ non-contracted axes of the second array `x2`. The returned array
+ must have a data type determined by Type Promotion Rules.
+ """
+ if not isinstance(x1, dpt.usm_ndarray):
+ raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+ if not isinstance(x2, dpt.usm_ndarray):
+ raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+ q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+ q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+ exec_q = dpctl.utils.get_execution_queue((q1, q2))
+ if exec_q is None:
+ raise ExecutionPlacementError(
+ "Execution placement can not be unambiguously inferred "
+ "from input arguments."
+ )
+ res_usm_type = dpctl.utils.get_coerced_usm_type(
+ (
+ x1_usm_type,
+ x2_usm_type,
+ )
+ )
+ dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+ # handle axes and shapes validation
+ x1_nd = x1.ndim
+ x2_nd = x2.ndim
+ x1_shape = x1.shape
+ x2_shape = x2.shape
+ if isinstance(axes, int):
+ if axes < 0:
+ raise ValueError("`axes` integer is expected to be non-negative")
+ n_axes1 = axes
+ n_axes2 = axes
+ axes1 = normalize_axis_tuple(tuple(range(-axes, 0)), x1_nd)
+ axes2 = tuple(range(0, axes))
+ elif isinstance(axes, tuple):
+ if len(axes) != 2:
+ raise ValueError(
+ "`axes` tuple is expected to contain two sequences"
+ )
+ axes1 = tuple(axes[0])
+ axes2 = tuple(axes[1])
+ n_axes1 = len(axes1)
+ n_axes2 = len(axes2)
+ else:
+ raise TypeError("`axes` must be an integer or a tuple of sequences")
+ if n_axes1 != n_axes2:
+ raise ValueError(
+ "number of axes contracted must be the same for each array"
+ )
+ if n_axes1 == 0:
+ arr1 = x1[..., dpt.newaxis]
+ arr2 = x2[dpt.newaxis, ...]
+ n_axes1 = 1
+ n_axes2 = 1
+ else:
+ same_shapes = True
+ for i in range(n_axes1):
+ axis1 = axes1[i]
+ if axis1 < 0:
+ raise ValueError("`axes` must be non-negative")
+ axis2 = axes2[i]
+ if axis2 < 0:
+ raise ValueError("`axes` must be non-negative")
+ same_shapes = same_shapes and (x1_shape[axis1] == x2_shape[axis2])
+ if not same_shapes:
+ raise ValueError("shape mismatch in contracted `tensordot` axes")
+ axes1 = normalize_axis_tuple(axes1, x1_nd)
+ axes2 = normalize_axis_tuple(axes2, x2_nd)
+ perm1 = [i for i in range(x1_nd) if i not in axes1] + list(axes1)
+ perm2 = list(axes2) + [i for i in range(x2_nd) if i not in axes2]
+ arr1 = dpt.permute_dims(x1, perm1)
+ arr2 = dpt.permute_dims(x2, perm2)
+ arr1_outer_nd = arr1.ndim - n_axes1
+ arr2_outer_nd = arr2.ndim - n_axes2
+ res_shape = arr1.shape[:arr1_outer_nd] + arr2.shape[n_axes2:]
+ # type validation
+ sycl_dev = exec_q.sycl_device
+ x1_dtype = x1.dtype
+ x2_dtype = x2.dtype
+ buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+ x1_dtype,
+ x2_dtype,
+ tli._dot_result_type,
+ sycl_dev,
+ acceptance_fn=_acceptance_fn_default_binary,
+ )
+ if res_dt is None:
+ raise TypeError(
+ "function 'tensordot' does not support input types "
+ f"({x1_dtype}, {x2_dtype}), "
+ "and the inputs could not be safely coerced to any "
+ "supported types according to the casting rule ''safe''."
+ )
+
+ if buf1_dt is None and buf2_dt is None:
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_dot_ev, _ = tli._dot(
+ x1=arr1,
+ x2=arr2,
+ batch_dims=0,
+ x1_outer_dims=arr1_outer_nd,
+ x2_outer_dims=arr2_outer_nd,
+ inner_dims=n_axes1,
+ dst=out,
+ sycl_queue=exec_q,
+ )
+ ht_dot_ev.wait()
+
+ return out
+
+ elif buf1_dt is None:
+ buf2 = _empty_like_orderK(arr2, buf2_dt)
+ ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=arr2, dst=buf2, sycl_queue=exec_q
+ )
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_dot_ev, _ = tli._dot(
+ x1=arr1,
+ x2=buf2,
+ batch_dims=0,
+ x1_outer_dims=arr1_outer_nd,
+ x2_outer_dims=arr2_outer_nd,
+ inner_dims=n_axes1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=[copy_ev],
+ )
+ ht_copy_ev.wait()
+ ht_dot_ev.wait()
+
+ return out
+
+ elif buf2_dt is None:
+ buf1 = _empty_like_orderK(arr1, buf1_dt)
+ ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=arr1, dst=buf1, sycl_queue=exec_q
+ )
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_dot_ev, _ = tli._dot(
+ x1=buf1,
+ x2=arr2,
+ batch_dims=0,
+ x1_outer_dims=arr1_outer_nd,
+ x2_outer_dims=arr2_outer_nd,
+ inner_dims=n_axes1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=[copy_ev],
+ )
+ ht_copy_ev.wait()
+ ht_dot_ev.wait()
+
+ return out
+
+ buf1 = _empty_like_orderK(arr1, buf1_dt)
+ ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=arr1, dst=buf1, sycl_queue=exec_q
+ )
+ buf2 = _empty_like_orderK(arr2, buf2_dt)
+ ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=arr2, dst=buf2, sycl_queue=exec_q
+ )
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_, _ = tli._dot(
+ x1=buf1,
+ x2=buf2,
+ batch_dims=0,
+ x1_outer_dims=arr1_outer_nd,
+ x2_outer_dims=arr2_outer_nd,
+ inner_dims=n_axes1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=[copy1_ev, copy2_ev],
+ )
+ dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_])
+
+ return out
+
+
+def vecdot(x1, x2, axis=-1):
+ """vecdot(x1, x2, axis=-1)
+
+ Computes the (vector) dot product of two arrays.
+
+ Args:
+ x1 (usm_ndarray):
+ first input array.
+ x2 (usm_ndarray):
+ second input array. Input arrays must have compatible
+ shapes along non-contract axes according to broadcasting
+ rules, and must have the same size along the contracted
+ axis. Input arrays should be of numeric type.
+ axis (Optional[int]):
+ axis over which to compute the dot product. The axis must
+ be an integer on the interval `[-N, N)`, where `N` is the
+ array rank of input arrays after broadcasting rules are
+ applied. If specified as a negative integer, the axis along
+ which dot product is performed is counted backward from
+ the last axes (that is `-1` refers to the last axis). By
+ default, dot product is computed over the last axis.
+ Default: `-1`.
+
+ Returns:
+ usm_ndarray:
+ if `x1` and `x2` are both one-dimensional arrays, a
+ zero-dimensional array containing the dot product value
+ is returned; otherwise, a non-zero-dimensional array containing
+ the dot products and having rank `N-1`, where `N` is the rank
+ of the shape of input arrays after broadcasting rules are applied
+ to non-contracted axes.
+ """
+ if not isinstance(x1, dpt.usm_ndarray):
+ raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+ if not isinstance(x2, dpt.usm_ndarray):
+ raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+ q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+ q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+ exec_q = dpctl.utils.get_execution_queue((q1, q2))
+ if exec_q is None:
+ raise ExecutionPlacementError(
+ "Execution placement can not be unambiguously inferred "
+ "from input arguments."
+ )
+ res_usm_type = dpctl.utils.get_coerced_usm_type(
+ (
+ x1_usm_type,
+ x2_usm_type,
+ )
+ )
+ dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+ # axis and shape validation
+ x1_nd = x1.ndim
+ x2_nd = x2.ndim
+ x1_shape = x1.shape
+ x2_shape = x2.shape
+ if x1_nd > x2_nd:
+ x2_shape = (1,) * (x1_nd - x2_nd) + x2_shape
+ x2_nd = len(x2_shape)
+ elif x2_nd > x1_nd:
+ x1_shape = (1,) * (x2_nd - x1_nd) + x1_shape
+ x1_nd = len(x1_shape)
+ axis = normalize_axis_index(operator.index(axis), x1_nd)
+ if x1_shape[axis] != x2_shape[axis]:
+ raise ValueError(
+ "given axis must have the same shape for `x1` and `x2`"
+ )
+ try:
+ broadcast_sh = _broadcast_shape_impl(
+ [
+ x1_shape,
+ x2_shape,
+ ]
+ )
+ except ValueError:
+ raise ValueError("mismatch in `vecdot` dimensions")
+ res_sh = tuple(
+ [broadcast_sh[i] for i in range(len(broadcast_sh)) if i != axis]
+ )
+ # type validation
+ sycl_dev = exec_q.sycl_device
+ x1_dtype = x1.dtype
+ x2_dtype = x2.dtype
+ buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+ x1_dtype,
+ x2_dtype,
+ tli._dot_result_type,
+ sycl_dev,
+ acceptance_fn=_acceptance_fn_default_binary,
+ )
+ if res_dt is None:
+ raise TypeError(
+ "function 'vecdot' does not support input types "
+ f"({x1_dtype}, {x2_dtype}), "
+ "and the inputs could not be safely coerced to any "
+ "supported types according to the casting rule ''safe''."
+ )
+
+ ht_list = []
+ deps = []
+ if buf1_dt is None and buf2_dt is None:
+ if x1.dtype.kind == "c":
+ x1_tmp = _empty_like_orderK(x1, x1.dtype)
+ ht_conj_ev, conj_ev = tei._conj(
+ src=x1,
+ dst=x1_tmp,
+ sycl_queue=exec_q,
+ )
+ ht_list.append(ht_conj_ev)
+ deps.append(conj_ev)
+ x1 = x1_tmp
+ if x1.shape != broadcast_sh:
+ x1 = dpt.broadcast_to(x1, broadcast_sh)
+ if x2.shape != broadcast_sh:
+ x2 = dpt.broadcast_to(x2, broadcast_sh)
+ x1 = dpt.moveaxis(x1, axis, -1)
+ x2 = dpt.moveaxis(x2, axis, -1)
+
+ out = dpt.empty(
+ res_sh,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_dot_ev, _ = tli._dot(
+ x1=x1,
+ x2=x2,
+ batch_dims=len(x1.shape[:-1]),
+ x1_outer_dims=0,
+ x2_outer_dims=0,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=deps,
+ )
+ ht_list.append(ht_dot_ev)
+ dpctl.SyclEvent.wait_for(ht_list)
+
+ return dpt.reshape(out, res_sh)
+
+ elif buf1_dt is None:
+ if x1.dtype.kind == "c":
+ x1_tmp = _empty_like_orderK(x1, x1.dtype)
+ ht_conj_ev, conj_e = tei._conj(
+ src=x1, dst=x1_tmp, sycl_queue=exec_q
+ )
+ ht_list.append(ht_conj_ev)
+ deps.append(conj_e)
+ x1 = x1_tmp
+ buf2 = _empty_like_orderK(x2, buf2_dt)
+ ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x2, dst=buf2, sycl_queue=exec_q
+ )
+ ht_list.append(ht_copy_ev)
+ deps.append(copy_ev)
+ if x1.shape != broadcast_sh:
+ x1 = dpt.broadcast_to(x1, broadcast_sh)
+ if buf2.shape != broadcast_sh:
+ buf2 = dpt.broadcast_to(buf2, broadcast_sh)
+ x1 = dpt.moveaxis(x1, axis, -1)
+ buf2 = dpt.moveaxis(buf2, axis, -1)
+ out = dpt.empty(
+ res_sh,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_dot_ev, _ = tli._dot(
+ x1=x1,
+ x2=buf2,
+ batch_dims=len(x1.shape[:-1]),
+ x1_outer_dims=0,
+ x2_outer_dims=0,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=deps,
+ )
+ ht_list.append(ht_dot_ev)
+ dpctl.SyclEvent.wait_for(ht_list)
+
+ return dpt.reshape(out, res_sh)
+
+ elif buf2_dt is None:
+ buf1 = _empty_like_orderK(x1, buf1_dt)
+ ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x1, dst=buf1, sycl_queue=exec_q
+ )
+ ht_list.append(ht_copy_ev)
+ deps.append(copy_ev)
+ if buf1.dtype.kind == "c":
+ ht_conj_ev, conj_ev = tei._conj(
+ src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy_ev]
+ )
+ ht_list.append(ht_conj_ev)
+ deps.append(conj_ev)
+ if buf1.shape != broadcast_sh:
+ buf1 = dpt.broadcast_to(buf1, broadcast_sh)
+ if x2.shape != broadcast_sh:
+ x2 = dpt.broadcast_to(x2, broadcast_sh)
+ buf1 = dpt.moveaxis(buf1, axis, -1)
+ x2 = dpt.moveaxis(x2, axis, -1)
+ out = dpt.empty(
+ res_sh,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_dot_ev, _ = tli._dot(
+ x1=buf1,
+ x2=x2,
+ batch_dims=len(x1.shape[:-1]),
+ x1_outer_dims=0,
+ x2_outer_dims=0,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=deps,
+ )
+ ht_list.append(ht_dot_ev)
+ dpctl.SyclEvent.wait_for(ht_list)
+
+ return dpt.reshape(out, res_sh)
+
+ buf1 = _empty_like_orderK(x1, buf1_dt)
+ ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x1, dst=buf1, sycl_queue=exec_q
+ )
+ ht_list.append(ht_copy1_ev)
+ deps.append(copy1_ev)
+ if buf1.dtype.kind == "c":
+ ht_conj_ev, conj_ev = tei._conj(
+ src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy1_ev]
+ )
+ ht_list.append(ht_conj_ev)
+ deps.append(conj_ev)
+ buf2 = _empty_like_orderK(x2, buf2_dt)
+ ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x2, dst=buf2, sycl_queue=exec_q
+ )
+ ht_list.append(ht_copy2_ev)
+ deps.append(copy2_ev)
+ if buf1.shape != broadcast_sh:
+ buf1 = dpt.broadcast_to(buf1, broadcast_sh)
+ if buf2.shape != broadcast_sh:
+ buf2 = dpt.broadcast_to(buf2, broadcast_sh)
+ buf1 = dpt.moveaxis(buf1, axis, -1)
+ buf2 = dpt.moveaxis(buf2, axis, -1)
+ out = dpt.empty(
+ res_sh,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order="C",
+ )
+ ht_dot_ev, _ = tli._dot(
+ x1=buf1,
+ x2=buf2,
+ batch_dims=len(x1.shape[:-1]),
+ x1_outer_dims=0,
+ x2_outer_dims=0,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=deps,
+ )
+ ht_list.append(ht_dot_ev)
+ dpctl.SyclEvent.wait_for(ht_list)
+
+ return out
+
+
+def matmul(x1, x2, out=None, dtype=None, order="K"):
+ """matmul(x1, x2, out=None, order="K")
+
+ Computes the matrix product. Implements the same semantics
+ as the built-in operator `@`.
+
+ Args:
+ x1 (usm_ndarray):
+ first input array. Expected to have numeric data type, and
+ at least one dimension. If `x1` is one-dimensional having
+ shape `(M,)`, and `x2` has more than one dimension, `x1` is
+ effectively treated as a two-dimensional array with shape `(1, M)`,
+ although the prepended dimension is removed from the output array.
+ If `x1` has shape `(..., M, K)`, the innermost two dimensions form
+ matrices on which to perform matrix multiplication.
+ x2 (usm_ndarray):
+ second input array. Expected to have numeric data type, and
+ at least one dimension. If `x2` is one-dimensional having
+ shape `(N,)`, and `x1` has more than one dimension, `x2` is
+ effectively treated as a two-dimensional array with shape `(N, 1)`,
+ although the appended dimension is removed from the output array.
+ If `x2` has shape `(..., K, N)`, the innermost two dimensions form
+ matrices on which to perform matrix multiplication.
+ out (Optional[usm_ndarray]):
+ the array into which the result of the matrix product is written.
+ If `None` then a new array is returned.
+ order (["K", "C", "F", "A"]):
+ memory layout of the output array, if `out` is `None`, otherwise
+ the `order` parameter value is not used.
+
+ Returns:
+ usm_ndarray:
+ * if both `x1` and `x2` are one-dimensional arrays with shape
+ `(N,)`, returned array is a zero-dimensional array containing
+ inner product as its only element.
+ * if `x1` is two-dimensional array with shape `(M, K)` and `x2` is
+ a two-dimensional array with shape `(K, N)`, returned array is a
+ two-dimensional array with shape `(M, N)` and contains the
+ conventional matrix product.
+ * if `x1` is a one-dimensinal array with shape `(K,)` and `x2` is an
+ array with shape `(..., K, N)`, returned array contains the
+ conventional matrix product and has shape `(..., N)`.
+ * if `x1` is an array with shape `(..., M, K)` and `x2` is a
+ one-dimensional array with shape `(K,)`, returned array has shape
+ `(..., M)` and contains the conventional matrix product.
+ * if `x1` is a two-dimensional array with shape `(M, K)` and `x2`
+ is an array with shape `(..., K, N)`, returned array contains
+ conventional matrix product for each stacked matrix and has shape
+ `(..., M, N)`.
+ * if `x1` has shape `(..., M, K)` and `x2` is a two-dimensional
+ array with shape `(K, N)`, returned array contains conventional
+ matrix product for each stacked matrix and has shape
+ `(..., M, N)`.
+ * if both `x1` and `x2` have more than two dimensions, returned
+ array contains conventional matrix product for each stacked
+ matrix and has shape determined by broadcasting rules for
+ `x1.shape[:-2]` and `x2.shape[:-2]`.
+
+ The data type of the returned array is determined by the Type
+ Promotion Rules. If either `x1` or `x2` has a complex floating
+ point type, neither argument is complex conjugated or transposed.
+ """
+ if not isinstance(x1, dpt.usm_ndarray):
+ raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+ if not isinstance(x2, dpt.usm_ndarray):
+ raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+ if order not in ["K", "C", "F", "A"]:
+ order = "K"
+ q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+ q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+ exec_q = dpctl.utils.get_execution_queue((q1, q2))
+ if exec_q is None:
+ raise ExecutionPlacementError(
+ "Execution placement can not be unambiguously inferred "
+ "from input arguments."
+ )
+ res_usm_type = dpctl.utils.get_coerced_usm_type(
+ (
+ x1_usm_type,
+ x2_usm_type,
+ )
+ )
+ dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+
+ x1_nd = x1.ndim
+ x2_nd = x2.ndim
+ if x1_nd == 0 or x2_nd == 0:
+ raise ValueError("one or more operands to `matmul` is 0 dimensional")
+ x1_shape = x1.shape
+ x2_shape = x2.shape
+ appended_axes = []
+ if x1_nd == 1:
+ x1 = x1[dpt.newaxis, :]
+ x1_shape = x1.shape
+ appended_axes.append(-2)
+ if x2_nd == 1:
+ x2 = x2[:, dpt.newaxis]
+ x2_shape = x2.shape
+ appended_axes.append(-1)
+ if x1_shape[-1] != x2_shape[-2]:
+ raise ValueError("mismatch in `matmul` inner dimension")
+ x1_outer_sh = x1_shape[:-2]
+ x2_outer_sh = x2_shape[:-2]
+ try:
+ res_outer_sh = _broadcast_shape_impl(
+ [
+ x1_outer_sh,
+ x2_outer_sh,
+ ]
+ )
+ except ValueError:
+ raise ValueError("mismatch in `matmul` batching dimensions")
+ x1_broadcast_shape = res_outer_sh + x1_shape[-2:]
+ x2_broadcast_shape = res_outer_sh + x2_shape[-2:]
+ res_shape = res_outer_sh + x1_shape[-2:-1] + x2_shape[-1:]
+
+ sycl_dev = exec_q.sycl_device
+ x1_dtype = x1.dtype
+ x2_dtype = x2.dtype
+ if dtype is None:
+ buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+ x1_dtype,
+ x2_dtype,
+ tli._dot_result_type,
+ sycl_dev,
+ acceptance_fn=_acceptance_fn_default_binary,
+ )
+ if res_dt is None:
+ raise ValueError(
+ "function 'matmul' does not support input types "
+ f"({x1_dtype}, {x2_dtype}), "
+ "and the inputs could not be safely coerced to any "
+ "supported types according to the casting rule ''safe''."
+ )
+ else:
+ res_dt = dpt.dtype(dtype)
+ res_dt = _to_device_supported_dtype(res_dt, sycl_dev)
+ buf1_dt, buf2_dt = None, None
+ if x1_dtype != res_dt:
+ if dpt.can_cast(x1_dtype, res_dt, casting="same_kind"):
+ buf1_dt = res_dt
+ else:
+ raise ValueError(
+ f"`matmul` input `x1` cannot be cast from {x1_dtype} to "
+ f"requested type {res_dt} according to the casting rule "
+ "''same_kind''."
+ )
+ if x2_dtype != res_dt:
+ if dpt.can_cast(x2_dtype, res_dt, casting="same_kind"):
+ buf2_dt = res_dt
+ else:
+ raise ValueError(
+ f"`matmul` input `x2` cannot be cast from {x2_dtype} to "
+ f"requested type {res_dt} according to the casting rule "
+ "''same_kind''."
+ )
+
+ orig_out = out
+ if out is not None:
+ if not isinstance(out, dpt.usm_ndarray):
+ raise TypeError(
+ f"output array must be of usm_ndarray type, got {type(out)}"
+ )
+
+ if out.shape != res_shape:
+ raise ValueError(
+ "The shape of input and output arrays are inconsistent. "
+ f"Expected output shape is {res_shape}, got {out.shape}"
+ )
+
+ if res_dt != out.dtype:
+ raise ValueError(
+ f"Output array of type {res_dt} is needed," f"got {out.dtype}"
+ )
+
+ if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+ raise ExecutionPlacementError(
+ "Input and output allocation queues are not compatible"
+ )
+
+ if ti._array_overlap(x1, out) and buf1_dt is None:
+ out = dpt.empty_like(out)
+
+ if ti._array_overlap(x2, out) and buf2_dt is None:
+ # should not reach if out is reallocated
+ # after being checked against x1
+ out = dpt.empty_like(out)
+
+ if buf1_dt is None and buf2_dt is None:
+ if out is None:
+ if order == "K":
+ out = _empty_like_pair_orderK(
+ x1, x2, res_dt, res_shape, res_usm_type, exec_q
+ )
+ else:
+ if order == "A":
+ order = (
+ "F"
+ if all(
+ arr.flags.f_contiguous
+ for arr in (
+ x1,
+ x2,
+ )
+ )
+ else "C"
+ )
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order=order,
+ )
+ if x1.shape != x1_broadcast_shape:
+ x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
+ if x2.shape != x2_broadcast_shape:
+ x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
+ ht_dot_ev, dot_ev = tli._dot(
+ x1=x1,
+ x2=x2,
+ batch_dims=len(res_shape[:-2]),
+ x1_outer_dims=1,
+ x2_outer_dims=1,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ )
+ if not (orig_out is None or orig_out is out):
+ # Copy the out data from temporary buffer to original memory
+ ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=out,
+ dst=orig_out,
+ sycl_queue=exec_q,
+ depends=[dot_ev],
+ )
+ ht_copy_out_ev.wait()
+ out = orig_out
+ ht_dot_ev.wait()
+ if appended_axes:
+ out = dpt.squeeze(out, tuple(appended_axes))
+ return out
+ elif buf1_dt is None:
+ if order == "K":
+ buf2 = _empty_like_orderK(x2, buf2_dt)
+ else:
+ if order == "A":
+ order = "F" if x1.flags.f_contiguous else "C"
+ buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
+ ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x2, dst=buf2, sycl_queue=exec_q
+ )
+ if out is None:
+ if order == "K":
+ out = _empty_like_pair_orderK(
+ x1, buf2, res_dt, res_shape, res_usm_type, exec_q
+ )
+ else:
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order=order,
+ )
+
+ if x1.shape != x1_broadcast_shape:
+ x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
+ if buf2.shape != x2_broadcast_shape:
+ buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
+ ht_dot_ev, dot_ev = tli._dot(
+ x1=x1,
+ x2=buf2,
+ batch_dims=len(res_shape[:-2]),
+ x1_outer_dims=1,
+ x2_outer_dims=1,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=[copy_ev],
+ )
+ if not (orig_out is None or orig_out is out):
+ # Copy the out data from temporary buffer to original memory
+ ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=out,
+ dst=orig_out,
+ sycl_queue=exec_q,
+ depends=[dot_ev],
+ )
+ ht_copy_out_ev.wait()
+ out = orig_out
+ ht_copy_ev.wait()
+ ht_dot_ev.wait()
+ if appended_axes:
+ out = dpt.squeeze(out, tuple(appended_axes))
+ return out
+
+ elif buf2_dt is None:
+ if order == "K":
+ buf1 = _empty_like_orderK(x1, buf1_dt)
+ else:
+ if order == "A":
+ order = "F" if x1.flags.f_contiguous else "C"
+ buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
+ ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x1, dst=buf1, sycl_queue=exec_q
+ )
+ if out is None:
+ if order == "K":
+ out = _empty_like_pair_orderK(
+ buf1, x2, res_dt, res_shape, res_usm_type, exec_q
+ )
+ else:
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order=order,
+ )
+
+ if buf1.shape != x1_broadcast_shape:
+ buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
+ if x2.shape != x2_broadcast_shape:
+ x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
+ ht_dot_ev, dot_ev = tli._dot(
+ x1=buf1,
+ x2=x2,
+ batch_dims=len(res_shape[:-2]),
+ x1_outer_dims=1,
+ x2_outer_dims=1,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=[copy_ev],
+ )
+ if not (orig_out is None or orig_out is out):
+ # Copy the out data from temporary buffer to original memory
+ ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=out,
+ dst=orig_out,
+ sycl_queue=exec_q,
+ depends=[dot_ev],
+ )
+ ht_copy_out_ev.wait()
+ out = orig_out
+ ht_copy_ev.wait()
+ ht_dot_ev.wait()
+ if appended_axes:
+ out = dpt.squeeze(out, tuple(appended_axes))
+ return out
+
+ if order in ["K", "A"]:
+ if x1.flags.f_contiguous and x2.flags.f_contiguous:
+ order = "F"
+ elif x1.flags.c_contiguous and x2.flags.c_contiguous:
+ order = "C"
+ else:
+ order = "C" if order == "A" else "K"
+ if order == "K":
+ buf1 = _empty_like_orderK(x1, buf1_dt)
+ else:
+ buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
+ ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x1, dst=buf1, sycl_queue=exec_q
+ )
+ if order == "K":
+ buf2 = _empty_like_orderK(x2, buf2_dt)
+ else:
+ buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
+ ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+ src=x2, dst=buf2, sycl_queue=exec_q
+ )
+ if out is None:
+ if order == "K":
+ out = _empty_like_pair_orderK(
+ buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+ )
+ else:
+ out = dpt.empty(
+ res_shape,
+ dtype=res_dt,
+ usm_type=res_usm_type,
+ sycl_queue=exec_q,
+ order=order,
+ )
+
+ if buf1.shape != x1_broadcast_shape:
+ buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
+ if buf2.shape != x2_broadcast_shape:
+ buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
+ ht_, _ = tli._dot(
+ x1=buf1,
+ x2=buf2,
+ batch_dims=len(res_shape[:-2]),
+ x1_outer_dims=1,
+ x2_outer_dims=1,
+ inner_dims=1,
+ dst=out,
+ sycl_queue=exec_q,
+ depends=[copy1_ev, copy2_ev],
+ )
+ dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_])
+ if appended_axes:
+ out = dpt.squeeze(out, tuple(appended_axes))
+ return out
diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py
index 3021db1841..144215e2d6 100644
--- a/dpctl/tensor/_type_utils.py
+++ b/dpctl/tensor/_type_utils.py
@@ -13,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+from __future__ import annotations
import numpy as np
@@ -662,6 +663,48 @@ def _supported_dtype(dtypes):
return True
+def isdtype(dtype, kind):
+ """isdtype(dtype, kind)
+
+ Returns a boolean indicating whether a provided `dtype` is
+ of a specified data type `kind`.
+
+ See [array API](array_api) for more information.
+
+ [array_api]: https://data-apis.org/array-api/latest/
+ """
+
+ if not isinstance(dtype, np.dtype):
+ raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype}")
+
+ if isinstance(kind, np.dtype):
+ return dtype == kind
+
+ elif isinstance(kind, str):
+ if kind == "bool":
+ return dtype == np.dtype("bool")
+ elif kind == "signed integer":
+ return dtype.kind == "i"
+ elif kind == "unsigned integer":
+ return dtype.kind == "u"
+ elif kind == "integral":
+ return dtype.kind in "iu"
+ elif kind == "real floating":
+ return dtype.kind == "f"
+ elif kind == "complex floating":
+ return dtype.kind == "c"
+ elif kind == "numeric":
+ return dtype.kind in "iufc"
+ else:
+ raise ValueError(f"Unrecognized data type kind: {kind}")
+
+ elif isinstance(kind, tuple):
+ return any(isdtype(dtype, k) for k in kind)
+
+ else:
+ raise TypeError(f"Unsupported data type kind: {kind}")
+
+
__all__ = [
"_find_buf_dtype",
"_find_buf_dtype2",
@@ -676,6 +719,7 @@ def _supported_dtype(dtypes):
"can_cast",
"finfo",
"iinfo",
+ "isdtype",
"result_type",
"WeakBooleanType",
"WeakIntegralType",
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 284de1cbe1..67e144f798 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -907,15 +907,15 @@ cdef class usm_ndarray:
def __abs__(self):
return dpctl.tensor.abs(self)
- def __add__(first, other):
+ def __add__(self, other):
"""
Implementation for operator.add
"""
- return dpctl.tensor.add(first, other)
+ return dpctl.tensor.add(self, other)
- def __and__(first, other):
+ def __and__(self, other):
"Implementation for operator.and"
- return dpctl.tensor.bitwise_and(first, other)
+ return dpctl.tensor.bitwise_and(self, other)
def __dlpack__(self, stream=None):
"""
@@ -963,8 +963,8 @@ cdef class usm_ndarray:
def __eq__(self, other):
return dpctl.tensor.equal(self, other)
- def __floordiv__(first, other):
- return dpctl.tensor.floor_divide(first, other)
+ def __floordiv__(self, other):
+ return dpctl.tensor.floor_divide(self, other)
def __ge__(self, other):
return dpctl.tensor.greater_equal(self, other)
@@ -984,21 +984,20 @@ cdef class usm_ndarray:
else:
raise TypeError("len() of unsized object")
- def __lshift__(first, other):
- "See comment in __add__"
- return dpctl.tensor.bitwise_left_shift(first, other)
+ def __lshift__(self, other):
+ return dpctl.tensor.bitwise_left_shift(self, other)
def __lt__(self, other):
return dpctl.tensor.less(self, other)
- def __matmul__(first, other):
- return NotImplemented
+ def __matmul__(self, other):
+ return dpctl.tensor.matmul(self, other)
- def __mod__(first, other):
- return dpctl.tensor.remainder(first, other)
+ def __mod__(self, other):
+ return dpctl.tensor.remainder(self, other)
- def __mul__(first, other):
- return dpctl.tensor.multiply(first, other)
+ def __mul__(self, other):
+ return dpctl.tensor.multiply(self, other)
def __ne__(self, other):
return dpctl.tensor.not_equal(self, other)
@@ -1006,20 +1005,17 @@ cdef class usm_ndarray:
def __neg__(self):
return dpctl.tensor.negative(self)
- def __or__(first, other):
- return dpctl.tensor.bitwise_or(first, other)
+ def __or__(self, other):
+ return dpctl.tensor.bitwise_or(self, other)
def __pos__(self):
return dpctl.tensor.positive(self)
- def __pow__(first, other, mod):
- if mod is None:
- return dpctl.tensor.pow(first, other)
- else:
- return NotImplemented
+ def __pow__(self, other):
+ return dpctl.tensor.pow(self, other)
- def __rshift__(first, other):
- return dpctl.tensor.bitwise_right_shift(first, other)
+ def __rshift__(self, other):
+ return dpctl.tensor.bitwise_right_shift(self, other)
def __setitem__(self, key, rhs):
cdef tuple _meta
@@ -1109,14 +1105,14 @@ cdef class usm_ndarray:
return
- def __sub__(first, other):
- return dpctl.tensor.subtract(first, other)
+ def __sub__(self, other):
+ return dpctl.tensor.subtract(self, other)
- def __truediv__(first, other):
- return dpctl.tensor.divide(first, other)
+ def __truediv__(self, other):
+ return dpctl.tensor.divide(self, other)
- def __xor__(first, other):
- return dpctl.tensor.bitwise_xor(first, other)
+ def __xor__(self, other):
+ return dpctl.tensor.bitwise_xor(self, other)
def __radd__(self, other):
return dpctl.tensor.add(other, self)
@@ -1131,7 +1127,7 @@ cdef class usm_ndarray:
return dpctl.tensor.bitwise_left_shift(other, self)
def __rmatmul__(self, other):
- return NotImplemented
+ return dpctl.tensor.matmul(other, self)
def __rmod__(self, other):
return dpctl.tensor.remainder(other, self)
@@ -1170,11 +1166,7 @@ cdef class usm_ndarray:
return dpctl.tensor.bitwise_left_shift(self, other, out=self)
def __imatmul__(self, other):
- res = self.__matmul__(other)
- if res is NotImplemented:
- return res
- self.__setitem__(Ellipsis, res)
- return self
+ return dpctl.tensor.matmul(self, other, out=self)
def __imod__(self, other):
return dpctl.tensor.remainder(self, other, out=self)
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index 491fb12126..ed06d9a774 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -26,13 +26,13 @@
#include
#include
#include
-#include
#include
#include
#include
+#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
namespace dpctl
{
@@ -43,8 +43,6 @@ namespace kernels
namespace accumulators
{
-namespace py = pybind11;
-
using namespace dpctl::tensor::offset_utils;
template T ceiling_quotient(T n, T m)
@@ -437,7 +435,7 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
size_t,
const char *,
int,
- const py::ssize_t *,
+ const ssize_t *,
char *,
std::vector &,
const std::vector &);
@@ -447,7 +445,7 @@ size_t accumulate_strided_impl(sycl::queue &q,
size_t n_elems,
const char *mask,
int nd,
- const py::ssize_t *shape_strides,
+ const ssize_t *shape_strides,
char *cumsum,
std::vector &host_tasks,
const std::vector &depends = {})
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index 522baadc6d..46468de2e0 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -25,13 +25,13 @@
#pragma once
#include
#include
-#include
#include
#include
#include
+#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
namespace dpctl
{
@@ -42,8 +42,6 @@ namespace kernels
namespace indexing
{
-namespace py = pybind11;
-
using namespace dpctl::tensor::offset_utils;
template (orthog_i));
+ orthog_src_dst_indexer(static_cast(orthog_i));
size_t total_src_offset = masked_src_indexer(masked_i) +
orthog_offsets.get_first_offset();
@@ -161,7 +159,7 @@ struct MaskedPlaceStridedFunctor
// + 1 : 1)
if (mask_set) {
auto orthog_offsets =
- orthog_dst_rhs_indexer(static_cast(orthog_i));
+ orthog_dst_rhs_indexer(static_cast(orthog_i));
size_t total_dst_offset = masked_dst_indexer(masked_i) +
orthog_offsets.get_first_offset();
@@ -199,28 +197,28 @@ class masked_extract_all_slices_strided_impl_krn;
typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
- py::ssize_t,
+ ssize_t,
const char *,
const char *,
char *,
int,
- py::ssize_t const *,
- py::ssize_t,
- py::ssize_t,
+ ssize_t const *,
+ ssize_t,
+ ssize_t,
const std::vector &);
template
sycl::event masked_extract_all_slices_strided_impl(
sycl::queue &exec_q,
- py::ssize_t iteration_size,
+ ssize_t iteration_size,
const char *src_p,
const char *cumsum_p,
char *dst_p,
int nd,
- const py::ssize_t
+ const ssize_t
*packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
- py::ssize_t dst_size, // dst is 1D
- py::ssize_t dst_stride,
+ ssize_t dst_size, // dst is 1D
+ ssize_t dst_stride,
const std::vector &depends = {})
{
// using MaskedExtractStridedFunctor;
@@ -230,7 +228,7 @@ sycl::event masked_extract_all_slices_strided_impl(
TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
- /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+ /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
* *_packed_shape_strides) */
StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride);
@@ -254,19 +252,19 @@ sycl::event masked_extract_all_slices_strided_impl(
typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
- py::ssize_t,
- py::ssize_t,
+ ssize_t,
+ ssize_t,
const char *,
const char *,
char *,
int,
- py::ssize_t const *,
- py::ssize_t,
- py::ssize_t,
+ ssize_t const *,
+ ssize_t,
+ ssize_t,
int,
- py::ssize_t const *,
- py::ssize_t,
- py::ssize_t,
+ ssize_t const *,
+ ssize_t,
+ ssize_t,
const std::vector &);
template
sycl::event masked_extract_some_slices_strided_impl(
sycl::queue &exec_q,
- py::ssize_t orthog_nelems,
- py::ssize_t masked_nelems,
+ ssize_t orthog_nelems,
+ ssize_t masked_nelems,
const char *src_p,
const char *cumsum_p,
char *dst_p,
int orthog_nd,
- const py::ssize_t
+ const ssize_t
*packed_ortho_src_dst_shape_strides, // [ortho_shape, ortho_src_strides,
// ortho_dst_strides], length
// 3*ortho_nd
- py::ssize_t ortho_src_offset,
- py::ssize_t ortho_dst_offset,
+ ssize_t ortho_src_offset,
+ ssize_t ortho_dst_offset,
int masked_nd,
- const py::ssize_t *packed_masked_src_shape_strides, // [masked_src_shape,
- // masked_src_strides],
- // length 2*masked_nd
- py::ssize_t masked_dst_size, // mask_dst is 1D
- py::ssize_t masked_dst_stride,
+ const ssize_t *packed_masked_src_shape_strides, // [masked_src_shape,
+ // masked_src_strides],
+ // length 2*masked_nd
+ ssize_t masked_dst_size, // mask_dst is 1D
+ ssize_t masked_dst_stride,
const std::vector &depends = {})
{
// using MaskedExtractStridedFunctor;
@@ -381,33 +379,33 @@ class masked_place_all_slices_strided_impl_krn;
typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
- py::ssize_t,
+ ssize_t,
char *,
const char *,
const char *,
int,
- py::ssize_t const *,
- py::ssize_t,
- py::ssize_t,
+ ssize_t const *,
+ ssize_t,
+ ssize_t,
const std::vector &);
template
sycl::event masked_place_all_slices_strided_impl(
sycl::queue &exec_q,
- py::ssize_t iteration_size,
+ ssize_t iteration_size,
char *dst_p,
const char *cumsum_p,
const char *rhs_p,
int nd,
- const py::ssize_t
+ const ssize_t
*packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
- py::ssize_t rhs_size, // rhs is 1D
- py::ssize_t rhs_stride,
+ ssize_t rhs_size, // rhs is 1D
+ ssize_t rhs_stride,
const std::vector &depends = {})
{
TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
- /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+ /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
* *_packed_shape_strides) */
StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
@@ -431,19 +429,19 @@ sycl::event masked_place_all_slices_strided_impl(
typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
- py::ssize_t,
- py::ssize_t,
+ ssize_t,
+ ssize_t,
char *,
const char *,
const char *,
int,
- py::ssize_t const *,
- py::ssize_t,
- py::ssize_t,
+ ssize_t const *,
+ ssize_t,
+ ssize_t,
int,
- py::ssize_t const *,
- py::ssize_t,
- py::ssize_t,
+ ssize_t const *,
+ ssize_t,
+ ssize_t,
const std::vector &);
template
sycl::event masked_place_some_slices_strided_impl(
sycl::queue &exec_q,
- py::ssize_t orthog_nelems,
- py::ssize_t masked_nelems,
+ ssize_t orthog_nelems,
+ ssize_t masked_nelems,
char *dst_p,
const char *cumsum_p,
const char *rhs_p,
int orthog_nd,
- const py::ssize_t
+ const ssize_t
*packed_ortho_dst_rhs_shape_strides, // [ortho_shape, ortho_dst_strides,
// ortho_rhs_strides], length
// 3*ortho_nd
- py::ssize_t ortho_dst_offset,
- py::ssize_t ortho_rhs_offset,
+ ssize_t ortho_dst_offset,
+ ssize_t ortho_rhs_offset,
int masked_nd,
- const py::ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape,
- // masked_dst_strides],
- // length 2*masked_nd
- py::ssize_t masked_rhs_size, // mask_dst is 1D
- py::ssize_t masked_rhs_stride,
+ const ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape,
+ // masked_dst_strides],
+ // length 2*masked_nd
+ ssize_t masked_rhs_size, // mask_dst is 1D
+ ssize_t masked_rhs_stride,
const std::vector &depends = {})
{
TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
orthog_nd, ortho_dst_offset, ortho_rhs_offset,
packed_ortho_dst_rhs_shape_strides};
- /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+ /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
* *_packed_shape_strides) */
StridedIndexer masked_dst_indexer{masked_nd, 0,
packed_masked_dst_shape_strides};
@@ -550,22 +548,22 @@ template class non_zero_indexes_krn;
typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
sycl::queue &,
- py::ssize_t,
- py::ssize_t,
+ ssize_t,
+ ssize_t,
int,
const char *,
char *,
- const py::ssize_t *,
+ const ssize_t *,
std::vector const &);
template
sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
- py::ssize_t iter_size,
- py::ssize_t nz_elems,
+ ssize_t iter_size,
+ ssize_t nz_elems,
int nd,
const char *cumsum_cp,
char *indexes_cp,
- const py::ssize_t *mask_shape,
+ const ssize_t *mask_shape,
std::vector const &depends)
{
const indT1 *cumsum_data = reinterpret_cast(cumsum_cp);
@@ -582,11 +580,11 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0);
bool cond = (cs_curr_val == cs_prev_val);
- py::ssize_t i_ = static_cast(i);
+ ssize_t i_ = static_cast(i);
for (int dim = nd; --dim > 0;) {
auto sd = mask_shape[dim];
- py::ssize_t q = i_ / sd;
- py::ssize_t r = (i_ - q * sd);
+ ssize_t q = i_ / sd;
+ ssize_t r = (i_ - q * sd);
if (cond) {
indexes_data[cs_curr_val + dim * nz_elems] =
static_cast(r);
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
index 877680c8bf..ee64bd2e44 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
@@ -31,15 +31,12 @@
#include
#include
-#include "pybind11/pybind11.h"
-
+#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-namespace py = pybind11;
-
namespace dpctl
{
namespace tensor
@@ -179,16 +176,16 @@ struct SequentialBooleanReduction
{
auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
- const py::ssize_t &inp_iter_offset =
+ const ssize_t &inp_iter_offset =
inp_out_iter_offsets_.get_first_offset();
- const py::ssize_t &out_iter_offset =
+ const ssize_t &out_iter_offset =
inp_out_iter_offsets_.get_second_offset();
outT red_val(identity_);
for (size_t m = 0; m < reduction_max_gid_; ++m) {
- py::ssize_t inp_reduction_offset =
- static_cast(inp_reduced_dims_indexer_(m));
- py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+ ssize_t inp_reduction_offset =
+ static_cast(inp_reduced_dims_indexer_(m));
+ ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
// must convert to boolean first to handle nans
using dpctl::tensor::type_utils::convert_impl;
@@ -249,9 +246,9 @@ typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
size_t,
const char *,
char *,
- py::ssize_t,
- py::ssize_t,
- py::ssize_t,
+ ssize_t,
+ ssize_t,
+ ssize_t,
const std::vector &);
template
@@ -269,9 +266,9 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q,
size_t reduction_nelems,
const char *arg_cp,
char *res_cp,
- py::ssize_t iter_arg_offset,
- py::ssize_t iter_res_offset,
- py::ssize_t red_arg_offset,
+ ssize_t iter_arg_offset,
+ ssize_t iter_res_offset,
+ ssize_t red_arg_offset,
const std::vector &depends)
{
const argTy *arg_tp = reinterpret_cast(arg_cp) +
@@ -298,8 +295,8 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q,
using ReductionIndexerT = NoOpIndexerT;
InputOutputIterIndexerT in_out_iter_indexer{
- InputIterIndexerT{0, static_cast(iter_nelems),
- static_cast(reduction_nelems)},
+ InputIterIndexerT{0, static_cast(iter_nelems),
+ static_cast(reduction_nelems)},
NoOpIndexerT{}};
ReductionIndexerT reduction_indexer{};
@@ -425,9 +422,9 @@ struct StridedBooleanReduction
const size_t wg_size = it.get_local_range(0);
auto inp_out_iter_offsets_ = inp_out_iter_indexer_(reduction_id);
- const py::ssize_t &inp_iter_offset =
+ const ssize_t &inp_iter_offset =
inp_out_iter_offsets_.get_first_offset();
- const py::ssize_t &out_iter_offset =
+ const ssize_t &out_iter_offset =
inp_out_iter_offsets_.get_second_offset();
outT local_red_val(identity_);
@@ -438,9 +435,9 @@ struct StridedBooleanReduction
for (size_t arg_reduce_gid = arg_reduce_gid0;
arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg_size)
{
- py::ssize_t inp_reduction_offset = static_cast(
- inp_reduced_dims_indexer_(arg_reduce_gid));
- py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+ ssize_t inp_reduction_offset =
+ static_cast(inp_reduced_dims_indexer_(arg_reduce_gid));
+ ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
// must convert to boolean first to handle nans
using dpctl::tensor::type_utils::convert_impl;
@@ -470,9 +467,9 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q,
size_t reduction_nelems,
const char *arg_cp,
char *res_cp,
- py::ssize_t iter_arg_offset,
- py::ssize_t iter_res_offset,
- py::ssize_t red_arg_offset,
+ ssize_t iter_arg_offset,
+ ssize_t iter_res_offset,
+ ssize_t red_arg_offset,
const std::vector &depends)
{
const argTy *arg_tp = reinterpret_cast(arg_cp) +
@@ -507,8 +504,8 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q,
InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
result_indexer};
ReductionIndexerT reduction_indexer{
- 0, static_cast(reduction_nelems),
- static_cast(iter_nelems)};
+ 0, static_cast(reduction_nelems),
+ static_cast(iter_nelems)};
constexpr size_t preferred_reductions_per_wi = 4;
size_t reductions_per_wi =
@@ -582,12 +579,12 @@ typedef sycl::event (*boolean_reduction_strided_impl_fn_ptr)(
const char *,
char *,
int,
- const py::ssize_t *,
- py::ssize_t,
- py::ssize_t,
+ const ssize_t *,
+ ssize_t,
+ ssize_t,
int,
- const py::ssize_t *,
- py::ssize_t,
+ const ssize_t *,
+ ssize_t,
const std::vector &);
template
@@ -598,12 +595,12 @@ boolean_reduction_strided_impl(sycl::queue &exec_q,
const char *arg_cp,
char *res_cp,
int iter_nd,
- const py::ssize_t *iter_shape_and_strides,
- py::ssize_t iter_arg_offset,
- py::ssize_t iter_res_offset,
+ const ssize_t *iter_shape_and_strides,
+ ssize_t iter_arg_offset,
+ ssize_t iter_res_offset,
int red_nd,
- const py::ssize_t *reduction_shape_stride,
- py::ssize_t reduction_arg_offset,
+ const ssize_t *reduction_shape_stride,
+ ssize_t reduction_arg_offset,
const std::vector &depends)
{
const argTy *arg_tp = reinterpret_cast(arg_cp);
@@ -647,8 +644,8 @@ boolean_reduction_strided_impl(sycl::queue &exec_q,
using IndexerT =
dpctl::tensor::offset_utils::UnpackedStridedIndexer;
- const py::ssize_t *const &res_shape = iter_shape_and_strides;
- const py::ssize_t *const &res_strides =
+ const ssize_t *const &res_shape = iter_shape_and_strides;
+ const ssize_t *const &res_strides =
iter_shape_and_strides + 2 * iter_nd;
IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
res_strides);
diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp
index aff1acb071..6d9bae6ed5 100644
--- a/dpctl/tensor/libtensor/include/kernels/clip.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp
@@ -23,19 +23,16 @@
//===----------------------------------------------------------------------===//
#pragma once
-#include "pybind11/numpy.h"
-#include "pybind11/stl.h"
-#include
#include
#include
#include
-#include
+#include
#include
+#include "dpctl_tensor_types.hpp"
#include "kernels/alignment.hpp"
#include "utils/math_utils.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
#include "utils/type_utils.hpp"
namespace dpctl
@@ -47,9 +44,6 @@ namespace kernels
namespace clip
{
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
using namespace dpctl::tensor::offset_utils;
using dpctl::tensor::kernels::alignment_utils::
@@ -257,7 +251,7 @@ template class ClipStridedFunctor
void operator()(sycl::id<1> id) const
{
size_t gid = id[0];
- auto offsets = indexer(static_cast(gid));
+ auto offsets = indexer(static_cast(gid));
dst_p[offsets.get_fourth_offset()] = clip(
x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
max_p[offsets.get_third_offset()]);
@@ -274,11 +268,11 @@ typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
const char *,
const char *,
char *,
- const py::ssize_t *,
- py::ssize_t,
- py::ssize_t,
- py::ssize_t,
- py::ssize_t,
+ const ssize_t *,
+ ssize_t,
+ ssize_t,
+ ssize_t,
+ ssize_t,
const std::vector &);
template
@@ -289,11 +283,11 @@ sycl::event clip_strided_impl(sycl::queue &q,
const char *min_cp,
const char *max_cp,
char *dst_cp,
- const py::ssize_t *shape_strides,
- py::ssize_t x_offset,
- py::ssize_t min_offset,
- py::ssize_t max_offset,
- py::ssize_t dst_offset,
+ const ssize_t *shape_strides,
+ ssize_t x_offset,
+ ssize_t min_offset,
+ ssize_t max_offset,
+ ssize_t dst_offset,
const std::vector &depends)
{
const T *x_tp = reinterpret_cast(x_cp);
diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
index c28033d23d..4cab7c213c 100644
--- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
@@ -24,11 +24,11 @@
//===----------------------------------------------------------------------===//
#pragma once
+#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
#include "utils/strided_iters.hpp"
#include "utils/type_utils.hpp"
#include
-#include
#include
namespace dpctl
@@ -48,37 +48,8 @@ template class linear_sequence_step_kernel;
template class linear_sequence_affine_kernel;
template class eye_kernel;
-namespace py = pybind11;
using namespace dpctl::tensor::offset_utils;
-/* =========== Unboxing Python scalar =============== */
-
-/*!
- * @brief Cast pybind11 class managing Python object to specified type `T`.
- * @defgroup CtorKernels
- */
-template T unbox_py_scalar(const py::object &o)
-{
- return py::cast(o);
-}
-
-template <> inline sycl::half unbox_py_scalar(const py::object &o)
-{
- float tmp = py::cast(o);
- return static_cast(tmp);
-}
-
-// Constructor to populate tensor with linear sequence defined by
-// start and step data
-
-typedef sycl::event (*lin_space_step_fn_ptr_t)(
- sycl::queue &,
- size_t, // num_elements
- const py::object &start,
- const py::object &step,
- char *, // dst_data_ptr
- const std::vector &);
-
template class LinearSequenceStepFunctor
{
private:
@@ -142,74 +113,9 @@ sycl::event lin_space_step_impl(sycl::queue &exec_q,
return lin_space_step_event;
}
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by starting value and increment
- * given as Python objects.
- *
- * @param q Sycl queue to which the kernel is submitted
- * @param nelems Length of the sequence
- * @param start Starting value of the sequence as Python object. Must be
- * convertible to array element data type `Ty`.
- * @param step Increment of the sequence as Python object. Must be convertible
- * to array element data type `Ty`.
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template
-sycl::event lin_space_step_impl(sycl::queue &exec_q,
- size_t nelems,
- const py::object &start,
- const py::object &step,
- char *array_data,
- const std::vector &depends)
-{
- Ty start_v;
- Ty step_v;
- try {
- start_v = unbox_py_scalar(start);
- step_v = unbox_py_scalar(step);
- } catch (const py::error_already_set &e) {
- throw;
- }
-
- auto lin_space_step_event = lin_space_step_impl(
- exec_q, nelems, start_v, step_v, array_data, depends);
-
- return lin_space_step_event;
-}
-
-/*!
- * @brief Factor to get function pointer of type `fnT` for array with elements
- * of type `Ty`.
- * @defgroup CtorKernels
- */
-template struct LinSpaceStepFactory
-{
- fnT get()
- {
- fnT f = lin_space_step_impl;
- return f;
- }
-};
-
// Constructor to populate tensor with linear sequence defined by
// start and and data
-typedef sycl::event (*lin_space_affine_fn_ptr_t)(
- sycl::queue &,
- size_t, // num_elements
- const py::object &start,
- const py::object &end,
- bool include_endpoint,
- char *, // dst_data_ptr
- const std::vector &);
-
template class LinearSequenceAffineFunctor
{
private:
@@ -312,70 +218,8 @@ sycl::event lin_space_affine_impl(sycl::queue &exec_q,
return lin_space_affine_event;
}
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by starting and end values given
- * as Python objects.
- *
- * @param exec_q Sycl queue to which kernel is submitted for execution.
- * @param nelems Length of the sequence
- * @param start Stating value of the sequence as Python object. Must be
- * convertible to array data element type `Ty`.
- * @param end End-value of the sequence as Python object. Must be convertible
- * to array data element type `Ty`.
- * @param include_endpoint Whether the end-value is included in the sequence
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template
-sycl::event lin_space_affine_impl(sycl::queue &exec_q,
- size_t nelems,
- const py::object &start,
- const py::object &end,
- bool include_endpoint,
- char *array_data,
- const std::vector &depends)
-{
- Ty start_v, end_v;
- try {
- start_v = unbox_py_scalar(start);
- end_v = unbox_py_scalar(end);
- } catch (const py::error_already_set &e) {
- throw;
- }
-
- auto lin_space_affine_event = lin_space_affine_impl(
- exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
-
- return lin_space_affine_event;
-}
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for array data type
- * `Ty`.
- */
-template struct LinSpaceAffineFactory
-{
- fnT get()
- {
- fnT f = lin_space_affine_impl;
- return f;
- }
-};
-
/* ================ Full ================== */
-typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &,
- size_t,
- const py::object &,
- char *,
- const std::vector &);
-
/*!
* @brief Function to submit kernel to fill given contiguous memory allocation
* with specified value.
@@ -408,58 +252,13 @@ sycl::event full_contig_impl(sycl::queue &q,
return fill_ev;
}
-/*!
- * @brief Function to submit kernel to fill given contiguous memory allocation
- * with specified value.
- *
- * @param exec_q Sycl queue to which kernel is submitted for execution.
- * @param nelems Length of the sequence
- * @param py_value Python object representing the value to fill the array with.
- * Must be convertible to `dstTy`.
- * @param dst_p Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template
-sycl::event full_contig_impl(sycl::queue &exec_q,
- size_t nelems,
- const py::object &py_value,
- char *dst_p,
- const std::vector &depends)
-{
- dstTy fill_v;
- try {
- fill_v = unbox_py_scalar(py_value);
- } catch (const py::error_already_set &e) {
- throw;
- }
-
- sycl::event fill_ev =
- full_contig_impl(exec_q, nelems, fill_v, dst_p, depends);
-
- return fill_ev;
-}
-
-template struct FullContigFactory
-{
- fnT get()
- {
- fnT f = full_contig_impl;
- return f;
- }
-};
-
/* ================ Eye ================== */
typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &,
size_t nelems, // num_elements
- py::ssize_t start,
- py::ssize_t end,
- py::ssize_t step,
+ ssize_t start,
+ ssize_t end,
+ ssize_t step,
char *, // dst_data_ptr
const std::vector &);
@@ -467,15 +266,15 @@ template class EyeFunctor
{
private:
Ty *p = nullptr;
- py::ssize_t start_v;
- py::ssize_t end_v;
- py::ssize_t step_v;
+ ssize_t start_v;
+ ssize_t end_v;
+ ssize_t step_v;
public:
EyeFunctor(char *dst_p,
- const py::ssize_t v0,
- const py::ssize_t v1,
- const py::ssize_t dv)
+ const ssize_t v0,
+ const ssize_t v1,
+ const ssize_t dv)
: p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv)
{
}
@@ -483,7 +282,7 @@ template class EyeFunctor
void operator()(sycl::id<1> wiid) const
{
Ty set_v = 0;
- py::ssize_t i = static_cast(wiid.get(0));
+ ssize_t i = static_cast(wiid.get(0));
if (i >= start_v and i <= end_v) {
if ((i - start_v) % step_v == 0) {
set_v = 1;
@@ -511,9 +310,9 @@ template class EyeFunctor
template
sycl::event eye_impl(sycl::queue &exec_q,
size_t nelems,
- const py::ssize_t start,
- const py::ssize_t end,
- const py::ssize_t step,
+ const ssize_t start,
+ const ssize_t end,
+ const ssize_t step,
char *array_data,
const std::vector &depends)
{
@@ -545,13 +344,13 @@ template struct EyeFactory
// define function type
typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &,
- py::ssize_t, // inner_range //py::ssize_t
- py::ssize_t, // outer_range
- char *, // src_data_ptr
- char *, // dst_data_ptr
- py::ssize_t, // nd
- py::ssize_t *, // shape_and_strides
- py::ssize_t, // k
+ ssize_t, // inner_range //ssize_t
+ ssize_t, // outer_range
+ char *, // src_data_ptr
+ char *, // dst_data_ptr
+ ssize_t, // nd
+ ssize_t *, // shape_and_strides
+ ssize_t, // k
const std::vector &,
const std::vector &);
@@ -580,21 +379,21 @@ typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &,
template class tri_kernel;
template
sycl::event tri_impl(sycl::queue &exec_q,
- py::ssize_t inner_range,
- py::ssize_t outer_range,
+ ssize_t inner_range,
+ ssize_t outer_range,
char *src_p,
char *dst_p,
- py::ssize_t nd,
- py::ssize_t *shape_and_strides,
- py::ssize_t k,
+ ssize_t nd,
+ ssize_t *shape_and_strides,
+ ssize_t k,
const std::vector &depends,
const std::vector &additional_depends)
{
constexpr int d2 = 2;
- py::ssize_t src_s = nd;
- py::ssize_t dst_s = 2 * nd;
- py::ssize_t nd_1 = nd - 1;
- py::ssize_t nd_2 = nd - 2;
+ ssize_t src_s = nd;
+ ssize_t dst_s = 2 * nd;
+ ssize_t nd_1 = nd - 1;
+ ssize_t nd_2 = nd - 2;
Ty *src = reinterpret_cast(src_p);
Ty *dst = reinterpret_cast(dst_p);
@@ -606,18 +405,18 @@ sycl::event tri_impl(sycl::queue &exec_q,
cgh.parallel_for>(
sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
- py::ssize_t outer_gid = idx[0] / inner_range;
- py::ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+ ssize_t outer_gid = idx[0] / inner_range;
+ ssize_t inner_gid = idx[0] - inner_range * outer_gid;
- py::ssize_t src_inner_offset = 0, dst_inner_offset = 0;
+ ssize_t src_inner_offset = 0, dst_inner_offset = 0;
bool to_copy(true);
{
using dpctl::tensor::strides::CIndexer_array;
- CIndexer_array indexer_i(
+ CIndexer_array indexer_i(
{shape_and_strides[nd_2], shape_and_strides[nd_1]});
indexer_i.set(inner_gid);
- const std::array &inner = indexer_i.get();
+ const std::array &inner = indexer_i.get();
src_inner_offset =
inner[0] * shape_and_strides[src_s + nd_2] +
inner[1] * shape_and_strides[src_s + nd_1];
@@ -631,11 +430,11 @@ sycl::event tri_impl(sycl::queue &exec_q,
to_copy = (inner[0] + k <= inner[1]);
}
- py::ssize_t src_offset = 0;
- py::ssize_t dst_offset = 0;
+ ssize_t src_offset = 0;
+ ssize_t dst_offset = 0;
{
using dpctl::tensor::strides::CIndexer_vector;
- CIndexer_vector outer(nd - d2);
+ CIndexer_vector outer(nd - d2);
outer.get_displacement(
outer_gid, shape_and_strides, shape_and_strides + src_s,
shape_and_strides + dst_s, src_offset, dst_offset);
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index ef24b58ef2..9bf86e560b 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -25,10 +25,10 @@
#pragma once
#include
#include
-#include
#include
#include
+#include "dpctl_tensor_types.hpp"
#include "kernels/alignment.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_utils.hpp"
@@ -42,7 +42,6 @@ namespace kernels
namespace copy_and_cast
{
-namespace py = pybind11;
using namespace dpctl::tensor::offset_utils;
using dpctl::tensor::kernels::alignment_utils::
@@ -89,9 +88,9 @@ class GenericCopyFunctor
void operator()(sycl::id<1> wiid) const
{
- const auto &offsets = indexer_(static_cast(wiid.get(0)));
- const py::ssize_t &src_offset = offsets.get_first_offset();
- const py::ssize_t &dst_offset = offsets.get_second_offset();
+ const auto &offsets = indexer_(static_cast(wiid.get(0)));
+ const ssize_t &src_offset = offsets.get_first_offset();
+ const ssize_t &dst_offset = offsets.get_second_offset();
CastFnT fn{};
dst_[dst_offset] = fn(src_[src_offset]);
@@ -109,11 +108,11 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
sycl::queue &,
size_t,
int,
- const py::ssize_t *,
+ const ssize_t *,
const char *,
- py::ssize_t,
+ ssize_t,
char *,
- py::ssize_t,
+ ssize_t,
const std::vector &,
const std::vector &);
@@ -155,11 +154,11 @@ sycl::event
copy_and_cast_generic_impl(sycl::queue &q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *src_p,
- py::ssize_t src_offset,
+ ssize_t src_offset,
char *dst_p,
- py::ssize_t dst_offset,
+ ssize_t dst_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
@@ -389,13 +388,13 @@ template struct CopyAndCastContigFactory
typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
sycl::queue &,
size_t,
- const std::array,
- const std::array,
- const std::array,
+ const std::array,
+ const std::array,
+ const std::array,
const char *,
- py::ssize_t,
+ ssize_t,
char *,
- py::ssize_t,
+ ssize_t,
const std::vector &);
/*!
@@ -405,13 +404,13 @@ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
sycl::queue &,
size_t,
- const std::array,
- const std::array,
- const std::array,
+ const std::array,
+ const std::array,
+ const std::array,
const char *,
- py::ssize_t,
+ ssize_t,
char *,
- py::ssize_t,
+ ssize_t,
const std::vector &);
/*!
@@ -447,13 +446,13 @@ template
sycl::event
copy_and_cast_nd_specialized_impl(sycl::queue &q,
size_t nelems,
- const std::array shape,
- const std::array src_strides,
- const std::array dst_strides,
+ const std::array shape,
+ const std::array src_strides,
+ const std::array dst_strides,
const char *src_p,
- py::ssize_t src_offset,
+ ssize_t src_offset,
char *dst_p,
- py::ssize_t dst_offset,
+ ssize_t dst_offset,
const std::vector &depends)
{
dpctl::tensor::type_utils::validate_type_for_device(q);
@@ -528,9 +527,9 @@ class GenericCopyFromHostFunctor
void operator()(sycl::id<1> wiid) const
{
- const auto &offsets = indexer_(static_cast(wiid.get(0)));
- const py::ssize_t &src_offset = offsets.get_first_offset();
- const py::ssize_t &dst_offset = offsets.get_second_offset();
+ const auto &offsets = indexer_(static_cast(wiid.get(0)));
+ const ssize_t &src_offset = offsets.get_first_offset();
+ const ssize_t &dst_offset = offsets.get_second_offset();
CastFnT fn{};
dst_[dst_offset] = fn(src_acc_[src_offset]);
@@ -541,13 +540,13 @@ typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
sycl::queue &,
size_t,
int,
- py::ssize_t *,
+ ssize_t *,
const char *,
- py::ssize_t,
- py::ssize_t,
- py::ssize_t,
+ ssize_t,
+ ssize_t,
+ ssize_t,
char *,
- py::ssize_t,
+ ssize_t,
const std::vector &,
const std::vector &);
@@ -594,17 +593,17 @@ void copy_and_cast_from_host_impl(
sycl::queue &q,
size_t nelems,
int nd,
- py::ssize_t *shape_and_strides,
+ ssize_t *shape_and_strides,
const char *host_src_p,
- py::ssize_t src_offset,
- py::ssize_t src_min_nelem_offset,
- py::ssize_t src_max_nelem_offset,
+ ssize_t src_offset,
+ ssize_t src_min_nelem_offset,
+ ssize_t src_max_nelem_offset,
char *dst_p,
- py::ssize_t dst_offset,
+ ssize_t dst_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
- py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
+ ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
dpctl::tensor::type_utils::validate_type_for_device(q);
dpctl::tensor::type_utils::validate_type_for_device(q);
@@ -621,7 +620,7 @@ void copy_and_cast_from_host_impl(
TwoOffsets_StridedIndexer indexer{
nd, src_offset - src_min_nelem_offset, dst_offset,
- const_cast(shape_and_strides)};
+ const_cast(shape_and_strides)};
dstTy *dst_tp = reinterpret_cast(dst_p);
@@ -683,8 +682,8 @@ class GenericCopyForReshapeFunctor
void operator()(sycl::id<1> wiid) const
{
- const py::ssize_t src_offset = src_indexer_(wiid.get(0));
- const py::ssize_t dst_offset = dst_indexer_(wiid.get(0));
+ const ssize_t src_offset = src_indexer_(wiid.get(0));
+ const ssize_t dst_offset = dst_indexer_(wiid.get(0));
dst_p[dst_offset] = src_p[src_offset];
}
@@ -693,12 +692,12 @@ class GenericCopyForReshapeFunctor
// define function type
typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
sycl::queue &,
- size_t, // num_elements
- int, // src_nd
- int, // dst_nd
- py::ssize_t *, // packed shapes and strides
- const char *, // src_data_ptr
- char *, // dst_data_ptr
+ size_t, // num_elements
+ int, // src_nd
+ int, // dst_nd
+ ssize_t *, // packed shapes and strides
+ const char *, // src_data_ptr
+ char *, // dst_data_ptr
const std::vector &);
/*!
@@ -728,7 +727,7 @@ copy_for_reshape_generic_impl(sycl::queue &q,
size_t nelems,
int src_nd,
int dst_nd,
- py::ssize_t *packed_shapes_and_strides,
+ ssize_t *packed_shapes_and_strides,
const char *src_p,
char *dst_p,
const std::vector &depends)
@@ -742,12 +741,11 @@ copy_for_reshape_generic_impl(sycl::queue &q,
// USM array of size 2*(src_nd + dst_nd)
// [ src_shape; src_strides; dst_shape; dst_strides ]
- const py::ssize_t *src_shape_and_strides =
- const_cast(packed_shapes_and_strides);
+ const ssize_t *src_shape_and_strides =
+ const_cast(packed_shapes_and_strides);
- const py::ssize_t *dst_shape_and_strides =
- const_cast(packed_shapes_and_strides +
- (2 * src_nd));
+ const ssize_t *dst_shape_and_strides = const_cast(
+ packed_shapes_and_strides + (2 * src_nd));
StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
@@ -820,35 +818,34 @@ template struct CompositionIndexer
struct RolledNDIndexer
{
RolledNDIndexer(int nd,
- const py::ssize_t *shape,
- const py::ssize_t *strides,
- const py::ssize_t *ndshifts,
- py::ssize_t starting_offset)
+ const ssize_t *shape,
+ const ssize_t *strides,
+ const ssize_t *ndshifts,
+ ssize_t starting_offset)
: nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
starting_offset_(starting_offset)
{
}
- py::ssize_t operator()(size_t gid) const
+ ssize_t operator()(size_t gid) const
{
return compute_offset(gid);
}
private:
int nd_ = -1;
- const py::ssize_t *shape_ = nullptr;
- const py::ssize_t *strides_ = nullptr;
- const py::ssize_t *ndshifts_ = nullptr;
- py::ssize_t starting_offset_ = 0;
+ const ssize_t *shape_ = nullptr;
+ const ssize_t *strides_ = nullptr;
+ const ssize_t *ndshifts_ = nullptr;
+ ssize_t starting_offset_ = 0;
- py::ssize_t compute_offset(py::ssize_t gid) const
+ ssize_t compute_offset(ssize_t gid) const
{
using dpctl::tensor::strides::CIndexer_vector;
CIndexer_vector _ind(nd_);
- py::ssize_t relative_offset_(0);
- _ind.get_left_rolled_displacement(
+ ssize_t relative_offset_(0);
+ _ind.get_left_rolled_displacement(
gid,
shape_, // shape ptr
strides_, // strides ptr
@@ -884,8 +881,8 @@ class StridedCopyForRollFunctor
{
const size_t gid = wiid.get(0);
- const py::ssize_t src_offset = src_indexer_(gid);
- const py::ssize_t dst_offset = dst_indexer_(gid);
+ const ssize_t src_offset = src_indexer_(gid);
+ const ssize_t dst_offset = dst_indexer_(gid);
dst_p[dst_offset] = src_p[src_offset];
}
@@ -894,14 +891,14 @@ class StridedCopyForRollFunctor
// define function type
typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
sycl::queue &,
- size_t, // shift
- size_t, // num_elements
- int, // common_nd
- const py::ssize_t *, // packed shapes and strides
- const char *, // src_data_ptr
- py::ssize_t, // src_offset
- char *, // dst_data_ptr
- py::ssize_t, // dst_offset
+ size_t, // shift
+ size_t, // num_elements
+ int, // common_nd
+ const ssize_t *, // packed shapes and strides
+ const char *, // src_data_ptr
+ ssize_t, // src_offset
+ char *, // dst_data_ptr
+ ssize_t, // dst_offset
const std::vector &);
/*!
@@ -929,17 +926,16 @@ typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
* @ingroup CopyAndCastKernels
*/
template
-sycl::event
-copy_for_roll_strided_impl(sycl::queue &q,
- size_t shift,
- size_t nelems,
- int nd,
- const py::ssize_t *packed_shapes_and_strides,
- const char *src_p,
- py::ssize_t src_offset,
- char *dst_p,
- py::ssize_t dst_offset,
- const std::vector &depends)
+sycl::event copy_for_roll_strided_impl(sycl::queue &q,
+ size_t shift,
+ size_t nelems,
+ int nd,
+ const ssize_t *packed_shapes_and_strides,
+ const char *src_p,
+ ssize_t src_offset,
+ char *dst_p,
+ ssize_t dst_offset,
+ const std::vector &depends)
{
dpctl::tensor::type_utils::validate_type_for_device(q);
@@ -985,9 +981,9 @@ typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
size_t, // shift
size_t, // num_elements
const char *, // src_data_ptr
- py::ssize_t, // src_offset
+ ssize_t, // src_offset
char *, // dst_data_ptr
- py::ssize_t, // dst_offset
+ ssize_t, // dst_offset
const std::vector &);
template class copy_for_roll_contig_kernel;
@@ -1018,9 +1014,9 @@ sycl::event copy_for_roll_contig_impl(sycl::queue &q,
size_t shift,
size_t nelems,
const char *src_p,
- py::ssize_t src_offset,
+ ssize_t src_offset,
char *dst_p,
- py::ssize_t dst_offset,
+ ssize_t dst_offset,
const std::vector &depends)
{
dpctl::tensor::type_utils::validate_type_for_device(q);
@@ -1085,13 +1081,13 @@ class copy_for_roll_ndshift_strided_kernel;
// define function type
typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
sycl::queue &,
- size_t, // num_elements
- int, // common_nd
- const py::ssize_t *, // packed shape, strides, shifts
- const char *, // src_data_ptr
- py::ssize_t, // src_offset
- char *, // dst_data_ptr
- py::ssize_t, // dst_offset
+ size_t, // num_elements
+ int, // common_nd
+ const ssize_t *, // packed shape, strides, shifts
+ const char *, // src_data_ptr
+ ssize_t, // src_offset
+ char *, // dst_data_ptr
+ ssize_t, // dst_offset
const std::vector &);
template
@@ -1099,11 +1095,11 @@ sycl::event copy_for_roll_ndshift_strided_impl(
sycl::queue &q,
size_t nelems,
int nd,
- const py::ssize_t *packed_shapes_and_strides_and_shifts,
+ const ssize_t *packed_shapes_and_strides_and_shifts,
const char *src_p,
- py::ssize_t src_offset,
+ ssize_t src_offset,
char *dst_p,
- py::ssize_t dst_offset,
+ ssize_t dst_offset,
const std::vector &depends)
{
dpctl::tensor::type_utils::validate_type_for_device(q);
@@ -1115,12 +1111,12 @@ sycl::event copy_for_roll_ndshift_strided_impl(
// USM array of size 4 * nd
// [ common_shape; src_strides; dst_strides; shifts ]
- const py::ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
- const py::ssize_t *src_strides_ptr =
+ const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
+ const ssize_t *src_strides_ptr =
packed_shapes_and_strides_and_shifts + nd;
- const py::ssize_t *dst_strides_ptr =
+ const ssize_t *dst_strides_ptr =
packed_shapes_and_strides_and_shifts + 2 * nd;
- const py::ssize_t *shifts_ptr =
+ const ssize_t *shifts_ptr =
packed_shapes_and_strides_and_shifts + 3 * nd;
RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, shifts_ptr,
diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
new file mode 100644
index 0000000000..c88d838abf
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
@@ -0,0 +1,37 @@
+//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===//
+//
+// Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include
+
+namespace dpctl
+{
+namespace tensor
+{
+
+typedef std::ptrdiff_t ssize_t;
+
+}
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 9e13648163..591f9cb24f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -34,10 +34,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -48,7 +48,6 @@ namespace kernels
namespace abs
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -214,11 +213,11 @@ template
sycl::event abs_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index cf6875c341..236999404e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -32,10 +32,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -46,7 +46,6 @@ namespace kernels
namespace acos
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -219,11 +218,11 @@ sycl::event
acos_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index a6ffa805d7..76d28ae92b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -32,10 +32,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -46,7 +46,6 @@ namespace kernels
namespace acosh
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -241,11 +240,11 @@ sycl::event
acosh_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index aae69d98ea..77bb3c4d67 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -31,12 +31,12 @@
#include "sycl_complex.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "kernels/elementwise_functions/common.hpp"
#include "kernels/elementwise_functions/common_inplace.hpp"
-#include
namespace dpctl
{
@@ -47,7 +47,6 @@ namespace kernels
namespace add
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
namespace tu_ns = dpctl::tensor::type_utils;
@@ -218,11 +217,11 @@ template
sycl::event add_contig_impl(sycl::queue &exec_q,
size_t nelems,
const char *arg1_p,
- py::ssize_t arg1_offset,
+ ssize_t arg1_offset,
const char *arg2_p,
- py::ssize_t arg2_offset,
+ ssize_t arg2_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends = {})
{
return elementwise_common::binary_contig_impl<
@@ -264,13 +263,13 @@ template
sycl::event add_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg1_p,
- py::ssize_t arg1_offset,
+ ssize_t arg1_offset,
const char *arg2_p,
- py::ssize_t arg2_offset,
+ ssize_t arg2_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
@@ -314,12 +313,12 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl(
size_t n0,
size_t n1,
const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
- py::ssize_t mat_offset,
+ ssize_t mat_offset,
const char *vec_p, // typeless pointer to (n1,) contiguous row
- py::ssize_t vec_offset,
+ ssize_t vec_offset,
char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
// res[i,j] = mat[i,j] + vec[j]
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends = {})
{
return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
@@ -363,12 +362,12 @@ sycl::event add_contig_row_contig_matrix_broadcast_impl(
size_t n0,
size_t n1,
const char *vec_p, // typeless pointer to (n1,) contiguous row
- py::ssize_t vec_offset,
+ ssize_t vec_offset,
const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
- py::ssize_t mat_offset,
+ ssize_t mat_offset,
char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
// res[i,j] = mat[i,j] + vec[j]
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends = {})
{
return add_contig_matrix_contig_row_broadcast_impl(
@@ -456,9 +455,9 @@ sycl::event
add_inplace_contig_impl(sycl::queue &exec_q,
size_t nelems,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends = {})
{
return elementwise_common::binary_inplace_contig_impl<
@@ -490,11 +489,11 @@ sycl::event
add_inplace_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
@@ -538,9 +537,9 @@ sycl::event add_inplace_row_matrix_broadcast_impl(
size_t n0,
size_t n1,
const char *vec_p, // typeless pointer to (n1,) contiguous row
- py::ssize_t vec_offset,
+ ssize_t vec_offset,
char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
- py::ssize_t mat_offset,
+ ssize_t mat_offset,
const std::vector &depends = {})
{
return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index 2759974b93..75512d80b8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -33,10 +33,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -47,7 +47,6 @@ namespace kernels
namespace angle
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -151,11 +150,11 @@ sycl::event
angle_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index dc5f2c2b18..0e27841d1e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -32,10 +32,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -46,7 +46,6 @@ namespace kernels
namespace asin
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -243,11 +242,11 @@ sycl::event
asin_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index 6d712165a9..b774de27da 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -32,10 +32,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -46,7 +46,6 @@ namespace kernels
namespace asinh
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -217,11 +216,11 @@ sycl::event
asinh_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index 93c9a6696d..c71498c196 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -33,10 +33,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -47,7 +47,6 @@ namespace kernels
namespace atan
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -219,11 +218,11 @@ sycl::event
atan_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index ac8c0483c4..012eaa7ce4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -30,11 +30,11 @@
#include
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "kernels/elementwise_functions/common.hpp"
-#include
namespace dpctl
{
@@ -45,7 +45,6 @@ namespace kernels
namespace atan2
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
namespace tu_ns = dpctl::tensor::type_utils;
@@ -114,11 +113,11 @@ template
sycl::event atan2_contig_impl(sycl::queue &exec_q,
size_t nelems,
const char *arg1_p,
- py::ssize_t arg1_offset,
+ ssize_t arg1_offset,
const char *arg2_p,
- py::ssize_t arg2_offset,
+ ssize_t arg2_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends = {})
{
return elementwise_common::binary_contig_impl<
@@ -163,13 +162,13 @@ sycl::event
atan2_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg1_p,
- py::ssize_t arg1_offset,
+ ssize_t arg1_offset,
const char *arg2_p,
- py::ssize_t arg2_offset,
+ ssize_t arg2_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 4a26cd92b4..d227047c51 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -33,10 +33,10 @@
#include "kernels/elementwise_functions/common.hpp"
#include "sycl_complex.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
#include "utils/type_utils.hpp"
-#include
namespace dpctl
{
@@ -47,7 +47,6 @@ namespace kernels
namespace atanh
{
-namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;
using dpctl::tensor::type_utils::is_complex;
@@ -212,11 +211,11 @@ sycl::event
atanh_strided_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
- const py::ssize_t *shape_and_strides,
+ const ssize_t *shape_and_strides,
const char *arg_p,
- py::ssize_t arg_offset,
+ ssize_t arg_offset,
char *res_p,
- py::ssize_t res_offset,
+ ssize_t res_offset,
const std::vector &depends,
const std::vector &additional_depends)
{
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index e4da56cd9e..2e3647ec9c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -29,12 +29,12 @@
#include