From 13b4019cad3fa8fbd409e5f45aa7340882e4652e Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sat, 5 Oct 2024 19:41:24 +0200 Subject: [PATCH] Add support for xsimd::tranpose Fix #107 (6 years old!) --- docs/source/api/data_transfer.rst | 2 ++ .../arch/generic/xsimd_generic_memory.hpp | 21 +++++++++++++++ include/xsimd/arch/xsimd_sse2.hpp | 26 +++++++++++++++++++ include/xsimd/types/xsimd_api.hpp | 17 ++++++++++++ test/test_shuffle.cpp | 22 ++++++++++++++++ 5 files changed, 88 insertions(+) diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index d8102b1a4..5691073ec 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -61,6 +61,8 @@ In place: Between batches: ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`transpose` | tranpose a matrix as an array of batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_lo` | interleave low halves of two batches | +---------------------------------------+----------------------------------------------------+ diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index 18c9c80ad..dbeca0c62 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -639,6 +639,27 @@ namespace xsimd hi.store_unaligned(buffer + real_batch::size); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + alignas(A::alignment()) T scratch_buffer[batch::size * batch::size]; + for(size_t i = 0; i < batch::size; ++i) { + matrix_begin[i].store_aligned(&scratch_buffer[i * batch::size]); + } + // FIXME: this is super naive we can probably do better. + for(size_t i = 0; i < batch::size; ++i) { + for(size_t j = 0; j < i; ++j) { + std::swap(scratch_buffer[i * batch::size + j], + scratch_buffer[j * batch::size + i]); + } + } + for(size_t i = 0; i < batch::size; ++i) { + matrix_begin[i] = batch::load_aligned(&scratch_buffer[i * batch::size]); + } + } + } } diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 67b74f548..93f4ff9a8 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1640,6 +1640,32 @@ namespace xsimd return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + _MM_TRANSPOSE4_PS(matrix_begin[0], matrix_begin[1], matrix_begin[2], matrix_begin[3]); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + _MM_TRANSPOSE4_PS(bitwise_cast(matrix_begin[0]), + bitwise_cast(matrix_begin[1]), + bitwise_cast(matrix_begin[2]), + bitwise_cast(matrix_begin[3])); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + _MM_TRANSPOSE4_PS(bitwise_cast(matrix_begin[0]), + bitwise_cast(matrix_begin[1]), + bitwise_cast(matrix_begin[2]), + bitwise_cast(matrix_begin[3])); + } + // zip_hi template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 6e9ec094c..5cd87c205 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -2516,6 +2516,23 @@ namespace xsimd return batch_cast>(x); } + /** + * @ingroup batch_data_transfer + * + * Transposes in place the matrix whose line are each of the batch passed as + * argument. + * @param matrix_begin pointer to the first line of the matrix to transpose + * @param matrix_end pointer to one element after the last line of the matrix to transpose + * + */ + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + detail::static_check_supported_config(); + return kernel::transpose(matrix_begin, matrix_end, A {}); + } + /** * @ingroup batch_rounding * diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index 3ea9dca4d..5e36e3733 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -605,6 +605,24 @@ struct shuffle_test } } + void transpose() + { + B b_lhs = B::load_unaligned(lhs.data()); + std::array b_matrix; + for(size_t i = 0; i < size; ++i) + b_matrix[i] = b_lhs; + std::array ref_matrix; + for(size_t i = 0; i < size; ++i) + for(size_t j = 0; j < size; ++j) + ref_matrix[i * size + j] = lhs[i]; + + INFO("transpose"); + xsimd::transpose(b_matrix.begin(), b_matrix.end()); + for(size_t i = 0; i < size; ++i) { + CHECK_BATCH_EQ(b_matrix[i], B::load_unaligned(&ref_matrix[i * size])); + } + } + void select() { B b_lhs = B::load_unaligned(lhs.data()); @@ -694,6 +712,10 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch, xs { Test.swizzle(); } + SUBCASE("transpose") + { + Test.transpose(); + } SUBCASE("zip") { Test.zip();