diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index d8102b1a4..5691073ec 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -61,6 +61,8 @@ In place: Between batches: ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`transpose` | tranpose a matrix as an array of batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_lo` | interleave low halves of two batches | +---------------------------------------+----------------------------------------------------+ diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index 18c9c80ad..10c1ffe66 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -639,6 +639,32 @@ namespace xsimd hi.store_unaligned(buffer + real_batch::size); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + alignas(A::alignment()) T scratch_buffer[batch::size * batch::size]; + for (size_t i = 0; i < batch::size; ++i) + { + matrix_begin[i].store_aligned(&scratch_buffer[i * batch::size]); + } + // FIXME: this is super naive we can probably do better. + for (size_t i = 0; i < batch::size; ++i) + { + for (size_t j = 0; j < i; ++j) + { + std::swap(scratch_buffer[i * batch::size + j], + scratch_buffer[j * batch::size + i]); + } + } + for (size_t i = 0; i < batch::size; ++i) + { + matrix_begin[i] = batch::load_aligned(&scratch_buffer[i * batch::size]); + } + } + } } diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 67b74f548..40b811394 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1640,6 +1640,35 @@ namespace xsimd return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + _MM_TRANSPOSE4_PS(matrix_begin[0], matrix_begin[1], matrix_begin[2], matrix_begin[3]); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + _MM_TRANSPOSE4_PS(bitwise_cast(matrix_begin[0]), + bitwise_cast(matrix_begin[1]), + bitwise_cast(matrix_begin[2]), + bitwise_cast(matrix_begin[3])); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + _MM_TRANSPOSE4_PS(bitwise_cast(matrix_begin[0]), + bitwise_cast(matrix_begin[1]), + bitwise_cast(matrix_begin[2]), + bitwise_cast(matrix_begin[3])); + } + // zip_hi template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 6e9ec094c..5cd87c205 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -2516,6 +2516,23 @@ namespace xsimd return batch_cast>(x); } + /** + * @ingroup batch_data_transfer + * + * Transposes in place the matrix whose line are each of the batch passed as + * argument. + * @param matrix_begin pointer to the first line of the matrix to transpose + * @param matrix_end pointer to one element after the last line of the matrix to transpose + * + */ + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + detail::static_check_supported_config(); + return kernel::transpose(matrix_begin, matrix_end, A {}); + } + /** * @ingroup batch_rounding * diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index 3ea9dca4d..a4caa747d 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -605,6 +605,25 @@ struct shuffle_test } } + void transpose() + { + B b_lhs = B::load_unaligned(lhs.data()); + std::array b_matrix; + for (size_t i = 0; i < size; ++i) + b_matrix[i] = b_lhs; + std::array ref_matrix; + for (size_t i = 0; i < size; ++i) + for (size_t j = 0; j < size; ++j) + ref_matrix[i * size + j] = lhs[i]; + + INFO("transpose"); + xsimd::transpose(b_matrix.begin(), b_matrix.end()); + for (size_t i = 0; i < size; ++i) + { + CHECK_BATCH_EQ(b_matrix[i], B::load_unaligned(&ref_matrix[i * size])); + } + } + void select() { B b_lhs = B::load_unaligned(lhs.data()); @@ -694,6 +713,10 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch, xs { Test.swizzle(); } + SUBCASE("transpose") + { + Test.transpose(); + } SUBCASE("zip") { Test.zip();