Skip to content

Commit

Permalink
Add support for xsimd::tranpose
Browse files Browse the repository at this point in the history
Fix #107 (6 years old!)
  • Loading branch information
serge-sans-paille committed Oct 5, 2024
1 parent 3747986 commit 13b4019
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ In place:

Between batches:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`transpose` | tranpose a matrix as an array of batches |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`zip_lo` | interleave low halves of two batches |
+---------------------------------------+----------------------------------------------------+
Expand Down
21 changes: 21 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,27 @@ namespace xsimd
hi.store_unaligned(buffer + real_batch::size);
}

// transpose
template <class A, class T>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
alignas(A::alignment()) T scratch_buffer[batch<T, A>::size * batch<T, A>::size];
for(size_t i = 0; i < batch<T, A>::size; ++i) {
matrix_begin[i].store_aligned(&scratch_buffer[i * batch<T, A>::size]);
}
// FIXME: this is super naive we can probably do better.
for(size_t i = 0; i < batch<T, A>::size; ++i) {
for(size_t j = 0; j < i; ++j) {
std::swap(scratch_buffer[i * batch<T, A>::size + j],
scratch_buffer[j * batch<T, A>::size + i]);
}
}
for(size_t i = 0; i < batch<T, A>::size; ++i) {
matrix_begin[i] = batch<T, A>::load_aligned(&scratch_buffer[i * batch<T, A>::size]);
}
}

}

}
Expand Down
26 changes: 26 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1640,6 +1640,32 @@ namespace xsimd
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
}

// transpose
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
_MM_TRANSPOSE4_PS(matrix_begin[0], matrix_begin[1], matrix_begin[2], matrix_begin[3]);
}
template <class A>
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
_MM_TRANSPOSE4_PS(bitwise_cast<float>(matrix_begin[0]),
bitwise_cast<float>(matrix_begin[1]),
bitwise_cast<float>(matrix_begin[2]),
bitwise_cast<float>(matrix_begin[3]));
}
template <class A>
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
_MM_TRANSPOSE4_PS(bitwise_cast<float>(matrix_begin[0]),
bitwise_cast<float>(matrix_begin[1]),
bitwise_cast<float>(matrix_begin[2]),
bitwise_cast<float>(matrix_begin[3]));
}

// zip_hi
template <class A>
XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
Expand Down
17 changes: 17 additions & 0 deletions include/xsimd/types/xsimd_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2516,6 +2516,23 @@ namespace xsimd
return batch_cast<as_integer_t<T>>(x);
}

/**
* @ingroup batch_data_transfer
*
* Transposes in place the matrix whose line are each of the batch passed as
* argument.
* @param matrix_begin pointer to the first line of the matrix to transpose
* @param matrix_end pointer to one element after the last line of the matrix to transpose
*
*/
template <class T, class A>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
detail::static_check_supported_config<T, A>();
return kernel::transpose(matrix_begin, matrix_end, A {});
}

/**
* @ingroup batch_rounding
*
Expand Down
22 changes: 22 additions & 0 deletions test/test_shuffle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,24 @@ struct shuffle_test
}
}

void transpose()
{
B b_lhs = B::load_unaligned(lhs.data());
std::array<B, size> b_matrix;
for(size_t i = 0; i < size; ++i)
b_matrix[i] = b_lhs;
std::array<value_type, size * size> ref_matrix;
for(size_t i = 0; i < size; ++i)
for(size_t j = 0; j < size; ++j)
ref_matrix[i * size + j] = lhs[i];

INFO("transpose");
xsimd::transpose(b_matrix.begin(), b_matrix.end());
for(size_t i = 0; i < size; ++i) {
CHECK_BATCH_EQ(b_matrix[i], B::load_unaligned(&ref_matrix[i * size]));
}
}

void select()
{
B b_lhs = B::load_unaligned(lhs.data());
Expand Down Expand Up @@ -694,6 +712,10 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xs
{
Test.swizzle();
}
SUBCASE("transpose")
{
Test.transpose();
}
SUBCASE("zip")
{
Test.zip();
Expand Down

0 comments on commit 13b4019

Please sign in to comment.