Skip to content

Commit

Permalink
Add: u8 kernels & FMA
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Oct 17, 2024
1 parent 71c68fe commit 252fba7
Show file tree
Hide file tree
Showing 9 changed files with 603 additions and 57 deletions.
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,16 @@ You can learn more about the technical implementation details in the following b
## Benchmarks

For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements:
Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements for the two most common vector-vector similarity metrics - the Cosine similarity and the Euclidean distance:

| Type | Apple M2 Pro | AMD Genoa | AWS Graviton 4 |
| :----- | ---------------------------------: | ---------------------------------: | ---------------------------------: |
| `f64` | 18.5 → 28.8 GB/s <br/> + 56 % | 21.9 → 41.4 GB/s <br/> + 89 % | 20.7 → 41.3 GB/s <br/> + 99 % |
| `f32` | 9.2 → 29.6 GB/s <br/> + 221 % | 10.9 → 95.8 GB/s <br/> + 779 % | 4.9 → 41.9 GB/s <br/> + 755 % |
| `f16` | 4.6 → 14.6 GB/s <br/> + 217 % | 3.1 → 108.4 GB/s <br/> + 3,397 % | 5.4 → 39.3 GB/s <br/> + 627 % |
| `bf16` | 4.6 → 26.3 GB/s <br/> + 472 % | 0.8 → 59.5 GB/s <br/> +7,437 % | 2.5 → 29.9 GB/s <br/> + 1,096 % |
| `i8` | 25.8 → 47.1 GB/s <br/> + 83 % | 33.1 → 65.3 GB/s <br/> + 97 % | 35.2 → 43.5 GB/s <br/> + 24 % |
| Type | Apple M2 Pro | Intel Sapphire Rapids | AWS Graviton 4 |
| :----- | ----------------------------: | -------------------------------: | ------------------------------: |
| `f64` | 18.5 → 28.8 GB/s <br/> + 56 % | 21.9 → 41.4 GB/s <br/> + 89 % | 20.7 → 41.3 GB/s <br/> + 99 % |
| `f32` | 9.2 → 29.6 GB/s <br/> + 221 % | 10.9 → 95.8 GB/s <br/> + 779 % | 4.9 → 41.9 GB/s <br/> + 755 % |
| `f16` | 4.6 → 14.6 GB/s <br/> + 217 % | 3.1 → 108.4 GB/s <br/> + 3,397 % | 5.4 → 39.3 GB/s <br/> + 627 % |
| `bf16` | 4.6 → 26.3 GB/s <br/> + 472 % | 0.8 → 59.5 GB/s <br/> +7,437 % | 2.5 → 29.9 GB/s <br/> + 1,096 % |
| `i8` | 25.8 → 47.1 GB/s <br/> + 83 % | 33.1 → 65.3 GB/s <br/> + 97 % | 35.2 → 43.5 GB/s <br/> + 24 % |
| `u8` | | 32.5 → 66.5 GB/s <br/> + 105 % | |

Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python.
Broader benchmarking results:
Expand All @@ -112,7 +113,7 @@ Broader benchmarking results:

The package is intended to replace the usage of `numpy.inner`, `numpy.dot`, and `scipy.spatial.distance`.
Aside from drastic performance improvements, SimSIMD significantly improves accuracy in mixed precision setups.
NumPy and SciPy, processing `i8` or `f16` vectors, will use the same types for accumulators, while SimSIMD can combine `i8` enumeration, `i16` multiplication, and `i32` accumulation to avoid overflows entirely.
NumPy and SciPy, processing `i8`, `u8` or `f16` vectors, will use the same types for accumulators, while SimSIMD can combine `i8` enumeration, `i16` multiplication, and `i32` accumulation to avoid overflows entirely.
The same applies to processing `f16` and `bf16` values with `f32` precision.

### Installation
Expand Down
17 changes: 17 additions & 0 deletions c/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ extern "C" {
}

// Dot products
SIMSIMD_DECLARATION_DENSE(dot, i8, i8)
SIMSIMD_DECLARATION_DENSE(dot, u8, u8)
SIMSIMD_DECLARATION_DENSE(dot, f16, f16)
SIMSIMD_DECLARATION_DENSE(dot, bf16, bf16)
SIMSIMD_DECLARATION_DENSE(dot, f32, f32)
Expand All @@ -123,16 +125,19 @@ SIMSIMD_DECLARATION_DENSE(vdot, f64c, f64)

// Spatial distances
SIMSIMD_DECLARATION_DENSE(cos, i8, i8)
SIMSIMD_DECLARATION_DENSE(cos, u8, u8)
SIMSIMD_DECLARATION_DENSE(cos, f16, f16)
SIMSIMD_DECLARATION_DENSE(cos, bf16, bf16)
SIMSIMD_DECLARATION_DENSE(cos, f32, f32)
SIMSIMD_DECLARATION_DENSE(cos, f64, f64)
SIMSIMD_DECLARATION_DENSE(l2sq, i8, i8)
SIMSIMD_DECLARATION_DENSE(l2sq, u8, u8)
SIMSIMD_DECLARATION_DENSE(l2sq, f16, f16)
SIMSIMD_DECLARATION_DENSE(l2sq, bf16, bf16)
SIMSIMD_DECLARATION_DENSE(l2sq, f32, f32)
SIMSIMD_DECLARATION_DENSE(l2sq, f64, f64)
SIMSIMD_DECLARATION_DENSE(l2, i8, i8)
SIMSIMD_DECLARATION_DENSE(l2, u8, u8)
SIMSIMD_DECLARATION_DENSE(l2, f16, f16)
SIMSIMD_DECLARATION_DENSE(l2, bf16, bf16)
SIMSIMD_DECLARATION_DENSE(l2, f32, f32)
Expand Down Expand Up @@ -199,10 +204,13 @@ SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) {
void* dummy = 0;

// Dense:
simsimd_dot_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
simsimd_dot_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
simsimd_dot_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
simsimd_dot_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
simsimd_dot_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
simsimd_dot_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);

simsimd_dot_f16c((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
simsimd_dot_bf16c((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
simsimd_dot_f32c((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
Expand All @@ -211,23 +219,32 @@ SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) {
simsimd_vdot_bf16c((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
simsimd_vdot_f32c((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
simsimd_vdot_f64c((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);

simsimd_cos_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
simsimd_cos_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
simsimd_cos_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
simsimd_cos_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
simsimd_cos_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
simsimd_cos_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);

simsimd_l2sq_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
simsimd_l2sq_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
simsimd_l2sq_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
simsimd_l2sq_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
simsimd_l2sq_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
simsimd_l2sq_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);

simsimd_l2_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
simsimd_l2_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
simsimd_l2_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
simsimd_l2_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
simsimd_l2_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
simsimd_l2_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
simsimd_l2_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);

simsimd_hamming_b8((simsimd_b8_t*)dummy, (simsimd_b8_t*)dummy, 0, dummy_results);
simsimd_jaccard_b8((simsimd_b8_t*)dummy, (simsimd_b8_t*)dummy, 0, dummy_results);

simsimd_kl_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
simsimd_kl_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
simsimd_kl_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
Expand Down
29 changes: 14 additions & 15 deletions include/simsimd/binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
simsimd_distance_t* result) {

simsimd_size_t intersection = 0, union_ = 0;
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
//? On such vectors we can clearly see that the CPU struggles to perform this many parallel
//? population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
//? One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
//? lookups, to utilize other ports on the CPU.
//? https://github.com/ashvardanian/SimSIMD/pull/138
//
// - `_mm512_popcnt_epi64` maps to `VPOPCNTQ (ZMM, K, ZMM)`:
// - On Ice Lake: 3 cycles latency, ports: 1*p5
// - On Genoa: 2 cycles latency, ports: 1*FP01
// - `_mm512_shuffle_epi8` maps to `VPSHUFB (ZMM, ZMM, ZMM)`:
// - On Ice Lake: 1 cycles latency, ports: 1*p5
// - On Genoa: 2 cycles latency, ports: 1*FP12
//
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
if (n_words <= 64) { // Up to 512 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
Expand All @@ -341,20 +354,6 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(and2_count_vec, and1_count_vec));
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(or2_count_vec, or1_count_vec));
} else if (n_words <= 196) { // Up to 1568 bits.
// TODO: On such vectors we can clearly see that the CPU struggles to perform this many parallel
// population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
// One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
// lookups, to utilize other ports on the CPU.
// https://github.com/ashvardanian/SimSIMD/pull/138
//
// On Ice Lake:
// - `VPOPCNTQ (ZMM, K, ZMM)` can only execute on port 5, which is a bottleneck.
// - `VPSHUFB (ZMM, ZMM, ZMM)` can only run on the same port 5 as well!
// On Zen4:
// - `VPOPCNTQ (ZMM, K, ZMM)` can run on ports: 0, 1.
// - `VPSHUFB (ZMM, ZMM, ZMM)` can run on ports: 1, 2.
// https://uops.info/table.html?search=VPOPCNTQ%20(ZMM%2C%20K%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
// https://uops.info/table.html?search=VPSHUFB%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
Expand Down
Loading

0 comments on commit 252fba7

Please sign in to comment.