diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c07942..9d7b4c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project( simsimd - VERSION 5.7.3 + VERSION 5.8.0 LANGUAGES C CXX DESCRIPTION "Portable mixed-precision BLAS-like vector math library for x86 and ARM" HOMEPAGE_URL "https://github.com/ashvardanian/simsimd" diff --git a/Cargo.lock b/Cargo.lock index d9bdf45..b886976 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -496,7 +496,7 @@ dependencies = [ [[package]] name = "simsimd" -version = "5.7.3" +version = "5.8.0" dependencies = [ "cc", "criterion", diff --git a/Cargo.toml b/Cargo.toml index 0427804..04a08ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "simsimd" description = "Portable mixed-precision BLAS-like vector math library for x86 and ARM" -version = "5.7.3" +version = "5.8.0" edition = "2021" license = "Apache-2.0" authors = ["Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>"] diff --git a/VERSION b/VERSION index 23900d6..edb1d39 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -5.7.3 \ No newline at end of file +5.8.0 \ No newline at end of file diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index 49b3ee9..400d72a 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -89,8 +89,8 @@ #define SIMSIMD_H #define SIMSIMD_VERSION_MAJOR 5 -#define SIMSIMD_VERSION_MINOR 7 -#define SIMSIMD_VERSION_PATCH 3 +#define SIMSIMD_VERSION_MINOR 8 +#define SIMSIMD_VERSION_PATCH 0 /** * @brief Removes compile-time dispatching, and replaces it with runtime dispatching. diff --git a/package.json b/package.json index ef1f109..c114d02 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "simsimd", - "version": "5.7.3", + "version": "5.8.0", "description": "Portable mixed-precision BLAS-like vector math library for x86 and ARM", "homepage": "https://github.com/ashvardanian/simsimd", "author": "Ash Vardanian", diff --git a/scripts/bench.cxx b/scripts/bench.cxx index 1642ce4..0d94ee2 100644 --- a/scripts/bench.cxx +++ b/scripts/bench.cxx @@ -23,7 +23,6 @@ // are implemented. #define SIMSIMD_NATIVE_F16 1 #define SIMSIMD_NATIVE_BF16 1 -#define SIMSIMD_TARGET_NEON_BF16 0 #include constexpr std::size_t default_seconds = 10; @@ -740,21 +739,6 @@ int main(int argc, char** argv) { constexpr simsimd_datatype_t f16c_k = simsimd_datatype_f16c_k; constexpr simsimd_datatype_t bf16c_k = simsimd_datatype_bf16c_k; - fma_("fma_f32_neon", simsimd_fma_f32_neon, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); - fma_("wsum_f32_neon", simsimd_wsum_f32_neon, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); - fma_("fma_f32_serial", simsimd_fma_f32_serial, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); - fma_("wsum_f32_serial", simsimd_wsum_f32_serial, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); - - fma_("fma_f16_neon", simsimd_fma_f16_neon, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); - fma_("wsum_f16_neon", simsimd_wsum_f16_neon, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); - fma_("fma_f16_serial", simsimd_fma_f16_serial, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); - fma_("wsum_f16_serial", simsimd_wsum_f16_serial, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); - - fma_("fma_u8_neon", simsimd_fma_u8_neon, simsimd_fma_u8_serial, simsimd_l2_u8_serial); - fma_("wsum_u8_neon", simsimd_wsum_u8_neon, simsimd_wsum_u8_serial, simsimd_l2_u8_serial); - fma_("fma_u8_serial", simsimd_fma_u8_serial, simsimd_fma_u8_serial, simsimd_l2_u8_serial); - fma_("wsum_u8_serial", simsimd_wsum_u8_serial, simsimd_wsum_u8_serial, simsimd_l2_u8_serial); - #if SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS dense_("dot_f32_blas", dot_f32_blas, simsimd_dot_f32_accurate); @@ -767,21 +751,37 @@ int main(int argc, char** argv) { #endif #if SIMSIMD_TARGET_NEON + dense_("dot_f16_neon", simsimd_dot_f16_neon, simsimd_dot_f16_accurate); + dense_("cos_f16_neon", simsimd_cos_f16_neon, simsimd_cos_f16_accurate); + dense_("l2sq_f16_neon", simsimd_l2sq_f16_neon, simsimd_l2sq_f16_accurate); + dense_("l2_f16_neon", simsimd_l2_f16_neon, simsimd_l2_f16_accurate); + dense_("kl_f16_neon", simsimd_kl_f16_neon, simsimd_kl_f16_accurate); + dense_("js_f16_neon", simsimd_js_f16_neon, simsimd_js_f16_accurate); + + dense_("dot_bf16_neon", simsimd_dot_bf16_neon, simsimd_dot_bf16_accurate); + dense_("cos_bf16_neon", simsimd_cos_bf16_neon, simsimd_cos_bf16_accurate); + dense_("l2sq_bf16_neon", simsimd_l2sq_bf16_neon, simsimd_l2sq_bf16_accurate); + dense_("l2_bf16_neon", simsimd_l2_bf16_neon, simsimd_l2_bf16_accurate); + dense_("dot_f32_neon", simsimd_dot_f32_neon, simsimd_dot_f32_accurate); dense_("cos_f32_neon", simsimd_cos_f32_neon, simsimd_cos_f32_accurate); dense_("l2sq_f32_neon", simsimd_l2sq_f32_neon, simsimd_l2sq_f32_accurate); + dense_("l2_f32_neon", simsimd_l2_f32_neon, simsimd_l2_f32_accurate); dense_("kl_f32_neon", simsimd_kl_f32_neon, simsimd_kl_f32_accurate); dense_("js_f32_neon", simsimd_js_f32_neon, simsimd_js_f32_accurate); dense_("cos_f64_neon", simsimd_cos_f64_neon, simsimd_cos_f64_serial); dense_("l2sq_f64_neon", simsimd_l2sq_f64_neon, simsimd_l2sq_f64_serial); + dense_("l2_f64_neon", simsimd_l2_f64_neon, simsimd_l2_f64_serial); dense_("cos_i8_neon", simsimd_cos_i8_neon, simsimd_cos_i8_serial); dense_("l2sq_i8_neon", simsimd_l2sq_i8_neon, simsimd_l2sq_i8_serial); + dense_("l2_i8_neon", simsimd_l2_i8_neon, simsimd_l2_i8_serial); dense_("dot_i8_neon", simsimd_dot_i8_neon, simsimd_dot_i8_serial); dense_("cos_u8_neon", simsimd_cos_u8_neon, simsimd_cos_u8_serial); dense_("l2sq_u8_neon", simsimd_l2sq_u8_neon, simsimd_l2sq_u8_serial); + dense_("l2_u8_neon", simsimd_l2_u8_neon, simsimd_l2_u8_serial); dense_("dot_u8_neon", simsimd_dot_u8_neon, simsimd_dot_u8_serial); dense_("hamming_b8_neon", simsimd_hamming_b8_neon, simsimd_hamming_b8_serial); @@ -795,6 +795,12 @@ int main(int argc, char** argv) { sparse_("intersect_u16_neon", simsimd_intersect_u16_neon, simsimd_intersect_u16_accurate); sparse_("intersect_u32_neon", simsimd_intersect_u32_neon, simsimd_intersect_u32_accurate); + + fma_("fma_f32_neon", simsimd_fma_f32_neon, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); + fma_("wsum_f32_neon", simsimd_wsum_f32_neon, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); + fma_("fma_f32_serial", simsimd_fma_f32_serial, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); + fma_("wsum_f32_serial", simsimd_wsum_f32_serial, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); + #endif #if SIMSIMD_TARGET_NEON_F16 @@ -804,11 +810,19 @@ int main(int argc, char** argv) { dense_("dot_f16_neon", simsimd_dot_f16_neon, simsimd_dot_f16_accurate); dense_("cos_f16_neon", simsimd_cos_f16_neon, simsimd_cos_f16_accurate); dense_("l2sq_f16_neon", simsimd_l2sq_f16_neon, simsimd_l2sq_f16_accurate); + dense_("l2_f16_neon", simsimd_l2_f16_neon, simsimd_l2sq_f16_accurate); dense_("kl_f16_neon", simsimd_kl_f16_neon, simsimd_kl_f16_accurate); dense_("js_f16_neon", simsimd_js_f16_neon, simsimd_js_f16_accurate); curved_("bilinear_f16_neon", simsimd_bilinear_f16_neon, simsimd_bilinear_f16_accurate); curved_("mahalanobis_f16_neon", simsimd_mahalanobis_f16_neon, simsimd_mahalanobis_f16_accurate); + + fma_("fma_f16_neon", simsimd_fma_f16_neon, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); + fma_("wsum_f16_neon", simsimd_wsum_f16_neon, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); + + // FMA kernels for `u8` on NEON use `f16` arithmetic + fma_("fma_u8_neon", simsimd_fma_u8_neon, simsimd_fma_u8_serial, simsimd_l2_u8_serial); + fma_("wsum_u8_neon", simsimd_wsum_u8_neon, simsimd_wsum_u8_serial, simsimd_l2_u8_serial); #endif #if SIMSIMD_TARGET_NEON_BF16 @@ -818,19 +832,31 @@ int main(int argc, char** argv) { dense_("dot_bf16_neon", simsimd_dot_bf16_neon, simsimd_dot_bf16_accurate); dense_("cos_bf16_neon", simsimd_cos_bf16_neon, simsimd_cos_bf16_accurate); dense_("l2sq_bf16_neon", simsimd_l2sq_bf16_neon, simsimd_l2sq_bf16_accurate); + dense_("l2_bf16_neon", simsimd_l2_bf16_neon, simsimd_l2_bf16_accurate); curved_("bilinear_bf16_neon", simsimd_bilinear_bf16_neon, simsimd_bilinear_bf16_accurate); curved_("mahalanobis_bf16_neon", simsimd_mahalanobis_bf16_neon, simsimd_mahalanobis_bf16_accurate); #endif #if SIMSIMD_TARGET_SVE + dense_("dot_f16_sve", simsimd_dot_f16_sve, simsimd_dot_f16_accurate); + dense_("cos_f16_sve", simsimd_cos_f16_sve, simsimd_cos_f16_accurate); + dense_("l2sq_f16_sve", simsimd_l2sq_f16_sve, simsimd_l2sq_f16_accurate); + dense_("l2_f16_sve", simsimd_l2_f16_sve, simsimd_l2_f16_accurate); + + dense_("cos_bf16_sve", simsimd_cos_bf16_sve, simsimd_cos_bf16_accurate); + dense_("l2sq_bf16_sve", simsimd_l2sq_bf16_sve, simsimd_l2sq_bf16_accurate); + dense_("l2_bf16_sve", simsimd_l2_bf16_sve, simsimd_l2_bf16_accurate); + dense_("dot_f32_sve", simsimd_dot_f32_sve, simsimd_dot_f32_accurate); dense_("cos_f32_sve", simsimd_cos_f32_sve, simsimd_cos_f32_accurate); dense_("l2sq_f32_sve", simsimd_l2sq_f32_sve, simsimd_l2sq_f32_accurate); + dense_("l2_f32_sve", simsimd_l2_f32_sve, simsimd_l2_f32_accurate); dense_("dot_f64_sve", simsimd_dot_f64_sve, simsimd_dot_f64_serial); dense_("cos_f64_sve", simsimd_cos_f64_sve, simsimd_cos_f64_serial); dense_("l2sq_f64_sve", simsimd_l2sq_f64_sve, simsimd_l2sq_f64_serial); + dense_("l2_f64_sve", simsimd_l2_f64_sve, simsimd_l2_f64_serial); dense_("hamming_b8_sve", simsimd_hamming_b8_sve, simsimd_hamming_b8_serial); dense_("jaccard_b8_sve", simsimd_jaccard_b8_sve, simsimd_jaccard_b8_serial); @@ -845,6 +871,7 @@ int main(int argc, char** argv) { dense_("dot_f16_sve", simsimd_dot_f16_sve, simsimd_dot_f16_accurate); dense_("cos_f16_sve", simsimd_cos_f16_sve, simsimd_cos_f16_accurate); dense_("l2sq_f16_sve", simsimd_l2sq_f16_sve, simsimd_l2sq_f16_accurate); + dense_("l2_f16_sve", simsimd_l2_f16_sve, simsimd_l2sq_f16_accurate); dense_("dot_f16c_sve", simsimd_dot_f16c_sve, simsimd_dot_f16c_accurate); dense_("vdot_f16c_sve", simsimd_vdot_f16c_sve, simsimd_vdot_f16c_accurate); #endif @@ -852,6 +879,7 @@ int main(int argc, char** argv) { #if SIMSIMD_TARGET_SVE_BF16 dense_("cos_bf16_sve", simsimd_cos_bf16_sve, simsimd_cos_bf16_accurate); dense_("l2sq_bf16_sve", simsimd_l2sq_bf16_sve, simsimd_l2sq_bf16_accurate); + dense_("l2_bf16_sve", simsimd_l2_bf16_sve, simsimd_l2_bf16_accurate); #endif #if SIMSIMD_TARGET_SVE2 @@ -863,19 +891,23 @@ int main(int argc, char** argv) { dense_("dot_f16_haswell", simsimd_dot_f16_haswell, simsimd_dot_f16_accurate); dense_("cos_f16_haswell", simsimd_cos_f16_haswell, simsimd_cos_f16_accurate); dense_("l2sq_f16_haswell", simsimd_l2sq_f16_haswell, simsimd_l2sq_f16_accurate); + dense_("l2_f16_haswell", simsimd_l2_f16_haswell, simsimd_l2_f16_accurate); dense_("kl_f16_haswell", simsimd_kl_f16_haswell, simsimd_kl_f16_accurate); dense_("js_f16_haswell", simsimd_js_f16_haswell, simsimd_js_f16_accurate); dense_("dot_bf16_haswell", simsimd_dot_bf16_haswell, simsimd_dot_bf16_accurate); dense_("cos_bf16_haswell", simsimd_cos_bf16_haswell, simsimd_cos_bf16_accurate); dense_("l2sq_bf16_haswell", simsimd_l2sq_bf16_haswell, simsimd_l2sq_bf16_accurate); + dense_("l2_bf16_haswell", simsimd_l2_bf16_haswell, simsimd_l2_bf16_accurate); dense_("cos_i8_haswell", simsimd_cos_i8_haswell, simsimd_cos_i8_serial); dense_("l2sq_i8_haswell", simsimd_l2sq_i8_haswell, simsimd_l2sq_i8_serial); + dense_("l2_i8_haswell", simsimd_l2_i8_haswell, simsimd_l2_i8_serial); dense_("dot_i8_haswell", simsimd_dot_i8_haswell, simsimd_dot_i8_serial); dense_("cos_u8_haswell", simsimd_cos_u8_haswell, simsimd_cos_u8_serial); dense_("l2sq_u8_haswell", simsimd_l2sq_u8_haswell, simsimd_l2sq_u8_serial); + dense_("l2_u8_haswell", simsimd_l2_u8_haswell, simsimd_l2_u8_serial); dense_("dot_u8_haswell", simsimd_dot_u8_haswell, simsimd_dot_u8_serial); dense_("hamming_b8_haswell", simsimd_hamming_b8_haswell, simsimd_hamming_b8_serial); @@ -897,6 +929,7 @@ int main(int argc, char** argv) { dense_("dot_bf16_genoa", simsimd_dot_bf16_genoa, simsimd_dot_bf16_accurate); dense_("cos_bf16_genoa", simsimd_cos_bf16_genoa, simsimd_cos_bf16_accurate); dense_("l2sq_bf16_genoa", simsimd_l2sq_bf16_genoa, simsimd_l2sq_bf16_accurate); + dense_("l2_bf16_genoa", simsimd_l2_bf16_genoa, simsimd_l2_bf16_accurate); dense_("dot_bf16c_genoa", simsimd_dot_bf16c_genoa, simsimd_dot_bf16c_accurate); dense_("vdot_bf16c_genoa", simsimd_vdot_bf16c_genoa, simsimd_vdot_bf16c_accurate); @@ -909,6 +942,7 @@ int main(int argc, char** argv) { dense_("dot_f16_sapphire", simsimd_dot_f16_sapphire, simsimd_dot_f16_accurate); dense_("cos_f16_sapphire", simsimd_cos_f16_sapphire, simsimd_cos_f16_accurate); dense_("l2sq_f16_sapphire", simsimd_l2sq_f16_sapphire, simsimd_l2sq_f16_accurate); + dense_("l2_f16_sapphire", simsimd_l2_f16_sapphire, simsimd_l2_f16_accurate); dense_("kl_f16_sapphire", simsimd_kl_f16_sapphire, simsimd_kl_f16_accurate); dense_("js_f16_sapphire", simsimd_js_f16_sapphire, simsimd_js_f16_accurate); @@ -919,15 +953,18 @@ int main(int argc, char** argv) { #if SIMSIMD_TARGET_ICE dense_("cos_i8_ice", simsimd_cos_i8_ice, simsimd_cos_i8_serial); dense_("l2sq_i8_ice", simsimd_l2sq_i8_ice, simsimd_l2sq_i8_serial); + dense_("l2_i8_ice", simsimd_l2_i8_ice, simsimd_l2_i8_serial); dense_("dot_i8_ice", simsimd_dot_i8_ice, simsimd_dot_i8_serial); dense_("cos_u8_ice", simsimd_cos_u8_ice, simsimd_cos_u8_serial); dense_("l2sq_u8_ice", simsimd_l2sq_u8_ice, simsimd_l2sq_u8_serial); + dense_("l2_u8_ice", simsimd_l2_u8_ice, simsimd_l2_u8_serial); dense_("dot_u8_ice", simsimd_dot_u8_ice, simsimd_dot_u8_serial); dense_("dot_f64_skylake", simsimd_dot_f64_skylake, simsimd_dot_f64_serial); dense_("cos_f64_skylake", simsimd_cos_f64_skylake, simsimd_cos_f64_serial); dense_("l2sq_f64_skylake", simsimd_l2sq_f64_skylake, simsimd_l2sq_f64_serial); + dense_("l2_f64_skylake", simsimd_l2_f64_skylake, simsimd_l2_f64_serial); dense_("hamming_b8_ice", simsimd_hamming_b8_ice, simsimd_hamming_b8_serial); dense_("jaccard_b8_ice", simsimd_jaccard_b8_ice, simsimd_jaccard_b8_serial); @@ -940,6 +977,7 @@ int main(int argc, char** argv) { dense_("dot_f32_skylake", simsimd_dot_f32_skylake, simsimd_dot_f32_accurate); dense_("cos_f32_skylake", simsimd_cos_f32_skylake, simsimd_cos_f32_accurate); dense_("l2sq_f32_skylake", simsimd_l2sq_f32_skylake, simsimd_l2sq_f32_accurate); + dense_("l2_f32_skylake", simsimd_l2_f32_skylake, simsimd_l2_f32_accurate); dense_("kl_f32_skylake", simsimd_kl_f32_skylake, simsimd_kl_f32_accurate); dense_("js_f32_skylake", simsimd_js_f32_skylake, simsimd_js_f32_accurate); @@ -966,31 +1004,37 @@ int main(int argc, char** argv) { dense_("dot_bf16_serial", simsimd_dot_bf16_serial, simsimd_dot_bf16_accurate); dense_("cos_bf16_serial", simsimd_cos_bf16_serial, simsimd_cos_bf16_accurate); dense_("l2sq_bf16_serial", simsimd_l2sq_bf16_serial, simsimd_l2sq_bf16_accurate); + dense_("l2_bf16_serial", simsimd_l2_bf16_serial, simsimd_l2_bf16_accurate); dense_("kl_bf16_serial", simsimd_kl_bf16_serial, simsimd_kl_bf16_accurate); dense_("js_bf16_serial", simsimd_js_bf16_serial, simsimd_js_bf16_accurate); dense_("dot_f16_serial", simsimd_dot_f16_serial, simsimd_dot_f16_accurate); dense_("cos_f16_serial", simsimd_cos_f16_serial, simsimd_cos_f16_accurate); dense_("l2sq_f16_serial", simsimd_l2sq_f16_serial, simsimd_l2sq_f16_accurate); + dense_("l2_f16_serial", simsimd_l2_f16_serial, simsimd_l2_f16_accurate); dense_("kl_f16_serial", simsimd_kl_f16_serial, simsimd_kl_f16_accurate); dense_("js_f16_serial", simsimd_js_f16_serial, simsimd_js_f16_accurate); dense_("dot_f32_serial", simsimd_dot_f32_serial, simsimd_dot_f32_accurate); dense_("cos_f32_serial", simsimd_cos_f32_serial, simsimd_cos_f32_accurate); dense_("l2sq_f32_serial", simsimd_l2sq_f32_serial, simsimd_l2sq_f32_accurate); + dense_("l2_f32_serial", simsimd_l2_f32_serial, simsimd_l2_f32_accurate); dense_("kl_f32_serial", simsimd_kl_f32_serial, simsimd_kl_f32_accurate); dense_("js_f32_serial", simsimd_js_f32_serial, simsimd_js_f32_accurate); dense_("dot_f64_serial", simsimd_dot_f64_serial, simsimd_dot_f64_serial); dense_("cos_f64_serial", simsimd_cos_f64_serial, simsimd_cos_f64_serial); dense_("l2sq_f64_serial", simsimd_l2sq_f64_serial, simsimd_l2sq_f64_serial); + dense_("l2_f64_serial", simsimd_l2_f64_serial, simsimd_l2_f64_serial); dense_("cos_i8_serial", simsimd_cos_i8_serial, simsimd_cos_i8_serial); dense_("l2sq_i8_serial", simsimd_l2sq_i8_serial, simsimd_l2sq_i8_serial); + dense_("l2_i8_serial", simsimd_l2_i8_serial, simsimd_l2_i8_serial); dense_("dot_i8_serial", simsimd_dot_i8_serial, simsimd_dot_i8_serial); dense_("cos_u8_serial", simsimd_cos_u8_serial, simsimd_cos_u8_serial); dense_("l2sq_u8_serial", simsimd_l2sq_u8_serial, simsimd_l2sq_u8_serial); + dense_("l2_u8_serial", simsimd_l2_u8_serial, simsimd_l2_u8_serial); dense_("dot_u8_serial", simsimd_dot_u8_serial, simsimd_dot_u8_serial); dense_("dot_f64c_serial", simsimd_dot_f64c_serial, simsimd_dot_f64c_serial); @@ -1008,6 +1052,11 @@ int main(int argc, char** argv) { dense_("hamming_b8_serial", simsimd_hamming_b8_serial, simsimd_hamming_b8_serial); dense_("jaccard_b8_serial", simsimd_jaccard_b8_serial, simsimd_jaccard_b8_serial); + fma_("fma_f16_serial", simsimd_fma_f16_serial, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); + fma_("wsum_f16_serial", simsimd_wsum_f16_serial, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); + fma_("fma_u8_serial", simsimd_fma_u8_serial, simsimd_fma_u8_serial, simsimd_l2_u8_serial); + fma_("wsum_u8_serial", simsimd_wsum_u8_serial, simsimd_wsum_u8_serial, simsimd_l2_u8_serial); + bm::RunSpecifiedBenchmarks(); bm::Shutdown(); return 0;