From 66ec5c9299a6d4edc72ad5eca21db1a9b52d5e6b Mon Sep 17 00:00:00 2001 From: Dmitry Arkhipov Date: Sun, 12 Jun 2022 13:00:46 +0300 Subject: [PATCH] reverse constants on big endian --- include/boost/json/detail/utf8.hpp | 93 +++++++++++++++++++++++------- test/utf8.cpp | 60 ++++++++++++++----- 2 files changed, 117 insertions(+), 36 deletions(-) diff --git a/include/boost/json/detail/utf8.hpp b/include/boost/json/detail/utf8.hpp index 4018c838c..d086e2d61 100644 --- a/include/boost/json/detail/utf8.hpp +++ b/include/boost/json/detail/utf8.hpp @@ -19,6 +19,22 @@ BOOST_JSON_NS_BEGIN namespace detail { +#ifdef BOOST_JSON_BIG_ENDIAN +# define BOOST_JSON_MK_NUM(b1, b2) 0x ## b2 ## b1 +# define BOOST_JSON_MK_NUM2(b1, b2) 0x ## b2 ## b1 ## 0000 +# define BOOST_JSON_MK_NUM3(b1, b2, b3) 0x ## b3 ## b2 ## b1 ## 00 +# define BOOST_JSON_MK_NUM4(b1, b2, b3, b4) 0x ## b4 ## b3 ## b2 ## b1 +# define BOOST_JSON_UTF8_KIND(b) (b & 0xFF) +# define BOOST_JSON_UTF8_LENGTH(b) (b >> 8) +#else +# define BOOST_JSON_MK_NUM(b1, b2) 0x ## b1 ## b2 +# define BOOST_JSON_MK_NUM2(b1, b2) 0x ## b1 ## b2 +# define BOOST_JSON_MK_NUM3(b1, b2, b3) 0x ## b1 ## b2 ## b3 +# define BOOST_JSON_MK_NUM4(b1, b2, b3, b4) 0x ## b1 ## b2 ## b3 ## b4 +# define BOOST_JSON_UTF8_KIND(b) (b >> 8) +# define BOOST_JSON_UTF8_LENGTH(b) (b & 0xFF) +#endif + template std::uint32_t load_little_endian(void const* p) @@ -38,6 +54,7 @@ inline uint16_t classify_utf8(char c) { + // for little endian // 0x000 = invalid // 0x102 = 2 bytes, second byte [80, BF] // 0x203 = 3 bytes, second byte [A0, BF] @@ -46,6 +63,7 @@ classify_utf8(char c) // 0x504 = 4 bytes, second byte [90, BF] // 0x604 = 4 bytes, second byte [80, BF] // 0x704 = 4 bytes, second byte [80, 8F] + // for big endian the bytes are reversed static constexpr uint16_t first[128] { 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, @@ -57,13 +75,41 @@ classify_utf8(char c) 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, - 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, - 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, - 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, - 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, - 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, - 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303, - 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000, + BOOST_JSON_MK_NUM(00, 00), BOOST_JSON_MK_NUM(00, 00), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02), + + BOOST_JSON_MK_NUM(02, 03), BOOST_JSON_MK_NUM(03, 03), + BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03), + BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03), + BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03), + + BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03), + BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03), + BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(04, 03), + BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03), + + BOOST_JSON_MK_NUM(05, 04), BOOST_JSON_MK_NUM(06, 04), + BOOST_JSON_MK_NUM(06, 04), BOOST_JSON_MK_NUM(06, 04), + BOOST_JSON_MK_NUM(07, 04), BOOST_JSON_MK_NUM(00, 00), + BOOST_JSON_MK_NUM(00, 00), BOOST_JSON_MK_NUM(00, 00), + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, }; return first[static_cast(c & 0x7F)]; @@ -74,30 +120,33 @@ bool is_valid_utf8(const char* p, uint16_t first) { uint32_t v; - switch(first >> 8) + switch(BOOST_JSON_UTF8_KIND(first)) { default: return false; // 2 bytes, second byte [80, BF] case 1: - v = load_little_endian<2>(p); - return (v & 0xC000) == 0x8000; + std::memcpy(&v, p, 2); + return (v & BOOST_JSON_MK_NUM2(C0,00)) == BOOST_JSON_MK_NUM2(80,00); // 3 bytes, second byte [A0, BF] case 2: - v = load_little_endian<3>(p); - return (v & 0xC0E000) == 0x80A000; + std::memcpy(&v, p, 3); + return (v & BOOST_JSON_MK_NUM3(C0,E0,00)) + == BOOST_JSON_MK_NUM3(80,A0,00); // 3 bytes, second byte [80, BF] case 3: - v = load_little_endian<3>(p); - return (v & 0xC0C000) == 0x808000; + std::memcpy(&v, p, 3); + return (v & BOOST_JSON_MK_NUM3(C0,C0,00)) + == BOOST_JSON_MK_NUM3(80,80,00); // 3 bytes, second byte [80, 9F] case 4: - v = load_little_endian<3>(p); - return (v & 0xC0E000) == 0x808000; + std::memcpy(&v, p, 3); + return (v & BOOST_JSON_MK_NUM3(C0,E0,00)) + == BOOST_JSON_MK_NUM3(80,80,00); // 4 bytes, second byte [90, BF] case 5: @@ -106,13 +155,15 @@ is_valid_utf8(const char* p, uint16_t first) // 4 bytes, second byte [80, BF] case 6: - v = load_little_endian<4>(p); - return (v & 0xC0C0C000) == 0x80808000; + std::memcpy(&v, p, 4); + return (v & BOOST_JSON_MK_NUM4(C0,C0,C0,00)) + == BOOST_JSON_MK_NUM4(80,80,80,00); // 4 bytes, second byte [80, 8F] case 7: - v = load_little_endian<4>(p); - return (v & 0xC0C0F000) == 0x80808000; + std::memcpy(&v, p, 4); + return (v & BOOST_JSON_MK_NUM4(C0,C0,F0,00)) + == BOOST_JSON_MK_NUM4(80,80,80,00); } } @@ -139,7 +190,7 @@ class utf8_sequence uint8_t length() const noexcept { - return first_ & 0xFF; + return BOOST_JSON_UTF8_LENGTH(first_); } bool diff --git a/test/utf8.cpp b/test/utf8.cpp index 76ffd603a..49624ff25 100644 --- a/test/utf8.cpp +++ b/test/utf8.cpp @@ -17,6 +17,8 @@ BOOST_JSON_NS_BEGIN class utf8_test { public: + ::test_suite::log_type log; + void testLoadLittleEndian() { @@ -50,21 +52,46 @@ class utf8_test void testClassifyUtf8() { - BOOST_TEST((detail::classify_utf8('\x00') & 0xFF) == 0); + BOOST_TEST(detail::classify_utf8('\x00') == 0); // from code point U+0080 (0xC280 in UTF-8) - BOOST_TEST((detail::classify_utf8('\xC2') & 0xFF) == 2); + BOOST_TEST(detail::classify_utf8('\xC2') == BOOST_JSON_MK_NUM(1, 02)); // from code point U+07FF (0xDFBF in UTF-8) - BOOST_TEST((detail::classify_utf8('\xDF') & 0xFF) == 2); + BOOST_TEST(detail::classify_utf8('\xDF') == BOOST_JSON_MK_NUM(1, 02)); // from code point U+0800 (0xE0A080 in UTF-8) - BOOST_TEST((detail::classify_utf8('\xE0') & 0xFF) == 3); - // from code point U+0FFFF (0xEFBFBF in UTF-8) - BOOST_TEST((detail::classify_utf8('\xEF') & 0xFF) == 3); - // from code point U+010000 (0xF0908080 in UTF-8) - BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4); - // from code point U+010000 (0xF0908080 in UTF-8) - BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4); - // from code point U+010FFFF (0xF48FBFBF in UTF-8) - BOOST_TEST((detail::classify_utf8('\xF4') & 0xFF) == 4); + BOOST_TEST(detail::classify_utf8('\xE0') == BOOST_JSON_MK_NUM(2, 03)); + // from code point U+D7B0 (0xED9EB0 in UTF-8) + BOOST_TEST(detail::classify_utf8('\xED') == BOOST_JSON_MK_NUM(4, 03)); + // from code point U+FFFF (0xEFBFBF in UTF-8) + BOOST_TEST(detail::classify_utf8('\xEF') == BOOST_JSON_MK_NUM(3, 03)); + // from code point U+10000 (0xF0908080 in UTF-8) + BOOST_TEST(detail::classify_utf8('\xF0') == BOOST_JSON_MK_NUM(5, 04)); + // from code point U+80000 (0xF1808080 in UTF-8) + BOOST_TEST(detail::classify_utf8('\xF1') == BOOST_JSON_MK_NUM(6, 04)); + // from code point U+C00000 (0xF3808080 in UTF-8) + BOOST_TEST(detail::classify_utf8('\xF3') == BOOST_JSON_MK_NUM(6, 04)); + // from code point U+10FFFF (0xF48FBFBF in UTF-8) + BOOST_TEST(detail::classify_utf8('\xF4') == BOOST_JSON_MK_NUM(7, 04)); + + if (!BOOST_TEST(BOOST_JSON_UTF8_KIND(detail::classify_utf8('\xC2')) == 1)) + { + uint16_t classifier = detail::classify_utf8('\xC2'); + unsigned char const* bytes + = reinterpret_cast(&classifier); + log << std::hex << std::setfill('0') << "0x" + << std::setw(2) << int(bytes[0]) + << std::setw(2) << int(bytes[1]) + << std::setfill(' ') << std::dec << '\n'; + } + if (!BOOST_TEST(BOOST_JSON_UTF8_LENGTH(detail::classify_utf8('\xC2')) == 2)) + { + uint16_t classifier = detail::classify_utf8('\xC2'); + unsigned char const* bytes + = reinterpret_cast(&classifier); + log << std::hex << std::setfill('0') << "0x" + << std::setw(2) << int(bytes[0]) + << std::setw(2) << int(bytes[1]) + << std::setfill(' ') << std::dec << '\n'; + } } void @@ -78,9 +105,12 @@ class utf8_test BOOST_TEST(is_valid_utf8("\xC2\x80")); // code point U+0080 BOOST_TEST(is_valid_utf8("\xDF\xBF")); // code point U+07FF BOOST_TEST(is_valid_utf8("\xE0\xA0\x80")); // code point U+0800 - BOOST_TEST(is_valid_utf8("\xEF\xBF\xBF")); // from code point U+0FFFF - BOOST_TEST(is_valid_utf8("\xF0\x90\x80\x80")); // code point U+010000 - BOOST_TEST(is_valid_utf8("\xF4\x8F\xBF\xBF")); // code point U+010FFFF + BOOST_TEST(is_valid_utf8("\xED\x9E\xB0")); // code point U+D7B0 + BOOST_TEST(is_valid_utf8("\xEF\xBF\xBF")); // from code point U+FFFF + BOOST_TEST(is_valid_utf8("\xF0\x90\x80\x80")); // code point U+10000 + BOOST_TEST(is_valid_utf8("\xF1\x80\x80\x80")); // code point U+80000 + BOOST_TEST(is_valid_utf8("\xF3\x80\x80\x80")); // code point U+C00000 + BOOST_TEST(is_valid_utf8("\xF4\x8F\xBF\xBF")); // code point U+10FFFF BOOST_TEST(! is_valid_utf8("\x80")); BOOST_TEST(! is_valid_utf8("\xBF"));