diff --git a/CHANGELOG.md b/CHANGELOG.md index ddea6c5ac..def4a184c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.6.0-beta.1] - Unreleased +### Added + +- Support for utf8 encoding to `*_to_atom` and `atom_to_*` functions +- `binary_to_atom/1` and `atom_to_binary/1` that default to utf8 (they were introduced with OTP23) + ### Fixed - ESP32: fix i2c_driver_acquire and i2c_driver_release functions, that were working only once. +### Changed + +- `binary_to_atom/2` validates utf8 strings +- `*_to_atom` and `atom_to_*` properly convert latin1 (not just ASCII) to utf8 and viceversa + ## [0.6.0-beta.0] - 2024-02-08 ### Added diff --git a/LICENSES/MIT.txt b/LICENSES/MIT.txt new file mode 100644 index 000000000..8aa26455d --- /dev/null +++ b/LICENSES/MIT.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/libs/estdlib/src/erlang.erl b/libs/estdlib/src/erlang.erl index 689325124..b7901d386 100644 --- a/libs/estdlib/src/erlang.erl +++ b/libs/estdlib/src/erlang.erl @@ -53,9 +53,11 @@ list_to_integer/1, list_to_tuple/1, iolist_to_binary/1, + binary_to_atom/1, binary_to_atom/2, binary_to_integer/1, binary_to_list/1, + atom_to_binary/1, atom_to_binary/2, atom_to_list/1, float_to_binary/1, @@ -117,6 +119,8 @@ %% * review API documentation for timer functions in this module %% +-type atom_encoding() :: latin1 | utf8 | unicode. + -type mem_type() :: binary. -type time_unit() :: second | millisecond | microsecond. -type timestamp() :: { @@ -582,13 +586,22 @@ iolist_to_binary(_IOList) -> %%----------------------------------------------------------------------------- %% @param Binary Binary to convert to atom -%% @param Encoding encoding for conversion +%% @returns an atom from passed binary +%% @doc Convert a binary to atom, defaults to utf8. +%% @end +%%----------------------------------------------------------------------------- +-spec binary_to_atom(Binary :: binary()) -> atom(). +binary_to_atom(_Binary) -> + erlang:nif_error(undefined). + +%%----------------------------------------------------------------------------- +%% @param Binary Binary to convert to atom +%% @param Encoding encoding for conversion (any of latin1, utf8 or unicode) %% @returns an atom from passed binary %% @doc Convert a binary to atom. -%% Only latin1 encoded is supported. %% @end %%----------------------------------------------------------------------------- --spec binary_to_atom(Binary :: binary(), Encoding :: latin1) -> atom(). +-spec binary_to_atom(Binary :: binary(), Encoding :: atom_encoding()) -> atom(). binary_to_atom(_Binary, _Encoding) -> erlang:nif_error(undefined). @@ -614,13 +627,24 @@ binary_to_list(_Binary) -> %%----------------------------------------------------------------------------- %% @param Atom Atom to convert -%% @param Encoding Encoding for conversion +%% @returns a binary with the atom's name +%% @doc Convert an atom to a binary, defaults to utf8. +%% Only latin1 encoding is supported. +%% @end +%%----------------------------------------------------------------------------- +-spec atom_to_binary(Atom :: atom()) -> binary(). +atom_to_binary(_Atom) -> + erlang:nif_error(undefined). + +%%----------------------------------------------------------------------------- +%% @param Atom Atom to convert +%% @param Encoding Encoding for conversion (any of latin1, utf8 or unicode) %% @returns a binary with the atom's name %% @doc Convert an atom to a binary. %% Only latin1 encoding is supported. %% @end %%----------------------------------------------------------------------------- --spec atom_to_binary(Atom :: atom(), Encoding :: latin1) -> binary(). +-spec atom_to_binary(Atom :: atom(), Encoding :: atom_encoding()) -> binary(). atom_to_binary(_Atom, _Encoding) -> erlang:nif_error(undefined). diff --git a/src/libAtomVM/CMakeLists.txt b/src/libAtomVM/CMakeLists.txt index 8b5a011a2..05c79303b 100644 --- a/src/libAtomVM/CMakeLists.txt +++ b/src/libAtomVM/CMakeLists.txt @@ -62,6 +62,7 @@ set(HEADER_FILES term.h timer_list.h trace.h + unicode.h utils.h valueshashtable.h ${CMAKE_CURRENT_BINARY_DIR}/avm_version.h @@ -94,6 +95,7 @@ set(SOURCE_FILES stacktrace.c term.c timer_list.c + unicode.c valueshashtable.c ) diff --git a/src/libAtomVM/atom_table.c b/src/libAtomVM/atom_table.c index 1c335664d..ed281dcb3 100644 --- a/src/libAtomVM/atom_table.c +++ b/src/libAtomVM/atom_table.c @@ -27,6 +27,7 @@ #include "atom.h" #include "smp.h" +#include "unicode.h" #include "utils.h" #ifndef AVM_NO_SMP @@ -279,6 +280,20 @@ atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index, return node; } +bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom) +{ + SMP_RDLOCK(table); + + struct HNode *node = (struct HNode *) atom; + const uint8_t *data = atom_string_data(node->key); + size_t len = atom_string_len(node->key); + + bool result = unicode_buf_is_ascii(data, len); + + SMP_UNLOCK(table); + return result; +} + void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf) { SMP_RDLOCK(table); diff --git a/src/libAtomVM/atom_table.h b/src/libAtomVM/atom_table.h index c9f108bd6..f919905d7 100644 --- a/src/libAtomVM/atom_table.h +++ b/src/libAtomVM/atom_table.h @@ -21,6 +21,8 @@ #ifndef _ATOM_TABLE_ #define _ATOM_TABLE_ +#include + #include "atom.h" #define ATOM_TABLE_NOT_FOUND -1 @@ -56,6 +58,7 @@ int atom_table_ensure_atoms( int atom_table_cmp_using_atom_index( struct AtomTable *table, int t_atom_index, int other_atom_index); atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index, size_t *out_len); +bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom); void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf); void atom_table_write_cstring( struct AtomTable *table, atom_ref_t atom, size_t buf_len, char *outbuf); diff --git a/src/libAtomVM/defaultatoms.c b/src/libAtomVM/defaultatoms.c index fff72c04f..a8645adfd 100644 --- a/src/libAtomVM/defaultatoms.c +++ b/src/libAtomVM/defaultatoms.c @@ -158,6 +158,8 @@ static const char *const fibonacci_atom = "\x9" "fibonacci"; static const char *const call_atom = "\x5" "$call"; static const char *const cast_atom = "\x5" "$cast"; +static const char *const unicode_atom = "\x7" "unicode"; + void defaultatoms_init(GlobalContext *glb) { int ok = 1; @@ -300,6 +302,8 @@ void defaultatoms_init(GlobalContext *glb) ok &= globalcontext_insert_atom(glb, call_atom) == CALL_ATOM_INDEX; ok &= globalcontext_insert_atom(glb, cast_atom) == CAST_ATOM_INDEX; + ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX; + if (!ok) { AVM_ABORT(); } diff --git a/src/libAtomVM/defaultatoms.h b/src/libAtomVM/defaultatoms.h index c8a7d41ac..df61dee84 100644 --- a/src/libAtomVM/defaultatoms.h +++ b/src/libAtomVM/defaultatoms.h @@ -167,7 +167,9 @@ extern "C" { #define CALL_ATOM_INDEX 108 #define CAST_ATOM_INDEX 109 -#define PLATFORM_ATOMS_BASE_INDEX 110 +#define UNICODE_ATOM_INDEX 110 + +#define PLATFORM_ATOMS_BASE_INDEX 111 #define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX) #define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX) @@ -309,6 +311,8 @@ extern "C" { #define CALL_ATOM TERM_FROM_ATOM_INDEX(CALL_ATOM_INDEX) #define CAST_ATOM TERM_FROM_ATOM_INDEX(CAST_ATOM_INDEX) +#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX) + void defaultatoms_init(GlobalContext *glb); void platform_defaultatoms_init(GlobalContext *glb); diff --git a/src/libAtomVM/interop.c b/src/libAtomVM/interop.c index 89e7011a9..cc3bfde6a 100644 --- a/src/libAtomVM/interop.c +++ b/src/libAtomVM/interop.c @@ -138,6 +138,63 @@ char *interop_list_to_string(term list, int *ok) return str; } +char *interop_list_to_utf8_string(term list, int *ok) +{ + size_t byte_len = 0; + + term t = list; + while (term_is_nonempty_list(t)) { + term head = term_get_list_head(t); + if (UNLIKELY(!term_is_integer(head))) { + *ok = 0; + return NULL; + } + avm_int_t codepoint = term_to_int(head); + if (UNLIKELY(codepoint < 0)) { + *ok = 0; + return NULL; + } else if (codepoint <= 127) { + byte_len++; + } else { + size_t codepoint_size; + bitstring_utf8_encode(codepoint, NULL, &codepoint_size); + byte_len += codepoint_size; + } + t = term_get_list_tail(t); + } + + if (!term_is_nil(t)) { + *ok = 0; + return NULL; + } + + uint8_t *str = malloc(byte_len + 1); + if (IS_NULL_PTR(str)) { + *ok = 0; + return NULL; + } + + t = list; + size_t i = 0; + while (i < byte_len) { + term codepoint_term = term_get_list_head(t); + size_t codepoint_size; + bool success = bitstring_utf8_encode(term_to_int(codepoint_term), &str[i], &codepoint_size); + if (UNLIKELY(!success)) { + free(str); + *ok = 0; + return NULL; + } + + t = term_get_list_tail(t); + i += codepoint_size; + } + str[byte_len] = 0; + + *ok = 1; + return (char *) str; +} + char *interop_atom_to_string(Context *ctx, term atom) { GlobalContext *glb = ctx->global; diff --git a/src/libAtomVM/interop.h b/src/libAtomVM/interop.h index 3a69fc1f5..6cfc7df4b 100644 --- a/src/libAtomVM/interop.h +++ b/src/libAtomVM/interop.h @@ -67,6 +67,7 @@ typedef void (*interop_chardata_rest_fun)(term t, void *accum); char *interop_term_to_string(term t, int *ok); char *interop_binary_to_string(term binary); +char *interop_list_to_utf8_string(term list, int *ok); char *interop_list_to_string(term list, int *ok); char *interop_iolist_to_string(term list, int *ok); char *interop_atom_to_string(Context *ctx, term atom); diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c index 16a03202a..b589a008c 100644 --- a/src/libAtomVM/nifs.c +++ b/src/libAtomVM/nifs.c @@ -35,6 +35,7 @@ #include "avm_version.h" #include "avmpack.h" #include "bif.h" +#include "bitstring.h" #include "context.h" #include "defaultatoms.h" #include "dictionary.h" @@ -52,6 +53,7 @@ #include "synclist.h" #include "sys.h" #include "term.h" +#include "unicode.h" #include "utils.h" #define MAX_NIF_NAME_LEN 260 @@ -1940,14 +1942,10 @@ static term nif_erlang_binary_to_existing_atom_2(Context *ctx, int argc, term ar static term binary_to_atom(Context *ctx, int argc, term argv[], int create_new) { - UNUSED(argc); - term a_binary = argv[0]; VALIDATE_VALUE(a_binary, term_is_binary); - if (UNLIKELY(argv[1] != LATIN1_ATOM)) { - RAISE_ERROR(BADARG_ATOM); - } + term encoding = (argc == 2) ? argv[1] : UTF8_ATOM; const char *atom_string = term_binary_data(a_binary); size_t atom_string_len = term_binary_size(a_binary); @@ -1955,9 +1953,49 @@ static term binary_to_atom(Context *ctx, int argc, term argv[], int create_new) RAISE_ERROR(SYSTEM_LIMIT_ATOM); } - AtomString atom = malloc(atom_string_len + 1); - ((uint8_t *) atom)[0] = atom_string_len; - memcpy(((char *) atom) + 1, atom_string, atom_string_len); + bool encode_latin1_to_utf8 = false; + if (UNLIKELY((encoding == LATIN1_ATOM) + && !unicode_buf_is_ascii((const uint8_t *) atom_string, atom_string_len))) { + encode_latin1_to_utf8 = true; + } else if (UNLIKELY((encoding != LATIN1_ATOM) && (encoding != UNICODE_ATOM) + && (encoding != UTF8_ATOM))) { + RAISE_ERROR(BADARG_ATOM); + } + + AtomString atom; + if (!encode_latin1_to_utf8) { + size_t i = 0; + while (i < atom_string_len) { + uint32_t codepoint; + size_t codepoint_size; + if (UNLIKELY(bitstring_utf8_decode( + (uint8_t *) atom_string + i, atom_string_len, &codepoint, &codepoint_size)) + != UnicodeTransformDecodeSuccess) { + RAISE_ERROR(BADARG_ATOM); + } + i += codepoint_size; + } + + atom = malloc(atom_string_len + 1); + ((uint8_t *) atom)[0] = atom_string_len; + memcpy(((char *) atom) + 1, atom_string, atom_string_len); + } else { + size_t encoded_len = atom_string_len * 2; + if (encoded_len > 255) { + RAISE_ERROR(SYSTEM_LIMIT_ATOM); + } + atom = malloc(encoded_len + 1); + ((uint8_t *) atom)[0] = encoded_len; + uint8_t *atom_data = ((uint8_t *) atom) + 1; + for (size_t i = 0; i < atom_string_len; i++) { + size_t out_size; + bitstring_utf8_encode(((uint8_t) atom_string[i]), &atom_data[i * 2], &out_size); + if (UNLIKELY(out_size != 2)) { + // unreachable, but let's detect implementation bugs + abort(); + } + } + } enum AtomTableCopyOpt atom_opts = AtomTableCopyAtom; if (!create_new) { @@ -1991,7 +2029,7 @@ term list_to_atom(Context *ctx, int argc, term argv[], int create_new) VALIDATE_VALUE(a_list, term_is_list); int ok; - char *atom_string = interop_list_to_string(a_list, &ok); + char *atom_string = interop_list_to_utf8_string(a_list, &ok); if (UNLIKELY(!ok)) { RAISE_ERROR(OUT_OF_MEMORY_ATOM); } @@ -2031,9 +2069,7 @@ static term nif_erlang_atom_to_binary_2(Context *ctx, int argc, term argv[]) term atom_term = argv[0]; VALIDATE_VALUE(atom_term, term_is_atom); - if (UNLIKELY(argv[1] != LATIN1_ATOM)) { - RAISE_ERROR(BADARG_ATOM); - } + term encoding = argv[1]; GlobalContext *glb = ctx->global; @@ -2041,13 +2077,50 @@ static term nif_erlang_atom_to_binary_2(Context *ctx, int argc, term argv[]) size_t atom_len; atom_ref_t atom_ref = atom_table_get_atom_ptr_and_len(glb->atom_table, atom_index, &atom_len); + bool encode_to_latin1 = false; + if (encoding == LATIN1_ATOM) { + if (UNLIKELY(!atom_table_is_atom_ref_ascii(glb->atom_table, atom_ref))) { + encode_to_latin1 = true; + } + } else if (UNLIKELY(encoding != UTF8_ATOM) && (encoding != UNICODE_ATOM)) { + RAISE_ERROR(BADARG_ATOM); + } + if (UNLIKELY(memory_ensure_free_opt(ctx, term_binary_heap_size(atom_len), MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { RAISE_ERROR(OUT_OF_MEMORY_ATOM); } - term binary = term_create_uninitialized_binary(atom_len, &ctx->heap, glb); - atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, (char *) term_binary_data(binary)); - return binary; + if (!encode_to_latin1) { + term binary = term_create_uninitialized_binary(atom_len, &ctx->heap, glb); + atom_table_write_bytes( + glb->atom_table, atom_ref, atom_len, (char *) term_binary_data(binary)); + return binary; + } else { + char *utf8_tmp_buf = malloc(atom_len); + if (IS_NULL_PTR(utf8_tmp_buf)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + atom_table_write_bytes(glb->atom_table, atom_ref, atom_len, utf8_tmp_buf); + size_t encoded_len = atom_len / 2; + term binary = term_create_uninitialized_binary(encoded_len, &ctx->heap, glb); + char *binary_data = (char *) term_binary_data(binary); + size_t in_pos = 0; + for (size_t i = 0; i < encoded_len; i++) { + size_t codepoint_size; + uint32_t codepoint; + if (UNLIKELY(bitstring_utf8_decode( + (uint8_t *) &utf8_tmp_buf[i * 2], 2, &codepoint, &codepoint_size) + != UnicodeTransformDecodeSuccess + || (codepoint > 255))) { + free(utf8_tmp_buf); + RAISE_ERROR(BADARG_ATOM); + } + binary_data[i] = codepoint; + in_pos += codepoint_size; + } + free(utf8_tmp_buf); + return binary; + } } static term nif_erlang_atom_to_list_1(Context *ctx, int argc, term argv[]) @@ -2069,18 +2142,50 @@ static term nif_erlang_atom_to_list_1(Context *ctx, int argc, term argv[]) RAISE_ERROR(OUT_OF_MEMORY_ATOM); } - if (UNLIKELY(memory_ensure_free_opt(ctx, atom_len * 2, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { + atom_table_write_bytes(ctx->global->atom_table, atom_ref, atom_len, atom_buf); + + size_t u8len = unicode_buf_utf8_len((uint8_t *) atom_buf, atom_len); + bool latin1 = atom_len == u8len; + + size_t list_len = latin1 ? atom_len : u8len; + + if (UNLIKELY(memory_ensure_free_opt(ctx, list_len * 2, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { + free(atom_buf); RAISE_ERROR(OUT_OF_MEMORY_ATOM); } - atom_table_write_bytes(ctx->global->atom_table, atom_ref, atom_len, atom_buf); - term prev = term_nil(); - for (int i = atom_len - 1; i >= 0; i--) { - char c = atom_buf[i]; - prev = term_list_prepend(term_from_int11(c), prev, &ctx->heap); - } + if (latin1) { + for (int i = atom_len - 1; i >= 0; i--) { + char c = atom_buf[i]; + prev = term_list_prepend(term_from_int11(c), prev, &ctx->heap); + } + } else { + uint32_t *codepoints = malloc(u8len * sizeof(uint32_t)); + if (IS_NULL_PTR(codepoints)) { + free(atom_buf); + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } + uint8_t *u_in = (uint8_t *) atom_buf; + for (size_t i = 0; i < u8len; i++) { + size_t codepoint_size; + enum UnicodeTransformDecodeResult result + = bitstring_utf8_decode(u_in, atom_len, &codepoints[i], &codepoint_size); + if (UNLIKELY((result != UnicodeTransformDecodeSuccess) + || !unicode_is_valid_codepoint(codepoints[i]))) { + free(codepoints); + free(atom_buf); + RAISE_ERROR(BADARG_ATOM); + } + u_in += codepoint_size; + } + + for (int i = u8len - 1; i >= 0; i--) { + prev = term_list_prepend(term_from_int(codepoints[i]), prev, &ctx->heap); + } + free(codepoints); + } free(atom_buf); return prev; diff --git a/src/libAtomVM/nifs.gperf b/src/libAtomVM/nifs.gperf index 249ab1714..3101a3099 100644 --- a/src/libAtomVM/nifs.gperf +++ b/src/libAtomVM/nifs.gperf @@ -39,10 +39,12 @@ binary:split/2, &binary_split_nif calendar:system_time_to_universal_time/2, &system_time_to_universal_time_nif erlang:atom_to_binary/2, &atom_to_binary_nif erlang:atom_to_list/1, &atom_to_list_nif +erlang:binary_to_atom/1, &binary_to_atom_nif erlang:binary_to_atom/2, &binary_to_atom_nif erlang:binary_to_float/1, &binary_to_float_nif erlang:binary_to_integer/1, &binary_to_integer_nif erlang:binary_to_list/1, &binary_to_list_nif +erlang:binary_to_existing_atom/1, &binary_to_existing_atom_nif erlang:binary_to_existing_atom/2, &binary_to_existing_atom_nif erlang:delete_element/2, &delete_element_nif erlang:erase/1, &erase_nif diff --git a/src/libAtomVM/unicode.c b/src/libAtomVM/unicode.c new file mode 100644 index 000000000..dd3a2629f --- /dev/null +++ b/src/libAtomVM/unicode.c @@ -0,0 +1,83 @@ +/* + * This file is part of AtomVM. + * + * Copyright 2024 Davide Bettio + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: (Apache-2.0 OR LGPL-2.1-or-later) AND MIT + */ + +#include +#include + +#include "unicode.h" + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +uint32_t inline +decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + +size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len) +{ + size_t count = 0; + + for (size_t i = 0; i < buf_len; i++) { + if ((buf[i] & 0xC0) != 0x80) { + count++; + } + } + + return count; +} + +bool unicode_buf_is_ascii(const uint8_t *buf, size_t len) +{ + for (size_t i = 0; i < len; i++) { + if (buf[i] > 0x7F) { + return false; + } + } + + return true; +} diff --git a/src/libAtomVM/unicode.h b/src/libAtomVM/unicode.h new file mode 100644 index 000000000..087f13126 --- /dev/null +++ b/src/libAtomVM/unicode.h @@ -0,0 +1,46 @@ +/* + * This file is part of AtomVM. + * + * Copyright 2024 Davide Bettio + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later + */ + +#ifndef _UNICODE_H_ +#define _UNICODE_H_ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len); +bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len); + +static inline bool unicode_is_valid_codepoint(uint32_t codepoint) +{ + // 0x110000 - 0x1FFFFF are not valid codepoints + // 0xD800 - 0xDFFF are surrogates + return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF)); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tests/erlang_tests/CMakeLists.txt b/tests/erlang_tests/CMakeLists.txt index 0e1b9ea6f..61e9336ff 100644 --- a/tests/erlang_tests/CMakeLists.txt +++ b/tests/erlang_tests/CMakeLists.txt @@ -490,6 +490,7 @@ compile_erlang(test_crypto_strong_rand_bytes) compile_erlang(test_atomvm_random) compile_erlang(float_decode) +compile_erlang(test_utf8_atoms) add_custom_target(erlang_test_modules DEPENDS code_load_files @@ -945,4 +946,5 @@ add_custom_target(erlang_test_modules DEPENDS test_atomvm_random.beam float_decode.beam + test_utf8_atoms.beam ) diff --git a/tests/erlang_tests/test_utf8_atoms.erl b/tests/erlang_tests/test_utf8_atoms.erl new file mode 100644 index 000000000..120adae89 --- /dev/null +++ b/tests/erlang_tests/test_utf8_atoms.erl @@ -0,0 +1,232 @@ +% +% This file is part of AtomVM. +% +% Copyright 2024 Davide Bettio +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% + +-module(test_utf8_atoms). +-export([start/0, conv/3, get_atom/1, get_list/1, get_binary/1, conv2/3]). + +start() -> + 32767 - test_from_atom() + + 4095 - test_to_atom() + + 63 - test_missing_atom() + + 1 - test_latin1_to_utf8_atom() + + 7 - test_invalid_bins(). + +test_latin1_to_utf8_atom() -> + comp(?MODULE:conv2(binary, l1, ?MODULE:get_binary(l1s)), 'µÃ\230Ã¥'). + +test_from_atom() -> + test_to_list(l1) + + test_to_list(l1s) * 2 + + test_to_list(gr) * 4 + + test_to_list(jp) * 8 + + test_to_list(jp_mixed) * 16 + + test_to_l1bin(l1) * 32 + + test_to_l1bin(l1s_plain) * 64 + + test_to_l1bincatch(gr) * 128 + + test_to_l1bincatch(jp) * 256 + + test_to_l1bincatch(jp_mixed) * 512 + + test_to_u8bin(l1) * 1024 + + test_to_u8bin(l1s) * 2048 + + test_to_u8bin(gr) * 4096 + + test_to_u8bin(jp) * 8192 + + test_to_u8bin(jp_mixed) * 16384. + +test_to_atom() -> + test_from_list(l1) + + test_from_list(l1s) * 2 + + test_from_list(gr) * 4 + + test_from_list(jp) * 8 + + test_from_list(jp_mixed) * 16 + + test_from_l1bin(l1) * 32 + + test_from_l1bin(l1s_plain) * 64 + + test_from_u8bin(l1) * 128 + + test_from_u8bin(l1s) * 256 + + test_from_u8bin(gr) * 512 + + test_from_u8bin(jp) * 1024 + + test_from_u8bin(jp_mixed) * 2048. + +test_missing_atom() -> + comp( + erlang:list_to_atom(get_list(l1s_missing)), + erlang:binary_to_atom(get_binary(l1s_missing), utf8) + ) + + comp( + erlang:list_to_atom(get_list(jp_mixed_missing)), + erlang:binary_to_atom(get_binary(jp_mixed_missing), utf8) + ) * 2 + + comp( + erlang:list_to_atom(get_list(l1s_missing)), + erlang:binary_to_atom(get_binary(l1s_missing), unicode) + ) * 4 + + comp( + erlang:list_to_atom(get_list(jp_mixed_missing)), + erlang:binary_to_atom(get_binary(jp_mixed_missing), unicode) + ) * 8 + + comp_opt( + fun() -> erlang:list_to_atom(get_list(l1s_missing)) end, + fun() -> erlang:binary_to_atom(get_binary(l1s_missing)) end + ) * 16 + + comp_opt( + fun() -> erlang:list_to_atom(get_list(jp_mixed_missing)) end, + fun() -> erlang:binary_to_atom(get_binary(jp_mixed_missing)) end + ) * 32. + +test_invalid_bins() -> + test_from_u8bincatch(invalid1) + + test_from_u8bincatch(invalid2) * 2 + + test_from_u8bincatch(invalid2) * 4. + +test_to_list(Id) -> + case ?MODULE:conv(list, x, ?MODULE:get_atom(Id)) == ?MODULE:get_list(Id) of + true -> + 1; + false -> + erlang:display({list, Id}), + 0 + end. + +test_to_l1bin(Id) -> + case ?MODULE:conv(binary, l1, ?MODULE:get_atom(Id)) == ?MODULE:get_binary(Id) of + true -> + 1; + false -> + erlang:display({l1bin, Id}), + 0 + end. + +test_to_l1bincatch(Id) -> + try ?MODULE:conv(binary, l1, ?MODULE:get_atom(Id)) of + _X -> + erlang:display({err, Id}), + 0 + catch + error:badarg -> + 1 + end. + +test_to_u8bin(Id) -> + case ?MODULE:conv(binary, u8, ?MODULE:get_atom(Id)) == ?MODULE:get_binary(Id) of + true -> + 1; + false -> + erlang:display({u8bin, Id}), + 0 + end. + +test_from_list(Id) -> + case ?MODULE:conv2(list, x, ?MODULE:get_list(Id)) == ?MODULE:get_atom(Id) of + true -> + 1; + false -> + erlang:display({flist, Id}), + 0 + end. + +test_from_l1bin(Id) -> + case ?MODULE:conv2(binary, l1, ?MODULE:get_binary(Id)) == ?MODULE:get_atom(Id) of + true -> + 1; + false -> + erlang:display({fl1bin, Id}), + 0 + end. + +test_from_u8bin(Id) -> + case ?MODULE:conv2(binary, u8, ?MODULE:get_binary(Id)) == ?MODULE:get_atom(Id) of + true -> + 1; + false -> + erlang:display({fu8bin, Id}), + 0 + end. + +test_from_u8bincatch(Id) -> + try ?MODULE:conv2(binary, u8, ?MODULE:get_binary(Id)) of + _X -> + erlang:display({u8err, Id}), + 0 + catch + error:badarg -> + 1 + end. + +conv(list, _Fmt, Atom) -> + erlang:atom_to_list(Atom); +conv(binary, l1, Atom) -> + erlang:atom_to_binary(Atom, latin1); +conv(binary, u8, Atom) -> + erlang:atom_to_binary(Atom, utf8). + +conv2(list, _Fmt, S) -> + erlang:list_to_atom(S); +conv2(binary, l1, S) -> + erlang:binary_to_atom(S, latin1); +conv2(binary, u8, S) -> + erlang:binary_to_atom(S, utf8). + +comp(A, A) -> 1; +comp(_A, _B) -> 0. + +comp_opt(Fun1, Fun2) -> + case erlang:system_info(machine) of + "BEAM" -> + case erlang:system_info(otp_release) of + Version when Version >= 23 -> comp(Fun1(), Fun2()); + _OldVersion -> 1 + end; + _ -> + comp(Fun1(), Fun2()) + end. + +get_atom(Id) -> + case Id of + l1 -> 'abcd'; + l1s -> 'µØå'; + l1s_plain -> 'µØå'; + gr -> 'ΓΔ'; + jp -> 'アーラン'; + jp_mixed -> 'latin1じゃない' + end. + +get_list(Id) -> + case Id of + l1 -> "abcd"; + l1s -> "µØå"; + l1s_missing -> "µ_å"; + gr -> "ΓΔ"; + jp -> "アーラン"; + jp_mixed -> "latin1じゃない"; + jp_mixed_missing -> "latin1_じゃない" + end. + +get_binary(Id) -> + case Id of + l1 -> <<"abcd"/utf8>>; + l1s_plain -> <<"µØå">>; + l1s -> <<"µØå"/utf8>>; + l1s_missing -> <<"µ_å"/utf8>>; + gr -> <<"ΓΔ"/utf8>>; + jp -> <<"アーラン"/utf8>>; + jp_mixed -> <<"latin1じゃない"/utf8>>; + jp_mixed_missing -> <<"latin1_じゃない"/utf8>>; + invalid1 -> <<230>>; + invalid2 -> <<16#f0, 16#90, 16#28, 16#bc>>; + invalid3 -> <<16#fc, 16#a1, 16#a1, 16#a1, 16#a1, 16#a1>> + end. diff --git a/tests/test.c b/tests/test.c index 71ca55c38..aecf2ac76 100644 --- a/tests/test.c +++ b/tests/test.c @@ -520,6 +520,7 @@ struct Test tests[] = { #endif TEST_CASE(float_decode), + TEST_CASE(test_utf8_atoms), // TEST CRASHES HERE: TEST_CASE(memlimit), { NULL, 0, false, false }