Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP Optimized utf8 #1051

Open
wants to merge 4 commits into
base: release-0.6
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [0.6.0-beta.1] - Unreleased

### Added

- Support for utf8 encoding to `*_to_atom` and `atom_to_*` functions
- `binary_to_atom/1` and `atom_to_binary/1` that default to utf8 (they were introduced with OTP23)

### Fixed

- ESP32: fix i2c_driver_acquire and i2c_driver_release functions, that were working only once.

### Changed

- `binary_to_atom/2` validates utf8 strings
- `*_to_atom` and `atom_to_*` properly convert latin1 (not just ASCII) to utf8 and viceversa

## [0.6.0-beta.0] - 2024-02-08

### Added
Expand Down
21 changes: 21 additions & 0 deletions LICENSES/MIT.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) [year] [fullname]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
34 changes: 29 additions & 5 deletions libs/estdlib/src/erlang.erl
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,11 @@
list_to_integer/1,
list_to_tuple/1,
iolist_to_binary/1,
binary_to_atom/1,
binary_to_atom/2,
binary_to_integer/1,
binary_to_list/1,
atom_to_binary/1,
atom_to_binary/2,
atom_to_list/1,
float_to_binary/1,
Expand Down Expand Up @@ -117,6 +119,8 @@
%% * review API documentation for timer functions in this module
%%

-type atom_encoding() :: latin1 | utf8 | unicode.

-type mem_type() :: binary.
-type time_unit() :: second | millisecond | microsecond.
-type timestamp() :: {
Expand Down Expand Up @@ -582,13 +586,22 @@ iolist_to_binary(_IOList) ->

%%-----------------------------------------------------------------------------
%% @param Binary Binary to convert to atom
%% @param Encoding encoding for conversion
%% @returns an atom from passed binary
%% @doc Convert a binary to atom, defaults to utf8.
%% @end
%%-----------------------------------------------------------------------------
-spec binary_to_atom(Binary :: binary()) -> atom().
binary_to_atom(_Binary) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @param Binary Binary to convert to atom
%% @param Encoding encoding for conversion (any of latin1, utf8 or unicode)
%% @returns an atom from passed binary
%% @doc Convert a binary to atom.
%% Only latin1 encoded is supported.
%% @end
%%-----------------------------------------------------------------------------
-spec binary_to_atom(Binary :: binary(), Encoding :: latin1) -> atom().
-spec binary_to_atom(Binary :: binary(), Encoding :: atom_encoding()) -> atom().
binary_to_atom(_Binary, _Encoding) ->
erlang:nif_error(undefined).

Expand All @@ -614,13 +627,24 @@ binary_to_list(_Binary) ->

%%-----------------------------------------------------------------------------
%% @param Atom Atom to convert
%% @param Encoding Encoding for conversion
%% @returns a binary with the atom's name
%% @doc Convert an atom to a binary, defaults to utf8.
%% Only latin1 encoding is supported.
%% @end
%%-----------------------------------------------------------------------------
-spec atom_to_binary(Atom :: atom()) -> binary().
atom_to_binary(_Atom) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @param Atom Atom to convert
%% @param Encoding Encoding for conversion (any of latin1, utf8 or unicode)
%% @returns a binary with the atom's name
%% @doc Convert an atom to a binary.
%% Only latin1 encoding is supported.
%% @end
%%-----------------------------------------------------------------------------
-spec atom_to_binary(Atom :: atom(), Encoding :: latin1) -> binary().
-spec atom_to_binary(Atom :: atom(), Encoding :: atom_encoding()) -> binary().
atom_to_binary(_Atom, _Encoding) ->
erlang:nif_error(undefined).

Expand Down
2 changes: 2 additions & 0 deletions src/libAtomVM/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ set(HEADER_FILES
term.h
timer_list.h
trace.h
unicode.h
utils.h
valueshashtable.h
${CMAKE_CURRENT_BINARY_DIR}/avm_version.h
Expand Down Expand Up @@ -94,6 +95,7 @@ set(SOURCE_FILES
stacktrace.c
term.c
timer_list.c
unicode.c
valueshashtable.c
)

Expand Down
15 changes: 15 additions & 0 deletions src/libAtomVM/atom_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include "atom.h"
#include "smp.h"
#include "unicode.h"
#include "utils.h"

#ifndef AVM_NO_SMP
Expand Down Expand Up @@ -279,6 +280,20 @@ atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index,
return node;
}

bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom)
{
SMP_RDLOCK(table);

struct HNode *node = (struct HNode *) atom;
const uint8_t *data = atom_string_data(node->key);
size_t len = atom_string_len(node->key);

bool result = unicode_buf_is_ascii(data, len);

SMP_UNLOCK(table);
return result;
}

void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf)
{
SMP_RDLOCK(table);
Expand Down
3 changes: 3 additions & 0 deletions src/libAtomVM/atom_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#ifndef _ATOM_TABLE_
#define _ATOM_TABLE_

#include <stdbool.h>

#include "atom.h"

#define ATOM_TABLE_NOT_FOUND -1
Expand Down Expand Up @@ -56,6 +58,7 @@ int atom_table_ensure_atoms(
int atom_table_cmp_using_atom_index(
struct AtomTable *table, int t_atom_index, int other_atom_index);
atom_ref_t atom_table_get_atom_ptr_and_len(struct AtomTable *table, long index, size_t *out_len);
bool atom_table_is_atom_ref_ascii(struct AtomTable *table, atom_ref_t atom);
void atom_table_write_bytes(struct AtomTable *table, atom_ref_t atom, size_t buf_len, void *outbuf);
void atom_table_write_cstring(
struct AtomTable *table, atom_ref_t atom, size_t buf_len, char *outbuf);
Expand Down
4 changes: 4 additions & 0 deletions src/libAtomVM/defaultatoms.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ static const char *const fibonacci_atom = "\x9" "fibonacci";
static const char *const call_atom = "\x5" "$call";
static const char *const cast_atom = "\x5" "$cast";

static const char *const unicode_atom = "\x7" "unicode";

void defaultatoms_init(GlobalContext *glb)
{
int ok = 1;
Expand Down Expand Up @@ -300,6 +302,8 @@ void defaultatoms_init(GlobalContext *glb)
ok &= globalcontext_insert_atom(glb, call_atom) == CALL_ATOM_INDEX;
ok &= globalcontext_insert_atom(glb, cast_atom) == CAST_ATOM_INDEX;

ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX;

if (!ok) {
AVM_ABORT();
}
Expand Down
6 changes: 5 additions & 1 deletion src/libAtomVM/defaultatoms.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ extern "C" {
#define CALL_ATOM_INDEX 108
#define CAST_ATOM_INDEX 109

#define PLATFORM_ATOMS_BASE_INDEX 110
#define UNICODE_ATOM_INDEX 110

#define PLATFORM_ATOMS_BASE_INDEX 111

#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
Expand Down Expand Up @@ -309,6 +311,8 @@ extern "C" {
#define CALL_ATOM TERM_FROM_ATOM_INDEX(CALL_ATOM_INDEX)
#define CAST_ATOM TERM_FROM_ATOM_INDEX(CAST_ATOM_INDEX)

#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX)

void defaultatoms_init(GlobalContext *glb);

void platform_defaultatoms_init(GlobalContext *glb);
Expand Down
57 changes: 57 additions & 0 deletions src/libAtomVM/interop.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,63 @@ char *interop_list_to_string(term list, int *ok)
return str;
}

char *interop_list_to_utf8_string(term list, int *ok)
{
size_t byte_len = 0;

term t = list;
while (term_is_nonempty_list(t)) {
term head = term_get_list_head(t);
if (UNLIKELY(!term_is_integer(head))) {
*ok = 0;
return NULL;
}
avm_int_t codepoint = term_to_int(head);
if (UNLIKELY(codepoint < 0)) {
*ok = 0;
return NULL;
} else if (codepoint <= 127) {
byte_len++;
} else {
size_t codepoint_size;
bitstring_utf8_encode(codepoint, NULL, &codepoint_size);
byte_len += codepoint_size;
}
t = term_get_list_tail(t);
}

if (!term_is_nil(t)) {
*ok = 0;
return NULL;
}

uint8_t *str = malloc(byte_len + 1);
if (IS_NULL_PTR(str)) {
*ok = 0;
return NULL;
}

t = list;
size_t i = 0;
while (i < byte_len) {
term codepoint_term = term_get_list_head(t);
size_t codepoint_size;
bool success = bitstring_utf8_encode(term_to_int(codepoint_term), &str[i], &codepoint_size);
if (UNLIKELY(!success)) {
free(str);
*ok = 0;
return NULL;
}

t = term_get_list_tail(t);
i += codepoint_size;
}
str[byte_len] = 0;

*ok = 1;
return (char *) str;
}

char *interop_atom_to_string(Context *ctx, term atom)
{
GlobalContext *glb = ctx->global;
Expand Down
1 change: 1 addition & 0 deletions src/libAtomVM/interop.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ typedef void (*interop_chardata_rest_fun)(term t, void *accum);

char *interop_term_to_string(term t, int *ok);
char *interop_binary_to_string(term binary);
char *interop_list_to_utf8_string(term list, int *ok);
char *interop_list_to_string(term list, int *ok);
char *interop_iolist_to_string(term list, int *ok);
char *interop_atom_to_string(Context *ctx, term atom);
Expand Down
Loading
Loading