Skip to content

Commit

Permalink
utf8 validation
Browse files Browse the repository at this point in the history
  • Loading branch information
martinfouilleul committed Oct 4, 2024
1 parent ed100e3 commit 62d64e6
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 30 deletions.
78 changes: 48 additions & 30 deletions src/util/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ static const char trailingBytesForUTF8[256] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};

#define oc_utf8_is_start_byte(c) (((c)&0xc0) != 0x80)

//-----------------------------------------------------------------
//NOTE: getting sizes / offsets / indices
//-----------------------------------------------------------------
Expand Down Expand Up @@ -129,65 +127,85 @@ oc_utf8_dec oc_utf8_decode_at(oc_str8 string, u64 offset)
//NOTE(martin): get the first codepoint in str, and advance index to the
// next oc_utf8 character
//TODO(martin): check for utf-16 surrogate pairs
oc_utf32 cp = 0;
u64 sz = 0;
oc_utf8_dec res = { .status = OC_UTF8_OK };

if(offset >= string.len || !string.ptr[offset])
if(offset >= string.len)
{
cp = 0;
sz = 1;
res.status = OC_UTF8_OUT_OF_BOUNDS;
res.size = 1;
}
else if(!oc_utf8_is_start_byte(string.ptr[offset]))
{
//NOTE(martin): unexpected continuation or invalid character.
cp = 0xfffd;
sz = 1;
if((string.ptr[offset] & 0xc0) == 0x80)
{
res.status = OC_UTF8_UNEXPECTED_CONTINUATION_BYTE;
}
else
{
res.status = OC_UTF8_INVALID_BYTE;
}

res.codepoint = 0xfffd;
res.size = 1;
}
else
{
int expectedSize = oc_utf8_size_from_leading_char(string.ptr[offset]);
do
{
if(offset >= string.len)
{
res.status = OC_UTF8_OUT_OF_BOUNDS;
break;
}
/*NOTE(martin):
we shift 6 bits and add the next byte at each round.
at the end we have our oc_utf8 codepoint, added to the shifted versions
of the oc_utf8 leading bits for each encoded byte. These values are
precomputed in offsetsFromUTF8.
*/
unsigned char b = string.ptr[offset];
cp <<= 6;
cp += b;
res.codepoint <<= 6;
res.codepoint += b;
offset += 1;
sz++;
res.size++;

if(b == 0xc0 || b == 0xc1 || b >= 0xc5)
if(b == 0xc0 || b == 0xc1 || b >= 0xf5)
{
//NOTE(martin): invalid byte encountered
res.status = OC_UTF8_INVALID_BYTE;
break;
}
if(res.size > 1 && oc_utf8_is_start_byte(b))
{
res.status = OC_UTF8_UNEXPECTED_LEADING_BYTE;
break;
}
}
while(offset < string.len
&& string.ptr[offset]
&& !oc_utf8_is_start_byte(string.ptr[offset])
&& sz < expectedSize);
while(res.size < expectedSize);

if(sz != expectedSize)
{
//NOTE(martin): if we encountered an error, we return the replacement codepoint U+FFFD
cp = 0xfffd;
}
else
if(res.status == OC_UTF8_OK)
{
cp -= offsetsFromUTF8[sz - 1];
res.codepoint -= offsetsFromUTF8[res.size - 1];

//NOTE(martin): check for invalid codepoints
if(cp > 0x10ffff || (cp >= 0xd800 && cp <= 0xdfff))
if((res.size == 3 && res.codepoint < 0x800) || (res.size == 4 && res.codepoint < 0x10000))
{
cp = 0xfffd;
res.status = OC_UTF8_OVERLONG_ENCODING;
}
else if(res.codepoint > 0x10ffff || (res.codepoint >= 0xd800 && res.codepoint <= 0xdfff))
{
res.status = OC_UTF8_INVALID_CODEPOINT;
}
}

if(res.status != OC_UTF8_OK)
{
//NOTE(martin): if we encountered an error, we return the replacement codepoint U+FFFD
res.codepoint = 0xfffd;
}
}
oc_utf8_dec res = { .codepoint = cp, .size = sz };
return (res);
}

Expand Down Expand Up @@ -225,7 +243,7 @@ oc_str8 oc_utf8_encode(char* dest, oc_utf32 codePoint)
dest[3] = (codePoint & 0x3F) | 0x80;
sz = 4;
}
oc_str8 res = {.ptr = dest , .len = sz};
oc_str8 res = { .ptr = dest, .len = sz };
return (res);
}

Expand All @@ -239,7 +257,7 @@ oc_str32 oc_utf8_to_codepoints(u64 maxCount, oc_utf32* backing, oc_str8 string)
backing[codePointIndex] = decode.codepoint;
byteOffset += decode.size;
}
oc_str32 res = {.ptr = backing , .len = codePointIndex};
oc_str32 res = { .ptr = backing, .len = codePointIndex };
return (res);
}

Expand All @@ -257,7 +275,7 @@ oc_str8 oc_utf8_from_codepoints(u64 maxBytes, char* backing, oc_str32 codePoints
oc_utf8_encode(backing + byteOffset, codePoint);
byteOffset += byteCount;
}
oc_str8 res = {.ptr = backing , .len = byteOffset};
oc_str8 res = { .ptr = backing, .len = byteOffset };
return (res);
}

Expand Down
15 changes: 15 additions & 0 deletions src/util/utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ typedef u32 oc_utf32;
//-----------------------------------------------------------------
//NOTE: getting sizes / offsets / indices
//-----------------------------------------------------------------
#define oc_utf8_is_start_byte(c) (((c)&0xc0) != 0x80)

ORCA_API u32 oc_utf8_size_from_leading_char(char leadingChar);
ORCA_API u32 oc_utf8_codepoint_size(oc_utf32 codePoint);

Expand All @@ -33,8 +35,21 @@ ORCA_API u64 oc_utf8_prev_offset(oc_str8 string, u64 byteOffset);
//-----------------------------------------------------------------
//NOTE: encoding / decoding
//-----------------------------------------------------------------

typedef enum oc_utf8_status
{
OC_UTF8_OK,
OC_UTF8_OUT_OF_BOUNDS,
OC_UTF8_UNEXPECTED_CONTINUATION_BYTE,
OC_UTF8_UNEXPECTED_LEADING_BYTE,
OC_UTF8_INVALID_BYTE,
OC_UTF8_INVALID_CODEPOINT,
OC_UTF8_OVERLONG_ENCODING,
} oc_utf8_status;

typedef struct oc_utf8_dec
{
oc_utf8_status status;
oc_utf32 codepoint; //NOTE: decoded codepoint
u32 size; //NOTE: size of corresponding oc_utf8 sequence
} oc_utf8_dec;
Expand Down

0 comments on commit 62d64e6

Please sign in to comment.