Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve handling of Unicode vs Byte strings #828

Merged
merged 33 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
dd6d275
Always parse \xHH as U+00HH, then decode when needed for Bytes domain…
Scott-Guest Jul 25, 2023
5b1934f
Merge remote-tracking branch 'origin/master' into unicode-strings
Scott-Guest Jul 25, 2023
588d624
Fix bytesStringPatternToBytes -> kllvm::bytesStringPatternToBytes
Scott-Guest Jul 25, 2023
a85c868
bytesStringPatternToBytes(): Add dummy assignment to silence uninitia…
Scott-Guest Jul 25, 2023
adc4d4b
Refactor bytesStriingPatternToBytes
Scott-Guest Jul 31, 2023
df8bdc6
Merge remote-tracking branch 'origin/master' into unicode-strings
Scott-Guest Aug 14, 2023
77c77ee
Add a bit to the string representation to indicate whether it is a by…
Scott-Guest Aug 15, 2023
a3c88e7
Add dummy return to silence warning
Scott-Guest Aug 15, 2023
6e31d8e
Set IS_BYTES bit in bytes2string and string2bytes
Scott-Guest Aug 15, 2023
acfae41
Rename missed usage of bytes2string to allocStringCopy
Scott-Guest Aug 15, 2023
cf08f4f
escapeString: Correct lengths passed to snprintf
Scott-Guest Aug 15, 2023
d64ef7a
emitGetToken: Fix type CurrentBlock -> CaseBlock for Bytes case
Scott-Guest Aug 16, 2023
26e4722
Make bytesStringPatternToBytes extern C
Scott-Guest Aug 16, 2023
5dbcb97
Refactor KOREScanner UTF-8 conversion to use UTF8EncodingType
Scott-Guest Aug 17, 2023
0670ca0
Convert Bytes back to UTF-8 encoded version when serializing
Scott-Guest Aug 17, 2023
f53aea9
sfprintf: Correctly va_copy and va_end to avoid undefined behavior
Scott-Guest Aug 18, 2023
f1b34e3
Convert to UTF-8 encoded representation of Bytes when parsing decisio…
Scott-Guest Aug 18, 2023
11270b0
Update test-unicode's output to use a Unicode escape
Scott-Guest Aug 18, 2023
463cb95
Create Bytes SortCategory
Scott-Guest Aug 23, 2023
0165962
Added test-unicode-strings
Scott-Guest Aug 23, 2023
14e71e1
Merge branch 'master' into unicode-strings
Scott-Guest Aug 23, 2023
86e1e0c
Merge remote-tracking branch 'origin/master' into unicode-strings
Scott-Guest Sep 5, 2023
a27d3f2
Add BYTES_LAYOUT
Scott-Guest Sep 6, 2023
92df03a
Add missing calls to set_is_bytes in hooks
Scott-Guest Sep 6, 2023
88ee65d
Set is_bytes bit in hook_BYTES_empty
Scott-Guest Sep 7, 2023
650a5a5
Set is_bytes to false in STRING hooks which delegate to BYTES
Scott-Guest Sep 7, 2023
c78ce03
Correct ! to ~ in set_is_bytes
Scott-Guest Sep 7, 2023
c9a4b1a
Update set_len to not unintentionally clear is_bytes bit
Scott-Guest Sep 7, 2023
11749cd
Fix set_len to also clear original NOT_YOUNG_OBJECT_BIT
Scott-Guest Sep 7, 2023
591f80b
Restore set_len behavior, rename to init_with_len, move all calls to …
Scott-Guest Sep 8, 2023
3decc94
Fix json.cpp formatting, caused by bug in clang-format-14
Scott-Guest Sep 8, 2023
44cf5fc
Reorder SortCategory so that MInt is last
Scott-Guest Sep 9, 2023
06cf769
Merge branch 'master' into unicode-strings
Scott-Guest Sep 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bindings/python/ast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ void bind_ast(py::module_ &m) {
.value("Bool", SortCategory::Bool)
.value("Symbol", SortCategory::Symbol)
.value("Variable", SortCategory::Variable)
.value("MInt", SortCategory::MInt);
.value("MInt", SortCategory::MInt)
.value("Bytes", SortCategory::Bytes);

py::class_<ValueType>(ast, "ValueType")
.def(py::init([](SortCategory cat) {
Expand Down
4 changes: 3 additions & 1 deletion cmake/RuntimeConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(VARIABLE_BIT 0x8000000000000)
set(LAYOUT_OFFSET 54)
set(TAG_MASK 0xffffffff)
set(LENGTH_MASK 0xffffffffff)
set(IS_BYTES_BIT 0x10000000000)

if(CMAKE_BUILD_TYPE STREQUAL "GcStats")
set(HDR_MASK -18013298997854209) # 0xffc000ffffffffff
Scott-Guest marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -34,7 +35,8 @@ set(STRINGBUFFER_LAYOUT 6)
set(BOOL_LAYOUT 7)
set(SYMBOL_LAYOUT 8)
set(VARIABLE_LAYOUT 9)
set(RANGEMAP_LAYOUT 11)
set(RANGEMAP_LAYOUT 10)
set(BYTES_LAYOUT 11)

get_filename_component(INSTALL_DIR_ABS_PATH "${CMAKE_INSTALL_PREFIX}"
REALPATH BASE_DIR "${CMAKE_BINARY_DIR}")
Expand Down
2 changes: 2 additions & 0 deletions config/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#define HDR_MASK @HDR_MASK@
#define TAG_MASK @TAG_MASK@LL
#define LENGTH_MASK @LENGTH_MASK@
#define IS_BYTES_BIT @IS_BYTES_BIT@

#define MAP_LAYOUT @MAP_LAYOUT@
#define LIST_LAYOUT @LIST_LAYOUT@
Expand All @@ -22,6 +23,7 @@
#define SYMBOL_LAYOUT @SYMBOL_LAYOUT@
#define VARIABLE_LAYOUT @VARIABLE_LAYOUT@
#define RANGEMAP_LAYOUT @RANGEMAP_LAYOUT@
#define BYTES_LAYOUT @BYTES_LAYOUT@

#define STRINGIFY(x) #x
#define TOSTRING(X) STRINGIFY(X)
Expand Down
17 changes: 13 additions & 4 deletions debug/kgdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,9 +561,14 @@ def append(self, subject, isVar, sort):
if not layout:
string = subject.cast(self.string_ptr)
length = hdr & @LENGTH_MASK@
string_bytes = bytes(int(string.dereference()['data'][i].cast(self.unsigned_char)) for i in range(length))
if hdr & @IS_BYTES_BIT@:
py_str = string_bytes.decode('iso-8859-1')
else:
py_str = string_bytes.decode('utf-8')

self.result += "\\dv{" + sort + "}(\""
for i in range(length):
c = chr(int(string.dereference()['data'][i].cast(self.unsigned_char)))
for c in py_str:
if c == '\\':
self.result += "\\\\"
elif c == '"':
Expand All @@ -576,10 +581,14 @@ def append(self, subject, isVar, sort):
self.result += "\\r"
elif c == '\f':
self.result += "\\f"
elif ord(c) >= 32 and ord(c) < 127:
elif 32 <= ord(c) and ord(c) < 127:
self.result += c
elif ord(c) <= 0xFF:
self.result += "\\x{:02x}".format(ord(c))
elif ord(c) <= 0xFFFF:
self.result += "\\u{:04x}".format(ord(c))
else:
self.result += "{:02x}".format(ord(c))
self.result += "\\U{:08x}".format(ord(c))
var = Variable(string)
stdStr = var.stdStr
if isVar and not var in self.var_names:
Expand Down
35 changes: 33 additions & 2 deletions include/kllvm/ast/AST.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,10 @@ enum class SortCategory {
Bool,
Symbol,
Variable,
MInt,
RangeMap
RangeMap,
Bytes,
// WARNING: MInt must be the last value, so that valueType.cat + valueType.bits can unique identify the ValueType
MInt
};

// represents the syntactic category of an LLVM backend term at runtime
Expand Down Expand Up @@ -646,6 +648,35 @@ class KOREStringPattern : public KOREPattern {
: contents(Contents) { }
};

// Convert a Unicode codepoint to a UTF-8 encoded string containing that codepoint
std::string codepointToUTF8(uint32_t codepoint);

// Return a representation of str with all special characters replaced by their
// escape sequences.
//
// The provided StringType indicates whether to treat the string as a sequence of bytes
// or as a UTF-8 encoded Unicode string.
//
// For example, U+1F601 (😁) is UTF-8 encoded as the byte sequence 0xF0 0x9F 0x98 0x81, so
// - escapeString("😁", StringType::UTF8) returns "\U0001f601"
// - escapeString("😁", StringType::BYTES) returns "\xf0\x9f\x98\x81"
//
enum class StringType { BYTES, UTF8 };
std::string escapeString(const std::string &str, StringType strType);

// A Bytes domain value is represented as a KOREStringPattern by storing each
// byte 0xHH as the (UTF-8 encoded) Unicode codepoint U+00HH.
//
// Given the contents of such a KOREStringPattern, this function produces the
// actual char[] for the Bytes domain value. In effect, this is just a conversion
// from UTF-8 to latin-1.
//
// The input buffer is overwritten, and the new length is returned.
extern "C" size_t bytesStringPatternToBytes(char *contents, size_t length);

// Given a char[] of bytes, return its representation as the contents of a KOREStringPattern.
std::string bytesToBytesStringPattern(const char *bytes, size_t length);

// KOREDeclaration
class KOREDeclaration {
protected:
Expand Down
2 changes: 1 addition & 1 deletion include/kllvm/parser/KOREScanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class KOREScanner {
return yylex(lval, loc, scanner);
}
void error(const location &loc, const std::string &err_message);
std::string codepoint_to_utf8(unsigned long int code, const location &loc);
std::string codepointToUTF8(const char *codepointStr, const location &loc);

FILE *in;
std::string stringBuffer;
Expand Down
23 changes: 21 additions & 2 deletions include/runtime/header.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,25 @@ bool hash_enter(void);
void hash_exit(void);
}

__attribute__((always_inline)) constexpr bool is_bytes_hdr(uint64_t hdr) {
return hdr & IS_BYTES_BIT;
}

template <typename T>
__attribute__((always_inline)) constexpr bool is_bytes(T const *s) {
return is_bytes_hdr(s->h.hdr);
}

template <typename T>
__attribute__((always_inline)) constexpr void
set_is_bytes(T *s, bool is_bytes) {
if (is_bytes) {
s->h.hdr |= IS_BYTES_BIT;
} else {
s->h.hdr &= ~IS_BYTES_BIT;
}
}

__attribute__((always_inline)) constexpr uint64_t len_hdr(uint64_t hdr) {
return hdr & LENGTH_MASK;
}
Expand All @@ -120,7 +139,7 @@ __attribute__((always_inline)) constexpr uint64_t len(T const *s) {
}

template <typename T>
__attribute__((always_inline)) constexpr void set_len(T *s, uint64_t l) {
__attribute__((always_inline)) constexpr void init_with_len(T *s, uint64_t l) {
s->h.hdr = l | (l > BLOCK_SIZE - sizeof(char *) ? NOT_YOUNG_OBJECT_BIT : 0);
}

Expand Down Expand Up @@ -221,7 +240,7 @@ struct kore_alloc_heap {
return ::operator new(size);
} else {
string *result = (string *)koreAllocToken(size + sizeof(blockheader));
set_len(result, size);
init_with_len(result, size);
return result->data;
}
}
Expand Down
Loading
Loading