Skip to content

Commit

Permalink
Revert "Improve handling of Unicode vs Byte strings (#828)"
Browse files Browse the repository at this point in the history
This reverts commit 4c9d010.
  • Loading branch information
Scott-Guest committed Sep 29, 2023
1 parent 0058580 commit 6092863
Show file tree
Hide file tree
Showing 35 changed files with 218 additions and 3,818 deletions.
3 changes: 1 addition & 2 deletions bindings/python/ast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,7 @@ void bind_ast(py::module_ &m) {
.value("Bool", SortCategory::Bool)
.value("Symbol", SortCategory::Symbol)
.value("Variable", SortCategory::Variable)
.value("MInt", SortCategory::MInt)
.value("Bytes", SortCategory::Bytes);
.value("MInt", SortCategory::MInt);

py::class_<ValueType>(ast, "ValueType")
.def(py::init([](SortCategory cat) {
Expand Down
4 changes: 1 addition & 3 deletions cmake/RuntimeConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ set(VARIABLE_BIT 0x8000000000000)
set(LAYOUT_OFFSET 54)
set(TAG_MASK 0xffffffff)
set(LENGTH_MASK 0xffffffffff)
set(IS_BYTES_BIT 0x10000000000)

if(CMAKE_BUILD_TYPE STREQUAL "GcStats")
set(HDR_MASK -18013298997854209) # 0xffc000ffffffffff
Expand All @@ -35,8 +34,7 @@ set(STRINGBUFFER_LAYOUT 6)
set(BOOL_LAYOUT 7)
set(SYMBOL_LAYOUT 8)
set(VARIABLE_LAYOUT 9)
set(RANGEMAP_LAYOUT 10)
set(BYTES_LAYOUT 11)
set(RANGEMAP_LAYOUT 11)

get_filename_component(INSTALL_DIR_ABS_PATH "${CMAKE_INSTALL_PREFIX}"
REALPATH BASE_DIR "${CMAKE_BINARY_DIR}")
Expand Down
2 changes: 0 additions & 2 deletions config/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#define HDR_MASK @HDR_MASK@
#define TAG_MASK @TAG_MASK@LL
#define LENGTH_MASK @LENGTH_MASK@
#define IS_BYTES_BIT @IS_BYTES_BIT@

#define MAP_LAYOUT @MAP_LAYOUT@
#define LIST_LAYOUT @LIST_LAYOUT@
Expand All @@ -23,7 +22,6 @@
#define SYMBOL_LAYOUT @SYMBOL_LAYOUT@
#define VARIABLE_LAYOUT @VARIABLE_LAYOUT@
#define RANGEMAP_LAYOUT @RANGEMAP_LAYOUT@
#define BYTES_LAYOUT @BYTES_LAYOUT@

#define STRINGIFY(x) #x
#define TOSTRING(X) STRINGIFY(X)
Expand Down
17 changes: 4 additions & 13 deletions debug/kgdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,14 +561,9 @@ def append(self, subject, isVar, sort):
if not layout:
string = subject.cast(self.string_ptr)
length = hdr & @LENGTH_MASK@
string_bytes = bytes(int(string.dereference()['data'][i].cast(self.unsigned_char)) for i in range(length))
if hdr & @IS_BYTES_BIT@:
py_str = string_bytes.decode('iso-8859-1')
else:
py_str = string_bytes.decode('utf-8')

self.result += "\\dv{" + sort + "}(\""
for c in py_str:
for i in range(length):
c = chr(int(string.dereference()['data'][i].cast(self.unsigned_char)))
if c == '\\':
self.result += "\\\\"
elif c == '"':
Expand All @@ -581,14 +576,10 @@ def append(self, subject, isVar, sort):
self.result += "\\r"
elif c == '\f':
self.result += "\\f"
elif 32 <= ord(c) and ord(c) < 127:
elif ord(c) >= 32 and ord(c) < 127:
self.result += c
elif ord(c) <= 0xFF:
self.result += "\\x{:02x}".format(ord(c))
elif ord(c) <= 0xFFFF:
self.result += "\\u{:04x}".format(ord(c))
else:
self.result += "\\U{:08x}".format(ord(c))
self.result += "{:02x}".format(ord(c))
var = Variable(string)
stdStr = var.stdStr
if isVar and not var in self.var_names:
Expand Down
35 changes: 2 additions & 33 deletions include/kllvm/ast/AST.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,8 @@ enum class SortCategory {
Bool,
Symbol,
Variable,
RangeMap,
Bytes,
// WARNING: MInt must be the last value, so that valueType.cat + valueType.bits can unique identify the ValueType
MInt
MInt,
RangeMap
};

// represents the syntactic category of an LLVM backend term at runtime
Expand Down Expand Up @@ -648,35 +646,6 @@ class KOREStringPattern : public KOREPattern {
: contents(Contents) { }
};

// Convert a Unicode codepoint to a UTF-8 encoded string containing that codepoint
std::string codepointToUTF8(uint32_t codepoint);

// Return a representation of str with all special characters replaced by their
// escape sequences.
//
// The provided StringType indicates whether to treat the string as a sequence of bytes
// or as a UTF-8 encoded Unicode string.
//
// For example, U+1F601 (😁) is UTF-8 encoded as the byte sequence 0xF0 0x9F 0x98 0x81, so
// - escapeString("😁", StringType::UTF8) returns "\U0001f601"
// - escapeString("😁", StringType::BYTES) returns "\xf0\x9f\x98\x81"
//
enum class StringType { BYTES, UTF8 };
std::string escapeString(const std::string &str, StringType strType);

// A Bytes domain value is represented as a KOREStringPattern by storing each
// byte 0xHH as the (UTF-8 encoded) Unicode codepoint U+00HH.
//
// Given the contents of such a KOREStringPattern, this function produces the
// actual char[] for the Bytes domain value. In effect, this is just a conversion
// from UTF-8 to latin-1.
//
// The input buffer is overwritten, and the new length is returned.
extern "C" size_t bytesStringPatternToBytes(char *contents, size_t length);

// Given a char[] of bytes, return its representation as the contents of a KOREStringPattern.
std::string bytesToBytesStringPattern(const char *bytes, size_t length);

// KOREDeclaration
class KOREDeclaration {
protected:
Expand Down
2 changes: 1 addition & 1 deletion include/kllvm/parser/KOREScanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class KOREScanner {
return yylex(lval, loc, scanner);
}
void error(const location &loc, const std::string &err_message);
std::string codepointToUTF8(const char *codepointStr, const location &loc);
std::string codepoint_to_utf8(unsigned long int code, const location &loc);

FILE *in;
std::string stringBuffer;
Expand Down
23 changes: 2 additions & 21 deletions include/runtime/header.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,25 +110,6 @@ bool hash_enter(void);
void hash_exit(void);
}

__attribute__((always_inline)) constexpr bool is_bytes_hdr(uint64_t hdr) {
return hdr & IS_BYTES_BIT;
}

template <typename T>
__attribute__((always_inline)) constexpr bool is_bytes(T const *s) {
return is_bytes_hdr(s->h.hdr);
}

template <typename T>
__attribute__((always_inline)) constexpr void
set_is_bytes(T *s, bool is_bytes) {
if (is_bytes) {
s->h.hdr |= IS_BYTES_BIT;
} else {
s->h.hdr &= ~IS_BYTES_BIT;
}
}

__attribute__((always_inline)) constexpr uint64_t len_hdr(uint64_t hdr) {
return hdr & LENGTH_MASK;
}
Expand All @@ -139,7 +120,7 @@ __attribute__((always_inline)) constexpr uint64_t len(T const *s) {
}

template <typename T>
__attribute__((always_inline)) constexpr void init_with_len(T *s, uint64_t l) {
__attribute__((always_inline)) constexpr void set_len(T *s, uint64_t l) {
s->h.hdr = l | (l > BLOCK_SIZE - sizeof(char *) ? NOT_YOUNG_OBJECT_BIT : 0);
}

Expand Down Expand Up @@ -240,7 +221,7 @@ struct kore_alloc_heap {
return ::operator new(size);
} else {
string *result = (string *)koreAllocToken(size + sizeof(blockheader));
init_with_len(result, size);
set_len(result, size);
return result->data;
}
}
Expand Down
Loading

0 comments on commit 6092863

Please sign in to comment.