From 475d5ee0449fae6e7c869dcfd949a3ef46b571eb Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Wed, 16 Oct 2024 16:06:46 -0300 Subject: [PATCH 1/7] fix make string.join work on stringslice Signed-off-by: martinvuyk --- stdlib/src/builtin/string_literal.mojo | 5 +- stdlib/src/builtin/value.mojo | 8 +- stdlib/src/collections/string.mojo | 22 ++-- stdlib/src/prelude/__init__.mojo | 4 +- stdlib/src/utils/__init__.mojo | 2 +- stdlib/src/utils/string_slice.mojo | 14 ++ stdlib/test/utils/test_string_slice.mojo | 158 +++++++++++++++++++++++ 7 files changed, 194 insertions(+), 19 deletions(-) diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index 975963b749..2926e12d4a 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -23,7 +23,7 @@ from hashlib._hasher import _HashableWithHasher, _Hasher from utils import StringRef, Span, StringSlice, StaticString from utils import Formattable, Formatter from utils._visualizers import lldb_formatter_wrapping_type - +from utils.span import AsBytesRead from collections.string import _atol, _StringSliceIter # ===----------------------------------------------------------------------===# @@ -36,6 +36,7 @@ from collections.string import _atol, _StringSliceIter struct StringLiteral( Boolable, Comparable, + CollectionElement, CollectionElementNew, Formattable, IntableRaising, @@ -44,7 +45,7 @@ struct StringLiteral( Sized, Stringable, FloatableRaising, - BytesCollectionElement, + AsBytesRead, _HashableWithHasher, ): """This type represents a string literal. diff --git a/stdlib/src/builtin/value.mojo b/stdlib/src/builtin/value.mojo index 74e033330b..3afe1caf89 100644 --- a/stdlib/src/builtin/value.mojo +++ b/stdlib/src/builtin/value.mojo @@ -221,13 +221,13 @@ trait StringableCollectionElement(CollectionElement, Stringable): pass -trait BytesCollectionElement(CollectionElement, AsBytes): - """The BytesCollectionElement trait denotes a trait composition - of the `CollectionElement` and `AsBytes`. +trait BytesReadCollectionElement(CollectionElement, AsBytesRead): + """The BytesReadCollectionElement trait denotes a trait composition + of the `CollectionElement` and `AsBytesRead`. This is useful to have as a named entity since Mojo does not currently support anonymous trait compositions to constrain - on `CollectionElement & AsBytes` in the parameter. + on `CollectionElement & AsBytesRead` in the parameter. """ pass diff --git a/stdlib/src/collections/string.mojo b/stdlib/src/collections/string.mojo index d365365bfc..46e994ed0f 100644 --- a/stdlib/src/collections/string.mojo +++ b/stdlib/src/collections/string.mojo @@ -1409,14 +1409,16 @@ struct String( # TODO(#3403): Simplify this when the linked conditional conformance # feature is added. Runs a faster algorithm if the concrete types are # able to be converted to a span of bytes. + @parameter if _type_is_eq[T, String](): - return self.fast_join(rebind[List[String]](elems)) + return self.join_bytes(rebind[List[String]](elems)) elif _type_is_eq[T, StringLiteral](): - return self.fast_join(rebind[List[StringLiteral]](elems)) - # FIXME(#3597): once StringSlice conforms to CollectionElement trait: - # if _type_is_eq[T, StringSlice](): - # return self.fast_join(rebind[List[StringSlice]](elems)) + return self.join_bytes(rebind[List[StringLiteral]](elems)) + elif _type_is_eq[T, StringSlice[__origin_of(elems)]](): + return self.join_bytes( + rebind[List[StringSlice[__origin_of(elems)]]](elems) + ) else: var result: String = "" var is_first = True @@ -1430,8 +1432,8 @@ struct String( return result - fn fast_join[ - T: BytesCollectionElement, //, + fn join_bytes[ + T: BytesReadCollectionElement, //, ](self, elems: List[T, *_]) -> String: """Joins string elements using the current string as a delimiter. @@ -1453,7 +1455,7 @@ struct String( # to prevent alloc syscalls as we know the buffer size. # This can hugely improve the performance on large lists for e_ref in elems: - len_elems += len(e_ref[].as_bytes()) + len_elems += len(e_ref[].as_bytes_read()) var capacity = len_self * (n_elems - 1) + len_elems var buf = Self._buffer_type(capacity=capacity) var self_ptr = self.unsafe_ptr() @@ -1467,13 +1469,13 @@ struct String( else: memcpy(dest=ptr + offset, src=self_ptr, count=len_self) offset += len_self - var e = elems[i].as_bytes() + var e = elems[i].as_bytes_read() var e_len = len(e) memcpy(dest=ptr + offset, src=e.unsafe_ptr(), count=e_len) offset += e_len i += 1 buf.size = capacity - buf.append(0) + buf.unsafe_set(capacity, 0) return String(buf^) fn _strref_dangerous(self) -> StringRef: diff --git a/stdlib/src/prelude/__init__.mojo b/stdlib/src/prelude/__init__.mojo index 60f47ba5c7..f1607bc937 100644 --- a/stdlib/src/prelude/__init__.mojo +++ b/stdlib/src/prelude/__init__.mojo @@ -91,7 +91,7 @@ from builtin.value import ( Defaultable, CollectionElement, CollectionElementNew, - BytesCollectionElement, + BytesReadCollectionElement, StringableCollectionElement, EqualityComparableCollectionElement, ComparableCollectionElement, @@ -133,4 +133,4 @@ from collections.string import ( ) from hashlib.hash import hash, Hashable from memory import Pointer, AddressSpace -from utils import AsBytes, Formattable, Formatter +from utils import AsBytes, AsBytesRead, Formattable, Formatter diff --git a/stdlib/src/utils/__init__.mojo b/stdlib/src/utils/__init__.mojo index f0290277cf..5ec21074c5 100644 --- a/stdlib/src/utils/__init__.mojo +++ b/stdlib/src/utils/__init__.mojo @@ -15,7 +15,7 @@ from .index import Index, IndexList, product from .inline_string import InlineString from .loop import unroll -from .span import AsBytes, Span +from .span import Span, AsBytes, AsBytesRead from .static_tuple import StaticTuple from .stringref import StringRef from .string_slice import StaticString, StringSlice diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index a860c5982e..9d3eb0dfea 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -661,6 +661,20 @@ struct StringSlice[ _ = next_line, unicode_line_sep, unicode_paragraph_sep return True + fn join[T: StringableCollectionElement](self, elems: List[T, *_]) -> String: + """Joins string elements using the current string as a delimiter. + + Parameters: + T: The types of the elements. + + Args: + elems: The input values. + + Returns: + The joined string. + """ + return str(self).join(elems) + fn splitlines(self, keepends: Bool = False) -> List[String]: """Split the string at line boundaries. This corresponds to Python's [universal newlines]( diff --git a/stdlib/test/utils/test_string_slice.mojo b/stdlib/test/utils/test_string_slice.mojo index b753463662..b465beec7a 100644 --- a/stdlib/test/utils/test_string_slice.mojo +++ b/stdlib/test/utils/test_string_slice.mojo @@ -415,6 +415,163 @@ def test_count_utf8_continuation_bytes(): _test(3, List[UInt8](b2, c, b3, c, c)) +def test_split(): + # empty separators default to whitespace + var d = StringSlice("hello world").split() + assert_true(len(d) == 2) + assert_true(d[0] == "hello") + assert_true(d[1] == "world") + d = StringSlice("hello \t\n\n\v\fworld").split("\n") + assert_true(len(d) == 3) + assert_true(d[0] == "hello \t" and d[1] == "" and d[2] == "\v\fworld") + + # Should add all whitespace-like chars as one + # test all unicode separators + # 0 is to build a String with null terminator + alias next_line = List[UInt8](0xC2, 0x85, 0) + """TODO: \\x85""" + alias unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8, 0) + """TODO: \\u2028""" + alias unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9, 0) + """TODO: \\u2029""" + # TODO add line and paragraph separator as StringLiteral once unicode + # escape secuences are accepted + var univ_sep_var = ( + StringSlice(" ") + + StringSlice("\t") + + StringSlice("\n") + + StringSlice("\r") + + StringSlice("\v") + + StringSlice("\f") + + StringSlice("\x1c") + + StringSlice("\x1d") + + StringSlice("\x1e") + + StringSlice(next_line) + + StringSlice(unicode_line_sep) + + StringSlice(unicode_paragraph_sep) + ) + var s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var + d = s.split() + assert_true(len(d) == 2) + assert_true(d[0] == "hello" and d[1] == "world") + + # should split into empty strings between separators + d = StringSlice("1,,,3").split(",") + assert_true(len(d) == 4) + assert_true(d[0] == "1" and d[1] == "" and d[2] == "" and d[3] == "3") + d = StringSlice(",,,").split(",") + assert_true(len(d) == 4) + assert_true(d[0] == "" and d[1] == "" and d[2] == "" and d[3] == "") + d = StringSlice(" a b ").split(" ") + assert_true(len(d) == 4) + assert_true(d[0] == "" and d[1] == "a" and d[2] == "b" and d[3] == "") + d = StringSlice("abababaaba").split("aba") + assert_true(len(d) == 4) + assert_true(d[0] == "" and d[1] == "b" and d[2] == "" and d[3] == "") + + # should split into maxsplit + 1 items + d = StringSlice("1,2,3").split(",", 0) + assert_true(len(d) == 1) + assert_true(d[0] == "1,2,3") + d = StringSlice("1,2,3").split(",", 1) + assert_true(len(d) == 2) + assert_true(d[0] == "1" and d[1] == "2,3") + + assert_true(len(StringSlice("").split()) == 0) + assert_true(len(StringSlice(" ").split()) == 0) + assert_true(len(StringSlice("").split(" ")) == 1) + assert_true(len(StringSlice(" ").split(" ")) == 2) + assert_true(len(StringSlice(" ").split(" ")) == 3) + assert_true(len(StringSlice(" ").split(" ")) == 4) + + # Split in middle + var d1 = StringSlice("n") + var in1 = StringSlice("faang") + var res1 = in1.split(d1) + assert_equal(len(res1), 2) + assert_equal(res1[0], "faa") + assert_equal(res1[1], "g") + + # Matches should be properly split in multiple case + var d2 = StringSlice(" ") + var in2 = StringSlice("modcon is coming soon") + var res2 = in2.split(d2) + assert_equal(len(res2), 4) + assert_equal(res2[0], "modcon") + assert_equal(res2[1], "is") + assert_equal(res2[2], "coming") + assert_equal(res2[3], "soon") + + # No match from the delimiter + var d3 = StringSlice("x") + var in3 = StringSlice("hello world") + var res3 = in3.split(d3) + assert_equal(len(res3), 1) + assert_equal(res3[0], "hello world") + + # Multiple character delimiter + var d4 = StringSlice("ll") + var in4 = StringSlice("hello") + var res4 = in4.split(d4) + assert_equal(len(res4), 2) + assert_equal(res4[0], "he") + assert_equal(res4[1], "o") + + # related to #2879 + # TODO: replace string comparison when __eq__ is implemented for List + assert_equal( + StringSlice("abbaaaabbba").split("a").__str__(), + "['', 'bb', '', '', '', 'bbb', '']", + ) + assert_equal( + StringSlice("abbaaaabbba").split("a", 8).__str__(), + "['', 'bb', '', '', '', 'bbb', '']", + ) + assert_equal( + StringSlice("abbaaaabbba").split("a", 5).__str__(), + "['', 'bb', '', '', '', 'bbba']", + ) + assert_equal(StringSlice("aaa").split("a", 0).__str__(), "['aaa']") + assert_equal(StringSlice("a").split("a").__str__(), "['', '']") + assert_equal(StringSlice("1,2,3").split("3", 0).__str__(), "['1,2,3']") + assert_equal(StringSlice("1,2,3").split("3", 1).__str__(), "['1,2,', '']") + assert_equal( + StringSlice("1,2,3,3").split("3", 2).__str__(), "['1,2,', ',', '']" + ) + assert_equal( + StringSlice("1,2,3,3,3").split("3", 2).__str__(), "['1,2,', ',', ',3']" + ) + + var in5 = StringSlice("Hello πŸ”₯!") + var res5 = in5.split() + assert_equal(len(res5), 2) + assert_equal(res5[0], "Hello") + assert_equal(res5[1], "πŸ”₯!") + + var in6 = StringSlice("Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚") + var res6 = in6.split(" ") + assert_equal(len(res6), 5) + assert_equal(res6[0], "Π›ΠΎΡ€Π΅ΠΌ") + assert_equal(res6[1], "ипсум") + assert_equal(res6[2], "Π΄ΠΎΠ»ΠΎΡ€") + assert_equal(res6[3], "сит") + assert_equal(res6[4], "Π°ΠΌΠ΅Ρ‚") + var res7 = in6.split("ΠΌ") + assert_equal(res7[0], "Π›ΠΎΡ€Π΅") + assert_equal(res7[1], " ипсу") + assert_equal(res7[2], " Π΄ΠΎΠ»ΠΎΡ€ сит Π°") + assert_equal(res7[3], "Π΅Ρ‚") + + assert_equal( + StringSlice("123").split(""), List[String]("", "1", "2", "3", "") + ) + assert_equal(StringSlice("").join(StringSlice("123").split("")), "123") + assert_equal( + StringSlice(",1,2,3,").split(","), StringSlice("123").split("") + ) + assert_equal(StringSlice(",").join(StringSlice("123").split("")), ",1,2,3,") + + fn main() raises: test_string_literal_byte_span() test_string_byte_span() @@ -432,3 +589,4 @@ fn main() raises: test_combination_10_good_utf8_sequences() test_combination_10_good_10_bad_utf8_sequences() test_count_utf8_continuation_bytes() + test_split() From dde9daf50a2bbcdad9391bb0bd0cd1f16e23d51c Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Wed, 16 Oct 2024 16:30:33 -0300 Subject: [PATCH 2/7] add tests and redirect string join to stringslice Signed-off-by: martinvuyk --- stdlib/src/builtin/string_literal.mojo | 22 +++- stdlib/src/collections/string.mojo | 63 +---------- stdlib/src/utils/string_slice.mojo | 76 ++++++++++++- stdlib/test/utils/test_string_slice.mojo | 134 +++++++++++------------ 4 files changed, 166 insertions(+), 129 deletions(-) diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index 2926e12d4a..63ec54be05 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -464,7 +464,9 @@ struct StringLiteral( """ return __mlir_op.`pop.string.replace`(self.value, old.value, new.value) - fn join[T: StringableCollectionElement](self, elems: List[T, *_]) -> String: + fn join[ + T: StringableCollectionElement, // + ](self, elems: List[T, *_]) -> String: """Joins string elements using the current string as a delimiter. Parameters: @@ -476,7 +478,23 @@ struct StringLiteral( Returns: The joined string. """ - return str(self).join(elems) + return self.as_string_slice.join(elems) + + fn join_bytes[ + T: BytesReadCollectionElement, //, + ](self, elems: List[T, *_]) -> String: + """Joins string elements using the current string as a delimiter. + + Parameters: + T: The types of the elements. + + Args: + elems: The input values. + + Returns: + The joined string. + """ + return self.as_string_slice().join_bytes(elems) fn split(self, sep: String, maxsplit: Int = -1) raises -> List[String]: """Split the string literal by a separator. diff --git a/stdlib/src/collections/string.mojo b/stdlib/src/collections/string.mojo index 46e994ed0f..939b344df2 100644 --- a/stdlib/src/collections/string.mojo +++ b/stdlib/src/collections/string.mojo @@ -1393,7 +1393,9 @@ struct String( _ = is_first return result - fn join[T: StringableCollectionElement](self, elems: List[T, *_]) -> String: + fn join[ + T: StringableCollectionElement, // + ](self, elems: List[T, *_]) -> String: """Joins string elements using the current string as a delimiter. Parameters: @@ -1405,32 +1407,7 @@ struct String( Returns: The joined string. """ - - # TODO(#3403): Simplify this when the linked conditional conformance - # feature is added. Runs a faster algorithm if the concrete types are - # able to be converted to a span of bytes. - - @parameter - if _type_is_eq[T, String](): - return self.join_bytes(rebind[List[String]](elems)) - elif _type_is_eq[T, StringLiteral](): - return self.join_bytes(rebind[List[StringLiteral]](elems)) - elif _type_is_eq[T, StringSlice[__origin_of(elems)]](): - return self.join_bytes( - rebind[List[StringSlice[__origin_of(elems)]]](elems) - ) - else: - var result: String = "" - var is_first = True - - for e in elems: - if is_first: - is_first = False - else: - result += self - result += str(e[]) - - return result + return self.as_string_slice.join(elems) fn join_bytes[ T: BytesReadCollectionElement, //, @@ -1446,37 +1423,7 @@ struct String( Returns: The joined string. """ - var n_elems = len(elems) - if n_elems == 0: - return String("") - var len_self = self.byte_length() - var len_elems = 0 - # Calculate the total size of the elements to join beforehand - # to prevent alloc syscalls as we know the buffer size. - # This can hugely improve the performance on large lists - for e_ref in elems: - len_elems += len(e_ref[].as_bytes_read()) - var capacity = len_self * (n_elems - 1) + len_elems - var buf = Self._buffer_type(capacity=capacity) - var self_ptr = self.unsafe_ptr() - var ptr = buf.unsafe_ptr() - var offset = 0 - var i = 0 - var is_first = True - while i < n_elems: - if is_first: - is_first = False - else: - memcpy(dest=ptr + offset, src=self_ptr, count=len_self) - offset += len_self - var e = elems[i].as_bytes_read() - var e_len = len(e) - memcpy(dest=ptr + offset, src=e.unsafe_ptr(), count=e_len) - offset += e_len - i += 1 - buf.size = capacity - buf.unsafe_set(capacity, 0) - return String(buf^) + return self.as_string_slice().join_bytes(elems) fn _strref_dangerous(self) -> StringRef: """ diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 9d3eb0dfea..e2a3e0587a 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -661,7 +661,9 @@ struct StringSlice[ _ = next_line, unicode_line_sep, unicode_paragraph_sep return True - fn join[T: StringableCollectionElement](self, elems: List[T, *_]) -> String: + fn join[ + T: StringableCollectionElement, // + ](self, elems: List[T, *_]) -> String: """Joins string elements using the current string as a delimiter. Parameters: @@ -673,7 +675,77 @@ struct StringSlice[ Returns: The joined string. """ - return str(self).join(elems) + # TODO(#3403): Simplify this when the linked conditional conformance + # feature is added. Runs a faster algorithm if the concrete types are + # able to be converted to a span of bytes. + + @parameter + if _type_is_eq[T, String](): + return self.join_bytes(rebind[List[String]](elems)) + elif _type_is_eq[T, StringLiteral](): + return self.join_bytes(rebind[List[StringLiteral]](elems)) + elif _type_is_eq[T, StringSlice[__origin_of(elems)]](): + return self.join_bytes( + rebind[List[StringSlice[__origin_of(elems)]]](elems) + ) + else: + var result: String = "" + var is_first = True + + for e in elems: + if is_first: + is_first = False + else: + result += self + result += str(e[]) + + return result + + fn join_bytes[ + T: BytesReadCollectionElement, //, + ](self, elems: List[T, *_]) -> String: + """Joins string elements using the current string as a delimiter. + + Parameters: + T: The types of the elements. + + Args: + elems: The input values. + + Returns: + The joined string. + """ + var n_elems = len(elems) + if n_elems == 0: + return String("") + var len_self = self.byte_length() + var len_elems = 0 + # Calculate the total size of the elements to join beforehand + # to prevent alloc syscalls as we know the buffer size. + # This can hugely improve the performance on large lists + for e_ref in elems: + len_elems += len(e_ref[].as_bytes_read()) + var capacity = len_self * (n_elems - 1) + len_elems + var buf = Self._buffer_type(capacity=capacity) + var self_ptr = self.unsafe_ptr() + var ptr = buf.unsafe_ptr() + var offset = 0 + var i = 0 + var is_first = True + while i < n_elems: + if is_first: + is_first = False + else: + memcpy(dest=ptr + offset, src=self_ptr, count=len_self) + offset += len_self + var e = elems[i].as_bytes_read() + var e_len = len(e) + memcpy(dest=ptr + offset, src=e.unsafe_ptr(), count=e_len) + offset += e_len + i += 1 + buf.size = capacity + buf.unsafe_set(capacity, 0) + return String(buf^) fn splitlines(self, keepends: Bool = False) -> List[String]: """Split the string at line boundaries. This corresponds to Python's diff --git a/stdlib/test/utils/test_string_slice.mojo b/stdlib/test/utils/test_string_slice.mojo index b465beec7a..a5a42a5924 100644 --- a/stdlib/test/utils/test_string_slice.mojo +++ b/stdlib/test/utils/test_string_slice.mojo @@ -416,12 +416,14 @@ def test_count_utf8_continuation_bytes(): def test_split(): + alias S = StringSlice + # empty separators default to whitespace - var d = StringSlice("hello world").split() + var d = S("hello world").split() assert_true(len(d) == 2) assert_true(d[0] == "hello") assert_true(d[1] == "world") - d = StringSlice("hello \t\n\n\v\fworld").split("\n") + d = S("hello \t\n\n\v\fworld").split("\n") assert_true(len(d) == 3) assert_true(d[0] == "hello \t" and d[1] == "" and d[2] == "\v\fworld") @@ -437,18 +439,18 @@ def test_split(): # TODO add line and paragraph separator as StringLiteral once unicode # escape secuences are accepted var univ_sep_var = ( - StringSlice(" ") - + StringSlice("\t") - + StringSlice("\n") - + StringSlice("\r") - + StringSlice("\v") - + StringSlice("\f") - + StringSlice("\x1c") - + StringSlice("\x1d") - + StringSlice("\x1e") - + StringSlice(next_line) - + StringSlice(unicode_line_sep) - + StringSlice(unicode_paragraph_sep) + S(" ") + + S("\t") + + S("\n") + + S("\r") + + S("\v") + + S("\f") + + S("\x1c") + + S("\x1d") + + S("\x1e") + + S(next_line) + + S(unicode_line_sep) + + S(unicode_paragraph_sep) ) var s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var d = s.split() @@ -456,45 +458,45 @@ def test_split(): assert_true(d[0] == "hello" and d[1] == "world") # should split into empty strings between separators - d = StringSlice("1,,,3").split(",") + d = S("1,,,3").split(",") assert_true(len(d) == 4) assert_true(d[0] == "1" and d[1] == "" and d[2] == "" and d[3] == "3") - d = StringSlice(",,,").split(",") + d = S(",,,").split(",") assert_true(len(d) == 4) assert_true(d[0] == "" and d[1] == "" and d[2] == "" and d[3] == "") - d = StringSlice(" a b ").split(" ") + d = S(" a b ").split(" ") assert_true(len(d) == 4) assert_true(d[0] == "" and d[1] == "a" and d[2] == "b" and d[3] == "") - d = StringSlice("abababaaba").split("aba") + d = S("abababaaba").split("aba") assert_true(len(d) == 4) assert_true(d[0] == "" and d[1] == "b" and d[2] == "" and d[3] == "") # should split into maxsplit + 1 items - d = StringSlice("1,2,3").split(",", 0) + d = S("1,2,3").split(",", 0) assert_true(len(d) == 1) assert_true(d[0] == "1,2,3") - d = StringSlice("1,2,3").split(",", 1) + d = S("1,2,3").split(",", 1) assert_true(len(d) == 2) assert_true(d[0] == "1" and d[1] == "2,3") - assert_true(len(StringSlice("").split()) == 0) - assert_true(len(StringSlice(" ").split()) == 0) - assert_true(len(StringSlice("").split(" ")) == 1) - assert_true(len(StringSlice(" ").split(" ")) == 2) - assert_true(len(StringSlice(" ").split(" ")) == 3) - assert_true(len(StringSlice(" ").split(" ")) == 4) + assert_true(len(S("").split()) == 0) + assert_true(len(S(" ").split()) == 0) + assert_true(len(S("").split(" ")) == 1) + assert_true(len(S(" ").split(" ")) == 2) + assert_true(len(S(" ").split(" ")) == 3) + assert_true(len(S(" ").split(" ")) == 4) # Split in middle - var d1 = StringSlice("n") - var in1 = StringSlice("faang") + var d1 = S("n") + var in1 = S("faang") var res1 = in1.split(d1) assert_equal(len(res1), 2) assert_equal(res1[0], "faa") assert_equal(res1[1], "g") # Matches should be properly split in multiple case - var d2 = StringSlice(" ") - var in2 = StringSlice("modcon is coming soon") + var d2 = S(" ") + var in2 = S("modcon is coming soon") var res2 = in2.split(d2) assert_equal(len(res2), 4) assert_equal(res2[0], "modcon") @@ -503,15 +505,15 @@ def test_split(): assert_equal(res2[3], "soon") # No match from the delimiter - var d3 = StringSlice("x") - var in3 = StringSlice("hello world") + var d3 = S("x") + var in3 = S("hello world") var res3 = in3.split(d3) assert_equal(len(res3), 1) assert_equal(res3[0], "hello world") # Multiple character delimiter - var d4 = StringSlice("ll") - var in4 = StringSlice("hello") + var d4 = S("ll") + var in4 = S("hello") var res4 = in4.split(d4) assert_equal(len(res4), 2) assert_equal(res4[0], "he") @@ -519,36 +521,26 @@ def test_split(): # related to #2879 # TODO: replace string comparison when __eq__ is implemented for List - assert_equal( - StringSlice("abbaaaabbba").split("a").__str__(), - "['', 'bb', '', '', '', 'bbb', '']", - ) - assert_equal( - StringSlice("abbaaaabbba").split("a", 8).__str__(), - "['', 'bb', '', '', '', 'bbb', '']", - ) - assert_equal( - StringSlice("abbaaaabbba").split("a", 5).__str__(), - "['', 'bb', '', '', '', 'bbba']", - ) - assert_equal(StringSlice("aaa").split("a", 0).__str__(), "['aaa']") - assert_equal(StringSlice("a").split("a").__str__(), "['', '']") - assert_equal(StringSlice("1,2,3").split("3", 0).__str__(), "['1,2,3']") - assert_equal(StringSlice("1,2,3").split("3", 1).__str__(), "['1,2,', '']") - assert_equal( - StringSlice("1,2,3,3").split("3", 2).__str__(), "['1,2,', ',', '']" - ) - assert_equal( - StringSlice("1,2,3,3,3").split("3", 2).__str__(), "['1,2,', ',', ',3']" - ) - - var in5 = StringSlice("Hello πŸ”₯!") + s = S("abbaaaabbba").split("a").__str__() + assert_equal(s, "['', 'bb', '', '', '', 'bbb', '']") + s = S("abbaaaabbba").split("a", 8).__str__() + assert_equal(s, "['', 'bb', '', '', '', 'bbb', '']") + s = S("abbaaaabbba").split("a", 5).__str__() + assert_equal(s, "['', 'bb', '', '', '', 'bbba']") + assert_equal(S("aaa").split("a", 0).__str__(), "['aaa']") + assert_equal(S("a").split("a").__str__(), "['', '']") + assert_equal(S("1,2,3").split("3", 0).__str__(), "['1,2,3']") + assert_equal(S("1,2,3").split("3", 1).__str__(), "['1,2,', '']") + assert_equal(S("1,2,3,3").split("3", 2).__str__(), "['1,2,', ',', '']") + assert_equal(S("1,2,3,3,3").split("3", 2).__str__(), "['1,2,', ',', ',3']") + + var in5 = S("Hello πŸ”₯!") var res5 = in5.split() assert_equal(len(res5), 2) assert_equal(res5[0], "Hello") assert_equal(res5[1], "πŸ”₯!") - var in6 = StringSlice("Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚") + var in6 = S("Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚") var res6 = in6.split(" ") assert_equal(len(res6), 5) assert_equal(res6[0], "Π›ΠΎΡ€Π΅ΠΌ") @@ -562,14 +554,21 @@ def test_split(): assert_equal(res7[2], " Π΄ΠΎΠ»ΠΎΡ€ сит Π°") assert_equal(res7[3], "Π΅Ρ‚") - assert_equal( - StringSlice("123").split(""), List[String]("", "1", "2", "3", "") - ) - assert_equal(StringSlice("").join(StringSlice("123").split("")), "123") - assert_equal( - StringSlice(",1,2,3,").split(","), StringSlice("123").split("") - ) - assert_equal(StringSlice(",").join(StringSlice("123").split("")), ",1,2,3,") + assert_equal(S("123").split(""), List[String]("", "1", "2", "3", "")) + assert_equal(S("").join(S("123").split("")), "123") + assert_equal(S(",1,2,3,").split(","), S("123").split("")) + assert_equal(S(",").join(S("123").split("")), ",1,2,3,") + + +def test_join(): + alias S = StringSlice + l1 = List[UInt8](1, 2, 3, 4, 5, 6, 7, 8, 9) + assert_equal(S(",").join(l1), "1,2,3,4,5,6,7,8,9") + assert_equal(S(",").join(List[UInt8](1, 2, 3)), "1,2,3") + assert_equal(S(",").join(List[UInt8]()), "") + assert_equal(S(",").join(List[UInt8](1)), "1") + l2 = List[S]("1", "2", "3") + assert_equal(S(",").join(l2), "1,2,3") fn main() raises: @@ -590,3 +589,4 @@ fn main() raises: test_combination_10_good_10_bad_utf8_sequences() test_count_utf8_continuation_bytes() test_split() + test_join() From d0368dc4c4690f30b9cdb69cf38c0a6670210dcb Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Wed, 16 Oct 2024 16:46:23 -0300 Subject: [PATCH 3/7] fix details Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index e2a3e0587a..88fd0a039a 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -675,6 +675,7 @@ struct StringSlice[ Returns: The joined string. """ + # TODO(#3403): Simplify this when the linked conditional conformance # feature is added. Runs a faster algorithm if the concrete types are # able to be converted to a span of bytes. @@ -715,36 +716,35 @@ struct StringSlice[ Returns: The joined string. """ + var n_elems = len(elems) if n_elems == 0: return String("") - var len_self = self.byte_length() + var s_len = self.byte_length() var len_elems = 0 # Calculate the total size of the elements to join beforehand # to prevent alloc syscalls as we know the buffer size. # This can hugely improve the performance on large lists for e_ref in elems: len_elems += len(e_ref[].as_bytes_read()) - var capacity = len_self * (n_elems - 1) + len_elems + var capacity = s_len * (n_elems - 1) + len_elems var buf = Self._buffer_type(capacity=capacity) - var self_ptr = self.unsafe_ptr() - var ptr = buf.unsafe_ptr() + var s_ptr = self.unsafe_ptr() + var b_ptr = buf.unsafe_ptr() var offset = 0 var i = 0 - var is_first = True + var not_first = False while i < n_elems: - if is_first: - is_first = False - else: - memcpy(dest=ptr + offset, src=self_ptr, count=len_self) - offset += len_self + memcpy(dest=b_ptr + offset, src=s_ptr, count=s_len * int(not_first)) + offset += s_len * int(not_first) + not_first = True var e = elems[i].as_bytes_read() var e_len = len(e) - memcpy(dest=ptr + offset, src=e.unsafe_ptr(), count=e_len) + memcpy(dest=b_ptr + offset, src=e.unsafe_ptr(), count=e_len) offset += e_len i += 1 buf.size = capacity - buf.unsafe_set(capacity, 0) + b_ptr[capacity] = 0 return String(buf^) fn splitlines(self, keepends: Bool = False) -> List[String]: From 5512ddfc2185e0ece18e84792c40f15175eda6fa Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 28 Oct 2024 11:55:52 -0300 Subject: [PATCH 4/7] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string_literal.mojo | 20 +++++++++- stdlib/src/collections/string.mojo | 20 +++++++++- stdlib/src/utils/span.mojo | 20 ++++++++++ stdlib/src/utils/string_slice.mojo | 51 ++++++++++++++++++-------- 4 files changed, 93 insertions(+), 18 deletions(-) diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index a24f4feacc..86f0311107 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -417,6 +417,24 @@ struct StringLiteral( len=self.byte_length(), ) + @always_inline + fn as_bytes_read[O: ImmutableOrigin, //](ref [O]self) -> Span[UInt8, O]: + """Returns an immutable contiguous slice of the bytes. + + Parameters: + O: The Origin of the bytes. + + Returns: + An immutable contiguous slice pointing to the bytes. + + Notes: + This does not include the trailing null terminator. + """ + + return Span[UInt8, O]( + unsafe_ptr=self.unsafe_ptr(), len=self.byte_length() + ) + @always_inline fn format[*Ts: _CurlyEntryFormattable](self, *args: *Ts) raises -> String: """Format a template with `*args`. @@ -509,7 +527,7 @@ struct StringLiteral( Returns: The joined string. """ - return self.as_string_slice.join(elems) + return self.as_string_slice().join(elems) fn join_bytes[ T: BytesReadCollectionElement, //, diff --git a/stdlib/src/collections/string.mojo b/stdlib/src/collections/string.mojo index 8a067bdbf5..fd59cd621b 100644 --- a/stdlib/src/collections/string.mojo +++ b/stdlib/src/collections/string.mojo @@ -1432,7 +1432,7 @@ struct String( Returns: The joined string. """ - return self.as_string_slice.join(elems) + return self.as_string_slice().join(elems) fn join_bytes[ T: BytesReadCollectionElement, //, @@ -1484,6 +1484,24 @@ struct String( unsafe_ptr=self._buffer.unsafe_ptr(), len=self.byte_length() ) + @always_inline + fn as_bytes_read[O: ImmutableOrigin, //](ref [O]self) -> Span[UInt8, O]: + """Returns an immutable contiguous slice of the bytes. + + Parameters: + O: The Origin of the bytes. + + Returns: + An immutable contiguous slice pointing to the bytes. + + Notes: + This does not include the trailing null terminator. + """ + + return Span[UInt8, O]( + unsafe_ptr=self.unsafe_ptr(), len=self.byte_length() + ) + @always_inline fn as_string_slice(ref [_]self) -> StringSlice[__origin_of(self)]: """Returns a string slice of the data owned by this string. diff --git a/stdlib/src/utils/span.mojo b/stdlib/src/utils/span.mojo index 305b4d1381..35155036d9 100644 --- a/stdlib/src/utils/span.mojo +++ b/stdlib/src/utils/span.mojo @@ -43,6 +43,26 @@ trait AsBytes: ... +trait AsBytesRead: + """The `AsBytesRead` trait denotes a type that can be returned as an + immutable byte span. + """ + + fn as_bytes_read[O: ImmutableOrigin, //](ref [O]self) -> Span[Byte, O]: + """Returns an immutable contiguous slice of the bytes. + + Parameters: + O: The Origin of the bytes. + + Returns: + An immutable contiguous slice pointing to the bytes. + + Notes: + This does not include the trailing null terminator. + """ + ... + + @value struct _SpanIter[ is_mutable: Bool, //, diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 51358af396..ec4cf684e7 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -26,8 +26,9 @@ from bit import count_leading_zeros from utils import Span from collections.string import _isspace, _atol, _atof from collections import List, Optional -from memory import memcmp, UnsafePointer +from memory import memcmp, UnsafePointer, memcpy from sys import simdwidthof, bitwidthof +from sys.intrinsics import _type_is_eq from memory.memory import _memcmp_impl_unconstrained alias StaticString = StringSlice[StaticConstantOrigin] @@ -655,6 +656,24 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable].type,]( """ return self._slice + @always_inline + fn as_bytes_read[O: ImmutableOrigin, //](ref [O]self) -> Span[UInt8, O]: + """Returns an immutable contiguous slice of the bytes. + + Parameters: + O: The Origin of the bytes. + + Returns: + An immutable contiguous slice pointing to the bytes. + + Notes: + This does not include the trailing null terminator. + """ + + return Span[UInt8, O]( + unsafe_ptr=self.unsafe_ptr(), len=self.byte_length() + ) + @always_inline fn unsafe_ptr(self) -> UnsafePointer[UInt8]: """Gets a pointer to the first element of this string slice. @@ -948,34 +967,34 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable].type,]( The joined string. """ - var n_elems = len(elems) + n_elems = len(elems) if n_elems == 0: return String("") - var s_len = self.byte_length() - var len_elems = 0 + s_len = self.byte_length() + len_elems = 0 # Calculate the total size of the elements to join beforehand # to prevent alloc syscalls as we know the buffer size. # This can hugely improve the performance on large lists for e_ref in elems: len_elems += len(e_ref[].as_bytes_read()) - var capacity = s_len * (n_elems - 1) + len_elems - var buf = Self._buffer_type(capacity=capacity) - var s_ptr = self.unsafe_ptr() - var b_ptr = buf.unsafe_ptr() - var offset = 0 - var i = 0 - var not_first = False + capacity = s_len * (n_elems - 1) + len_elems + 1 + buf = String._buffer_type(capacity=capacity) + buf.size = capacity + s_ptr = self.unsafe_ptr() + b_ptr = buf.unsafe_ptr() + offset = 0 + i = 0 + not_first = False while i < n_elems: memcpy(dest=b_ptr + offset, src=s_ptr, count=s_len * int(not_first)) offset += s_len * int(not_first) not_first = True - var e = elems[i].as_bytes_read() - var e_len = len(e) + e = elems[i].as_bytes_read() + e_len = len(e) memcpy(dest=b_ptr + offset, src=e.unsafe_ptr(), count=e_len) offset += e_len i += 1 - buf.size = capacity - b_ptr[capacity] = 0 + b_ptr[capacity - 1] = 0 return String(buf^) fn splitlines(self, keepends: Bool = False) -> List[String]: @@ -1036,7 +1055,7 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable].type,]( # ===----------------------------------------------------------------------===# -trait Stringlike: +trait Stringlike(AsBytesRead): """Trait intended to be used only with `String`, `StringLiteral` and `StringSlice`.""" From 3ba6ab7b29b96bc294d20a23e6f2c325ecf6b258 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 28 Oct 2024 12:11:15 -0300 Subject: [PATCH 5/7] fix details Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 16 +-- stdlib/test/utils/test_string_slice.mojo | 156 +---------------------- 2 files changed, 12 insertions(+), 160 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index ec4cf684e7..5336ceb665 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -940,17 +940,15 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable].type,]( rebind[List[StringSlice[__origin_of(elems)]]](elems) ) else: - var result: String = "" - var is_first = True + e_len = len(elems) + buf = List[String](capacity=e_len) + buf.size = e_len + b_ptr = buf.unsafe_ptr() - for e in elems: - if is_first: - is_first = False - else: - result += self - result += str(e[]) + for i in range(e_len): + (b_ptr + i).init_pointee_move(str(elems.unsafe_get(i))) - return result + return self.join_bytes(buf^) fn join_bytes[ T: BytesReadCollectionElement, //, diff --git a/stdlib/test/utils/test_string_slice.mojo b/stdlib/test/utils/test_string_slice.mojo index e5350b576b..9400b13822 100644 --- a/stdlib/test/utils/test_string_slice.mojo +++ b/stdlib/test/utils/test_string_slice.mojo @@ -415,158 +415,13 @@ def test_count_utf8_continuation_bytes(): _test(3, List[UInt8](b2, c, b3, c, c)) -def test_split(): - alias S = StringSlice - - # empty separators default to whitespace - var d = S("hello world").split() - assert_true(len(d) == 2) - assert_true(d[0] == "hello") - assert_true(d[1] == "world") - d = S("hello \t\n\n\v\fworld").split("\n") - assert_true(len(d) == 3) - assert_true(d[0] == "hello \t" and d[1] == "" and d[2] == "\v\fworld") - - # Should add all whitespace-like chars as one - # test all unicode separators - # 0 is to build a String with null terminator - alias next_line = List[UInt8](0xC2, 0x85, 0) - """TODO: \\x85""" - alias unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8, 0) - """TODO: \\u2028""" - alias unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9, 0) - """TODO: \\u2029""" - # TODO add line and paragraph separator as StringLiteral once unicode - # escape secuences are accepted - var univ_sep_var = ( - S(" ") - + S("\t") - + S("\n") - + S("\r") - + S("\v") - + S("\f") - + S("\x1c") - + S("\x1d") - + S("\x1e") - + S(next_line) - + S(unicode_line_sep) - + S(unicode_paragraph_sep) - ) - var s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var - d = s.split() - assert_true(len(d) == 2) - assert_true(d[0] == "hello" and d[1] == "world") - - # should split into empty strings between separators - d = S("1,,,3").split(",") - assert_true(len(d) == 4) - assert_true(d[0] == "1" and d[1] == "" and d[2] == "" and d[3] == "3") - d = S(",,,").split(",") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "" and d[2] == "" and d[3] == "") - d = S(" a b ").split(" ") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "a" and d[2] == "b" and d[3] == "") - d = S("abababaaba").split("aba") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "b" and d[2] == "" and d[3] == "") - - # should split into maxsplit + 1 items - d = S("1,2,3").split(",", 0) - assert_true(len(d) == 1) - assert_true(d[0] == "1,2,3") - d = S("1,2,3").split(",", 1) - assert_true(len(d) == 2) - assert_true(d[0] == "1" and d[1] == "2,3") - - assert_true(len(S("").split()) == 0) - assert_true(len(S(" ").split()) == 0) - assert_true(len(S("").split(" ")) == 1) - assert_true(len(S(" ").split(" ")) == 2) - assert_true(len(S(" ").split(" ")) == 3) - assert_true(len(S(" ").split(" ")) == 4) - - # Split in middle - var d1 = S("n") - var in1 = S("faang") - var res1 = in1.split(d1) - assert_equal(len(res1), 2) - assert_equal(res1[0], "faa") - assert_equal(res1[1], "g") - - # Matches should be properly split in multiple case - var d2 = S(" ") - var in2 = S("modcon is coming soon") - var res2 = in2.split(d2) - assert_equal(len(res2), 4) - assert_equal(res2[0], "modcon") - assert_equal(res2[1], "is") - assert_equal(res2[2], "coming") - assert_equal(res2[3], "soon") - - # No match from the delimiter - var d3 = S("x") - var in3 = S("hello world") - var res3 = in3.split(d3) - assert_equal(len(res3), 1) - assert_equal(res3[0], "hello world") - - # Multiple character delimiter - var d4 = S("ll") - var in4 = S("hello") - var res4 = in4.split(d4) - assert_equal(len(res4), 2) - assert_equal(res4[0], "he") - assert_equal(res4[1], "o") - - # related to #2879 - # TODO: replace string comparison when __eq__ is implemented for List - s = S("abbaaaabbba").split("a").__str__() - assert_equal(s, "['', 'bb', '', '', '', 'bbb', '']") - s = S("abbaaaabbba").split("a", 8).__str__() - assert_equal(s, "['', 'bb', '', '', '', 'bbb', '']") - s = S("abbaaaabbba").split("a", 5).__str__() - assert_equal(s, "['', 'bb', '', '', '', 'bbba']") - assert_equal(S("aaa").split("a", 0).__str__(), "['aaa']") - assert_equal(S("a").split("a").__str__(), "['', '']") - assert_equal(S("1,2,3").split("3", 0).__str__(), "['1,2,3']") - assert_equal(S("1,2,3").split("3", 1).__str__(), "['1,2,', '']") - assert_equal(S("1,2,3,3").split("3", 2).__str__(), "['1,2,', ',', '']") - assert_equal(S("1,2,3,3,3").split("3", 2).__str__(), "['1,2,', ',', ',3']") - - var in5 = S("Hello πŸ”₯!") - var res5 = in5.split() - assert_equal(len(res5), 2) - assert_equal(res5[0], "Hello") - assert_equal(res5[1], "πŸ”₯!") - - var in6 = S("Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚") - var res6 = in6.split(" ") - assert_equal(len(res6), 5) - assert_equal(res6[0], "Π›ΠΎΡ€Π΅ΠΌ") - assert_equal(res6[1], "ипсум") - assert_equal(res6[2], "Π΄ΠΎΠ»ΠΎΡ€") - assert_equal(res6[3], "сит") - assert_equal(res6[4], "Π°ΠΌΠ΅Ρ‚") - var res7 = in6.split("ΠΌ") - assert_equal(res7[0], "Π›ΠΎΡ€Π΅") - assert_equal(res7[1], " ипсу") - assert_equal(res7[2], " Π΄ΠΎΠ»ΠΎΡ€ сит Π°") - assert_equal(res7[3], "Π΅Ρ‚") - - assert_equal(S("123").split(""), List[String]("", "1", "2", "3", "")) - assert_equal(S("").join(S("123").split("")), "123") - assert_equal(S(",1,2,3,").split(","), S("123").split("")) - assert_equal(S(",").join(S("123").split("")), ",1,2,3,") - - def test_join(): - alias S = StringSlice - l1 = List[UInt8](1, 2, 3, 4, 5, 6, 7, 8, 9) + alias S = StringSlice[StaticConstantOrigin] + l1 = List[Byte](1, 2, 3, 4, 5, 6, 7, 8, 9) assert_equal(S(",").join(l1), "1,2,3,4,5,6,7,8,9") - assert_equal(S(",").join(List[UInt8](1, 2, 3)), "1,2,3") - assert_equal(S(",").join(List[UInt8]()), "") - assert_equal(S(",").join(List[UInt8](1)), "1") + assert_equal(S(",").join(List[Byte](1, 2, 3)), "1,2,3") + assert_equal(S(",").join(List[Byte]()), "") + assert_equal(S(",").join(List[Byte](1)), "1") l2 = List[S]("1", "2", "3") assert_equal(S(",").join(l2), "1,2,3") @@ -588,5 +443,4 @@ fn main() raises: test_combination_10_good_utf8_sequences() test_combination_10_good_10_bad_utf8_sequences() test_count_utf8_continuation_bytes() - test_split() test_join() From 25d6d0c056d44bb39d1c64b64cfe8bff98d5b049 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 28 Oct 2024 12:17:57 -0300 Subject: [PATCH 6/7] fix details Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 5336ceb665..88362f31b7 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -966,22 +966,18 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable].type,]( """ n_elems = len(elems) - if n_elems == 0: - return String("") s_len = self.byte_length() len_elems = 0 # Calculate the total size of the elements to join beforehand # to prevent alloc syscalls as we know the buffer size. # This can hugely improve the performance on large lists - for e_ref in elems: - len_elems += len(e_ref[].as_bytes_read()) - capacity = s_len * (n_elems - 1) + len_elems + 1 + for e in elems: + len_elems += len(e[].as_bytes_read()) + capacity = s_len * (n_elems - int(n_elems > 0)) + len_elems + 1 buf = String._buffer_type(capacity=capacity) buf.size = capacity - s_ptr = self.unsafe_ptr() - b_ptr = buf.unsafe_ptr() - offset = 0 - i = 0 + s_ptr, b_ptr = self.unsafe_ptr(), buf.unsafe_ptr() + offset, i = 0, 0 not_first = False while i < n_elems: memcpy(dest=b_ptr + offset, src=s_ptr, count=s_len * int(not_first)) From 236187a0dcbfca990422ea8c99c7c238dd7b614c Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 28 Oct 2024 12:27:13 -0300 Subject: [PATCH 7/7] fix detail Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 88362f31b7..e4a8f5b028 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -983,7 +983,7 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable].type,]( memcpy(dest=b_ptr + offset, src=s_ptr, count=s_len * int(not_first)) offset += s_len * int(not_first) not_first = True - e = elems[i].as_bytes_read() + e = elems.unsafe_get(i).as_bytes_read() e_len = len(e) memcpy(dest=b_ptr + offset, src=e.unsafe_ptr(), count=e_len) offset += e_len