From 2087d5000d73c453340ab1cfe22c63ee2df925f4 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 15 Oct 2024 23:08:21 +0000 Subject: [PATCH] chore(c/vendor): Update nanoarrow to 0.6.0 (#2220) This PR updates the vendored nanoarrow to 0.6.0! --- c/subprojects/nanoarrow.wrap | 10 +- c/vendor/nanoarrow/nanoarrow.c | 701 +++++++++++++++++++++++++------ c/vendor/nanoarrow/nanoarrow.h | 600 +++++++++++++++++++++++--- c/vendor/nanoarrow/nanoarrow.hpp | 13 +- c/vendor/vendor_nanoarrow.sh | 22 +- 5 files changed, 1146 insertions(+), 200 deletions(-) diff --git a/c/subprojects/nanoarrow.wrap b/c/subprojects/nanoarrow.wrap index 1a7e85693d..612d1118a6 100644 --- a/c/subprojects/nanoarrow.wrap +++ b/c/subprojects/nanoarrow.wrap @@ -1,10 +1,8 @@ [wrap-file] -directory = arrow-nanoarrow-apache-arrow-nanoarrow-0.5.0 -source_url = https://github.com/apache/arrow-nanoarrow/archive/refs/tags/apache-arrow-nanoarrow-0.5.0.tar.gz -source_filename = apache-arrow-nanoarrow-0.5.0.tar.gz -source_hash = 0ceeaa1fb005dbc89c8c7d1b39f2dba07344e40aa9d885ee25fb55b4d57e331a -source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanoarrow_0.5.0-1/apache-arrow-nanoarrow-0.5.0.tar.gz -wrapdb_version = 0.5.0-1 +directory = arrow-nanoarrow-33d2c8b973d8f8f424e02ac92ddeaace2a92f8dd +source_url = https://github.com/apache/arrow-nanoarrow/archive/33d2c8b973d8f8f424e02ac92ddeaace2a92f8dd.tar.gz +source_filename = arrow-nanoarrow-33d2c8b973d8f8f424e02ac92ddeaace2a92f8dd.tar.gz +source_hash = be4d2a6f1467793fe1b02c6ecf12383ed9ecf29557531715a3b9e11578ab18e8 [provide] nanoarrow = nanoarrow_dep diff --git a/c/vendor/nanoarrow/nanoarrow.c b/c/vendor/nanoarrow/nanoarrow.c index 9677a0e533..8f2659881b 100644 --- a/c/vendor/nanoarrow/nanoarrow.c +++ b/c/vendor/nanoarrow/nanoarrow.c @@ -66,6 +66,7 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_RUN_END_ENCODED: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; @@ -178,6 +179,16 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; break; + case NANOARROW_TYPE_BINARY_VIEW: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY_VIEW; + layout->element_size_bits[1] = 128; + break; + case NANOARROW_TYPE_STRING_VIEW: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_STRING_VIEW; + layout->element_size_bits[1] = 128; + default: break; } @@ -345,6 +356,7 @@ ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, // https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365 ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, struct ArrowBuffer* buffer) { + NANOARROW_DCHECK(decimal->n_words == 2 || decimal->n_words == 4); int is_negative = ArrowDecimalSign(decimal) < 0; uint64_t words_little_endian[4]; @@ -468,6 +480,7 @@ ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decim // under the License. #include +#include #include #include #include @@ -552,8 +565,12 @@ static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { return "u"; case NANOARROW_TYPE_LARGE_STRING: return "U"; + case NANOARROW_TYPE_STRING_VIEW: + return "vu"; case NANOARROW_TYPE_BINARY: return "z"; + case NANOARROW_TYPE_BINARY_VIEW: + return "vz"; case NANOARROW_TYPE_LARGE_BINARY: return "Z"; @@ -576,6 +593,8 @@ static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { return "+s"; case NANOARROW_TYPE_MAP: return "+m"; + case NANOARROW_TYPE_RUN_END_ENCODED: + return "+r"; default: return NULL; @@ -607,6 +626,13 @@ static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, NANOARROW_RETURN_NOT_OK( ArrowSchemaSetName(schema->children[0]->children[1], "value")); break; + case NANOARROW_TYPE_RUN_END_ENCODED: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 2)); + ArrowSchemaInit(schema->children[0]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "run_ends")); + schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + ArrowSchemaInit(schema->children[1]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[1], "values")); default: break; } @@ -676,10 +702,10 @@ ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, int n_chars; switch (type) { case NANOARROW_TYPE_FIXED_SIZE_BINARY: - n_chars = snprintf(buffer, sizeof(buffer), "w:%d", (int)fixed_size); + n_chars = snprintf(buffer, sizeof(buffer), "w:%" PRId32, fixed_size); break; case NANOARROW_TYPE_FIXED_SIZE_LIST: - n_chars = snprintf(buffer, sizeof(buffer), "+w:%d", (int)fixed_size); + n_chars = snprintf(buffer, sizeof(buffer), "+w:%" PRId32, fixed_size); break; default: return EINVAL; @@ -729,6 +755,28 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowT return ArrowSchemaSetFormat(schema, buffer); } +ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema, + enum ArrowType run_end_type) { + switch (run_end_type) { + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT64: + break; + default: + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat( + schema, ArrowSchemaFormatTemplate(NANOARROW_TYPE_RUN_END_ENCODED))); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaInitChildrenIfNeeded(schema, NANOARROW_TYPE_RUN_END_ENCODED)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema->children[0], run_end_type)); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetType(schema->children[1], NANOARROW_TYPE_UNINITIALIZED)); + + return NANOARROW_OK; +} + static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: @@ -850,7 +898,7 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowTyp format_out_size -= n_chars; for (int64_t i = 1; i < n_children; i++) { - n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i); + n_chars = snprintf(format_cursor, format_out_size, ",%" PRId64, i); format_cursor += n_chars; format_out_size -= n_chars; } @@ -1144,8 +1192,9 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); return NANOARROW_OK; default: - ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %d", - (int)schema_view->decimal_bitwidth); + ArrowErrorSet(error, + "Expected decimal bitwidth of 128 or 256 but found %" PRId32, + schema_view->decimal_bitwidth); return EINVAL; } @@ -1202,6 +1251,13 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, *format_end_out = format + 2; return NANOARROW_OK; + // run end encoded has no buffer at all + case 'r': + schema_view->storage_type = NANOARROW_TYPE_RUN_END_ENCODED; + schema_view->type = NANOARROW_TYPE_RUN_END_ENCODED; + *format_end_out = format + 2; + return NANOARROW_OK; + // just validity buffer case 'w': if (format[2] != ':' || format[3] == '\0') { @@ -1249,11 +1305,10 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, int64_t n_type_ids = _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); if (n_type_ids != schema_view->schema->n_children) { - ArrowErrorSet( - error, - "Expected union type_ids parameter to be a comma-separated list of %ld " - "values between 0 and 127 but found '%s'", - (long)schema_view->schema->n_children, schema_view->union_type_ids); + ArrowErrorSet(error, + "Expected union type_ids parameter to be a comma-separated " + "list of %" PRId64 " values between 0 and 127 but found '%s'", + schema_view->schema->n_children, schema_view->union_type_ids); return EINVAL; } *format_end_out = format + strlen(format); @@ -1432,6 +1487,24 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, return EINVAL; } + // view types + case 'v': { + switch (format[1]) { + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_STRING_VIEW); + *format_end_out = format + 2; + return NANOARROW_OK; + case 'z': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BINARY_VIEW); + *format_end_out = format + 2; + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected 'u', or 'z' following 'v' but found '%s'", + format + 1); + return EINVAL; + } + } + default: ArrowErrorSet(error, "Unknown format: '%s'", format); return EINVAL; @@ -1441,8 +1514,9 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, static ArrowErrorCode ArrowSchemaViewValidateNChildren( struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { if (n_children != -1 && schema_view->schema->n_children != n_children) { - ArrowErrorSet(error, "Expected schema with %d children but found %d children", - (int)n_children, (int)schema_view->schema->n_children); + ArrowErrorSet( + error, "Expected schema with %" PRId64 " children but found %" PRId64 " children", + n_children, schema_view->schema->n_children); return EINVAL; } @@ -1452,15 +1526,15 @@ static ArrowErrorCode ArrowSchemaViewValidateNChildren( for (int64_t i = 0; i < schema_view->schema->n_children; i++) { child = schema_view->schema->children[i]; if (child == NULL) { - ArrowErrorSet(error, - "Expected valid schema at schema->children[%ld] but found NULL", - (long)i); + ArrowErrorSet( + error, "Expected valid schema at schema->children[%" PRId64 "] but found NULL", + i); return EINVAL; } else if (child->release == NULL) { - ArrowErrorSet( - error, - "Expected valid schema at schema->children[%ld] but found a released schema", - (long)i); + ArrowErrorSet(error, + "Expected valid schema at schema->children[%" PRId64 + "] but found a released schema", + i); return EINVAL; } } @@ -1478,8 +1552,9 @@ static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); if (schema_view->schema->children[0]->n_children != 2) { - ArrowErrorSet(error, "Expected child of map type to have 2 children but found %d", - (int)schema_view->schema->children[0]->n_children); + ArrowErrorSet(error, + "Expected child of map type to have 2 children but found %" PRId64, + schema_view->schema->children[0]->n_children); return EINVAL; } @@ -1561,6 +1636,8 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_vie case NANOARROW_TYPE_TIME32: case NANOARROW_TYPE_TIME64: case NANOARROW_TYPE_DURATION: + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: return ArrowSchemaViewValidateNChildren(schema_view, 0, error); case NANOARROW_TYPE_FIXED_SIZE_BINARY: @@ -1576,6 +1653,9 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_vie case NANOARROW_TYPE_FIXED_SIZE_LIST: return ArrowSchemaViewValidateNChildren(schema_view, 1, error); + case NANOARROW_TYPE_RUN_END_ENCODED: + return ArrowSchemaViewValidateNChildren(schema_view, 2, error); + case NANOARROW_TYPE_STRUCT: return ArrowSchemaViewValidateNChildren(schema_view, -1, error); @@ -1591,7 +1671,7 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_vie default: ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", - (int)schema_view->type); + schema_view->type); return EINVAL; } @@ -1641,8 +1721,8 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, } if ((format + format_len) != format_end_out) { - ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%d characters", - format, (int)(format_end_out - format), (int)(format_len)); + ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%zu characters", + format, (int)(format_end_out - format), format_len); return EINVAL; } @@ -1702,9 +1782,8 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi switch (schema_view->type) { case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: - return snprintf(out, n, "%s(%d, %d)", type_string, - (int)schema_view->decimal_precision, - (int)schema_view->decimal_scale); + return snprintf(out, n, "%s(%" PRId32 ", %" PRId32 ")", type_string, + schema_view->decimal_precision, schema_view->decimal_scale); case NANOARROW_TYPE_TIMESTAMP: return snprintf(out, n, "%s('%s', '%s')", type_string, ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); @@ -1715,7 +1794,7 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi ArrowTimeUnitString(schema_view->time_unit)); case NANOARROW_TYPE_FIXED_SIZE_BINARY: case NANOARROW_TYPE_FIXED_SIZE_LIST: - return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size); + return snprintf(out, n, "%s(%" PRId32 ")", type_string, schema_view->fixed_size); case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_DENSE_UNION: return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); @@ -1731,7 +1810,7 @@ static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, // In the unlikely snprintf() returning a negative value (encoding error), // ensure the result won't cause an out-of-bounds access. if (n_chars_last < 0) { - n_chars = 0; + n_chars_last = 0; } *n_chars += n_chars_last; @@ -2070,6 +2149,10 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, // under the License. #include +#include +#include +#include +#include #include #include @@ -2083,6 +2166,12 @@ static void ArrowArrayReleaseInternal(struct ArrowArray* array) { ArrowBitmapReset(&private_data->bitmap); ArrowBufferReset(&private_data->buffers[0]); ArrowBufferReset(&private_data->buffers[1]); + ArrowFree(private_data->buffer_data); + for (int32_t i = 0; i < private_data->n_variadic_buffers; ++i) { + ArrowBufferReset(&private_data->variadic_buffers[i]); + } + ArrowFree(private_data->variadic_buffers); + ArrowFree(private_data->variadic_buffer_sizes); ArrowFree(private_data); } @@ -2123,6 +2212,7 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_RUN_END_ENCODED: array->n_buffers = 0; break; @@ -2156,7 +2246,10 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, case NANOARROW_TYPE_DENSE_UNION: array->n_buffers = 2; break; - + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1; + break; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_BINARY: @@ -2199,12 +2292,36 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, ArrowBitmapInit(&private_data->bitmap); ArrowBufferInit(&private_data->buffers[0]); ArrowBufferInit(&private_data->buffers[1]); - private_data->buffer_data[0] = NULL; - private_data->buffer_data[1] = NULL; - private_data->buffer_data[2] = NULL; + private_data->buffer_data = + (const void**)ArrowMalloc(sizeof(void*) * NANOARROW_MAX_FIXED_BUFFERS); + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; ++i) { + private_data->buffer_data[i] = NULL; + } + private_data->n_variadic_buffers = 0; + private_data->variadic_buffers = NULL; + private_data->variadic_buffer_sizes = NULL; array->private_data = private_data; - array->buffers = (const void**)(&private_data->buffer_data); + array->buffers = (const void**)(private_data->buffer_data); + + // These are not technically "storage" in the sense that they do not appear + // in the ArrowSchemaView's storage_type member; however, allowing them here + // is helpful to maximize the number of types that can avoid going through + // ArrowArrayInitFromSchema(). + switch (storage_type) { + case NANOARROW_TYPE_DURATION: + case NANOARROW_TYPE_TIMESTAMP: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DATE64: + storage_type = NANOARROW_TYPE_INT64; + break; + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_DATE32: + storage_type = NANOARROW_TYPE_INT32; + break; + default: + break; + } int result = ArrowArraySetStorageType(array, storage_type); if (result != NANOARROW_OK) { @@ -2488,10 +2605,26 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + const bool is_binary_view = private_data->storage_type == NANOARROW_TYPE_STRING_VIEW || + private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW; + const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS; + + for (int32_t i = 0; i < nfixed_buf; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } + if (is_binary_view) { + const int32_t nvirt_buf = private_data->n_variadic_buffers; + private_data->buffer_data = (const void**)ArrowRealloc( + private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf + 1)); + for (int32_t i = 0; i < nvirt_buf; i++) { + private_data->buffer_data[nfixed_buf + i] = private_data->variadic_buffers[i].data; + } + private_data->buffer_data[nfixed_buf + nvirt_buf] = + private_data->variadic_buffer_sizes; + array->buffers = (const void**)(private_data->buffer_data); + } + for (int64_t i = 0; i < array->n_children; i++) { ArrowArrayFlushInternalPointers(array->children[i]); } @@ -2547,6 +2680,11 @@ ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, return EINVAL; } + if (n_children == 0) { + array_view->n_children = 0; + return NANOARROW_OK; + } + array_view->children = (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); if (array_view->children == NULL) { @@ -2695,6 +2833,8 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) case NANOARROW_BUFFER_TYPE_UNION_OFFSET: array_view->buffer_views[i].size_bytes = element_size_bytes * length; continue; + case NANOARROW_BUFFER_TYPE_VARIADIC_DATA: + case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE: case NANOARROW_BUFFER_TYPE_NONE: array_view->buffer_views[i].size_bytes = 0; continue; @@ -2727,9 +2867,16 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, array_view->offset = array->offset; array_view->length = array->length; array_view->null_count = array->null_count; + array_view->variadic_buffer_sizes = NULL; + array_view->variadic_buffers = NULL; + array_view->n_variadic_buffers = 0; int64_t buffers_required = 0; - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + const int nfixed_buf = array_view->storage_type == NANOARROW_TYPE_STRING_VIEW || + array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW + ? NANOARROW_BINARY_VIEW_FIXED_BUFFERS + : NANOARROW_MAX_FIXED_BUFFERS; + for (int i = 0; i < nfixed_buf; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } @@ -2747,17 +2894,30 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, } } - // Check the number of buffers + if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW || + array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) { + const int64_t n_buffers = array->n_buffers; + const int32_t nfixed_buf = NANOARROW_BINARY_VIEW_FIXED_BUFFERS; + + const int32_t nvariadic_buf = (int32_t)(n_buffers - nfixed_buf - 1); + array_view->n_variadic_buffers = nvariadic_buf; + buffers_required += nvariadic_buf + 1; + array_view->variadic_buffers = array->buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS; + array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1]; + } + if (buffers_required != array->n_buffers) { - ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", - (int)buffers_required, (int)array->n_buffers); + ArrowErrorSet(error, + "Expected array with %" PRId64 " buffer(s) but found %" PRId64 + " buffer(s)", + buffers_required, array->n_buffers); return EINVAL; } // Check number of children if (array_view->n_children != array->n_children) { - ArrowErrorSet(error, "Expected %ld children but found %ld children", - (long)array_view->n_children, (long)array->n_children); + ArrowErrorSet(error, "Expected %" PRId64 " children but found %" PRId64 " children", + array_view->n_children, array->n_children); return EINVAL; } @@ -2789,14 +2949,20 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, struct ArrowError* error) { if (array_view->length < 0) { - ArrowErrorSet(error, "Expected length >= 0 but found length %ld", - (long)array_view->length); + ArrowErrorSet(error, "Expected length >= 0 but found length %" PRId64, + array_view->length); return EINVAL; } if (array_view->offset < 0) { - ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", - (long)array_view->offset); + ArrowErrorSet(error, "Expected offset >= 0 but found offset %" PRId64, + array_view->offset); + return EINVAL; + } + + // Ensure that offset + length fits within an int64 before a possible overflow + if ((uint64_t)array_view->offset + (uint64_t)array_view->length > (uint64_t)INT64_MAX) { + ArrowErrorSet(error, "Offset + length is > INT64_MAX"); return EINVAL; } @@ -2809,7 +2975,9 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, for (int i = 0; i < 2; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; // Initialize with a value that will cause an error if accidentally used uninitialized - int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; + // Need to suppress the clang-tidy warning because gcc warns for possible use + int64_t min_buffer_size_bytes = // NOLINT(clang-analyzer-deadcode.DeadStores) + array_view->buffer_views[i].size_bytes + 1; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: @@ -2835,6 +3003,8 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, case NANOARROW_BUFFER_TYPE_UNION_OFFSET: min_buffer_size_bytes = element_size_bytes * offset_plus_length; break; + case NANOARROW_BUFFER_TYPE_VARIADIC_DATA: + case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE: case NANOARROW_BUFFER_TYPE_NONE: continue; } @@ -2844,11 +3014,11 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { ArrowErrorSet(error, - "Expected %s array buffer %d to have size >= %ld bytes but found " - "buffer with %ld bytes", - ArrowTypeString(array_view->storage_type), (int)i, - (long)min_buffer_size_bytes, - (long)array_view->buffer_views[i].size_bytes); + "Expected %s array buffer %d to have size >= %" PRId64 + " bytes but found " + "buffer with %" PRId64 " bytes", + ArrowTypeString(array_view->storage_type), i, min_buffer_size_bytes, + array_view->buffer_views[i].size_bytes); return EINVAL; } } @@ -2860,11 +3030,20 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, case NANOARROW_TYPE_FIXED_SIZE_LIST: case NANOARROW_TYPE_MAP: if (array_view->n_children != 1) { - ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", - ArrowTypeString(array_view->storage_type), - (long)array_view->n_children); + ArrowErrorSet(error, + "Expected 1 child of %s array but found %" PRId64 " child arrays", + ArrowTypeString(array_view->storage_type), array_view->n_children); return EINVAL; } + break; + case NANOARROW_TYPE_RUN_END_ENCODED: + if (array_view->n_children != 2) { + ArrowErrorSet( + error, "Expected 2 children for %s array but found %" PRId64 " child arrays", + ArrowTypeString(array_view->storage_type), array_view->n_children); + return EINVAL; + } + break; default: break; } @@ -2878,12 +3057,11 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, child_min_length = (array_view->offset + array_view->length); for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i]->length < child_min_length) { - ArrowErrorSet( - error, - "Expected struct child %d to have length >= %ld but found child with " - "length %ld", - (int)(i + 1), (long)(child_min_length), - (long)array_view->children[i]->length); + ArrowErrorSet(error, + "Expected struct child %" PRId64 " to have length >= %" PRId64 + " but found child with " + "length %" PRId64, + i + 1, child_min_length, array_view->children[i]->length); return EINVAL; } } @@ -2894,12 +3072,78 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, array_view->layout.child_size_elements; if (array_view->children[0]->length < child_min_length) { ArrowErrorSet(error, - "Expected child of fixed_size_list array to have length >= %ld but " - "found array with length %ld", - (long)child_min_length, (long)array_view->children[0]->length); + "Expected child of fixed_size_list array to have length >= %" PRId64 + " but " + "found array with length %" PRId64, + child_min_length, array_view->children[0]->length); return EINVAL; } break; + + case NANOARROW_TYPE_RUN_END_ENCODED: { + if (array_view->n_children != 2) { + ArrowErrorSet(error, + "Expected 2 children for run-end encoded array but found %" PRId64, + array_view->n_children); + return EINVAL; + } + struct ArrowArrayView* run_ends_view = array_view->children[0]; + struct ArrowArrayView* values_view = array_view->children[1]; + int64_t max_length; + switch (run_ends_view->storage_type) { + case NANOARROW_TYPE_INT16: + max_length = INT16_MAX; + break; + case NANOARROW_TYPE_INT32: + max_length = INT32_MAX; + break; + case NANOARROW_TYPE_INT64: + max_length = INT64_MAX; + break; + default: + ArrowErrorSet( + error, + "Run-end encoded array only supports INT16, INT32 or INT64 run-ends " + "but found run-ends type %s", + ArrowTypeString(run_ends_view->storage_type)); + return EINVAL; + } + + // There is already a check above that offset_plus_length < INT64_MAX + if (offset_plus_length > max_length) { + ArrowErrorSet(error, + "Offset + length of a run-end encoded array must fit in a value" + " of the run end type %s but is %" PRId64 " + %" PRId64, + ArrowTypeString(run_ends_view->storage_type), array_view->offset, + array_view->length); + return EINVAL; + } + + if (run_ends_view->length > values_view->length) { + ArrowErrorSet(error, + "Length of run_ends is greater than the length of values: %" PRId64 + " > %" PRId64, + run_ends_view->length, values_view->length); + return EINVAL; + } + + if (run_ends_view->length == 0 && values_view->length != 0) { + ArrowErrorSet(error, + "Run-end encoded array has zero length %" PRId64 + ", but values array has " + "non-zero length", + values_view->length); + return EINVAL; + } + + if (run_ends_view->null_count != 0) { + ArrowErrorSet(error, "Null count must be 0 for run ends array, but is %" PRId64, + run_ends_view->null_count); + return EINVAL; + } + break; + } + default: break; } @@ -2935,24 +3179,30 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int32[0]; + first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset]; if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { array_view->buffer_views[2].size_bytes = last_offset; } else if (array_view->buffer_views[2].size_bytes < last_offset) { ArrowErrorSet(error, - "Expected %s array buffer 2 to have size >= %ld bytes but found " - "buffer with %ld bytes", - ArrowTypeString(array_view->storage_type), (long)last_offset, - (long)array_view->buffer_views[2].size_bytes); + "Expected %s array buffer 2 to have size >= %" PRId64 + " bytes but found " + "buffer with %" PRId64 " bytes", + ArrowTypeString(array_view->storage_type), last_offset, + array_view->buffer_views[2].size_bytes); return EINVAL; } } else if (array_view->buffer_views[2].size_bytes == -1) { @@ -2965,24 +3215,30 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int64[0]; + first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset]; if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { array_view->buffer_views[2].size_bytes = last_offset; } else if (array_view->buffer_views[2].size_bytes < last_offset) { ArrowErrorSet(error, - "Expected %s array buffer 2 to have size >= %ld bytes but found " - "buffer with %ld bytes", - ArrowTypeString(array_view->storage_type), (long)last_offset, - (long)array_view->buffer_views[2].size_bytes); + "Expected %s array buffer 2 to have size >= %" PRId64 + " bytes but found " + "buffer with %" PRId64 " bytes", + ArrowTypeString(array_view->storage_type), last_offset, + array_view->buffer_views[2].size_bytes); return EINVAL; } } else if (array_view->buffer_views[2].size_bytes == -1) { @@ -2995,12 +3251,11 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i]->length < offset_plus_length) { - ArrowErrorSet( - error, - "Expected struct child %d to have length >= %ld but found child with " - "length %ld", - (int)(i + 1), (long)offset_plus_length, - (long)array_view->children[i]->length); + ArrowErrorSet(error, + "Expected struct child %" PRId64 " to have length >= %" PRId64 + " but found child with " + "length %" PRId64, + i + 1, offset_plus_length, array_view->children[i]->length); return EINVAL; } } @@ -3009,21 +3264,27 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int32[0]; + first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset]; if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + if (array_view->children[0]->length < last_offset) { - ArrowErrorSet( - error, - "Expected child of %s array to have length >= %ld but found array with " - "length %ld", - ArrowTypeString(array_view->storage_type), (long)last_offset, - (long)array_view->children[0]->length); + ArrowErrorSet(error, + "Expected child of %s array to have length >= %" PRId64 + " but found array with " + "length %" PRId64, + ArrowTypeString(array_view->storage_type), last_offset, + array_view->children[0]->length); return EINVAL; } } @@ -3031,24 +3292,58 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, case NANOARROW_TYPE_LARGE_LIST: if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int64[0]; + first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset]; if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + if (array_view->children[0]->length < last_offset) { - ArrowErrorSet( - error, - "Expected child of large list array to have length >= %ld but found array " - "with length %ld", - (long)last_offset, (long)array_view->children[0]->length); + ArrowErrorSet(error, + "Expected child of large list array to have length >= %" PRId64 + " but found array " + "with length %" PRId64, + last_offset, array_view->children[0]->length); return EINVAL; } } break; + + case NANOARROW_TYPE_RUN_END_ENCODED: { + struct ArrowArrayView* run_ends_view = array_view->children[0]; + if (run_ends_view->length == 0) { + break; + } + + int64_t first_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0); + if (first_run_end < 1) { + ArrowErrorSet( + error, + "All run ends must be greater than 0 but the first run end is %" PRId64, + first_run_end); + return EINVAL; + } + + // offset + length < INT64_MAX is checked in ArrowArrayViewValidateMinimal() + int64_t last_run_end = + ArrowArrayViewGetIntUnsafe(run_ends_view, run_ends_view->length - 1); + if (last_run_end < offset_plus_length) { + ArrowErrorSet(error, + "Last run end is %" PRId64 " but it should be >= (%" PRId64 + " + %" PRId64 ")", + last_run_end, array_view->offset, array_view->length); + return EINVAL; + } + break; + } default: break; } @@ -3101,7 +3396,7 @@ static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); + ArrowErrorSet(error, "[%" PRId64 "] Expected element size >= 0", i); return EINVAL; } } @@ -3117,7 +3412,7 @@ static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); + ArrowErrorSet(error, "[%" PRId64 "] Expected element size >= 0", i); return EINVAL; } } @@ -3130,8 +3425,9 @@ static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, for (int64_t i = 0; i < view.size_bytes; i++) { if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { ArrowErrorSet(error, - "[%ld] Expected buffer value between %d and %d but found value %d", - (long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]); + "[%" PRId64 "] Expected buffer value between %" PRId8 " and %" PRId8 + " but found value %" PRId8, + i, min_value, max_value, view.data.as_int8[i]); return EINVAL; } } @@ -3151,8 +3447,8 @@ static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, } if (!item_found) { - ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i, - (int)view.data.as_int8[i]); + ArrowErrorSet(error, "[%" PRId64 "] Unexpected buffer value %" PRId8, i, + view.data.as_int8[i]); return EINVAL; } } @@ -3164,13 +3460,24 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, struct ArrowError* error) { for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { switch (array_view->layout.buffer_type[i]) { + // Only validate the portion of the buffer that is strictly required, + // which includes not validating the offset buffer of a zero-length array. case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + if (array_view->length == 0) { + continue; + } if (array_view->layout.element_size_bits[i] == 32) { - NANOARROW_RETURN_NOT_OK( - ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); + struct ArrowBufferView sliced_offsets; + sliced_offsets.data.as_int32 = + array_view->buffer_views[i].data.as_int32 + array_view->offset; + sliced_offsets.size_bytes = (array_view->length + 1) * sizeof(int32_t); + NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt32(sliced_offsets, error)); } else { - NANOARROW_RETURN_NOT_OK( - ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); + struct ArrowBufferView sliced_offsets; + sliced_offsets.data.as_int64 = + array_view->buffer_views[i].data.as_int64 + array_view->offset; + sliced_offsets.size_bytes = (array_view->length + 1) * sizeof(int64_t); + NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt64(sliced_offsets, error)); } break; default: @@ -3180,6 +3487,15 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { + struct ArrowBufferView sliced_type_ids; + sliced_type_ids.size_bytes = array_view->length * sizeof(int8_t); + if (array_view->length > 0) { + sliced_type_ids.data.as_int8 = + array_view->buffer_views[0].data.as_int8 + array_view->offset; + } else { + sliced_type_ids.data.as_int8 = NULL; + } + if (array_view->union_type_id_map == NULL) { // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough @@ -3191,9 +3507,9 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, array_view->union_type_id_map, array_view->n_children, array_view->n_children)) { NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( - array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); + sliced_type_ids, 0, (int8_t)(array_view->n_children - 1), error)); } else { - NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], + NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(sliced_type_ids, array_view->union_type_id_map + 128, array_view->n_children, error)); } @@ -3207,16 +3523,37 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); int64_t child_length = array_view->children[child_id]->length; if (offset < 0 || offset > child_length) { - ArrowErrorSet( - error, - "[%ld] Expected union offset for child id %d to be between 0 and %ld but " - "found offset value %ld", - (long)i, (int)child_id, (long)child_length, (long)offset); + ArrowErrorSet(error, + "[%" PRId64 "] Expected union offset for child id %" PRId8 + " to be between 0 and %" PRId64 + " but " + "found offset value %" PRId64, + i, child_id, child_length, offset); return EINVAL; } } } + if (array_view->storage_type == NANOARROW_TYPE_RUN_END_ENCODED) { + struct ArrowArrayView* run_ends_view = array_view->children[0]; + if (run_ends_view->length > 0) { + int64_t last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0); + for (int64_t i = 1; i < run_ends_view->length; i++) { + const int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i); + if (run_end <= last_run_end) { + ArrowErrorSet( + error, + "Every run end must be strictly greater than the previous run end, " + "but run_ends[%" PRId64 " is %" PRId64 " and run_ends[%" PRId64 + "] is %" PRId64, + i, run_end, i - 1, last_run_end); + return EINVAL; + } + last_run_end = run_end; + } + } + } + // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); @@ -3249,6 +3586,136 @@ ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, ArrowErrorSet(error, "validation_level not recognized"); return EINVAL; } + +struct ArrowComparisonInternalState { + enum ArrowCompareLevel level; + int is_equal; + struct ArrowError* reason; +}; + +NANOARROW_CHECK_PRINTF_ATTRIBUTE static void ArrowComparePrependPath( + struct ArrowError* out, const char* fmt, ...) { + if (out == NULL) { + return; + } + + char prefix[128]; + prefix[0] = '\0'; + va_list args; + va_start(args, fmt); + int prefix_len = vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + + if (prefix_len <= 0) { + return; + } + + size_t out_len = strlen(out->message); + size_t out_len_to_move = sizeof(struct ArrowError) - prefix_len - 1; + if (out_len_to_move > out_len) { + out_len_to_move = out_len; + } + + memmove(out->message + prefix_len, out->message, out_len_to_move); + memcpy(out->message, prefix, prefix_len); + out->message[out_len + prefix_len] = '\0'; +} + +#define SET_NOT_EQUAL_AND_RETURN_IF_IMPL(cond_, state_, reason_) \ + do { \ + if (cond_) { \ + ArrowErrorSet(state_->reason, ": %s", reason_); \ + state_->is_equal = 0; \ + return; \ + } \ + } while (0) + +#define SET_NOT_EQUAL_AND_RETURN_IF(condition_, state_) \ + SET_NOT_EQUAL_AND_RETURN_IF_IMPL(condition_, state_, #condition_) + +static void ArrowArrayViewCompareBuffer(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, int i, + struct ArrowComparisonInternalState* state) { + SET_NOT_EQUAL_AND_RETURN_IF( + actual->buffer_views[i].size_bytes != expected->buffer_views[i].size_bytes, state); + + int64_t buffer_size = actual->buffer_views[i].size_bytes; + if (buffer_size > 0) { + SET_NOT_EQUAL_AND_RETURN_IF( + memcmp(actual->buffer_views[i].data.data, expected->buffer_views[i].data.data, + buffer_size) != 0, + state); + } +} + +static void ArrowArrayViewCompareIdentical(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, + struct ArrowComparisonInternalState* state) { + SET_NOT_EQUAL_AND_RETURN_IF(actual->storage_type != expected->storage_type, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->n_children != expected->n_children, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary == NULL && expected->dictionary != NULL, + state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary != NULL && expected->dictionary == NULL, + state); + + SET_NOT_EQUAL_AND_RETURN_IF(actual->length != expected->length, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->offset != expected->offset, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->null_count != expected->null_count, state); + + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + ArrowArrayViewCompareBuffer(actual, expected, i, state); + if (!state->is_equal) { + ArrowComparePrependPath(state->reason, ".buffers[%d]", i); + return; + } + } + + for (int64_t i = 0; i < actual->n_children; i++) { + ArrowArrayViewCompareIdentical(actual->children[i], expected->children[i], state); + if (!state->is_equal) { + ArrowComparePrependPath(state->reason, ".children[%" PRId64 "]", i); + return; + } + } + + if (actual->dictionary != NULL) { + ArrowArrayViewCompareIdentical(actual->dictionary, expected->dictionary, state); + if (!state->is_equal) { + ArrowComparePrependPath(state->reason, ".dictionary"); + return; + } + } +} + +// Top-level entry point to take care of creating, cleaning up, and +// propagating the ArrowComparisonInternalState to the caller +ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, + enum ArrowCompareLevel level, int* out, + struct ArrowError* reason) { + struct ArrowComparisonInternalState state; + state.level = level; + state.is_equal = 1; + state.reason = reason; + + switch (level) { + case NANOARROW_COMPARE_IDENTICAL: + ArrowArrayViewCompareIdentical(actual, expected, &state); + break; + default: + return EINVAL; + } + + *out = state.is_equal; + if (!state.is_equal) { + ArrowComparePrependPath(state.reason, "root"); + } + + return NANOARROW_OK; +} + +#undef SET_NOT_EQUAL_AND_RETURN_IF +#undef SET_NOT_EQUAL_AND_RETURN_IF_IMPL // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information diff --git a/c/vendor/nanoarrow/nanoarrow.h b/c/vendor/nanoarrow/nanoarrow.h index e845d0ad96..264aad5b6e 100644 --- a/c/vendor/nanoarrow/nanoarrow.h +++ b/c/vendor/nanoarrow/nanoarrow.h @@ -19,9 +19,9 @@ #define NANOARROW_BUILD_ID_H_INCLUDED #define NANOARROW_VERSION_MAJOR 0 -#define NANOARROW_VERSION_MINOR 5 +#define NANOARROW_VERSION_MINOR 6 #define NANOARROW_VERSION_PATCH 0 -#define NANOARROW_VERSION "0.5.0" +#define NANOARROW_VERSION "0.6.0" #define NANOARROW_VERSION_INT \ (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ @@ -181,14 +181,14 @@ struct ArrowArrayStream { NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) #if defined(NANOARROW_DEBUG) -#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) { \ - ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ - NAME, __FILE__, __LINE__); \ - return NAME; \ - } \ +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d(%s)\n* %s:%d", EXPR_STR, \ + NAME, strerror(NAME), __FILE__, __LINE__); \ + return NAME; \ + } \ } while (0) #else #define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ @@ -482,7 +482,10 @@ enum ArrowType { NANOARROW_TYPE_LARGE_STRING, NANOARROW_TYPE_LARGE_BINARY, NANOARROW_TYPE_LARGE_LIST, - NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO, + NANOARROW_TYPE_RUN_END_ENCODED, + NANOARROW_TYPE_BINARY_VIEW, + NANOARROW_TYPE_STRING_VIEW }; /// \brief Get a string value of an enum ArrowType value @@ -569,6 +572,12 @@ static inline const char* ArrowTypeString(enum ArrowType type) { return "large_list"; case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: return "interval_month_day_nano"; + case NANOARROW_TYPE_RUN_END_ENCODED: + return "run_end_encoded"; + case NANOARROW_TYPE_BINARY_VIEW: + return "binary_view"; + case NANOARROW_TYPE_STRING_VIEW: + return "string_view"; default: return NULL; } @@ -605,6 +614,17 @@ enum ArrowValidationLevel { NANOARROW_VALIDATION_LEVEL_FULL = 3 }; +/// \brief Comparison level enumerator +/// \ingroup nanoarrow-utils +enum ArrowCompareLevel { + /// \brief Consider arrays equal if buffers contain identical content + /// and have identical offset, null count, and length. Note that this is + /// a much stricter check than logical equality, which would take into + /// account potentially different content of null slots, arrays with a + /// non-zero offset, and other considerations. + NANOARROW_COMPARE_IDENTICAL, +}; + /// \brief Get a string value of an enum ArrowTimeUnit value /// \ingroup nanoarrow-utils /// @@ -634,15 +654,13 @@ enum ArrowBufferType { NANOARROW_BUFFER_TYPE_TYPE_ID, NANOARROW_BUFFER_TYPE_UNION_OFFSET, NANOARROW_BUFFER_TYPE_DATA_OFFSET, - NANOARROW_BUFFER_TYPE_DATA + NANOARROW_BUFFER_TYPE_DATA, + NANOARROW_BUFFER_TYPE_VARIADIC_DATA, + NANOARROW_BUFFER_TYPE_VARIADIC_SIZE }; -/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout +/// \brief The maximum number of fixed buffers in an ArrowArrayView or ArrowLayout /// \ingroup nanoarrow-array-view -/// -/// All currently supported types have 3 buffers or fewer; however, future types -/// may involve a variable number of buffers (e.g., string view). These buffers -/// will be represented by separate members of the ArrowArrayView or ArrowLayout. #define NANOARROW_MAX_FIXED_BUFFERS 3 /// \brief An non-owning view of a string @@ -689,6 +707,7 @@ union ArrowBufferViewData { const double* as_double; const float* as_float; const char* as_char; + const union ArrowBinaryView* as_binary_view; }; /// \brief An non-owning view of a buffer @@ -826,6 +845,15 @@ struct ArrowArrayView { /// type_id == union_type_id_map[128 + child_index]. This value may be /// NULL in the case where child_id == type_id. int8_t* union_type_id_map; + + /// \brief Number of variadic buffers + int32_t n_variadic_buffers; + + /// \brief Pointers to variadic buffers of binary/string_view arrays + const void** variadic_buffers; + + /// \brief Size of each variadic buffer + int64_t* variadic_buffer_sizes; }; // Used as the private data member for ArrowArrays allocated here and accessed @@ -840,8 +868,8 @@ struct ArrowArrayPrivateData { // The array of pointers to buffers. This must be updated after a sequence // of appends to synchronize its values with the actual buffer addresses - // (which may have ben reallocated uring that time) - const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; + // (which may have been reallocated during that time) + const void** buffer_data; // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown enum ArrowType storage_type; @@ -853,6 +881,15 @@ struct ArrowArrayPrivateData { // In the future this could be replaced with a type id<->child mapping // to support constructing unions in append mode where type_id != child_index int8_t union_type_id_is_child_index; + + // Number of variadic buffers for binary view types + int32_t n_variadic_buffers; + + // Variadic buffers for binary view types + struct ArrowBuffer* variadic_buffers; + + // Size of each variadic buffer in bytes + int64_t* variadic_buffer_sizes; }; /// \brief A representation of an interval. @@ -911,7 +948,7 @@ static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwid memset(decimal->words, 0, sizeof(decimal->words)); decimal->precision = precision; decimal->scale = scale; - decimal->n_words = bitwidth / 8 / sizeof(uint64_t); + decimal->n_words = (int)(bitwidth / 8 / sizeof(uint64_t)); if (_ArrowIsLittleEndian()) { decimal->low_word_index = 0; @@ -1052,6 +1089,8 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) #define ArrowSchemaSetTypeDecimal \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) +#define ArrowSchemaSetTypeRunEndEncoded \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeRunEndEncoded) #define ArrowSchemaSetTypeDateTime \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) #define ArrowSchemaSetTypeUnion \ @@ -1118,6 +1157,7 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) #define ArrowArrayViewValidate \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) +#define ArrowArrayViewCompare NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewCompare) #define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) #define ArrowBasicArrayStreamInit \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) @@ -1281,6 +1321,12 @@ ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, struct ArrowBuffer* buffer); +/// \brief Get the half float value of a float +static inline uint16_t ArrowFloatToHalfFloat(float value); + +/// \brief Get the float value of a half float +static inline float ArrowHalfFloatToFloat(uint16_t value); + /// \brief Resolve a chunk index from increasing int64_t offsets /// /// Given a buffer of increasing int64_t offsets that begin with 0 (e.g., offset buffer @@ -1358,6 +1404,17 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowT int32_t decimal_precision, int32_t decimal_scale); +/// \brief Set the format field of a run-end encoded schema +/// +/// Returns EINVAL for run_end_type that is not +/// NANOARROW_TYPE_INT16, NANOARROW_TYPE_INT32 or NANOARROW_TYPE_INT64. +/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +/// The caller must call `ArrowSchemaSetTypeXXX(schema->children[1])` to +/// set the value type. Note that when building arrays using the `ArrowArrayAppendXXX()` +/// functions, the run-end encoded array's logical length must be updated manually. +ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema, + enum ArrowType run_end_type); + /// \brief Set the format field of a time, timestamp, or duration schema /// /// Returns EINVAL for type that is not @@ -2025,6 +2082,48 @@ ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, const struct ArrowArray* array, struct ArrowError* error); +/// \brief Get the number of buffers +/// +/// The number of buffers referred to by this ArrowArrayView. In may cases this can also +/// be calculated from the ArrowLayout member of the ArrowArrayView or ArrowSchemaView; +/// however, for binary view and string view types, the number of total buffers depends on +/// the number of variadic buffers. +static inline int64_t ArrowArrayViewGetNumBuffers(struct ArrowArrayView* array_view); + +/// \brief Get a view of a specific buffer from an ArrowArrayView +/// +/// This is the ArrowArrayView equivalent of ArrowArray::buffers[i] that includes +/// size information (if known). +static inline struct ArrowBufferView ArrowArrayViewGetBufferView( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the function of a specific buffer in an ArrowArrayView +/// +/// In may cases this can also be obtained from the ArrowLayout member of the +/// ArrowArrayView or ArrowSchemaView; however, for binary view and string view types, +/// the function of each buffer may be different between two arrays of the same type +/// depending on the number of variadic buffers. +static inline enum ArrowBufferType ArrowArrayViewGetBufferType( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the data type of a specific buffer in an ArrowArrayView +/// +/// In may cases this can also be obtained from the ArrowLayout member of the +/// ArrowArrayView or ArrowSchemaView; however, for binary view and string view types, +/// the data type of each buffer may be different between two arrays of the same type +/// depending on the number of variadic buffers. +static inline enum ArrowType ArrowArrayViewGetBufferDataType( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the element size (in bits) of a specific buffer in an ArrowArrayView +/// +/// In may cases this can also be obtained from the ArrowLayout member of the +/// ArrowArrayView or ArrowSchemaView; however, for binary view and string view types, +/// the element width of each buffer may be different between two arrays of the same type +/// depending on the number of variadic buffers. +static inline int64_t ArrowArrayViewGetBufferElementSizeBits( + struct ArrowArrayView* array_view, int64_t i); + /// \brief Performs checks on the content of an ArrowArrayView /// /// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, @@ -2037,6 +2136,19 @@ ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, enum ArrowValidationLevel validation_level, struct ArrowError* error); +/// \brief Compare two ArrowArrayView objects for equality +/// +/// Given two ArrowArrayView instances, place either 0 (not equal) and +/// 1 (equal) at the address pointed to by out. If the comparison determines +/// that actual and expected are not equal, a reason will be communicated via +/// error if error is non-NULL. +/// +/// Returns NANOARROW_OK if the comparison completed successfully. +ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, + enum ArrowCompareLevel level, int* out, + struct ArrowError* reason); + /// \brief Reset the contents of an ArrowArrayView and frees resources void ArrowArrayViewReset(struct ArrowArrayView* array_view); @@ -2044,6 +2156,10 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view); static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, int64_t i); +/// \brief Compute null count for an ArrowArrayView +static inline int64_t ArrowArrayViewComputeNullCount( + const struct ArrowArrayView* array_view); + /// \brief Get the type id of a union array element static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, int64_t i); @@ -2233,6 +2349,57 @@ static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_c } } +// float to half float conversion, adapted from Arrow Go +// https://github.com/apache/arrow/blob/main/go/arrow/float16/float16.go +static inline uint16_t ArrowFloatToHalfFloat(float value) { + union { + float f; + uint32_t b; + } u; + u.f = value; + + uint16_t sn = (uint16_t)((u.b >> 31) & 0x1); + uint16_t exp = (u.b >> 23) & 0xff; + int16_t res = (int16_t)(exp - 127 + 15); + uint16_t fc = (uint16_t)(u.b >> 13) & 0x3ff; + + if (exp == 0) { + res = 0; + } else if (exp == 0xff) { + res = 0x1f; + } else if (res > 0x1e) { + res = 0x1f; + fc = 0; + } else if (res < 0x01) { + res = 0; + fc = 0; + } + + return (uint16_t)((sn << 15) | (uint16_t)(res << 10) | fc); +} + +// half float to float conversion, adapted from Arrow Go +// https://github.com/apache/arrow/blob/main/go/arrow/float16/float16.go +static inline float ArrowHalfFloatToFloat(uint16_t value) { + uint32_t sn = (uint32_t)((value >> 15) & 0x1); + uint32_t exp = (value >> 10) & 0x1f; + uint32_t res = exp + 127 - 15; + uint32_t fc = value & 0x3ff; + + if (exp == 0) { + res = 0; + } else if (exp == 0x1f) { + res = 0xff; + } + + union { + float f; + uint32_t b; + } u; + u.b = (uint32_t)(sn << 31) | (uint32_t)(res << 23) | (uint32_t)(fc << 13); + return u.f; +} + static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { buffer->data = NULL; buffer->size_bytes = 0; @@ -2316,6 +2483,7 @@ static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, int64_t size_bytes) { if (size_bytes > 0) { + NANOARROW_DCHECK(buffer->data != NULL); memcpy(buffer->data + buffer->size_bytes, data, size_bytes); buffer->size_bytes += size_bytes; } @@ -2391,10 +2559,16 @@ static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buf static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, uint8_t value, int64_t size_bytes) { + if (size_bytes == 0) { + return NANOARROW_OK; + } + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + NANOARROW_DCHECK(buffer->data != NULL); // To help clang-tidy memset(buffer->data + buffer->size_bytes, value, size_bytes); buffer->size_bytes += size_bytes; + return NANOARROW_OK; } @@ -2511,6 +2685,8 @@ static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offse return; } + NANOARROW_DCHECK(bits != NULL && out != NULL); + const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; const int64_t i_last_valid = i_end - 1; @@ -2553,12 +2729,18 @@ static inline void ArrowBitClear(uint8_t* bits, int64_t i) { } static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { - bits[i / 8] ^= - ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; + bits[i / 8] ^= (uint8_t)(((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & + _ArrowkBitmask[i % 8]); } static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, uint8_t bits_are_set) { + if (length == 0) { + return; + } + + NANOARROW_DCHECK(bits != NULL); + const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; const uint8_t fill_byte = (uint8_t)(-bits_are_set); @@ -2602,6 +2784,8 @@ static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset return 0; } + NANOARROW_DCHECK(bits != NULL); + const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; const int64_t i_last_valid = i_end - 1; @@ -3095,6 +3279,8 @@ static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* a switch (private_data->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_NONE: + case NANOARROW_BUFFER_TYPE_VARIADIC_DATA: + case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE: case NANOARROW_BUFFER_TYPE_VALIDITY: continue; case NANOARROW_BUFFER_TYPE_DATA_OFFSET: @@ -3173,6 +3359,10 @@ static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, case NANOARROW_TYPE_FLOAT: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; + case NANOARROW_TYPE_HALF_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, ArrowFloatToHalfFloat((float)value))); + break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); break; @@ -3223,6 +3413,10 @@ static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, case NANOARROW_TYPE_FLOAT: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; + case NANOARROW_TYPE_HALF_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, ArrowFloatToHalfFloat((float)value))); + break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); break; @@ -3252,6 +3446,10 @@ static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, case NANOARROW_TYPE_FLOAT: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; + case NANOARROW_TYPE_HALF_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, ArrowFloatToHalfFloat((float)value))); + break; default: return EINVAL; } @@ -3264,52 +3462,151 @@ static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, return NANOARROW_OK; } +// Binary views only have two fixed buffers, but be aware that they must also +// always have more 1 buffer to store variadic buffer sizes (even if there are none) +#define NANOARROW_BINARY_VIEW_FIXED_BUFFERS 2 +#define NANOARROW_BINARY_VIEW_INLINE_SIZE 12 +#define NANOARROW_BINARY_VIEW_PREFIX_SIZE 4 +#define NANOARROW_BINARY_VIEW_BLOCK_SIZE (32 << 10) // 32KB + +// The Arrow C++ implementation uses anonymous structs as members +// of the ArrowBinaryView. For Cython support in this library, we define +// those structs outside of the ArrowBinaryView +struct ArrowBinaryViewInlined { + int32_t size; + uint8_t data[NANOARROW_BINARY_VIEW_INLINE_SIZE]; +}; + +struct ArrowBinaryViewRef { + int32_t size; + uint8_t prefix[NANOARROW_BINARY_VIEW_PREFIX_SIZE]; + int32_t buffer_index; + int32_t offset; +}; + +union ArrowBinaryView { + struct ArrowBinaryViewInlined inlined; + struct ArrowBinaryViewRef ref; + int64_t alignment_dummy; +}; + +static inline int32_t ArrowArrayVariadicBufferCount(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + return private_data->n_variadic_buffers; +} + +static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray* array, + int32_t nbuffers) { + const int32_t n_current_bufs = ArrowArrayVariadicBufferCount(array); + const int32_t nvariadic_bufs_needed = n_current_bufs + nbuffers; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + private_data->variadic_buffers = (struct ArrowBuffer*)ArrowRealloc( + private_data->variadic_buffers, sizeof(struct ArrowBuffer) * nvariadic_bufs_needed); + if (private_data->variadic_buffers == NULL) { + return ENOMEM; + } + private_data->variadic_buffer_sizes = (int64_t*)ArrowRealloc( + private_data->variadic_buffer_sizes, sizeof(int64_t) * nvariadic_bufs_needed); + if (private_data->variadic_buffer_sizes == NULL) { + return ENOMEM; + } + + for (int32_t i = n_current_bufs; i < nvariadic_bufs_needed; i++) { + ArrowBufferInit(&private_data->variadic_buffers[i]); + private_data->variadic_buffer_sizes[i] = 0; + } + private_data->n_variadic_buffers = nvariadic_bufs_needed; + array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 + nvariadic_bufs_needed; + + return NANOARROW_OK; +} + static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, struct ArrowBufferView value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); - struct ArrowBuffer* data_buffer = ArrowArrayBuffer( - array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); - int32_t offset; - int64_t large_offset; - int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + if (private_data->storage_type == NANOARROW_TYPE_STRING_VIEW || + private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW) { + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + union ArrowBinaryView bvt; + bvt.inlined.size = (int32_t)value.size_bytes; - switch (private_data->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - offset = ((int32_t*)offset_buffer->data)[array->length]; - if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { - return EOVERFLOW; + if (value.size_bytes <= NANOARROW_BINARY_VIEW_INLINE_SIZE) { + memcpy(bvt.inlined.data, value.data.as_char, value.size_bytes); + memset(bvt.inlined.data + bvt.inlined.size, 0, + NANOARROW_BINARY_VIEW_INLINE_SIZE - bvt.inlined.size); + } else { + int32_t current_n_vbufs = ArrowArrayVariadicBufferCount(array); + if (current_n_vbufs == 0 || + private_data->variadic_buffers[current_n_vbufs - 1].size_bytes + + value.size_bytes > + NANOARROW_BINARY_VIEW_BLOCK_SIZE) { + const int32_t additional_bufs_needed = 1; + NANOARROW_RETURN_NOT_OK( + ArrowArrayAddVariadicBuffers(array, additional_bufs_needed)); + current_n_vbufs += additional_bufs_needed; } - offset += (int32_t)value.size_bytes; - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + const int32_t buf_index = current_n_vbufs - 1; + struct ArrowBuffer* variadic_buf = &private_data->variadic_buffers[buf_index]; + memcpy(bvt.ref.prefix, value.data.as_char, NANOARROW_BINARY_VIEW_PREFIX_SIZE); + bvt.ref.buffer_index = (int32_t)buf_index; + bvt.ref.offset = (int32_t)variadic_buf->size_bytes; NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; + ArrowBufferAppend(variadic_buf, value.data.as_char, value.size_bytes)); + private_data->variadic_buffer_sizes[buf_index] = variadic_buf->size_bytes; + } + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &bvt, sizeof(bvt))); + } else { + struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); + struct ArrowBuffer* data_buffer = ArrowArrayBuffer( + array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); + int32_t offset; + int64_t large_offset; + int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + offset = ((int32_t*)offset_buffer->data)[array->length]; + if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { + return EOVERFLOW; + } - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - large_offset = ((int64_t*)offset_buffer->data)[array->length]; - large_offset += value.size_bytes; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; + offset += (int32_t)value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - if (value.size_bytes != fixed_size_bytes) { - return EINVAL; - } + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + large_offset = ((int64_t*)offset_buffer->data)[array->length]; + large_offset += value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - default: - return EINVAL; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (value.size_bytes != fixed_size_bytes) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + default: + return EINVAL; + } } if (private_data->bitmap.buffer.data != NULL) { @@ -3332,8 +3629,10 @@ static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, switch (private_data->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_STRING_VIEW: case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_BINARY_VIEW: return ArrowArrayAppendBytes(array, buffer_view); default: return EINVAL; @@ -3520,6 +3819,132 @@ static inline void ArrowArrayViewMove(struct ArrowArrayView* src, ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); } +static inline int64_t ArrowArrayViewGetNumBuffers(struct ArrowArrayView* array_view) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + return NANOARROW_BINARY_VIEW_FIXED_BUFFERS + array_view->n_variadic_buffers + 1; + default: + break; + } + + int64_t n_buffers = 0; + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + break; + } + + n_buffers++; + } + + return n_buffers; +} + +static inline struct ArrowBufferView ArrowArrayViewGetBufferView( + struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->buffer_views[i]; + } else if (i >= + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + struct ArrowBufferView view; + view.data.as_int64 = array_view->variadic_buffer_sizes; + view.size_bytes = array_view->n_variadic_buffers * sizeof(double); + return view; + } else { + struct ArrowBufferView view; + view.data.data = + array_view->variadic_buffers[i - NANOARROW_BINARY_VIEW_FIXED_BUFFERS]; + view.size_bytes = + array_view->variadic_buffer_sizes[i - NANOARROW_BINARY_VIEW_FIXED_BUFFERS]; + return view; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + struct ArrowBufferView view; + view.data.data = NULL; + view.size_bytes = 0; + return view; + } else { + return array_view->buffer_views[i]; + } + } +} + +enum ArrowBufferType ArrowArrayViewGetBufferType(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->layout.buffer_type[i]; + } else if (i == + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + return NANOARROW_BUFFER_TYPE_VARIADIC_SIZE; + } else { + return NANOARROW_BUFFER_TYPE_VARIADIC_DATA; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + return NANOARROW_BUFFER_TYPE_NONE; + } else { + return array_view->layout.buffer_type[i]; + } + } +} + +static inline enum ArrowType ArrowArrayViewGetBufferDataType( + struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->layout.buffer_data_type[i]; + } else if (i >= + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + return NANOARROW_TYPE_INT64; + } else if (array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) { + return NANOARROW_TYPE_BINARY; + } else { + return NANOARROW_TYPE_STRING; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + return NANOARROW_TYPE_UNINITIALIZED; + } else { + return array_view->layout.buffer_data_type[i]; + } + } +} + +static inline int64_t ArrowArrayViewGetBufferElementSizeBits( + struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->layout.element_size_bits[i]; + } else if (i >= + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + return sizeof(int64_t) * 8; + } else { + return 0; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + return 0; + } else { + return array_view->layout.element_size_bits[i]; + } + } +} + static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, int64_t i) { const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; @@ -3536,12 +3961,37 @@ static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_vie } } +static inline int64_t ArrowArrayViewComputeNullCount( + const struct ArrowArrayView* array_view) { + if (array_view->length == 0) { + return 0; + } + + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return array_view->length; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0; + default: + break; + } + + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + if (validity_buffer == NULL) { + return 0; + } + return array_view->length - + ArrowBitCountSet(validity_buffer, array_view->offset, array_view->length); +} + static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, int64_t i) { switch (array_view->storage_type) { case NANOARROW_TYPE_DENSE_UNION: case NANOARROW_TYPE_SPARSE_UNION: - return array_view->buffer_views[0].data.as_int8[i]; + return array_view->buffer_views[0].data.as_int8[array_view->offset + i]; default: return -1; } @@ -3561,9 +4011,9 @@ static inline int64_t ArrowArrayViewUnionChildOffset( const struct ArrowArrayView* array_view, int64_t i) { switch (array_view->storage_type) { case NANOARROW_TYPE_DENSE_UNION: - return array_view->buffer_views[1].data.as_int32[i]; + return array_view->buffer_views[1].data.as_int32[array_view->offset + i]; case NANOARROW_TYPE_SPARSE_UNION: - return i; + return array_view->offset + i; default: return -1; } @@ -3581,6 +4031,20 @@ static inline int64_t ArrowArrayViewListChildOffset( } } +static struct ArrowBufferView ArrowArrayViewGetBytesFromViewArrayUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + const union ArrowBinaryView* bv = &array_view->buffer_views[1].data.as_binary_view[i]; + struct ArrowBufferView out = {{NULL}, bv->inlined.size}; + if (bv->inlined.size <= NANOARROW_BINARY_VIEW_INLINE_SIZE) { + out.data.as_uint8 = bv->inlined.data; + return out; + } + + out.data.data = array_view->variadic_buffers[bv->ref.buffer_index]; + out.data.as_uint8 += bv->ref.offset; + return out; +} + static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, int64_t i) { const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; @@ -3607,6 +4071,8 @@ static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* ar return (int64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: return (int64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_HALF_FLOAT: + return (int64_t)ArrowHalfFloatToFloat(data_view->data.as_uint16[i]); case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: @@ -3640,6 +4106,8 @@ static inline uint64_t ArrowArrayViewGetUIntUnsafe( return (uint64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: return (uint64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_HALF_FLOAT: + return (uint64_t)ArrowHalfFloatToFloat(data_view->data.as_uint16[i]); case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: @@ -3672,6 +4140,8 @@ static inline double ArrowArrayViewGetDoubleUnsafe( return data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: return data_view->data.as_float[i]; + case NANOARROW_TYPE_HALF_FLOAT: + return ArrowHalfFloatToFloat(data_view->data.as_uint16[i]); case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: @@ -3703,6 +4173,14 @@ static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( view.size_bytes = array_view->layout.element_size_bits[1] / 8; view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); break; + case NANOARROW_TYPE_STRING_VIEW: + case NANOARROW_TYPE_BINARY_VIEW: { + struct ArrowBufferView buf_view = + ArrowArrayViewGetBytesFromViewArrayUnsafe(array_view, i); + view.data = buf_view.data.as_char; + view.size_bytes = buf_view.size_bytes; + break; + } default: view.data = NULL; view.size_bytes = 0; @@ -3737,6 +4215,10 @@ static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( view.data.as_uint8 = array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); break; + case NANOARROW_TYPE_STRING_VIEW: + case NANOARROW_TYPE_BINARY_VIEW: + view = ArrowArrayViewGetBytesFromViewArrayUnsafe(array_view, i); + break; default: view.data.data = NULL; view.size_bytes = 0; diff --git a/c/vendor/nanoarrow/nanoarrow.hpp b/c/vendor/nanoarrow/nanoarrow.hpp index 49ba38f0da..16c2e55b9f 100644 --- a/c/vendor/nanoarrow/nanoarrow.hpp +++ b/c/vendor/nanoarrow/nanoarrow.hpp @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include -#include "nanoarrow/nanoarrow.h" +#include "nanoarrow.h" #ifndef NANOARROW_HPP_INCLUDED #define NANOARROW_HPP_INCLUDED @@ -216,10 +217,16 @@ template class Unique { public: /// \brief Construct an invalid instance of T holding no resources - Unique() { init_pointer(&data_); } + Unique() { + std::memset(&data_, 0, sizeof(data_)); + init_pointer(&data_); + } /// \brief Move and take ownership of data - Unique(T* data) { move_pointer(data, &data_); } + Unique(T* data) { + std::memset(&data_, 0, sizeof(data_)); + move_pointer(data, &data_); + } /// \brief Move and take ownership of data wrapped by rhs Unique(Unique&& rhs) : Unique(rhs.get()) {} diff --git a/c/vendor/vendor_nanoarrow.sh b/c/vendor/vendor_nanoarrow.sh index f74ebdeda8..9024090fe1 100755 --- a/c/vendor/vendor_nanoarrow.sh +++ b/c/vendor/vendor_nanoarrow.sh @@ -21,7 +21,7 @@ main() { local -r repo_url="https://github.com/apache/arrow-nanoarrow" # Check releases page: https://github.com/apache/arrow-nanoarrow/releases/ - local -r commit_sha=c5fb10035c17b598e6fd688ad9eb7b874c7c631b + local -r commit_sha=33d2c8b973d8f8f424e02ac92ddeaace2a92f8dd echo "Fetching $commit_sha from $repo_url" SCRATCH=$(mktemp -d) @@ -34,21 +34,13 @@ main() { mkdir -p nanoarrow tar --strip-components 1 -C "$SCRATCH" -xf "$tarball" - # Build the bundle using cmake. We could also use the dist/ files - # but this allows us to add the symbol namespace and ensures that the - # resulting bundle is perfectly synchronized with the commit we've pulled. - pushd "$SCRATCH" - mkdir build && cd build - # Do not use "adbc" in the namespace name since our scripts expose all - # such symbols - cmake .. -DNANOARROW_BUNDLE=ON -DNANOARROW_NAMESPACE=Private - cmake --build . - cmake --install . --prefix=../dist-adbc - popd + # Build the bundle + python "$SCRATCH/ci/scripts/bundle.py" \ + --symbol-namespace=Private \ + --include-output-dir=nanoarrow \ + --source-output-dir=nanoarrow \ + --header-namespace= - cp "$SCRATCH/dist-adbc/nanoarrow.c" nanoarrow/ - cp "$SCRATCH/dist-adbc/nanoarrow.h" nanoarrow/ - cp "$SCRATCH/dist-adbc/nanoarrow.hpp" nanoarrow/ mv CMakeLists.nanoarrow.tmp nanoarrow/CMakeLists.txt }