Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-43267: [C#] Correctly import sliced arrays through the C Data interface #44117

Merged
merged 2 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions csharp/src/Apache.Arrow/Apache.Arrow.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,16 @@
<Description>Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware.</Description>
</PropertyGroup>

<PropertyGroup Condition="'$(IsWindows)'=='true'">
<PropertyGroup>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking this was intentionally included in the PR? Seems reasonable to now build for net462 on non-Windows too, this built fine for me on Linux and I guess something made sure the reference assemblies / targeting pack was available.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was intentional, though arguably I should have put it into a separate PR. What I didn't realize when adding net462 support back was that the official NuGet packages aren't built on Windows and so they were missing the 4.6.2 assembly.

<TargetFrameworks>netstandard2.0;net6.0;net8.0;net462</TargetFrameworks>
</PropertyGroup>
<PropertyGroup Condition="'$(IsWindows)'!='true'">
<TargetFrameworks>netstandard2.0;net6.0;net8.0</TargetFrameworks>
</PropertyGroup>

<ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETStandard' or '$(TargetFramework)' == 'net462'">
<PackageReference Include="System.Buffers" Version="4.5.1" />
<PackageReference Include="System.Memory" Version="4.5.5" />
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.7.1" />
<PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.4" />
<PackageReference Include="System.ValueTuple" Version="4.5.0" />
</ItemGroup>

<ItemGroup>
Expand Down
22 changes: 11 additions & 11 deletions csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ private ArrayData[] ProcessStructChildren(CArrowArray* cArray, IReadOnlyList<Fie

private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray)
{
int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int validityLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength));
}
Expand All @@ -285,7 +285,7 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray)
throw new InvalidOperationException("Byte arrays are expected to have exactly three buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 4;
int* offsets = (int*)cArray->buffers[1];
Debug.Assert(offsets != null);
Expand All @@ -306,7 +306,7 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray)
throw new InvalidOperationException("Byte array views are expected to have at least three buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int viewsLength = length * 16;

long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1];
Expand Down Expand Up @@ -336,7 +336,7 @@ private ArrowBuffer[] ImportLargeByteArrayBuffers(CArrowArray* cArray)
$"is greater than the maximum supported large byte array length ({maxLength})");
}

int length = (int)cArray->length;
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 8;
long* offsets = (long*)cArray->buffers[1];
Debug.Assert(offsets != null);
Expand Down Expand Up @@ -364,7 +364,7 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray)
throw new InvalidOperationException("List arrays are expected to have exactly two buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 4;

ArrowBuffer[] buffers = new ArrowBuffer[2];
Expand All @@ -381,7 +381,7 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray)
throw new InvalidOperationException("List view arrays are expected to have exactly three buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = length * 4;

ArrowBuffer[] buffers = new ArrowBuffer[3];
Expand All @@ -407,7 +407,7 @@ private ArrowBuffer[] ImportLargeListBuffers(CArrowArray* cArray)
$"is greater than the maximum supported large list array length ({maxLength})");
}

int length = (int)cArray->length;
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 8;

ArrowBuffer[] buffers = new ArrowBuffer[2];
Expand Down Expand Up @@ -436,7 +436,7 @@ private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray)
{
throw new InvalidOperationException("Dense union arrays are expected to have exactly two children");
}
int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = length * 4;

ArrowBuffer[] buffers = new ArrowBuffer[2];
Expand All @@ -454,7 +454,7 @@ private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray)
}

ArrowBuffer[] buffers = new ArrowBuffer[1];
buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length));
buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->offset + (int)cArray->length));

return buffers;
}
Expand All @@ -467,10 +467,10 @@ private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth)
}

// validity, data
int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int valuesLength;
if (bitWidth >= 8)
valuesLength = checked((int)(cArray->length * bitWidth / 8));
valuesLength = checked(length * bitWidth / 8);
else
valuesLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);

Expand Down
11 changes: 11 additions & 0 deletions csharp/src/Apache.Arrow/RecordBatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,17 @@ public RecordBatch Clone(MemoryAllocator allocator = default)
return new RecordBatch(Schema, arrays, Length);
}

public RecordBatch Slice(int offset, int length)
{
if (offset > Length)
{
throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for RecordBatch.Slice");
}

length = Math.Min(Length - offset, length);
return new RecordBatch(Schema, _arrays.Select(a => ArrowArrayFactory.Slice(a, offset, length)), length);
}

public void Accept(IArrowArrayVisitor visitor)
{
switch (visitor)
Expand Down
2 changes: 0 additions & 2 deletions csharp/src/Apache.Arrow/Utility.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

using Apache.Arrow.Flatbuf;
using System;
using System.Collections.Generic;
using System.Text;

namespace Apache.Arrow
{
Expand Down
4 changes: 3 additions & 1 deletion csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,9 @@ private void CompareArrays(FixedSizeListArray actualArray)
var listSize = ((FixedSizeListType)expectedArray.Data.DataType).ListSize;
var expectedValuesSlice = ArrowArrayFactory.Slice(
expectedArray.Values, expectedArray.Offset * listSize, expectedArray.Length * listSize);
actualArray.Values.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
var actualValuesSlice = ArrowArrayFactory.Slice(
actualArray.Values, actualArray.Offset * listSize, actualArray.Length * listSize);
actualValuesSlice.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
}

private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer actualValidityBuffer, int actualBufferOffset)
Expand Down
18 changes: 18 additions & 0 deletions csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,23 @@ public unsafe void CallsReleaseForInvalid()
GC.KeepAlive(releaseCallback);
}
#endif

[Fact]
public unsafe void RoundTripInt32ArrayWithOffset()
{
Int32Array array = new Int32Array.Builder()
.AppendRange(new[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 })
.Build();
IArrowArray sliced = array.Slice(2, 6);
CArrowArray* cArray = CArrowArray.Create();
CArrowArrayExporter.ExportArray(sliced, cArray);
using (var importedSlice = (Int32Array)CArrowArrayImporter.ImportArray(cArray, array.Data.DataType))
{
Assert.Equal(6, importedSlice.Length);
Assert.Equal(2, importedSlice.Offset);
Assert.Equal(2, importedSlice.GetValue(0));
}
CArrowArray.Free(cArray);
}
}
}
43 changes: 43 additions & 0 deletions csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,49 @@ public unsafe void RoundTripTestBatch()
CArrowSchema.Free(cImportSchema);
}

[SkippableFact]
public unsafe void RoundTripTestSlicedBatch()
{
// TODO: Enable these once this the version of pyarrow referenced during testing supports them
HashSet<ArrowTypeId> unsupported = new HashSet<ArrowTypeId> { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView };
RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported);
RecordBatch batch1slice = batch1.Slice(1, 2);
RecordBatch batch2 = batch1slice.Clone();

CArrowArray* cExportArray = CArrowArray.Create();
CArrowArrayExporter.ExportRecordBatch(batch1slice, cExportArray);

CArrowSchema* cExportSchema = CArrowSchema.Create();
CArrowSchemaExporter.ExportSchema(batch1.Schema, cExportSchema);

CArrowArray* cImportArray = CArrowArray.Create();
CArrowSchema* cImportSchema = CArrowSchema.Create();

// For Python, we need to provide the pointers
long exportArrayPtr = ((IntPtr)cExportArray).ToInt64();
long exportSchemaPtr = ((IntPtr)cExportSchema).ToInt64();
long importArrayPtr = ((IntPtr)cImportArray).ToInt64();
long importSchemaPtr = ((IntPtr)cImportSchema).ToInt64();

using (Py.GIL())
{
dynamic pa = Py.Import("pyarrow");
dynamic exportedPyArray = pa.RecordBatch._import_from_c(exportArrayPtr, exportSchemaPtr);
exportedPyArray._export_to_c(importArrayPtr, importSchemaPtr);
}

Schema schema = CArrowSchemaImporter.ImportSchema(cImportSchema);
RecordBatch importedBatch = CArrowArrayImporter.ImportRecordBatch(cImportArray, schema);

ArrowReaderVerifier.CompareBatches(batch2, importedBatch, strictCompare: false); // Non-strict because span lengths won't match.

// Since we allocated, we are responsible for freeing the pointer.
CArrowArray.Free(cExportArray);
CArrowSchema.Free(cExportSchema);
CArrowArray.Free(cImportArray);
CArrowSchema.Free(cImportSchema);
}

[SkippableFact]
public unsafe void ExportBatchReader()
{
Expand Down
Loading