Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

convert date/datev2 to sql date and largeint into decimal(38,0) #125

Merged
merged 3 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
Expand All @@ -30,6 +35,7 @@
import org.apache.arrow.vector.BitVector;
import org.apache.arrow.vector.DecimalVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.FixedSizeBinaryVector;
import org.apache.arrow.vector.Float4Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.IntVector;
Expand All @@ -41,9 +47,12 @@
import org.apache.arrow.vector.complex.ListVector;
import org.apache.arrow.vector.ipc.ArrowStreamReader;
import org.apache.arrow.vector.types.Types;

import org.apache.doris.sdk.thrift.TScanBatchResult;
import org.apache.doris.spark.exception.DorisException;
import org.apache.doris.spark.rest.models.Schema;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.spark.sql.types.Decimal;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -98,7 +107,7 @@ public RowBatch(TScanBatchResult nextResult, Schema schema) throws DorisExceptio
fieldVectors.size(), schema.size());
throw new DorisException("Load Doris data failed, schema size of fetch data is wrong.");
}
if (fieldVectors.size() == 0 || root.getRowCount() == 0) {
if (fieldVectors.isEmpty() || root.getRowCount() == 0) {
logger.debug("One batch in arrow has no data.");
continue;
}
Expand Down Expand Up @@ -190,6 +199,34 @@ public void convertArrowToRowBatch() throws DorisException {
addValueToRow(rowIndex, fieldValue);
}
break;
case "LARGEINT":
Preconditions.checkArgument(mt.equals(Types.MinorType.FIXEDSIZEBINARY) ||
mt.equals(Types.MinorType.VARCHAR), typeMismatchMessage(currentType, mt));
if (mt.equals(Types.MinorType.FIXEDSIZEBINARY)) {
FixedSizeBinaryVector largeIntVector = (FixedSizeBinaryVector) curFieldVector;
for (int rowIndex = 0; rowIndex < rowCountInOneBatch; rowIndex++) {
if (largeIntVector.isNull(rowIndex)) {
addValueToRow(rowIndex, null);
continue;
}
byte[] bytes = largeIntVector.get(rowIndex);
ArrayUtils.reverse(bytes);
BigInteger largeInt = new BigInteger(bytes);
addValueToRow(rowIndex, Decimal.apply(largeInt));
}
} else {
VarCharVector largeIntVector = (VarCharVector) curFieldVector;
for (int rowIndex = 0; rowIndex < rowCountInOneBatch; rowIndex++) {
if (largeIntVector.isNull(rowIndex)) {
addValueToRow(rowIndex, null);
continue;
}
String stringValue = new String(largeIntVector.get(rowIndex));
BigInteger largeInt = new BigInteger(stringValue);
addValueToRow(rowIndex, Decimal.apply(largeInt));
}
}
break;
case "FLOAT":
Preconditions.checkArgument(mt.equals(Types.MinorType.FLOAT4),
typeMismatchMessage(currentType, mt));
Expand Down Expand Up @@ -257,9 +294,21 @@ public void convertArrowToRowBatch() throws DorisException {
break;
case "DATE":
case "DATEV2":
Preconditions.checkArgument(mt.equals(Types.MinorType.VARCHAR),
typeMismatchMessage(currentType, mt));
VarCharVector date = (VarCharVector) curFieldVector;
for (int rowIndex = 0; rowIndex < rowCountInOneBatch; rowIndex++) {
if (date.isNull(rowIndex)) {
addValueToRow(rowIndex, null);
continue;
}
String stringValue = new String(date.get(rowIndex));
LocalDate localDate = LocalDate.parse(stringValue);
addValueToRow(rowIndex, Date.valueOf(localDate));
}
break;
case "DATETIME":
case "DATETIMEV2":
case "LARGEINT":
case "CHAR":
case "VARCHAR":
case "STRING":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ private[spark] object SchemaUtils {
case "BIGINT" => DataTypes.LongType
case "FLOAT" => DataTypes.FloatType
case "DOUBLE" => DataTypes.DoubleType
case "DATE" => DataTypes.StringType
case "DATEV2" => DataTypes.StringType
case "DATE" => DataTypes.DateType
case "DATEV2" => DataTypes.DateType
case "DATETIME" => DataTypes.StringType
case "DATETIMEV2" => DataTypes.StringType
case "BINARY" => DataTypes.BinaryType
case "DECIMAL" => DecimalType(precision, scale)
case "CHAR" => DataTypes.StringType
case "LARGEINT" => DataTypes.StringType
case "LARGEINT" => DecimalType(38,0)
case "VARCHAR" => DataTypes.StringType
case "JSONB" => DataTypes.StringType
case "DECIMALV2" => DecimalType(precision, scale)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,20 @@

package org.apache.doris.spark.serialization;

import static org.hamcrest.core.StringStartsWith.startsWith;

import java.io.ByteArrayOutputStream;
import java.math.BigDecimal;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;
import org.apache.doris.sdk.thrift.TScanBatchResult;
import org.apache.doris.sdk.thrift.TStatus;
import org.apache.doris.sdk.thrift.TStatusCode;
import org.apache.doris.spark.exception.DorisException;
import org.apache.doris.spark.rest.RestService;
import org.apache.doris.spark.rest.models.Schema;

import com.google.common.collect.ImmutableList;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.BitVector;
import org.apache.arrow.vector.DecimalVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.FixedSizeBinaryVector;
import org.apache.arrow.vector.Float4Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.IntVector;
Expand All @@ -44,11 +45,7 @@
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.doris.sdk.thrift.TScanBatchResult;
import org.apache.doris.sdk.thrift.TStatus;
import org.apache.doris.sdk.thrift.TStatusCode;
import org.apache.doris.spark.rest.RestService;
import org.apache.doris.spark.rest.models.Schema;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.spark.sql.types.Decimal;
import org.junit.Assert;
import org.junit.Rule;
Expand All @@ -57,7 +54,16 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.ImmutableList;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.sql.Date;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;

import static org.hamcrest.core.StringStartsWith.startsWith;

public class TestRowBatch {
private final static Logger logger = LoggerFactory.getLogger(TestRowBatch.class);
Expand Down Expand Up @@ -250,7 +256,7 @@ public void testRowBatch() throws Exception {
1L,
(float) 1.1,
(double) 1.1,
"2008-08-08",
Date.valueOf("2008-08-08"),
"2008-08-08 00:00:00",
Decimal.apply(1234L, 4, 2),
"char1"
Expand All @@ -264,7 +270,7 @@ public void testRowBatch() throws Exception {
2L,
(float) 2.2,
(double) 2.2,
"1900-08-08",
Date.valueOf("1900-08-08"),
"1900-08-08 00:00:00",
Decimal.apply(8888L, 4, 2),
"char2"
Expand All @@ -278,23 +284,23 @@ public void testRowBatch() throws Exception {
3L,
(float) 3.3,
(double) 3.3,
"2100-08-08",
Date.valueOf("2100-08-08"),
"2100-08-08 00:00:00",
Decimal.apply(10L, 2, 0),
"char3"
);

Assert.assertTrue(rowBatch.hasNext());
List<Object> actualRow1 = rowBatch.next();
Assert.assertEquals(expectedRow1, actualRow1);
Assert.assertArrayEquals(expectedRow1.toArray(), actualRow1.toArray());

Assert.assertTrue(rowBatch.hasNext());
List<Object> actualRow2 = rowBatch.next();
Assert.assertEquals(expectedRow2, actualRow2);
Assert.assertArrayEquals(expectedRow2.toArray(), actualRow2.toArray());

Assert.assertTrue(rowBatch.hasNext());
List<Object> actualRow3 = rowBatch.next();
Assert.assertEquals(expectedRow3, actualRow3);
Assert.assertArrayEquals(expectedRow3.toArray(), actualRow3.toArray());

Assert.assertFalse(rowBatch.hasNext());
thrown.expect(NoSuchElementException.class);
Expand Down Expand Up @@ -437,4 +443,152 @@ public void testDecimalV2() throws Exception {
thrown.expectMessage(startsWith("Get row offset:"));
rowBatch.next();
}

@Test
public void testDate() throws DorisException, IOException {

ImmutableList.Builder<Field> childrenBuilder = ImmutableList.builder();
childrenBuilder.add(new Field("k1", FieldType.nullable(new ArrowType.Utf8()), null));
childrenBuilder.add(new Field("k2", FieldType.nullable(new ArrowType.Utf8()), null));

VectorSchemaRoot root = VectorSchemaRoot.create(
new org.apache.arrow.vector.types.pojo.Schema(childrenBuilder.build(), null),
new RootAllocator(Integer.MAX_VALUE));
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(
root,
new DictionaryProvider.MapDictionaryProvider(),
outputStream);

arrowStreamWriter.start();
root.setRowCount(1);

FieldVector vector = root.getVector("k1");
VarCharVector dateVector = (VarCharVector)vector;
dateVector.setInitialCapacity(1);
dateVector.allocateNew();
dateVector.setIndexDefined(0);
dateVector.setValueLengthSafe(0, 10);
dateVector.setSafe(0, "2023-08-09".getBytes());
vector.setValueCount(1);


vector = root.getVector("k2");
VarCharVector dateV2Vector = (VarCharVector)vector;
dateV2Vector.setInitialCapacity(1);
dateV2Vector.allocateNew();
dateV2Vector.setIndexDefined(0);
dateV2Vector.setValueLengthSafe(0, 10);
dateV2Vector.setSafe(0, "2023-08-10".getBytes());
vector.setValueCount(1);

arrowStreamWriter.writeBatch();

arrowStreamWriter.end();
arrowStreamWriter.close();

TStatus status = new TStatus();
status.setStatusCode(TStatusCode.OK);
TScanBatchResult scanBatchResult = new TScanBatchResult();
scanBatchResult.setStatus(status);
scanBatchResult.setEos(false);
scanBatchResult.setRows(outputStream.toByteArray());


String schemaStr = "{\"properties\":[" +
"{\"type\":\"DATE\",\"name\":\"k1\",\"comment\":\"\"}, " +
"{\"type\":\"DATEV2\",\"name\":\"k2\",\"comment\":\"\"}" +
"], \"status\":200}";

Schema schema = RestService.parseSchema(schemaStr, logger);

RowBatch rowBatch = new RowBatch(scanBatchResult, schema);

Assert.assertTrue(rowBatch.hasNext());
List<Object> actualRow0 = rowBatch.next();
Assert.assertEquals(Date.valueOf("2023-08-09"), actualRow0.get(0));
Assert.assertEquals(Date.valueOf("2023-08-10"), actualRow0.get(1));

Assert.assertFalse(rowBatch.hasNext());
thrown.expect(NoSuchElementException.class);
thrown.expectMessage(startsWith("Get row offset:"));
rowBatch.next();

}

@Test
public void testLargeInt() throws DorisException, IOException {

ImmutableList.Builder<Field> childrenBuilder = ImmutableList.builder();
childrenBuilder.add(new Field("k1", FieldType.nullable(new ArrowType.Utf8()), null));
childrenBuilder.add(new Field("k2", FieldType.nullable(new ArrowType.FixedSizeBinary(16)), null));

VectorSchemaRoot root = VectorSchemaRoot.create(
new org.apache.arrow.vector.types.pojo.Schema(childrenBuilder.build(), null),
new RootAllocator(Integer.MAX_VALUE));
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(
root,
new DictionaryProvider.MapDictionaryProvider(),
outputStream);

arrowStreamWriter.start();
root.setRowCount(1);

FieldVector vector = root.getVector("k1");
VarCharVector lageIntVector = (VarCharVector)vector;
lageIntVector.setInitialCapacity(1);
lageIntVector.allocateNew();
lageIntVector.setIndexDefined(0);
lageIntVector.setValueLengthSafe(0, 19);
lageIntVector.setSafe(0, "9223372036854775808".getBytes());
vector.setValueCount(1);


vector = root.getVector("k2");
FixedSizeBinaryVector lageIntVector1 = (FixedSizeBinaryVector)vector;
lageIntVector1.setInitialCapacity(1);
lageIntVector1.allocateNew();
lageIntVector1.setIndexDefined(0);
byte[] bytes = new BigInteger("9223372036854775809").toByteArray();
byte[] fixedBytes = new byte[16];
System.arraycopy(bytes, 0, fixedBytes, 16 - bytes.length, bytes.length);
ArrayUtils.reverse(fixedBytes);
lageIntVector1.setSafe(0, fixedBytes);
vector.setValueCount(1);

arrowStreamWriter.writeBatch();

arrowStreamWriter.end();
arrowStreamWriter.close();

TStatus status = new TStatus();
status.setStatusCode(TStatusCode.OK);
TScanBatchResult scanBatchResult = new TScanBatchResult();
scanBatchResult.setStatus(status);
scanBatchResult.setEos(false);
scanBatchResult.setRows(outputStream.toByteArray());

String schemaStr = "{\"properties\":[" +
"{\"type\":\"LARGEINT\",\"name\":\"k1\",\"comment\":\"\"}, " +
"{\"type\":\"LARGEINT\",\"name\":\"k2\",\"comment\":\"\"}" +
"], \"status\":200}";

Schema schema = RestService.parseSchema(schemaStr, logger);

RowBatch rowBatch = new RowBatch(scanBatchResult, schema);

Assert.assertTrue(rowBatch.hasNext());
List<Object> actualRow0 = rowBatch.next();

Assert.assertEquals(Decimal.apply(new BigInteger("9223372036854775808")), actualRow0.get(0));
Assert.assertEquals(Decimal.apply(new BigInteger("9223372036854775809")), actualRow0.get(1));

Assert.assertFalse(rowBatch.hasNext());
thrown.expect(NoSuchElementException.class);
thrown.expectMessage(startsWith("Get row offset:"));
rowBatch.next();

}

}