Skip to content

Commit

Permalink
Re-implemented geometry statistics according to the updated spec:
Browse files Browse the repository at this point in the history
1. geometry statistics moved out of statistics, it is now a field of column metadata
2. geometry statistics is removed from page index
  • Loading branch information
Kontinuation committed Oct 30, 2024
1 parent fe8a3e5 commit ba80f3e
Show file tree
Hide file tree
Showing 21 changed files with 2,139 additions and 2,268 deletions.
1,445 changes: 1,214 additions & 231 deletions cpp/src/generated/parquet_types.cpp

Large diffs are not rendered by default.

1,303 changes: 202 additions & 1,101 deletions cpp/src/generated/parquet_types.h

Large diffs are not rendered by default.

177 changes: 72 additions & 105 deletions cpp/src/generated/parquet_types.tcc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Autogenerated by Thrift Compiler (0.20.0)
* Autogenerated by Thrift Compiler (0.21.0)
*
* DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
* @generated
Expand Down Expand Up @@ -469,14 +469,6 @@ uint32_t Statistics::read(Protocol_* iprot) {
xfer += iprot->skip(ftype);
}
break;
case 9:
if (ftype == ::apache::thrift::protocol::T_STRUCT) {
xfer += this->geometry_stats.read(iprot);
this->__isset.geometry_stats = true;
} else {
xfer += iprot->skip(ftype);
}
break;
default:
xfer += iprot->skip(ftype);
break;
Expand Down Expand Up @@ -535,11 +527,6 @@ uint32_t Statistics::write(Protocol_* oprot) const {
xfer += oprot->writeBool(this->is_min_value_exact);
xfer += oprot->writeFieldEnd();
}
if (this->__isset.geometry_stats) {
xfer += oprot->writeFieldBegin("geometry_stats", ::apache::thrift::protocol::T_STRUCT, 9);
xfer += this->geometry_stats.write(oprot);
xfer += oprot->writeFieldEnd();
}
xfer += oprot->writeFieldStop();
xfer += oprot->writeStructEnd();
return xfer;
Expand Down Expand Up @@ -3406,6 +3393,14 @@ uint32_t ColumnMetaData::read(Protocol_* iprot) {
xfer += iprot->skip(ftype);
}
break;
case 17:
if (ftype == ::apache::thrift::protocol::T_STRUCT) {
xfer += this->geometry_stats.read(iprot);
this->__isset.geometry_stats = true;
} else {
xfer += iprot->skip(ftype);
}
break;
default:
xfer += iprot->skip(ftype);
break;
Expand Down Expand Up @@ -3544,6 +3539,11 @@ uint32_t ColumnMetaData::write(Protocol_* oprot) const {
xfer += this->size_statistics.write(oprot);
xfer += oprot->writeFieldEnd();
}
if (this->__isset.geometry_stats) {
xfer += oprot->writeFieldBegin("geometry_stats", ::apache::thrift::protocol::T_STRUCT, 17);
xfer += this->geometry_stats.write(oprot);
xfer += oprot->writeFieldEnd();
}
xfer += oprot->writeFieldStop();
xfer += oprot->writeStructEnd();
return xfer;
Expand Down Expand Up @@ -4556,26 +4556,6 @@ uint32_t ColumnIndex::read(Protocol_* iprot) {
xfer += iprot->skip(ftype);
}
break;
case 8:
if (ftype == ::apache::thrift::protocol::T_LIST) {
{
this->geometry_stats.clear();
uint32_t _size319;
::apache::thrift::protocol::TType _etype322;
xfer += iprot->readListBegin(_etype322, _size319);
this->geometry_stats.resize(_size319);
uint32_t _i323;
for (_i323 = 0; _i323 < _size319; ++_i323)
{
xfer += this->geometry_stats[_i323].read(iprot);
}
xfer += iprot->readListEnd();
}
this->__isset.geometry_stats = true;
} else {
xfer += iprot->skip(ftype);
}
break;
default:
xfer += iprot->skip(ftype);
break;
Expand Down Expand Up @@ -4605,10 +4585,10 @@ uint32_t ColumnIndex::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast<uint32_t>(this->null_pages.size()));
std::vector<bool> ::const_iterator _iter324;
for (_iter324 = this->null_pages.begin(); _iter324 != this->null_pages.end(); ++_iter324)
std::vector<bool> ::const_iterator _iter319;
for (_iter319 = this->null_pages.begin(); _iter319 != this->null_pages.end(); ++_iter319)
{
xfer += oprot->writeBool((*_iter324));
xfer += oprot->writeBool((*_iter319));
}
xfer += oprot->writeListEnd();
}
Expand All @@ -4617,10 +4597,10 @@ uint32_t ColumnIndex::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->min_values.size()));
std::vector<std::string> ::const_iterator _iter325;
for (_iter325 = this->min_values.begin(); _iter325 != this->min_values.end(); ++_iter325)
std::vector<std::string> ::const_iterator _iter320;
for (_iter320 = this->min_values.begin(); _iter320 != this->min_values.end(); ++_iter320)
{
xfer += oprot->writeBinary((*_iter325));
xfer += oprot->writeBinary((*_iter320));
}
xfer += oprot->writeListEnd();
}
Expand All @@ -4629,10 +4609,10 @@ uint32_t ColumnIndex::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->max_values.size()));
std::vector<std::string> ::const_iterator _iter326;
for (_iter326 = this->max_values.begin(); _iter326 != this->max_values.end(); ++_iter326)
std::vector<std::string> ::const_iterator _iter321;
for (_iter321 = this->max_values.begin(); _iter321 != this->max_values.end(); ++_iter321)
{
xfer += oprot->writeBinary((*_iter326));
xfer += oprot->writeBinary((*_iter321));
}
xfer += oprot->writeListEnd();
}
Expand All @@ -4646,10 +4626,10 @@ uint32_t ColumnIndex::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->null_counts.size()));
std::vector<int64_t> ::const_iterator _iter327;
for (_iter327 = this->null_counts.begin(); _iter327 != this->null_counts.end(); ++_iter327)
std::vector<int64_t> ::const_iterator _iter322;
for (_iter322 = this->null_counts.begin(); _iter322 != this->null_counts.end(); ++_iter322)
{
xfer += oprot->writeI64((*_iter327));
xfer += oprot->writeI64((*_iter322));
}
xfer += oprot->writeListEnd();
}
Expand All @@ -4659,10 +4639,10 @@ uint32_t ColumnIndex::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("repetition_level_histograms", ::apache::thrift::protocol::T_LIST, 6);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->repetition_level_histograms.size()));
std::vector<int64_t> ::const_iterator _iter328;
for (_iter328 = this->repetition_level_histograms.begin(); _iter328 != this->repetition_level_histograms.end(); ++_iter328)
std::vector<int64_t> ::const_iterator _iter323;
for (_iter323 = this->repetition_level_histograms.begin(); _iter323 != this->repetition_level_histograms.end(); ++_iter323)
{
xfer += oprot->writeI64((*_iter328));
xfer += oprot->writeI64((*_iter323));
}
xfer += oprot->writeListEnd();
}
Expand All @@ -4672,23 +4652,10 @@ uint32_t ColumnIndex::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("definition_level_histograms", ::apache::thrift::protocol::T_LIST, 7);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->definition_level_histograms.size()));
std::vector<int64_t> ::const_iterator _iter329;
for (_iter329 = this->definition_level_histograms.begin(); _iter329 != this->definition_level_histograms.end(); ++_iter329)
{
xfer += oprot->writeI64((*_iter329));
}
xfer += oprot->writeListEnd();
}
xfer += oprot->writeFieldEnd();
}
if (this->__isset.geometry_stats) {
xfer += oprot->writeFieldBegin("geometry_stats", ::apache::thrift::protocol::T_LIST, 8);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->geometry_stats.size()));
std::vector<GeometryStatistics> ::const_iterator _iter330;
for (_iter330 = this->geometry_stats.begin(); _iter330 != this->geometry_stats.end(); ++_iter330)
std::vector<int64_t> ::const_iterator _iter324;
for (_iter324 = this->definition_level_histograms.begin(); _iter324 != this->definition_level_histograms.end(); ++_iter324)
{
xfer += (*_iter330).write(oprot);
xfer += oprot->writeI64((*_iter324));
}
xfer += oprot->writeListEnd();
}
Expand Down Expand Up @@ -4976,14 +4943,14 @@ uint32_t FileMetaData::read(Protocol_* iprot) {
if (ftype == ::apache::thrift::protocol::T_LIST) {
{
this->schema.clear();
uint32_t _size347;
::apache::thrift::protocol::TType _etype350;
xfer += iprot->readListBegin(_etype350, _size347);
this->schema.resize(_size347);
uint32_t _i351;
for (_i351 = 0; _i351 < _size347; ++_i351)
uint32_t _size341;
::apache::thrift::protocol::TType _etype344;
xfer += iprot->readListBegin(_etype344, _size341);
this->schema.resize(_size341);
uint32_t _i345;
for (_i345 = 0; _i345 < _size341; ++_i345)
{
xfer += this->schema[_i351].read(iprot);
xfer += this->schema[_i345].read(iprot);
}
xfer += iprot->readListEnd();
}
Expand All @@ -5004,14 +4971,14 @@ uint32_t FileMetaData::read(Protocol_* iprot) {
if (ftype == ::apache::thrift::protocol::T_LIST) {
{
this->row_groups.clear();
uint32_t _size352;
::apache::thrift::protocol::TType _etype355;
xfer += iprot->readListBegin(_etype355, _size352);
this->row_groups.resize(_size352);
uint32_t _i356;
for (_i356 = 0; _i356 < _size352; ++_i356)
uint32_t _size346;
::apache::thrift::protocol::TType _etype349;
xfer += iprot->readListBegin(_etype349, _size346);
this->row_groups.resize(_size346);
uint32_t _i350;
for (_i350 = 0; _i350 < _size346; ++_i350)
{
xfer += this->row_groups[_i356].read(iprot);
xfer += this->row_groups[_i350].read(iprot);
}
xfer += iprot->readListEnd();
}
Expand All @@ -5024,14 +4991,14 @@ uint32_t FileMetaData::read(Protocol_* iprot) {
if (ftype == ::apache::thrift::protocol::T_LIST) {
{
this->key_value_metadata.clear();
uint32_t _size357;
::apache::thrift::protocol::TType _etype360;
xfer += iprot->readListBegin(_etype360, _size357);
this->key_value_metadata.resize(_size357);
uint32_t _i361;
for (_i361 = 0; _i361 < _size357; ++_i361)
uint32_t _size351;
::apache::thrift::protocol::TType _etype354;
xfer += iprot->readListBegin(_etype354, _size351);
this->key_value_metadata.resize(_size351);
uint32_t _i355;
for (_i355 = 0; _i355 < _size351; ++_i355)
{
xfer += this->key_value_metadata[_i361].read(iprot);
xfer += this->key_value_metadata[_i355].read(iprot);
}
xfer += iprot->readListEnd();
}
Expand All @@ -5052,14 +5019,14 @@ uint32_t FileMetaData::read(Protocol_* iprot) {
if (ftype == ::apache::thrift::protocol::T_LIST) {
{
this->column_orders.clear();
uint32_t _size362;
::apache::thrift::protocol::TType _etype365;
xfer += iprot->readListBegin(_etype365, _size362);
this->column_orders.resize(_size362);
uint32_t _i366;
for (_i366 = 0; _i366 < _size362; ++_i366)
uint32_t _size356;
::apache::thrift::protocol::TType _etype359;
xfer += iprot->readListBegin(_etype359, _size356);
this->column_orders.resize(_size356);
uint32_t _i360;
for (_i360 = 0; _i360 < _size356; ++_i360)
{
xfer += this->column_orders[_i366].read(iprot);
xfer += this->column_orders[_i360].read(iprot);
}
xfer += iprot->readListEnd();
}
Expand Down Expand Up @@ -5117,10 +5084,10 @@ uint32_t FileMetaData::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->schema.size()));
std::vector<SchemaElement> ::const_iterator _iter367;
for (_iter367 = this->schema.begin(); _iter367 != this->schema.end(); ++_iter367)
std::vector<SchemaElement> ::const_iterator _iter361;
for (_iter361 = this->schema.begin(); _iter361 != this->schema.end(); ++_iter361)
{
xfer += (*_iter367).write(oprot);
xfer += (*_iter361).write(oprot);
}
xfer += oprot->writeListEnd();
}
Expand All @@ -5133,10 +5100,10 @@ uint32_t FileMetaData::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->row_groups.size()));
std::vector<RowGroup> ::const_iterator _iter368;
for (_iter368 = this->row_groups.begin(); _iter368 != this->row_groups.end(); ++_iter368)
std::vector<RowGroup> ::const_iterator _iter362;
for (_iter362 = this->row_groups.begin(); _iter362 != this->row_groups.end(); ++_iter362)
{
xfer += (*_iter368).write(oprot);
xfer += (*_iter362).write(oprot);
}
xfer += oprot->writeListEnd();
}
Expand All @@ -5146,10 +5113,10 @@ uint32_t FileMetaData::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
std::vector<KeyValue> ::const_iterator _iter369;
for (_iter369 = this->key_value_metadata.begin(); _iter369 != this->key_value_metadata.end(); ++_iter369)
std::vector<KeyValue> ::const_iterator _iter363;
for (_iter363 = this->key_value_metadata.begin(); _iter363 != this->key_value_metadata.end(); ++_iter363)
{
xfer += (*_iter369).write(oprot);
xfer += (*_iter363).write(oprot);
}
xfer += oprot->writeListEnd();
}
Expand All @@ -5164,10 +5131,10 @@ uint32_t FileMetaData::write(Protocol_* oprot) const {
xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7);
{
xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->column_orders.size()));
std::vector<ColumnOrder> ::const_iterator _iter370;
for (_iter370 = this->column_orders.begin(); _iter370 != this->column_orders.end(); ++_iter370)
std::vector<ColumnOrder> ::const_iterator _iter364;
for (_iter364 = this->column_orders.begin(); _iter364 != this->column_orders.end(); ++_iter364)
{
xfer += (*_iter370).write(oprot);
xfer += (*_iter364).write(oprot);
}
xfer += oprot->writeListEnd();
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ set(PARQUET_SRCS
exception.cc
file_reader.cc
file_writer.cc
geometry_statistics.cc
level_comparison.cc
level_conversion.cc
metadata.cc
Expand Down
3 changes: 0 additions & 3 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,6 @@ EncodedStatistics ExtractStatsFromHeader(const H& header) {
if (stats.__isset.distinct_count) {
page_statistics.set_distinct_count(stats.distinct_count);
}
if (stats.__isset.geometry_stats) {
page_statistics.set_geometry(FromThrift(stats.geometry_stats));
}
return page_statistics;
}

Expand Down
Loading

0 comments on commit ba80f3e

Please sign in to comment.