apache · mapleFU · Oct 31, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
@@ -320,7 +320,7 @@ if(ARROW_TESTING)
   # "link" our dependencies so that include paths are configured
   # correctly
   target_link_libraries(parquet_testing PUBLIC ${ARROW_GTEST_GMOCK})
-  list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing)
+  list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing RapidJSON)
 endif()
 
 if(NOT ARROW_BUILD_SHARED)

@@ -220,7 +220,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
     bool hasRow;
     do {
       hasRow = false;
-      for (auto scanner : scanners) {
+      for (const auto& scanner : scanners) {
         if (scanner->HasNext()) {
           hasRow = true;
           scanner->PrintNext(stream, COL_WIDTH);
@@ -246,7 +246,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
          << file_metadata->schema()->group_node()->field_count() << "\",\n";
   stream << "  \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
 
-  if (selected_columns.size() == 0) {
+  if (selected_columns.empty()) {
     for (int i = 0; i < file_metadata->num_columns(); i++) {
       selected_columns.push_back(i);
     }
@@ -299,70 +299,74 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
              << column_chunk->num_values() << "\", "
              << "\"StatsSet\": ";
       if (column_chunk->is_stats_set()) {
-        stream << "\"True\", \"Stats\": {";
+        stream << R"("True", "Stats": {)";
         if (stats->HasNullCount()) {
-          stream << "\"NumNulls\": \"" << stats->null_count();
+          stream << R"("NumNulls": ")" << stats->null_count() << "\"";
         }
         if (stats->HasDistinctCount()) {
-          stream << "\", "
-                 << "\"DistinctValues\": \"" << stats->distinct_count();
+          stream << ", "
+                 << R"("DistinctValues": ")" << stats->distinct_count() << "\"";
         }
         if (stats->HasMinMax()) {
           std::string min = stats->EncodeMin(), max = stats->EncodeMax();
-          stream << "\", "
-                 << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max)
+          stream << ", "
+                 << R"("Max": ")" << FormatStatValue(descr->physical_type(), max)
                  << "\", "
-                 << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min);
+                 << R"("Min": ")" << FormatStatValue(descr->physical_type(), min) << "\"";
         }
-        stream << "\" },";
+        stream << " },";
       } else {
         stream << "\"False\",";
       }
       stream << "\n           \"Compression\": \""
              << ::arrow::internal::AsciiToUpper(
                     Codec::GetCodecAsString(column_chunk->compression()))
-             << "\", \"Encodings\": \"";
+             << R"(", "Encodings": )";
+      stream << "\"";
       if (column_chunk->encoding_stats().empty()) {
         for (auto encoding : column_chunk->encodings()) {
           stream << EncodingToString(encoding) << " ";
         }
       } else {
         PrintPageEncodingStats(stream, column_chunk->encoding_stats());
       }
-      stream << "\", "
-             << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
-             << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
+      stream << "\"";
+      stream << ", "
+             << R"("UncompressedSize": ")" << column_chunk->total_uncompressed_size()
+             << R"(", "CompressedSize": ")" << column_chunk->total_compressed_size()
+             << "\"";
 
       if (column_chunk->bloom_filter_offset()) {
         // Output BloomFilter {offset, length}
-        stream << "\", BloomFilter {"
-               << "\"offset\": \"" << column_chunk->bloom_filter_offset().value();
+        stream << ", \"BloomFilter\": {"
+               << R"("offset": ")" << column_chunk->bloom_filter_offset().value() << "\"";
         if (column_chunk->bloom_filter_length()) {
-          stream << "\", \"length\": \"" << column_chunk->bloom_filter_length().value();
+          stream << R"(, "length": ")" << column_chunk->bloom_filter_length().value()
+                 << "\"";
         }
-        stream << "\"}";
+        stream << "}";
       }
 
       if (column_chunk->GetColumnIndexLocation()) {
         auto location = column_chunk->GetColumnIndexLocation().value();
         // Output ColumnIndex {offset, length}
-        stream << "\", ColumnIndex {"
-               << "\"offset\": \"" << location.offset;
-        stream << "\", \"length\": \"" << location.length;
+        stream << ", \"ColumnIndex\": {"
+               << R"("offset": ")" << location.offset;
+        stream << R"(", "length": ")" << location.length;
         stream << "\"}";
       }
 
       if (column_chunk->GetOffsetIndexLocation()) {
         auto location = column_chunk->GetOffsetIndexLocation().value();
         // Output OffsetIndex {offset, length}
-        stream << "\", OffsetIndex {"
-               << "\"offset\": \"" << location.offset;
-        stream << "\", \"length\": \"" << location.length;
-        stream << "\"}";
+        stream << ", \"OffsetIndex\": {"
+               << R"("offset": ")" << location.offset << "\"";
+        stream << R"(, "length": ")" << location.length << "\"";
+        stream << "}";
       }
 
       // end of a ColumnChunk
-      stream << "\" }";
+      stream << " }";
       c1++;
       if (c1 != static_cast<int>(selected_columns.size())) {
         stream << ",\n";

@@ -32,7 +32,7 @@ class PARQUET_EXPORT ParquetFilePrinter {
 
  public:
   explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
-  ~ParquetFilePrinter() {}
+  ~ParquetFilePrinter() = default;
 
   void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
                   bool print_values = false, bool format_dump = false,

@@ -27,6 +27,12 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "arrow/json/rapidjson_defs.h"  // IWYU pragma: keep
+
+#include <rapidjson/document.h>
+#include <rapidjson/error/en.h>
+#include <rapidjson/stringbuffer.h>
+
 #include "arrow/array.h"
 #include "arrow/buffer.h"
 #include "arrow/io/file.h"
@@ -47,6 +53,8 @@
 #include "parquet/printer.h"
 #include "parquet/test_util.h"
 
+namespace rj = arrow::rapidjson;
+
 using arrow::internal::checked_pointer_cast;
 using arrow::internal::Zip;
 
@@ -1172,6 +1180,30 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) {
   EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
 }
 
+// GH-44101: Test that JSON output is valid JSON
+TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
+  auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status {
+    rj::Document json_doc;
+    constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag;
+    json_doc.Parse<kParseFlags>(json_string.data(), json_string.length());
+    if (json_doc.HasParseError()) {
+      return ::arrow::Status::Invalid("JSON parse error at offset ",
+                                      json_doc.GetErrorOffset(), ": ",
+                                      rj::GetParseError_En(json_doc.GetParseError()));
+    }
+    return ::arrow::Status::OK();
+  };
+  std::vector<std::string_view> check_file_lists = {
+      "data_index_bloom_encoding_with_length.parquet",
+      "data_index_bloom_encoding_stats.parquet", "alltypes_tiny_pages_plain.parquet",
+      "concatenated_gzip_members.parquet", "nulls.snappy.parquet"};
+  for (const auto& file : check_file_lists) {
+    std::string json_content = ReadFromLocalFile(file);
+    ASSERT_OK(check_json_valid(json_content))
+        << "Invalid JSON output for file: " << file << ", content:" << json_content;
+  }
+}
+
 TEST(TestFileReader, BufferedReadsWithDictionary) {
   const int num_rows = 1000;