/** * Statistics per row group and per page * All fields are optional. */ structStatistics { /** count of null value in the column */ 3: optional i64 null_count; /** count of distinct values occurring */ 4: optional i64 distinct_count; /** * Min and max values for the column, determined by its ColumnOrder. * * Values are encoded using PLAIN encoding, except that variable-length byte * arrays do not include a length prefix. */ 5: optional binary max_value; 6: optional binary min_value; }
/** * Description for column metadata */ structColumnMetaData { /** Type of this column **/ 1: required Type type
/** Set of all encodings used for this column. The purpose is to validate * whether we can decode those pages. **/ 2: required list<Encoding> encodings
/** Path in schema **/ 3: required list<string> path_in_schema
/** optional statistics for this column chunk */ 12: optional Statistics statistics;
/** Set of all encodings used for pages in this column chunk. * This information can be used to determine if all data pages are * dictionary encoded for example **/ 13: optional list<PageEncodingStats> encoding_stats; }
/** * Description for file metadata */ structFileMetaData { /** Parquet schema for this file. This schema contains metadata for all the columns. * The schema is represented as a tree with a single root. The nodes of the tree * are flattened to a list by doing a depth-first traversal. * The column metadata contains the path in the schema for that column which can be * used to map columns to nodes in the schema. * The first element is the root **/ 2: required list<SchemaElement> schema;
/** String for application that wrote this file. This should be in the format * <Application> version <App Version> (build <App Build Hash>). * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) **/ 6: optional string created_by
/** * Sort order used for the min_value and max_value fields in the Statistics * objects and the min_values and max_values fields in the ColumnIndex * objects of each column in this file. Sort orders are listed in the order * matching the columns in the schema. The indexes are not necessary the same * though, because only leaf nodes of the schema are represented in the list * of sort orders. * * Without column_orders, the meaning of the min_value and max_value fields * in the Statistics object and the ColumnIndex object is undefined. To ensure * well-defined behaviour, if these fields are written to a Parquet file, * column_orders must be written as well. * * The obsolete min and max fields in the Statistics object are always sorted * by signed comparison regardless of column_orders. */ 7: optional list<ColumnOrder> column_orders; }
/** Empty struct to signal the order defined by the physical or logical type */ structTypeDefinedOrder {}
/** * Union to specify the order used for the min_value and max_value fields for a * column. This union takes the role of an enhanced enum that allows rich * elements (which will be needed for a collation-based ordering in the future). * * Possible values are: * * TypeDefinedOrder - the column uses the order defined by its logical or * physical type (if there is no logical type). * * If the reader does not support the value of this union, min and max stats * for this column should be ignored. */ unionColumnOrder {
/** * The sort orders for logical types are: * UTF8 - unsigned byte-wise comparison * INT8 - signed comparison * INT16 - signed comparison * INT32 - signed comparison * INT64 - signed comparison * UINT8 - unsigned comparison * UINT16 - unsigned comparison * UINT32 - unsigned comparison * UINT64 - unsigned comparison * DECIMAL - signed comparison of the represented value * DATE - signed comparison * TIME_MILLIS - signed comparison * TIME_MICROS - signed comparison * TIMESTAMP_MILLIS - signed comparison * TIMESTAMP_MICROS - signed comparison * INTERVAL - unsigned comparison * JSON - unsigned byte-wise comparison * BSON - unsigned byte-wise comparison * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true * INT32 - signed comparison * INT64 - signed comparison * INT96 (only used for legacy timestamps) - undefined * FLOAT - signed comparison of the represented value (*) * DOUBLE - signed comparison of the represented value (*) * BYTE_ARRAY - unsigned byte-wise comparison * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison * * (*) Because the sorting order is not specified properly for floating * point values (relations vs. total ordering) the following * compatibility rules should be applied when reading statistics: * - If the min is a NaN, it should be ignored. * - If the max is a NaN, it should be ignored. * - If the min is +0, the row group may contain -0 values as well. * - If the max is -0, the row group may contain +0 values as well. * - When looking for NaN values, min and max should be ignored. * * When writing statistics the following rules should be followed: * - NaNs should not be written to min or max statistics fields. * - If the computed max value is zero (whether negative or positive), * `+0.0` should be written into the max statistics field. * - If the computed min value is zero (whether negative or positive), * `-0.0` should be written into the min statistics field. */ 1: TypeDefinedOrder TYPE_ORDER; }
template <typename DType> structUnsignedCompareHelperBase { using T = typename DType::c_type; using UCType = typename std::make_unsigned<T>::type;
static_assert(!std::is_same<T, UCType>::value, "T is unsigned"); static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");
// NOTE: according to the C++ spec, unsigned-to-signed conversion is // implementation-defined if the original value does not fit in the signed type // (i.e., two's complement cannot be assumed even on mainstream machines, // because the compiler may decide otherwise). Hence the use of `SafeCopy` // below for deterministic bit-casting. // (see "Integer conversions" in // https://en.cppreference.com/w/cpp/language/implicit_conversion)
staticconst T DefaultMin(){ returnSafeCopy<T>(std::numeric_limits<UCType>::max()); } staticconst T DefaultMax(){ return0; }
staticboolCompare(int type_length, T a, T b){ returnSafeCopy<UCType>(a) < SafeCopy<UCType>(b); }
static T Min(int type_length, T a, T b){ returnCompare(type_length, a, b) ? a : b; } static T Max(int type_length, T a, T b){ returnCompare(type_length, a, b) ? b : a; } };
template <typename DType, bool is_signed> structCompareHelper { using T = typename DType::c_type;
static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value, "T is an unsigned numeric");
constexprstatic T DefaultMin(){ return std::numeric_limits<T>::max(); } constexprstatic T DefaultMax(){ return std::numeric_limits<T>::lowest(); }
// MSVC17 fix, isnan is not overloaded for IntegralType as per C++11 // standard requirements. template <typename T1 = T> static ::arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val, T fallback) { return std::isnan(val) ? fallback : val; }
template <typename T1 = T> static ::arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce( T val, T fallback) { return val; }
staticinlineboolCompare(int type_length, const T& a, const T& b){ return a < b; }
static T Min(int type_length, T a, T b){ return a < b ? a : b; } static T Max(int type_length, T a, T b){ return a < b ? b : a; } };
T min = Helper::DefaultMin(); T max = Helper::DefaultMax();
for (int64_t i = 0; i < length; i++) { constauto val = SafeLoad(values + i); min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin())); max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax())); }
// In case of floating point types, the following rules are applied (as per // upstream parquet-mr): // - If any of min/max is NaN, return nothing. // - If min is 0.0f, replace with -0.0f // - If max is -0.0f, replace with 0.0f template <typename T> ::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>> CleanStatistic(std::pair<T, T> min_max) { T min = min_max.first; T max = min_max.second;
// Ignore if one of the value is nan. if (std::isnan(min) || std::isnan(max)) { return ::std::nullopt; }
if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) { return ::std::nullopt; }
T zero{};
if (min == zero && !std::signbit(min)) { min = -min; }
if (max == zero && std::signbit(max)) { max = -max; }
return {{min, max}}; }
voidSetMinMaxPair(std::pair<T, T> min_max){ // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN auto maybe_min_max = CleanStatistic(min_max); if (!maybe_min_max) return;
auto min = maybe_min_max.value().first; auto max = maybe_min_max.value().second;
我们上节介绍 Schema 的时候没介绍 SortOrder,因为 Parquet Standard 里面没有这个东西(有叫 sort_order 的别的东西)。这个是 C++ Parquet 内部实现 order 用的:
1 2 3 4 5
SortOrder::type sort_order()const{ auto la = logical_type(); auto pt = physical_type(); return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt); }
staticinline format::Statistics ToThrift(const EncodedStatistics& stats){ format::Statistics statistics; if (stats.has_min) { statistics.__set_min_value(stats.min()); // If the order is SIGNED, then the old min value must be set too. // This for backward compatibility if (stats.is_signed()) { statistics.__set_min(stats.min()); } } if (stats.has_max) { statistics.__set_max_value(stats.max()); // If the order is SIGNED, then the old max value must be set too. // This for backward compatibility if (stats.is_signed()) { statistics.__set_max(stats.max()); } } if (stats.has_null_count) { statistics.__set_null_count(stats.null_count); } if (stats.has_distinct_count) { statistics.__set_distinct_count(stats.distinct_count); }
return statistics; }
上面是 Page 层,ColumnChunk 层会 Merge Page 层的,直到完成:
1 2 3 4 5 6
voidResetPageStatistics()override{ if (chunk_statistics_ != nullptr) { chunk_statistics_->Merge(*page_statistics_); page_statistics_->Reset(); } }
// Extracts encoded statistics from V1 and V2 data page headers template <typename H> EncodedStatistics ExtractStatsFromHeader(const H& header){ EncodedStatistics page_statistics; if (!header.__isset.statistics) { return page_statistics; } const format::Statistics& stats = header.statistics; // Use the new V2 min-max statistics over the former one if it is filled if (stats.__isset.max_value || stats.__isset.min_value) { // TODO: check if the column_order is TYPE_DEFINED_ORDER. if (stats.__isset.max_value) { page_statistics.set_max(stats.max_value); } if (stats.__isset.min_value) { page_statistics.set_min(stats.min_value); } } elseif (stats.__isset.max || stats.__isset.min) { // TODO: check created_by to see if it is corrupted for some types. // TODO: check if the sort_order is SIGNED. if (stats.__isset.max) { page_statistics.set_max(stats.max); } if (stats.__isset.min) { page_statistics.set_min(stats.min); } } if (stats.__isset.null_count) { page_statistics.set_null_count(stats.null_count); } if (stats.__isset.distinct_count) { page_statistics.set_distinct_count(stats.distinct_count); } return page_statistics; }
/** Set of all encodings used for pages in this column chunk. * This information can be used to determine if all data pages are * dictionary encoded for example **/ 13: optional list<PageEncodingStats> encoding_stats;
/** * Description for ColumnIndex. * Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i] */ structColumnIndex { /** * A list of Boolean values to determine the validity of the corresponding * min and max values. If true, a page contains only null values, and writers * have to set the corresponding entries in min_values and max_values to * byte[0], so that all lists have the same length. If false, the * corresponding entries in min_values and max_values must be valid. */ 1: required list<bool> null_pages
/** * Two lists containing lower and upper bounds for the values of each page * determined by the ColumnOrder of the column. These may be the actual * minimum and maximum values found on a page, but can also be (more compact) * values that do not exist on a page. For example, instead of storing ""Blart * Versenwald III", a writer may set min_values[i]="B", max_values[i]="C". * Such more compact values must still be valid values within the column's * logical type. Readers must make sure that list entries are populated before * using them by inspecting null_pages. */ 2: required list<binary> min_values 3: required list<binary> max_values
/** * Stores whether both min_values and max_values are ordered and if so, in * which direction. This allows readers to perform binary searches in both * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even * if the lists are ordered. */ 4: required BoundaryOrder boundary_order
/** A list containing the number of null values for each page **/ 5: optional list<i64> null_counts }