// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "parquet/arrow/reader.h" #include #include #include #include #include #include #include #include #include #include "arrow/api.h" #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" #include "parquet/arrow/record_reader.h" #include "parquet/arrow/schema.h" #include "parquet/column_reader.h" #include "parquet/schema.h" #include "parquet/util/schema-util.h" using arrow::Array; using arrow::BooleanArray; using arrow::Column; using arrow::Field; using arrow::Int32Array; using arrow::ListArray; using arrow::MemoryPool; using arrow::PoolBuffer; using arrow::Status; using arrow::StructArray; using arrow::Table; using arrow::TimestampArray; using parquet::schema::Node; // Help reduce verbosity using ParquetReader = parquet::ParquetFileReader; using arrow::ParallelFor; using parquet::internal::RecordReader; namespace parquet { namespace arrow { using ::arrow::BitUtil::BytesForBits; constexpr int64_t kJulianToUnixEpochDays = 2440588LL; constexpr int64_t kMillisecondsInADay = 86400000LL; constexpr int64_t kNanosecondsInADay = kMillisecondsInADay * 1000LL * 1000LL; static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timestamp) { int64_t days_since_epoch = impala_timestamp.value[2] - kJulianToUnixEpochDays; int64_t nanoseconds = *(reinterpret_cast(&(impala_timestamp.value))); return days_since_epoch * kNanosecondsInADay + nanoseconds; } template using ArrayType = typename ::arrow::TypeTraits::ArrayType; // ---------------------------------------------------------------------- // Iteration utilities // Abstraction to decouple row group iteration details from the ColumnReader, // so we can read only a single row group if we want class FileColumnIterator { public: explicit FileColumnIterator(int column_index, ParquetFileReader* reader) : column_index_(column_index), reader_(reader), schema_(reader->metadata()->schema()) {} virtual ~FileColumnIterator() {} virtual std::unique_ptr<::parquet::PageReader> NextChunk() = 0; const SchemaDescriptor* schema() const { return schema_; } const ColumnDescriptor* descr() const { return schema_->Column(column_index_); } std::shared_ptr metadata() const { return reader_->metadata(); } int column_index() const { return column_index_; } protected: int column_index_; ParquetFileReader* reader_; const SchemaDescriptor* schema_; }; class AllRowGroupsIterator : public FileColumnIterator { public: explicit AllRowGroupsIterator(int column_index, ParquetFileReader* reader) : FileColumnIterator(column_index, reader), next_row_group_(0) {} std::unique_ptr<::parquet::PageReader> NextChunk() override { std::unique_ptr<::parquet::PageReader> result; if (next_row_group_ < reader_->metadata()->num_row_groups()) { result = reader_->RowGroup(next_row_group_)->GetColumnPageReader(column_index_); next_row_group_++; } else { result = nullptr; } return result; } private: int next_row_group_; }; class SingleRowGroupIterator : public FileColumnIterator { public: explicit SingleRowGroupIterator(int column_index, int row_group_number, ParquetFileReader* reader) : FileColumnIterator(column_index, reader), row_group_number_(row_group_number), done_(false) {} std::unique_ptr<::parquet::PageReader> NextChunk() override { if (done_) { return nullptr; } auto result = reader_->RowGroup(row_group_number_)->GetColumnPageReader(column_index_); done_ = true; return result; } private: int row_group_number_; bool done_; }; // ---------------------------------------------------------------------- // File reader implementation class FileReader::Impl { public: Impl(MemoryPool* pool, std::unique_ptr reader) : pool_(pool), reader_(std::move(reader)), num_threads_(1) {} virtual ~Impl() {} Status GetColumn(int i, std::unique_ptr* out); Status ReadSchemaField(int i, std::shared_ptr* out); Status ReadSchemaField(int i, const std::vector& indices, std::shared_ptr* out); Status GetReaderForNode(int index, const Node* node, const std::vector& indices, int16_t def_level, std::unique_ptr* out); Status ReadColumn(int i, std::shared_ptr* out); Status ReadColumnChunk(int column_index, int row_group_index, std::shared_ptr* out); Status GetSchema(std::shared_ptr<::arrow::Schema>* out); Status GetSchema(const std::vector& indices, std::shared_ptr<::arrow::Schema>* out); Status ReadRowGroup(int row_group_index, const std::vector& indices, std::shared_ptr<::arrow::Table>* out); Status ReadTable(const std::vector& indices, std::shared_ptr* table); Status ReadTable(std::shared_ptr

* table); Status ReadRowGroup(int i, std::shared_ptr

* table); bool CheckForFlatColumn(const ColumnDescriptor* descr); bool CheckForFlatListColumn(const ColumnDescriptor* descr); const ParquetFileReader* parquet_reader() const { return reader_.get(); } int num_row_groups() const { return reader_->metadata()->num_row_groups(); } void set_num_threads(int num_threads) { num_threads_ = num_threads; } ParquetFileReader* reader() { return reader_.get(); } private: MemoryPool* pool_; std::unique_ptr reader_; int num_threads_; }; class ColumnReader::ColumnReaderImpl { public: virtual ~ColumnReaderImpl() {} virtual Status NextBatch(int64_t records_to_read, std::shared_ptr* out) = 0; virtual Status GetDefLevels(const int16_t** data, size_t* length) = 0; virtual Status GetRepLevels(const int16_t** data, size_t* length) = 0; virtual const std::shared_ptr field() = 0; }; // Reader implementation for primitive arrays class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { public: PrimitiveImpl(MemoryPool* pool, std::unique_ptr input) : pool_(pool), input_(std::move(input)), descr_(input_->descr()) { record_reader_ = RecordReader::Make(descr_, pool_); DCHECK(NodeToField(*input_->descr()->schema_node(), &field_).ok()); NextRowGroup(); } Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; template Status WrapIntoListArray(std::shared_ptr* array); Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; const std::shared_ptr field() override { return field_; } private: void NextRowGroup(); MemoryPool* pool_; std::unique_ptr input_; const ColumnDescriptor* descr_; std::shared_ptr record_reader_; std::shared_ptr field_; }; // Reader implementation for struct array class PARQUET_NO_EXPORT StructImpl : public ColumnReader::ColumnReaderImpl { public: explicit StructImpl(const std::vector>& children, int16_t struct_def_level, MemoryPool* pool, const Node* node) : children_(children), struct_def_level_(struct_def_level), pool_(pool), def_levels_buffer_(pool) { InitField(node, children); } Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; const std::shared_ptr field() override { return field_; } private: std::vector> children_; int16_t struct_def_level_; MemoryPool* pool_; std::shared_ptr field_; PoolBuffer def_levels_buffer_; Status DefLevelsToNullArray(std::shared_ptr* null_bitmap, int64_t* null_count); void InitField(const Node* node, const std::vector>& children); }; FileReader::FileReader(MemoryPool* pool, std::unique_ptr reader) : impl_(new FileReader::Impl(pool, std::move(reader))) {} FileReader::~FileReader() {} Status FileReader::Impl::GetColumn(int i, std::unique_ptr* out) { std::unique_ptr input(new AllRowGroupsIterator(i, reader_.get())); std::unique_ptr impl( new PrimitiveImpl(pool_, std::move(input))); *out = std::unique_ptr(new ColumnReader(std::move(impl))); return Status::OK(); } Status FileReader::Impl::GetReaderForNode( int index, const Node* node, const std::vector& indices, int16_t def_level, std::unique_ptr* out) { *out = nullptr; if (IsSimpleStruct(node)) { const schema::GroupNode* group = static_cast(node); std::vector> children; for (int i = 0; i < group->field_count(); i++) { std::unique_ptr child_reader; // TODO(itaiin): Remove the -1 index hack when all types of nested reads // are supported. This currently just signals the lower level reader resolution // to abort RETURN_NOT_OK(GetReaderForNode(index, group->field(i).get(), indices, static_cast(def_level + 1), &child_reader)); if (child_reader != nullptr) { children.push_back(std::move(child_reader)); } } if (children.size() > 0) { *out = std::unique_ptr( new StructImpl(children, def_level, pool_, node)); } } else { // This should be a flat field case - translate the field index to // the correct column index by walking down to the leaf node const Node* walker = node; while (!walker->is_primitive()) { DCHECK(walker->is_group()); auto group = static_cast(walker); if (group->field_count() != 1) { return Status::NotImplemented("lists with structs are not supported."); } walker = group->field(0).get(); } auto column_index = reader_->metadata()->schema()->ColumnIndex(*walker); // If the index of the column is found then a reader for the coliumn is needed. // Otherwise *out keeps the nullptr value. if (std::find(indices.begin(), indices.end(), column_index) != indices.end()) { std::unique_ptr reader; RETURN_NOT_OK(GetColumn(column_index, &reader)); *out = std::move(reader->impl_); } } return Status::OK(); } Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { std::vector indices(reader_->metadata()->num_columns()); for (size_t j = 0; j < indices.size(); ++j) { indices[j] = static_cast(j); } return ReadSchemaField(i, indices, out); } Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, std::shared_ptr* out) { auto parquet_schema = reader_->metadata()->schema(); auto node = parquet_schema->group_node()->field(i).get(); std::unique_ptr reader_impl; RETURN_NOT_OK(GetReaderForNode(i, node, indices, 1, &reader_impl)); if (reader_impl == nullptr) { *out = nullptr; return Status::OK(); } std::unique_ptr reader(new ColumnReader(std::move(reader_impl))); // TODO(wesm): This calculation doesn't make much sense when we have repeated // schema nodes int64_t records_to_read = 0; const FileMetaData& metadata = *reader_->metadata(); for (int j = 0; j < metadata.num_row_groups(); j++) { records_to_read += metadata.RowGroup(j)->ColumnChunk(i)->num_values(); } return reader->NextBatch(records_to_read, out); } Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { std::unique_ptr flat_column_reader; RETURN_NOT_OK(GetColumn(i, &flat_column_reader)); int64_t records_to_read = 0; for (int j = 0; j < reader_->metadata()->num_row_groups(); j++) { records_to_read += reader_->metadata()->RowGroup(j)->ColumnChunk(i)->num_values(); } return flat_column_reader->NextBatch(records_to_read, out); } Status FileReader::Impl::GetSchema(const std::vector& indices, std::shared_ptr<::arrow::Schema>* out) { auto descr = reader_->metadata()->schema(); auto parquet_key_value_metadata = reader_->metadata()->key_value_metadata(); return FromParquetSchema(descr, indices, parquet_key_value_metadata, out); } Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, std::shared_ptr* out) { auto rg_metadata = reader_->metadata()->RowGroup(row_group_index); int64_t records_to_read = rg_metadata->ColumnChunk(column_index)->num_values(); std::unique_ptr input( new SingleRowGroupIterator(column_index, row_group_index, reader_.get())); std::unique_ptr impl( new PrimitiveImpl(pool_, std::move(input))); ColumnReader flat_column_reader(std::move(impl)); std::shared_ptr array; RETURN_NOT_OK(flat_column_reader.NextBatch(records_to_read, &array)); *out = array; return Status::OK(); } Status FileReader::Impl::ReadRowGroup(int row_group_index, const std::vector& indices, std::shared_ptr<::arrow::Table>* out) { std::shared_ptr<::arrow::Schema> schema; RETURN_NOT_OK(GetSchema(indices, &schema)); auto rg_metadata = reader_->metadata()->RowGroup(row_group_index); int num_columns = static_cast(indices.size()); int nthreads = std::min(num_threads_, num_columns); std::vector> columns(num_columns); // TODO(wesm): Refactor to share more code with ReadTable auto ReadColumnFunc = [&indices, &row_group_index, &schema, &columns, this](int i) { int column_index = indices[i]; std::shared_ptr array; RETURN_NOT_OK(ReadColumnChunk(column_index, row_group_index, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); }; if (nthreads == 1) { for (int i = 0; i < num_columns; i++) { RETURN_NOT_OK(ReadColumnFunc(i)); } } else { RETURN_NOT_OK(ParallelFor(nthreads, num_columns, ReadColumnFunc)); } *out = Table::Make(schema, columns); return Status::OK(); } Status FileReader::Impl::ReadTable(const std::vector& indices, std::shared_ptr

* table) { std::shared_ptr<::arrow::Schema> schema; RETURN_NOT_OK(GetSchema(indices, &schema)); // We only need to read schema fields which have columns indicated // in the indices vector std::vector field_indices; if (!ColumnIndicesToFieldIndices(*reader_->metadata()->schema(), indices, &field_indices)) { return Status::Invalid("Invalid column index"); } std::vector> columns(field_indices.size()); auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { std::shared_ptr array; RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); }; int num_fields = static_cast(field_indices.size()); int nthreads = std::min(num_threads_, num_fields); if (nthreads == 1) { for (int i = 0; i < num_fields; i++) { RETURN_NOT_OK(ReadColumnFunc(i)); } } else { RETURN_NOT_OK(ParallelFor(nthreads, num_fields, ReadColumnFunc)); } *table = Table::Make(schema, columns); return Status::OK(); } Status FileReader::Impl::ReadTable(std::shared_ptr

* table) { std::vector indices(reader_->metadata()->num_columns()); for (size_t i = 0; i < indices.size(); ++i) { indices[i] = static_cast(i); } return ReadTable(indices, table); } Status FileReader::Impl::ReadRowGroup(int i, std::shared_ptr

* table) { std::vector indices(reader_->metadata()->num_columns()); for (size_t i = 0; i < indices.size(); ++i) { indices[i] = static_cast(i); } return ReadRowGroup(i, indices, table); } // Static ctor Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, MemoryPool* allocator, const ReaderProperties& props, const std::shared_ptr& metadata, std::unique_ptr* reader) { std::unique_ptr io_wrapper(new ArrowInputFile(file)); std::unique_ptr pq_reader; PARQUET_CATCH_NOT_OK(pq_reader = ParquetReader::Open(std::move(io_wrapper), props, metadata)); reader->reset(new FileReader(allocator, std::move(pq_reader))); return Status::OK(); } Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, MemoryPool* allocator, std::unique_ptr* reader) { return OpenFile(file, allocator, ::parquet::default_reader_properties(), nullptr, reader); } Status FileReader::GetColumn(int i, std::unique_ptr* out) { return impl_->GetColumn(i, out); } Status FileReader::ReadColumn(int i, std::shared_ptr* out) { try { return impl_->ReadColumn(i, out); } catch (const ::parquet::ParquetException& e) { return ::arrow::Status::IOError(e.what()); } } Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { try { return impl_->ReadSchemaField(i, out); } catch (const ::parquet::ParquetException& e) { return ::arrow::Status::IOError(e.what()); } } Status FileReader::ReadTable(std::shared_ptr

* out) { try { return impl_->ReadTable(out); } catch (const ::parquet::ParquetException& e) { return ::arrow::Status::IOError(e.what()); } } Status FileReader::ReadTable(const std::vector& indices, std::shared_ptr

* out) { try { return impl_->ReadTable(indices, out); } catch (const ::parquet::ParquetException& e) { return ::arrow::Status::IOError(e.what()); } } Status FileReader::ReadRowGroup(int i, std::shared_ptr

* out) { try { return impl_->ReadRowGroup(i, out); } catch (const ::parquet::ParquetException& e) { return ::arrow::Status::IOError(e.what()); } } Status FileReader::ReadRowGroup(int i, const std::vector& indices, std::shared_ptr

* out) { try { return impl_->ReadRowGroup(i, indices, out); } catch (const ::parquet::ParquetException& e) { return ::arrow::Status::IOError(e.what()); } } std::shared_ptr FileReader::RowGroup(int row_group_index) { return std::shared_ptr( new RowGroupReader(impl_.get(), row_group_index)); } int FileReader::num_row_groups() const { return impl_->num_row_groups(); } void FileReader::set_num_threads(int num_threads) { impl_->set_num_threads(num_threads); } Status FileReader::ScanContents(std::vector columns, const int32_t column_batch_size, int64_t* num_rows) { try { *num_rows = ScanFileContents(columns, column_batch_size, impl_->reader()); return Status::OK(); } catch (const ::parquet::ParquetException& e) { return Status::IOError(e.what()); } } const ParquetFileReader* FileReader::parquet_reader() const { return impl_->parquet_reader(); } template Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { const int16_t* def_levels = record_reader_->def_levels(); const int16_t* rep_levels = record_reader_->rep_levels(); const int64_t total_levels_read = record_reader_->levels_position(); std::shared_ptr<::arrow::Schema> arrow_schema; RETURN_NOT_OK(FromParquetSchema(input_->schema(), {input_->column_index()}, input_->metadata()->key_value_metadata(), &arrow_schema)); std::shared_ptr current_field = arrow_schema->field(0); if (descr_->max_repetition_level() > 0) { // Walk downwards to extract nullability std::vector nullable; std::vector> offset_builders; std::vector> valid_bits_builders; nullable.push_back(current_field->nullable()); while (current_field->type()->num_children() > 0) { if (current_field->type()->num_children() > 1) { return Status::NotImplemented( "Fields with more than one child are not supported."); } else { if (current_field->type()->id() != ::arrow::Type::LIST) { return Status::NotImplemented( "Currently only nesting with Lists is supported."); } current_field = current_field->type()->child(0); } offset_builders.emplace_back( std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); valid_bits_builders.emplace_back( std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); nullable.push_back(current_field->nullable()); } int64_t list_depth = offset_builders.size(); // This describes the minimal definition that describes a level that // reflects a value in the primitive values array. int16_t values_def_level = descr_->max_definition_level(); if (nullable[nullable.size() - 1]) { values_def_level--; } // The definition levels that are needed so that a list is declared // as empty and not null. std::vector empty_def_level(list_depth); int def_level = 0; for (int i = 0; i < list_depth; i++) { if (nullable[i]) { def_level++; } empty_def_level[i] = static_cast(def_level); def_level++; } int32_t values_offset = 0; std::vector null_counts(list_depth, 0); for (int64_t i = 0; i < total_levels_read; i++) { int16_t rep_level = rep_levels[i]; if (rep_level < descr_->max_repetition_level()) { for (int64_t j = rep_level; j < list_depth; j++) { if (j == (list_depth - 1)) { RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); } else { RETURN_NOT_OK(offset_builders[j]->Append( static_cast(offset_builders[j + 1]->length()))); } if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); null_counts[j]++; break; } else { RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); if (empty_def_level[j] == def_levels[i]) { break; } } } } if (def_levels[i] >= values_def_level) { values_offset++; } } // Add the final offset to all lists for (int64_t j = 0; j < list_depth; j++) { if (j == (list_depth - 1)) { RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); } else { RETURN_NOT_OK(offset_builders[j]->Append( static_cast(offset_builders[j + 1]->length()))); } } std::vector> offsets; std::vector> valid_bits; std::vector list_lengths; for (int64_t j = 0; j < list_depth; j++) { list_lengths.push_back(offset_builders[j]->length() - 1); std::shared_ptr array; RETURN_NOT_OK(offset_builders[j]->Finish(&array)); offsets.emplace_back(std::static_pointer_cast(array)->values()); RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); valid_bits.emplace_back(std::static_pointer_cast(array)->values()); } std::shared_ptr output(*array); for (int64_t j = list_depth - 1; j >= 0; j--) { auto list_type = ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); output = std::make_shared<::arrow::ListArray>( list_type, list_lengths[j], offsets[j], output, valid_bits[j], null_counts[j]); } *array = output; } return Status::OK(); } template struct supports_fast_path_impl { using ArrowCType = typename ArrowType::c_type; using ParquetCType = typename ParquetType::c_type; static constexpr bool value = std::is_same::value; }; template struct supports_fast_path_impl { static constexpr bool value = false; }; template struct supports_fast_path_impl { static constexpr bool value = false; }; template using supports_fast_path = typename std::enable_if::value>::type; template struct TransferFunctor { using ArrowCType = typename ArrowType::c_type; using ParquetCType = typename ParquetType::c_type; Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { static_assert(!std::is_same::value, "The fast path transfer functor should be used " "for primitive values"); int64_t length = reader->values_written(); std::shared_ptr data; RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(ArrowCType), &data)); auto values = reinterpret_cast(reader->values()); auto out_ptr = reinterpret_cast(data->mutable_data()); std::copy(values, values + length, out_ptr); if (reader->nullable_values()) { std::shared_ptr is_valid = reader->ReleaseIsValid(); *out = std::make_shared>(type, length, data, is_valid, reader->null_count()); } else { *out = std::make_shared>(type, length, data); } return Status::OK(); } }; template struct TransferFunctor> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { int64_t length = reader->values_written(); std::shared_ptr values = reader->ReleaseValues(); if (reader->nullable_values()) { std::shared_ptr is_valid = reader->ReleaseIsValid(); *out = std::make_shared>(type, length, values, is_valid, reader->null_count()); } else { *out = std::make_shared>(type, length, values); } return Status::OK(); } }; template <> struct TransferFunctor<::arrow::BooleanType, BooleanType> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { int64_t length = reader->values_written(); std::shared_ptr data; const int64_t buffer_size = BytesForBits(length); RETURN_NOT_OK(::arrow::AllocateBuffer(pool, buffer_size, &data)); // Transfer boolean values to packed bitmap auto values = reinterpret_cast(reader->values()); uint8_t* data_ptr = data->mutable_data(); memset(data_ptr, 0, buffer_size); for (int64_t i = 0; i < length; i++) { if (values[i]) { ::arrow::BitUtil::SetBit(data_ptr, i); } } if (reader->nullable_values()) { std::shared_ptr is_valid = reader->ReleaseIsValid(); RETURN_NOT_OK(is_valid->Resize(BytesForBits(length), false)); *out = std::make_shared(type, length, data, is_valid, reader->null_count()); } else { *out = std::make_shared(type, length, data); } return Status::OK(); } }; template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); std::shared_ptr data; RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(int64_t), &data)); auto data_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { *data_ptr++ = impala_timestamp_to_nanoseconds(values[i]); } if (reader->nullable_values()) { std::shared_ptr is_valid = reader->ReleaseIsValid(); *out = std::make_shared(type, length, data, is_valid, reader->null_count()); } else { *out = std::make_shared(type, length, data); } return Status::OK(); } }; template <> struct TransferFunctor<::arrow::Date64Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); std::shared_ptr data; RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(int64_t), &data)); auto out_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { *out_ptr++ = static_cast(values[i]) * kMillisecondsInADay; } if (reader->nullable_values()) { std::shared_ptr is_valid = reader->ReleaseIsValid(); *out = std::make_shared<::arrow::Date64Array>(type, length, data, is_valid, reader->null_count()); } else { *out = std::make_shared<::arrow::Date64Array>(type, length, data); } return Status::OK(); } }; template struct TransferFunctor< ArrowType, ParquetType, typename std::enable_if::value || std::is_same::value>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { RETURN_NOT_OK(reader->builder()->Finish(out)); if (type->id() == ::arrow::Type::STRING) { // Convert from BINARY type to STRING auto new_data = (*out)->data()->Copy(); new_data->type = type; *out = ::arrow::MakeArray(new_data); } return Status::OK(); } }; static uint64_t BytesToInteger(const uint8_t* bytes, int32_t start, int32_t stop) { using ::arrow::BitUtil::FromBigEndian; const int32_t length = stop - start; DCHECK_GE(length, 0); DCHECK_LE(length, 8); switch (length) { case 0: return 0; case 1: return bytes[start]; case 2: return FromBigEndian(*reinterpret_cast(bytes + start)); case 3: { const uint64_t first_two_bytes = FromBigEndian(*reinterpret_cast(bytes + start)); const uint64_t last_byte = bytes[stop - 1]; return first_two_bytes << 8 | last_byte; } case 4: return FromBigEndian(*reinterpret_cast(bytes + start)); case 5: { const uint64_t first_four_bytes = FromBigEndian(*reinterpret_cast(bytes + start)); const uint64_t last_byte = bytes[stop - 1]; return first_four_bytes << 8 | last_byte; } case 6: { const uint64_t first_four_bytes = FromBigEndian(*reinterpret_cast(bytes + start)); const uint64_t last_two_bytes = FromBigEndian(*reinterpret_cast(bytes + start + 4)); return first_four_bytes << 16 | last_two_bytes; } case 7: { const uint64_t first_four_bytes = FromBigEndian(*reinterpret_cast(bytes + start)); const uint64_t second_two_bytes = FromBigEndian(*reinterpret_cast(bytes + start + 4)); const uint64_t last_byte = bytes[stop - 1]; return first_four_bytes << 24 | second_two_bytes << 8 | last_byte; } case 8: return FromBigEndian(*reinterpret_cast(bytes + start)); default: { DCHECK(false); return UINT64_MAX; } } } static constexpr int32_t kMinDecimalBytes = 1; static constexpr int32_t kMaxDecimalBytes = 16; /// \brief Convert a sequence of big-endian bytes to one int64_t (high bits) and one /// uint64_t (low bits). static void BytesToIntegerPair(const uint8_t* bytes, const int32_t total_number_of_bytes_used, int64_t* high, uint64_t* low) { DCHECK_GE(total_number_of_bytes_used, kMinDecimalBytes); DCHECK_LE(total_number_of_bytes_used, kMaxDecimalBytes); /// Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the /// sign bit. const bool is_negative = static_cast(bytes[0]) < 0; /// Sign extend the low bits if necessary *low = UINT64_MAX * (is_negative && total_number_of_bytes_used < 8); *high = -1 * (is_negative && total_number_of_bytes_used < kMaxDecimalBytes); /// Stop byte of the high bytes const int32_t high_bits_offset = std::max(0, total_number_of_bytes_used - 8); /// Shift left enough bits to make room for the incoming int64_t *high <<= high_bits_offset * CHAR_BIT; /// Preserve the upper bits by inplace OR-ing the int64_t *high |= BytesToInteger(bytes, 0, high_bits_offset); /// Stop byte of the low bytes const int32_t low_bits_offset = std::min(total_number_of_bytes_used, 8); /// Shift left enough bits to make room for the incoming uint64_t *low <<= low_bits_offset * CHAR_BIT; /// Preserve the upper bits by inplace OR-ing the uint64_t *low |= BytesToInteger(bytes, high_bits_offset, total_number_of_bytes_used); } static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width, uint8_t* out_buf) { // view the first 8 bytes as an unsigned 64-bit integer auto low = reinterpret_cast(out_buf); // view the second 8 bytes as a signed 64-bit integer auto high = reinterpret_cast(out_buf + sizeof(uint64_t)); // Convert the fixed size binary array bytes into a Decimal128 compatible layout BytesToIntegerPair(value, byte_width, high, low); } /// \brief Convert an array of FixedLenByteArrays to an arrow::Decimal128Array /// We do this by: /// 1. Creating a arrow::FixedSizeBinaryArray from the RecordReader's builder /// 2. Allocating a buffer for the arrow::Decimal128Array /// 3. Converting the big-endian bytes in the FixedSizeBinaryArray to two integers /// representing the high and low bits of each decimal value. template <> struct TransferFunctor<::arrow::Decimal128Type, FLBAType> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); // Finish the built data into a temporary array std::shared_ptr array; RETURN_NOT_OK(reader->builder()->Finish(&array)); const auto& fixed_size_binary_array = static_cast(*array); // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time // this will be different from the decimal array width because we write the minimum // number of bytes necessary to represent a given precision const int32_t byte_width = static_cast(*fixed_size_binary_array.type()) .byte_width(); // The byte width of each decimal value const int32_t type_length = static_cast(*type).byte_width(); // number of elements in the entire array const int64_t length = fixed_size_binary_array.length(); // allocate memory for the decimal array std::shared_ptr data; RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); // raw bytes that we can write to uint8_t* out_ptr = data->mutable_data(); // convert each FixedSizeBinary value to valid decimal bytes const int64_t null_count = fixed_size_binary_array.null_count(); if (null_count > 0) { for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { if (!fixed_size_binary_array.IsNull(i)) { RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); } } } else { for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); } } *out = std::make_shared<::arrow::Decimal128Array>( type, length, data, fixed_size_binary_array.null_bitmap(), null_count); return Status::OK(); } }; /// \brief Convert an Int32 or Int64 array into a Decimal128Array /// The parquet spec allows systems to write decimals in int32, int64 if the values are /// small enough to fit in less 4 bytes or less than 8 bytes, respectively. /// This function implements the conversion from int32 and int64 arrays to decimal arrays. template ::value || std::is_same::value>::type> static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); const int64_t length = reader->values_written(); using ElementType = typename ParquetIntegerType::c_type; static_assert(std::is_same::value || std::is_same::value, "ElementType must be int32_t or int64_t"); const auto values = reinterpret_cast(reader->values()); const auto& decimal_type = static_cast(*type); const int64_t type_length = decimal_type.byte_width(); std::shared_ptr data; RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); uint8_t* out_ptr = data->mutable_data(); using ::arrow::BitUtil::FromLittleEndian; for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { // sign/zero extend int32_t values, otherwise a no-op const auto value = static_cast(values[i]); auto out_ptr_view = reinterpret_cast(out_ptr); // No-op on little endian machines, byteswap on big endian out_ptr_view[0] = FromLittleEndian(static_cast(value)); // no need to byteswap here because we're sign/zero extending exactly 8 bytes out_ptr_view[1] = static_cast(value < 0 ? -1 : 0); } if (reader->nullable_values()) { std::shared_ptr is_valid = reader->ReleaseIsValid(); *out = std::make_shared<::arrow::Decimal128Array>(type, length, data, is_valid, reader->null_count()); } else { *out = std::make_shared<::arrow::Decimal128Array>(type, length, data); } return Status::OK(); } template <> struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; template <> struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, std::shared_ptr* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; #define TRANSFER_DATA(ArrowType, ParquetType) \ TransferFunctor func; \ RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), out)); \ RETURN_NOT_OK(WrapIntoListArray(out)) #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ case ::arrow::Type::ENUM: { \ TRANSFER_DATA(ArrowType, ParquetType); \ } break; Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { if (!record_reader_->HasMoreData()) { // Exhausted all row groups. *out = nullptr; return Status::OK(); } if (field_->type()->id() == ::arrow::Type::NA) { *out = std::make_shared<::arrow::NullArray>(records_to_read); return Status::OK(); } try { // Pre-allocation gives much better performance for flat columns record_reader_->Reserve(records_to_read); record_reader_->Reset(); while (records_to_read > 0) { if (!record_reader_->HasMoreData()) { break; } int64_t records_read = record_reader_->ReadRecords(records_to_read); records_to_read -= records_read; if (records_read == 0) { NextRowGroup(); } } } catch (const ::parquet::ParquetException& e) { return ::arrow::Status::IOError(e.what()); } switch (field_->type()->id()) { TRANSFER_CASE(BOOL, ::arrow::BooleanType, BooleanType) TRANSFER_CASE(UINT8, ::arrow::UInt8Type, Int32Type) TRANSFER_CASE(INT8, ::arrow::Int8Type, Int32Type) TRANSFER_CASE(UINT16, ::arrow::UInt16Type, Int32Type) TRANSFER_CASE(INT16, ::arrow::Int16Type, Int32Type) TRANSFER_CASE(UINT32, ::arrow::UInt32Type, Int32Type) TRANSFER_CASE(INT32, ::arrow::Int32Type, Int32Type) TRANSFER_CASE(UINT64, ::arrow::UInt64Type, Int64Type) TRANSFER_CASE(INT64, ::arrow::Int64Type, Int64Type) TRANSFER_CASE(FLOAT, ::arrow::FloatType, FloatType) TRANSFER_CASE(DOUBLE, ::arrow::DoubleType, DoubleType) TRANSFER_CASE(STRING, ::arrow::StringType, ByteArrayType) TRANSFER_CASE(BINARY, ::arrow::BinaryType, ByteArrayType) TRANSFER_CASE(DATE32, ::arrow::Date32Type, Int32Type) TRANSFER_CASE(DATE64, ::arrow::Date64Type, Int32Type) TRANSFER_CASE(FIXED_SIZE_BINARY, ::arrow::FixedSizeBinaryType, FLBAType) case ::arrow::Type::DECIMAL: { switch (descr_->physical_type()) { case ::parquet::Type::INT32: { TRANSFER_DATA(::arrow::Decimal128Type, Int32Type); } break; case ::parquet::Type::INT64: { TRANSFER_DATA(::arrow::Decimal128Type, Int64Type); } break; case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: { TRANSFER_DATA(::arrow::Decimal128Type, FLBAType); } break; default: return Status::Invalid( "Physical type for decimal must be int32, int64, or fixed length binary"); } } break; case ::arrow::Type::TIMESTAMP: { ::arrow::TimestampType* timestamp_type = static_cast<::arrow::TimestampType*>(field_->type().get()); switch (timestamp_type->unit()) { case ::arrow::TimeUnit::MILLI: case ::arrow::TimeUnit::MICRO: { TRANSFER_DATA(::arrow::TimestampType, Int64Type); } break; case ::arrow::TimeUnit::NANO: { TRANSFER_DATA(::arrow::TimestampType, Int96Type); } break; default: return Status::NotImplemented("TimeUnit not supported"); } } break; TRANSFER_CASE(TIME32, ::arrow::Time32Type, Int32Type) TRANSFER_CASE(TIME64, ::arrow::Time64Type, Int64Type) default: std::stringstream ss; ss << "No support for reading columns of type " << field_->type()->ToString(); return Status::NotImplemented(ss.str()); } return Status::OK(); } void PrimitiveImpl::NextRowGroup() { std::unique_ptr page_reader = input_->NextChunk(); record_reader_->SetPageReader(std::move(page_reader)); } Status PrimitiveImpl::GetDefLevels(const int16_t** data, size_t* length) { *data = record_reader_->def_levels(); *length = record_reader_->levels_written(); return Status::OK(); } Status PrimitiveImpl::GetRepLevels(const int16_t** data, size_t* length) { *data = record_reader_->rep_levels(); *length = record_reader_->levels_written(); return Status::OK(); } ColumnReader::ColumnReader(std::unique_ptr impl) : impl_(std::move(impl)) {} ColumnReader::~ColumnReader() {} Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { return impl_->NextBatch(records_to_read, out); } // StructImpl methods Status StructImpl::DefLevelsToNullArray(std::shared_ptr* null_bitmap_out, int64_t* null_count_out) { std::shared_ptr null_bitmap; auto null_count = 0; const int16_t* def_levels_data; size_t def_levels_length; RETURN_NOT_OK(GetDefLevels(&def_levels_data, &def_levels_length)); RETURN_NOT_OK(GetEmptyBitmap(pool_, def_levels_length, &null_bitmap)); uint8_t* null_bitmap_ptr = null_bitmap->mutable_data(); for (size_t i = 0; i < def_levels_length; i++) { if (def_levels_data[i] < struct_def_level_) { // Mark null null_count += 1; } else { DCHECK_EQ(def_levels_data[i], struct_def_level_); ::arrow::BitUtil::SetBit(null_bitmap_ptr, i); } } *null_count_out = null_count; *null_bitmap_out = (null_count == 0) ? nullptr : null_bitmap; return Status::OK(); } // TODO(itaiin): Consider caching the results of this calculation - // note that this is only used once for each read for now Status StructImpl::GetDefLevels(const int16_t** data, size_t* length) { *data = nullptr; if (children_.size() == 0) { // Empty struct *length = 0; return Status::OK(); } // We have at least one child const int16_t* child_def_levels; size_t child_length; RETURN_NOT_OK(children_[0]->GetDefLevels(&child_def_levels, &child_length)); auto size = child_length * sizeof(int16_t); RETURN_NOT_OK(def_levels_buffer_.Resize(size)); // Initialize with the minimal def level std::memset(def_levels_buffer_.mutable_data(), -1, size); auto result_levels = reinterpret_cast(def_levels_buffer_.mutable_data()); // When a struct is defined, all of its children def levels are at least at // nesting level, and def level equals nesting level. // When a struct is not defined, all of its children def levels are less than // the nesting level, and the def level equals max(children def levels) // All other possibilities are malformed definition data. for (auto& child : children_) { size_t current_child_length; RETURN_NOT_OK(child->GetDefLevels(&child_def_levels, ¤t_child_length)); DCHECK_EQ(child_length, current_child_length); for (size_t i = 0; i < child_length; i++) { // Check that value is either uninitialized, or current // and previous children def levels agree on the struct level DCHECK((result_levels[i] == -1) || ((result_levels[i] >= struct_def_level_) == (child_def_levels[i] >= struct_def_level_))); result_levels[i] = std::max(result_levels[i], std::min(child_def_levels[i], struct_def_level_)); } } *data = reinterpret_cast(def_levels_buffer_.data()); *length = child_length; return Status::OK(); } void StructImpl::InitField( const Node* node, const std::vector>& children) { // Make a shallow node to field conversion from the children fields std::vector> fields(children.size()); for (size_t i = 0; i < children.size(); i++) { fields[i] = children[i]->field(); } auto type = ::arrow::struct_(fields); field_ = ::arrow::field(node->name(), type); } Status StructImpl::GetRepLevels(const int16_t** data, size_t* length) { return Status::NotImplemented("GetRepLevels is not implemented for struct"); } Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { std::vector> children_arrays; std::shared_ptr null_bitmap; int64_t null_count; // Gather children arrays and def levels for (auto& child : children_) { std::shared_ptr child_array; RETURN_NOT_OK(child->NextBatch(records_to_read, &child_array)); children_arrays.push_back(child_array); } RETURN_NOT_OK(DefLevelsToNullArray(&null_bitmap, &null_count)); int64_t struct_length = children_arrays[0]->length(); for (size_t i = 1; i < children_arrays.size(); ++i) { if (children_arrays[i]->length() != struct_length) { // TODO(wesm): This should really only occur if the Parquet file is // malformed. Should this be a DCHECK? return Status::Invalid("Struct children had different lengths"); } } *out = std::make_shared(field()->type(), struct_length, children_arrays, null_bitmap, null_count); return Status::OK(); } std::shared_ptr RowGroupReader::Column(int column_index) { return std::shared_ptr( new ColumnChunkReader(impl_, row_group_index_, column_index)); } Status RowGroupReader::ReadTable(const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) { return impl_->ReadRowGroup(row_group_index_, column_indices, out); } Status RowGroupReader::ReadTable(std::shared_ptr<::arrow::Table>* out) { return impl_->ReadRowGroup(row_group_index_, out); } RowGroupReader::~RowGroupReader() {} RowGroupReader::RowGroupReader(FileReader::Impl* impl, int row_group_index) : impl_(impl), row_group_index_(row_group_index) {} Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { return impl_->ReadColumnChunk(column_index_, row_group_index_, out); } ColumnChunkReader::~ColumnChunkReader() {} ColumnChunkReader::ColumnChunkReader(FileReader::Impl* impl, int row_group_index, int column_index) : impl_(impl), column_index_(column_index), row_group_index_(row_group_index) {} } // namespace arrow } // namespace parquet