/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ORC_VECTOR_HH #define ORC_VECTOR_HH #include "Int128.hh" #include "MemoryPool.hh" #include "orc/orc-config.hh" #include #include #include #include #include #include #include namespace orc { /** * The base class for each of the column vectors. This class handles * the generic attributes such as number of elements, capacity, and * notNull vector. * Note: If hasNull is false, the values in the notNull buffer are not required. * On the writer side, it does not read values from notNull buffer so users are * not expected to write notNull buffer if hasNull is false. On the reader side, * it does not set notNull buffer if hasNull is false, meaning that it is undefined * behavior to consume values from notNull buffer in this case by downstream users. */ struct ColumnVectorBatch { ColumnVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~ColumnVectorBatch(); // the number of slots available uint64_t capacity; // the number of current occupied slots uint64_t numElements; // an array of capacity length marking non-null values DataBuffer notNull; // whether there are any null values bool hasNulls; // whether the vector batch is encoded bool isEncoded; // whether the dictionary is decoded into vector batch bool dictionaryDecoded; // custom memory pool MemoryPool& memoryPool; /** * Generate a description of this vector as a string. */ virtual std::string toString() const = 0; /** * Change the number of slots to at least the given capacity. * This function is not recursive into subtypes. */ virtual void resize(uint64_t capacity); /** * Empties the vector from all its elements, recursively. * Do not alter the current capacity. */ virtual void clear(); /** * Heap memory used by the batch. */ virtual uint64_t getMemoryUsage(); /** * Check whether the batch length varies depending on data. */ virtual bool hasVariableLength(); /** * Decode possible dictionary into vector batch. */ void decodeDictionary(); protected: virtual void decodeDictionaryImpl() {} private: ColumnVectorBatch(const ColumnVectorBatch&); ColumnVectorBatch& operator=(const ColumnVectorBatch&); }; template struct IntegerVectorBatch : public ColumnVectorBatch { IntegerVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), data(pool, cap) { // PASS } ~IntegerVectorBatch() override = default; inline std::string toString() const override; void resize(uint64_t cap) override { if (capacity < cap) { ColumnVectorBatch::resize(cap); data.resize(cap); } } void clear() override { numElements = 0; } uint64_t getMemoryUsage() override { return ColumnVectorBatch::getMemoryUsage() + static_cast(data.capacity() * sizeof(ValueType)); } DataBuffer data; }; using LongVectorBatch = IntegerVectorBatch; using IntVectorBatch = IntegerVectorBatch; using ShortVectorBatch = IntegerVectorBatch; using ByteVectorBatch = IntegerVectorBatch; template <> inline std::string LongVectorBatch::toString() const { std::ostringstream buffer; buffer << "Long vector <" << numElements << " of " << capacity << ">"; return buffer.str(); } template <> inline std::string IntVectorBatch::toString() const { std::ostringstream buffer; buffer << "Int vector <" << numElements << " of " << capacity << ">"; return buffer.str(); } template <> inline std::string ShortVectorBatch::toString() const { std::ostringstream buffer; buffer << "Short vector <" << numElements << " of " << capacity << ">"; return buffer.str(); } template <> inline std::string ByteVectorBatch::toString() const { std::ostringstream buffer; buffer << "Byte vector <" << numElements << " of " << capacity << ">"; return buffer.str(); } template struct FloatingVectorBatch : public ColumnVectorBatch { FloatingVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), data(pool, cap) { // PASS } ~FloatingVectorBatch() override = default; inline std::string toString() const override; void resize(uint64_t cap) override { if (capacity < cap) { ColumnVectorBatch::resize(cap); data.resize(cap); } } void clear() override { numElements = 0; } uint64_t getMemoryUsage() override { return ColumnVectorBatch::getMemoryUsage() + static_cast(data.capacity() * sizeof(FloatType)); } DataBuffer data; }; using DoubleVectorBatch = FloatingVectorBatch; using FloatVectorBatch = FloatingVectorBatch; template <> inline std::string DoubleVectorBatch::toString() const { std::ostringstream buffer; buffer << "Double vector <" << numElements << " of " << capacity << ">"; return buffer.str(); } template <> inline std::string FloatVectorBatch::toString() const { std::ostringstream buffer; buffer << "Float vector <" << numElements << " of " << capacity << ">"; return buffer.str(); } struct StringVectorBatch : public ColumnVectorBatch { StringVectorBatch(uint64_t capacity, MemoryPool& pool); ~StringVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; // pointers to the start of each string DataBuffer data; // the length of each string DataBuffer length; // string blob DataBuffer blob; }; struct StringDictionary { StringDictionary(MemoryPool& pool); DataBuffer dictionaryBlob; // Offset for each dictionary key entry. DataBuffer dictionaryOffset; void getValueByIndex(int64_t index, char*& valPtr, int64_t& length) { if (index < 0 || static_cast(index) + 1 >= dictionaryOffset.size()) { throw std::out_of_range("index out of range."); } int64_t* offsetPtr = dictionaryOffset.data(); valPtr = dictionaryBlob.data() + offsetPtr[index]; length = offsetPtr[index + 1] - offsetPtr[index]; } }; /** * Include a index array with reference to corresponding dictionary. * User first obtain index from index array and retrieve string pointer * and length by calling getValueByIndex() from dictionary. */ struct EncodedStringVectorBatch : public StringVectorBatch { EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool); ~EncodedStringVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; // Calculate data and length in StringVectorBatch from dictionary and index void decodeDictionaryImpl() override; std::shared_ptr dictionary; // index for dictionary entry DataBuffer index; }; struct StructVectorBatch : public ColumnVectorBatch { StructVectorBatch(uint64_t capacity, MemoryPool& pool); ~StructVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; bool hasVariableLength() override; std::vector fields; protected: void decodeDictionaryImpl() override; }; struct ListVectorBatch : public ColumnVectorBatch { ListVectorBatch(uint64_t capacity, MemoryPool& pool); ~ListVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; bool hasVariableLength() override; /** * The offset of the first element of each list. * The length of list i is offsets[i+1] - offsets[i]. */ DataBuffer offsets; // the concatenated elements std::unique_ptr elements; protected: void decodeDictionaryImpl() override; }; struct MapVectorBatch : public ColumnVectorBatch { MapVectorBatch(uint64_t capacity, MemoryPool& pool); ~MapVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; bool hasVariableLength() override; /** * The offset of the first element of each map. * The size of map i is offsets[i+1] - offsets[i]. */ DataBuffer offsets; // the concatenated keys std::unique_ptr keys; // the concatenated elements std::unique_ptr elements; protected: void decodeDictionaryImpl() override; }; struct UnionVectorBatch : public ColumnVectorBatch { UnionVectorBatch(uint64_t capacity, MemoryPool& pool); ~UnionVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; bool hasVariableLength() override; /** * For each value, which element of children has the value. */ DataBuffer tags; /** * For each value, the index inside of the child ColumnVectorBatch. */ DataBuffer offsets; // the sub-columns std::vector children; protected: void decodeDictionaryImpl() override; }; struct Decimal { Decimal(const Int128& value, int32_t scale); explicit Decimal(const std::string& value); Decimal(); std::string toString(bool trimTrailingZeros = false) const; Int128 value; int32_t scale; }; struct Decimal64VectorBatch : public ColumnVectorBatch { Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool); ~Decimal64VectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; // total number of digits int32_t precision; // the number of places after the decimal int32_t scale; // the numeric values DataBuffer values; protected: /** * Contains the scales that were read from the file. Should NOT be * used. */ DataBuffer readScales; friend class Decimal64ColumnReader; friend class Decimal64ColumnWriter; }; struct Decimal128VectorBatch : public ColumnVectorBatch { Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool); ~Decimal128VectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; // total number of digits int32_t precision; // the number of places after the decimal int32_t scale; // the numeric values DataBuffer values; protected: /** * Contains the scales that were read from the file. Should NOT be * used. */ DataBuffer readScales; friend class Decimal128ColumnReader; friend class DecimalHive11ColumnReader; friend class Decimal128ColumnWriter; }; /** * A column vector batch for storing timestamp values. * The timestamps are stored split into the time_t value (seconds since * 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value. */ struct TimestampVectorBatch : public ColumnVectorBatch { TimestampVectorBatch(uint64_t capacity, MemoryPool& pool); ~TimestampVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; void clear() override; uint64_t getMemoryUsage() override; // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t) // Note that we always assume data is in GMT timezone; therefore it is // user's responsibility to convert wall clock time in local timezone // to GMT. DataBuffer data; // the nanoseconds of each value DataBuffer nanoseconds; }; } // namespace orc #endif