/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/plain_table_index.h

Source (jump to first uncovered line)
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
// The following only applies to changes made to this file as part of YugaByte development.
//
// Portions Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//

#ifndef YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H
#define YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H

#pragma once

#ifndef ROCKSDB_LITE

#include <string>
#include <vector>

#include "yb/rocksdb/db/dbformat.h"
#include "yb/rocksdb/immutable_options.h"
#include "yb/rocksdb/options.h"
#include "yb/rocksdb/util/murmurhash.h"
#include "yb/rocksdb/util/hash.h"
#include "yb/rocksdb/util/arena.h"
#include "yb/rocksdb/util/histogram.h"

#include "yb/util/status_log.h"

namespace rocksdb {

// PlainTableIndex contains buckets size of index_size_, each is a
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
// and the first bit of the integer indicates type of the offset.
//
// +--------------+------------------------------------------------------+
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
// +--------------+------------------------------------------------------+
//
// Explanation for the "flag bit":
//
// 0 indicates that the bucket contains only one prefix (no conflict when
//   hashing this prefix), whose first row starts from this offset of the
// file.
// 1 indicates that the bucket contains more than one prefixes, or there
//   are too many rows for one prefix so we need a binary search for it. In
//   this case, the offset indicates the offset of sub_index_ holding the
//   binary search indexes of keys for those rows. Those binary search indexes
//   are organized in this way:
//
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
// it, there are N 32-bit integers, each points of an offset of the file,
// which
// points to starting of a row. Those offsets need to be guaranteed to be in
// ascending order so the keys they are pointing to are also in ascending
// order
// to make sure we can use them to do binary searches. Below is visual
// presentation of a bucket.
//
// <begin>
//   number_of_records:  varint32
//   record 1 file offset:  fixedint32
//   record 2 file offset:  fixedint32
//    ....
//   record N file offset:  fixedint32
// <end>
class PlainTableIndex {
 public:
  enum IndexSearchResult {
    kNoPrefixForBucket = 0,
    kDirectToFile = 1,
    kSubindex = 2
  };

  explicit PlainTableIndex(Slice data) { CHECK_OK(InitFromRawData(data)); }

  PlainTableIndex()
      : index_size_(0),
        sub_index_size_(0),
        num_prefixes_(0),
        index_(nullptr),
        sub_index_(nullptr) {}

  IndexSearchResult GetOffset(uint32_t prefix_hash,
                              uint32_t* bucket_value) const;

  Status InitFromRawData(Slice data);

  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
                                              uint32_t* upper_bound) const {
    const char* index_ptr = &sub_index_[offset];
    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
  }

  uint32_t GetIndexSize() const { return index_size_; }

  uint32_t GetSubIndexSize() const { return sub_index_size_; }

  uint32_t GetNumPrefixes() const { return num_prefixes_; }

  static const uint64_t kMaxFileSize = (1u << 31) - 1;
  static const uint32_t kSubIndexMask = 0x80000000;
  static const size_t kOffsetLen = sizeof(uint32_t);

 private:
  uint32_t index_size_;
  uint32_t sub_index_size_;
  uint32_t num_prefixes_;

  uint32_t* index_;
  char* sub_index_;
};

// PlainTableIndexBuilder is used to create plain table index.
// After calling Finish(), it returns Slice, which is usually
// used either to initialize PlainTableIndex or
// to save index to sst file.
// For more details about the  index, please refer to:
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
// #wiki-in-memory-index-format
class PlainTableIndexBuilder {
 public:
  PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
                         size_t index_sparseness, double hash_table_ratio,
                         size_t huge_page_tlb_size)
      : arena_(arena),
        ioptions_(ioptions),
        record_list_(kRecordsPerGroup),
        is_first_record_(true),
        due_index_(false),
        num_prefixes_(0),
        num_keys_per_prefix_(0),
        prev_key_prefix_hash_(0),
        index_sparseness_(index_sparseness),
        prefix_extractor_(ioptions.prefix_extractor),
        hash_table_ratio_(hash_table_ratio),
        huge_page_tlb_size_(huge_page_tlb_size) {}

  void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);

  Slice Finish();

  uint32_t GetTotalSize() const {
    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
  }

  static const std::string kPlainTableIndexBlock;

 private:
  struct IndexRecord {
    uint32_t hash;    // hash of the prefix
    uint32_t offset;  // offset of a row
    IndexRecord* next;
  };

  // Helper class to track all the index records
  class IndexRecordList {
   public:
    explicit IndexRecordList(size_t num_records_per_group)
        : kNumRecordsPerGroup(num_records_per_group),
          current_group_(nullptr),
          num_records_in_current_group_(num_records_per_group) {}

    ~IndexRecordList() {
      for (size_t i = 0; i < groups_.size(); i++5.18k) {
        delete[] groups_[i];
      }
    }

    void AddRecord(uint32_t hash, uint32_t offset);

    size_t GetNumRecords() const {
      return (groups_.size() - 1) * kNumRecordsPerGroup +
             num_records_in_current_group_;
    }
    IndexRecord* At(size_t index) {
      return &(groups_[index / kNumRecordsPerGroup]
                      [index % kNumRecordsPerGroup]);
    }

   private:
    IndexRecord* AllocateNewGroup() {
      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
      groups_.push_back(result);
      return result;
    }

    // Each group in `groups_` contains fix-sized records (determined by
    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
    // occurs.
    const size_t kNumRecordsPerGroup;
    IndexRecord* current_group_;
    // List of arrays allocated
    std::vector<IndexRecord*> groups_;
    size_t num_records_in_current_group_;
  };

  void AllocateIndex();

  // Internal helper function to bucket index record list to hash buckets.
  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
                        std::vector<uint32_t>* entries_per_bucket);

  // Internal helper class to fill the indexes and bloom filters to internal
  // data structures.
  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
                    const std::vector<uint32_t>& entries_per_bucket);

  Arena* arena_;
  const ImmutableCFOptions ioptions_;
  HistogramImpl keys_per_prefix_hist_;
  IndexRecordList record_list_;
  bool is_first_record_;
  bool due_index_;
  uint32_t num_prefixes_;
  uint32_t num_keys_per_prefix_;

  uint32_t prev_key_prefix_hash_;
  size_t index_sparseness_;
  uint32_t index_size_;
  uint32_t sub_index_size_;

  const SliceTransform* prefix_extractor_;
  double hash_table_ratio_;
  size_t huge_page_tlb_size_;

  std::string prev_key_prefix_;

  static const size_t kRecordsPerGroup = 256;
};

};  // namespace rocksdb

#endif  // ROCKSDB_LITE

#endif // YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2		// This source code is licensed under the BSD-style license found in the
3		// LICENSE file in the root directory of this source tree. An additional grant
4		// of patent rights can be found in the PATENTS file in the same directory.
5		//
6		// The following only applies to changes made to this file as part of YugaByte development.
7		//
8		// Portions Copyright (c) YugaByte, Inc.
9		//
10		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11		// in compliance with the License. You may obtain a copy of the License at
12		//
13		// http://www.apache.org/licenses/LICENSE-2.0
14		//
15		// Unless required by applicable law or agreed to in writing, software distributed under the License
16		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17		// or implied. See the License for the specific language governing permissions and limitations
18		// under the License.
19		//
20
21		#ifndef YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H
22		#define YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H
23
24		#pragma once
25
26		#ifndef ROCKSDB_LITE
27
28		#include <string>
29		#include <vector>
30
31		#include "yb/rocksdb/db/dbformat.h"
32		#include "yb/rocksdb/immutable_options.h"
33		#include "yb/rocksdb/options.h"
34		#include "yb/rocksdb/util/murmurhash.h"
35		#include "yb/rocksdb/util/hash.h"
36		#include "yb/rocksdb/util/arena.h"
37		#include "yb/rocksdb/util/histogram.h"
38
39		#include "yb/util/status_log.h"
40
41		namespace rocksdb {
42
43		// PlainTableIndex contains buckets size of index_size_, each is a
44		// 32-bit integer. The lower 31 bits contain an offset value (explained below)
45		// and the first bit of the integer indicates type of the offset.
46		//
47		// +--------------+------------------------------------------------------+
48		// \| Flag (1 bit) \| Offset to binary search buffer or file (31 bits) +
49		// +--------------+------------------------------------------------------+
50		//
51		// Explanation for the "flag bit":
52		//
53		// 0 indicates that the bucket contains only one prefix (no conflict when
54		// hashing this prefix), whose first row starts from this offset of the
55		// file.
56		// 1 indicates that the bucket contains more than one prefixes, or there
57		// are too many rows for one prefix so we need a binary search for it. In
58		// this case, the offset indicates the offset of sub_index_ holding the
59		// binary search indexes of keys for those rows. Those binary search indexes
60		// are organized in this way:
61		//
62		// The first 4 bytes, indicate how many indexes (N) are stored after it. After
63		// it, there are N 32-bit integers, each points of an offset of the file,
64		// which
65		// points to starting of a row. Those offsets need to be guaranteed to be in
66		// ascending order so the keys they are pointing to are also in ascending
67		// order
68		// to make sure we can use them to do binary searches. Below is visual
69		// presentation of a bucket.
70		//
71		// <begin>
72		// number_of_records: varint32
73		// record 1 file offset: fixedint32
74		// record 2 file offset: fixedint32
75		// ....
76		// record N file offset: fixedint32
77		// <end>
78		class PlainTableIndex {
79		public:
80		enum IndexSearchResult {
81		kNoPrefixForBucket = 0,
82		kDirectToFile = 1,
83		kSubindex = 2
84		};
85
86	0	explicit PlainTableIndex(Slice data) { CHECK_OK(InitFromRawData(data)); }
87
88		PlainTableIndex()
89		: index_size_(0),
90		sub_index_size_(0),
91		num_prefixes_(0),
92		index_(nullptr),
93	3.10k	sub_index_(nullptr) {}
94
95		IndexSearchResult GetOffset(uint32_t prefix_hash,
96		uint32_t* bucket_value) const;
97
98		Status InitFromRawData(Slice data);
99
100		const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
101	87.0k	uint32_t* upper_bound) const {
102	87.0k	const char* index_ptr = &sub_index_[offset];
103	87.0k	return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
104	87.0k	}
105
106	20.0k	uint32_t GetIndexSize() const { return index_size_; }
107
108	3.03k	uint32_t GetSubIndexSize() const { return sub_index_size_; }
109
110	3.03k	uint32_t GetNumPrefixes() const { return num_prefixes_; }
111
112		static const uint64_t kMaxFileSize = (1u << 31) - 1;
113		static const uint32_t kSubIndexMask = 0x80000000;
114		static const size_t kOffsetLen = sizeof(uint32_t);
115
116		private:
117		uint32_t index_size_;
118		uint32_t sub_index_size_;
119		uint32_t num_prefixes_;
120
121		uint32_t* index_;
122		char* sub_index_;
123		};
124
125		// PlainTableIndexBuilder is used to create plain table index.
126		// After calling Finish(), it returns Slice, which is usually
127		// used either to initialize PlainTableIndex or
128		// to save index to sst file.
129		// For more details about the index, please refer to:
130		// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
131		// #wiki-in-memory-index-format
132		class PlainTableIndexBuilder {
133		public:
134		PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
135		size_t index_sparseness, double hash_table_ratio,
136		size_t huge_page_tlb_size)
137		: arena_(arena),
138		ioptions_(ioptions),
139		record_list_(kRecordsPerGroup),
140		is_first_record_(true),
141		due_index_(false),
142		num_prefixes_(0),
143		num_keys_per_prefix_(0),
144		prev_key_prefix_hash_(0),
145		index_sparseness_(index_sparseness),
146		prefix_extractor_(ioptions.prefix_extractor),
147		hash_table_ratio_(hash_table_ratio),
148	3.16k	huge_page_tlb_size_(huge_page_tlb_size) {}
149
150		void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
151
152		Slice Finish();
153
154	6.20k	uint32_t GetTotalSize() const {
155	6.20k	return VarintLength(index_size_) + VarintLength(num_prefixes_) +
156	6.20k	PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
157	6.20k	}
158
159		static const std::string kPlainTableIndexBlock;
160
161		private:
162		struct IndexRecord {
163		uint32_t hash; // hash of the prefix
164		uint32_t offset; // offset of a row
165		IndexRecord* next;
166		};
167
168		// Helper class to track all the index records
169		class IndexRecordList {
170		public:
171		explicit IndexRecordList(size_t num_records_per_group)
172		: kNumRecordsPerGroup(num_records_per_group),
173		current_group_(nullptr),
174	3.16k	num_records_in_current_group_(num_records_per_group) {}
175
176	3.16k	~IndexRecordList() {
177	8.35k	for (size_t i = 0; i < groups_.size(); i++5.18k ) {
178	5.18k	delete[] groups_[i];
179	5.18k	}
180	3.16k	}
181
182		void AddRecord(uint32_t hash, uint32_t offset);
183
184	3.10k	size_t GetNumRecords() const {
185	3.10k	return (groups_.size() - 1) * kNumRecordsPerGroup +
186	3.10k	num_records_in_current_group_;
187	3.10k	}
188	690k	IndexRecord* At(size_t index) {
189	690k	return &(groups_[index / kNumRecordsPerGroup]
190	690k	[index % kNumRecordsPerGroup]);
191	690k	}
192
193		private:
194	5.18k	IndexRecord* AllocateNewGroup() {
195	5.18k	IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
196	5.18k	groups_.push_back(result);
197	5.18k	return result;
198	5.18k	}
199
200		// Each group in `groups_` contains fix-sized records (determined by
201		// kNumRecordsPerGroup). Which can help us minimize the cost if resizing
202		// occurs.
203		const size_t kNumRecordsPerGroup;
204		IndexRecord* current_group_;
205		// List of arrays allocated
206		std::vector<IndexRecord*> groups_;
207		size_t num_records_in_current_group_;
208		};
209
210		void AllocateIndex();
211
212		// Internal helper function to bucket index record list to hash buckets.
213		void BucketizeIndexes(std::vector<IndexRecord> hash_to_offsets,
214		std::vector<uint32_t>* entries_per_bucket);
215
216		// Internal helper class to fill the indexes and bloom filters to internal
217		// data structures.
218		Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
219		const std::vector<uint32_t>& entries_per_bucket);
220
221		Arena* arena_;
222		const ImmutableCFOptions ioptions_;
223		HistogramImpl keys_per_prefix_hist_;
224		IndexRecordList record_list_;
225		bool is_first_record_;
226		bool due_index_;
227		uint32_t num_prefixes_;
228		uint32_t num_keys_per_prefix_;
229
230		uint32_t prev_key_prefix_hash_;
231		size_t index_sparseness_;
232		uint32_t index_size_;
233		uint32_t sub_index_size_;
234
235		const SliceTransform* prefix_extractor_;
236		double hash_table_ratio_;
237		size_t huge_page_tlb_size_;
238
239		std::string prev_key_prefix_;
240
241		static const size_t kRecordsPerGroup = 256;
242		};
243
244		}; // namespace rocksdb
245
246		#endif // ROCKSDB_LITE
247
248		#endif // YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H