/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/plain_table_index.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | |
21 | | #ifndef ROCKSDB_LITE |
22 | | |
23 | | #ifndef __STDC_FORMAT_MACROS |
24 | | #define __STDC_FORMAT_MACROS |
25 | | #endif |
26 | | |
27 | | #include <inttypes.h> |
28 | | |
29 | | #include "yb/rocksdb/table/plain_table_index.h" |
30 | | #include "yb/rocksdb/env.h" |
31 | | #include "yb/rocksdb/util/coding.h" |
32 | | #include "yb/rocksdb/util/hash.h" |
33 | | |
34 | | namespace rocksdb { |
35 | | |
36 | | namespace { |
37 | 814k | inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { |
38 | 814k | assert(num_buckets > 0); |
39 | 0 | return hash % num_buckets; |
40 | 814k | } |
41 | | } |
42 | | |
43 | 3.10k | Status PlainTableIndex::InitFromRawData(Slice data) { |
44 | 3.10k | if (!GetVarint32(&data, &index_size_)) { |
45 | 0 | return STATUS(Corruption, "Couldn't read the index size!"); |
46 | 0 | } |
47 | 3.10k | assert(index_size_ > 0); |
48 | 3.10k | if (!GetVarint32(&data, &num_prefixes_)) { |
49 | 0 | return STATUS(Corruption, "Couldn't read the index size!"); |
50 | 0 | } |
51 | 3.10k | sub_index_size_ = |
52 | 3.10k | static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen; |
53 | | |
54 | 3.10k | char* index_data_begin = const_cast<char*>(data.cdata()); |
55 | 3.10k | index_ = reinterpret_cast<uint32_t*>(index_data_begin); |
56 | 3.10k | sub_index_ = reinterpret_cast<char*>(index_ + index_size_); |
57 | 3.10k | return Status::OK(); |
58 | 3.10k | } |
59 | | |
60 | | PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( |
61 | 123k | uint32_t prefix_hash, uint32_t* bucket_value) const { |
62 | 123k | int bucket = GetBucketIdFromHash(prefix_hash, index_size_); |
63 | 123k | *bucket_value = index_[bucket]; |
64 | 123k | if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { |
65 | 87.0k | *bucket_value ^= kSubIndexMask; |
66 | 87.0k | return kSubindex; |
67 | 87.0k | } |
68 | 36.2k | if (*bucket_value >= kMaxFileSize) { |
69 | 876 | return kNoPrefixForBucket; |
70 | 35.4k | } else { |
71 | | // point directly to the file |
72 | 35.4k | return kDirectToFile; |
73 | 35.4k | } |
74 | 36.2k | } |
75 | | |
76 | | void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash, |
77 | 691k | uint32_t offset) { |
78 | 691k | if (num_records_in_current_group_ == kNumRecordsPerGroup) { |
79 | 5.18k | current_group_ = AllocateNewGroup(); |
80 | 5.18k | num_records_in_current_group_ = 0; |
81 | 5.18k | } |
82 | 691k | auto& new_record = current_group_[num_records_in_current_group_++]; |
83 | 691k | new_record.hash = hash; |
84 | 691k | new_record.offset = offset; |
85 | 691k | new_record.next = nullptr; |
86 | 691k | } |
87 | | |
88 | | void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, |
89 | 975k | uint32_t key_offset) { |
90 | 975k | if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()972k ) { |
91 | 674k | ++num_prefixes_; |
92 | 674k | if (!is_first_record_) { |
93 | 671k | keys_per_prefix_hist_.Add(num_keys_per_prefix_); |
94 | 671k | } |
95 | 674k | num_keys_per_prefix_ = 0; |
96 | 674k | prev_key_prefix_ = key_prefix_slice.ToString(); |
97 | 674k | prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); |
98 | 674k | due_index_ = true; |
99 | 674k | } |
100 | | |
101 | 975k | if (due_index_) { |
102 | | // Add an index key for every kIndexIntervalForSamePrefixKeys keys |
103 | 691k | record_list_.AddRecord(prev_key_prefix_hash_, key_offset); |
104 | 691k | due_index_ = false; |
105 | 691k | } |
106 | | |
107 | 975k | num_keys_per_prefix_++; |
108 | 975k | if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0975k ) { |
109 | 17.0k | due_index_ = true; |
110 | 17.0k | } |
111 | 975k | is_first_record_ = false; |
112 | 975k | } |
113 | | |
114 | 3.10k | Slice PlainTableIndexBuilder::Finish() { |
115 | 3.10k | AllocateIndex(); |
116 | 3.10k | std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr); |
117 | 3.10k | std::vector<uint32_t> entries_per_bucket(index_size_, 0); |
118 | 3.10k | BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); |
119 | | |
120 | 3.10k | keys_per_prefix_hist_.Add(num_keys_per_prefix_); |
121 | 3.10k | RLOG(InfoLogLevel::INFO_LEVEL, ioptions_.info_log, |
122 | 3.10k | "Number of Keys per prefix Histogram: %s", |
123 | 3.10k | keys_per_prefix_hist_.ToString().c_str()); |
124 | | |
125 | | // From the temp data structure, populate indexes. |
126 | 3.10k | return FillIndexes(hash_to_offsets, entries_per_bucket); |
127 | 3.10k | } |
128 | | |
129 | 3.10k | void PlainTableIndexBuilder::AllocateIndex() { |
130 | 3.10k | if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 02.75k ) { |
131 | | // Fall back to pure binary search if the user fails to specify a prefix |
132 | | // extractor. |
133 | 370 | index_size_ = 1; |
134 | 2.73k | } else { |
135 | 2.73k | double hash_table_size_multipier = 1.0 / hash_table_ratio_; |
136 | 2.73k | index_size_ = |
137 | 2.73k | static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1; |
138 | 2.73k | assert(index_size_ > 0); |
139 | 2.73k | } |
140 | 3.10k | } |
141 | | |
142 | | void PlainTableIndexBuilder::BucketizeIndexes( |
143 | | std::vector<IndexRecord*>* hash_to_offsets, |
144 | 3.10k | std::vector<uint32_t>* entries_per_bucket) { |
145 | 3.10k | bool first = true; |
146 | 3.10k | uint32_t prev_hash = 0; |
147 | 3.10k | size_t num_records = record_list_.GetNumRecords(); |
148 | 694k | for (size_t i = 0; i < num_records; i++690k ) { |
149 | 690k | IndexRecord* index_record = record_list_.At(i); |
150 | 690k | uint32_t cur_hash = index_record->hash; |
151 | 690k | if (first || prev_hash != cur_hash687k ) { |
152 | 674k | prev_hash = cur_hash; |
153 | 674k | first = false; |
154 | 674k | } |
155 | 690k | uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); |
156 | 690k | IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; |
157 | 690k | index_record->next = prev_bucket_head; |
158 | 690k | (*hash_to_offsets)[bucket] = index_record; |
159 | 690k | (*entries_per_bucket)[bucket]++; |
160 | 690k | } |
161 | | |
162 | 3.10k | sub_index_size_ = 0; |
163 | 900k | for (auto entry_count : *entries_per_bucket) { |
164 | 900k | if (entry_count <= 1) { |
165 | 744k | continue; |
166 | 744k | } |
167 | | // Only buckets with more than 1 entry will have subindex. |
168 | 156k | sub_index_size_ += VarintLength(entry_count); |
169 | | // total bytes needed to store these entries' in-file offsets. |
170 | 156k | sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; |
171 | 156k | } |
172 | 3.10k | } |
173 | | |
174 | | Slice PlainTableIndexBuilder::FillIndexes( |
175 | | const std::vector<IndexRecord*>& hash_to_offsets, |
176 | 3.10k | const std::vector<uint32_t>& entries_per_bucket) { |
177 | 3.10k | RLOG(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log, |
178 | 3.10k | "Reserving %" PRIu32 " bytes for plain table's sub_index", |
179 | 3.10k | sub_index_size_); |
180 | 3.10k | auto total_allocate_size = GetTotalSize(); |
181 | 3.10k | char* allocated = arena_->AllocateAligned( |
182 | 3.10k | total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); |
183 | | |
184 | 3.10k | auto temp_ptr = EncodeVarint32(allocated, index_size_); |
185 | 3.10k | uint32_t* index = |
186 | 3.10k | reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_)); |
187 | 3.10k | char* sub_index = reinterpret_cast<char*>(index + index_size_); |
188 | | |
189 | 3.10k | uint32_t sub_index_offset = 0; |
190 | 903k | for (uint32_t i = 0; i < index_size_; i++900k ) { |
191 | 900k | uint32_t num_keys_for_bucket = entries_per_bucket[i]; |
192 | 900k | switch (num_keys_for_bucket) { |
193 | 428k | case 0: |
194 | | // No key for bucket |
195 | 428k | index[i] = PlainTableIndex::kMaxFileSize; |
196 | 428k | break; |
197 | 315k | case 1: |
198 | | // point directly to the file offset |
199 | 315k | index[i] = hash_to_offsets[i]->offset; |
200 | 315k | break; |
201 | 156k | default: |
202 | | // point to second level indexes. |
203 | 156k | index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask; |
204 | 156k | char* prev_ptr = &sub_index[sub_index_offset]; |
205 | 156k | char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); |
206 | 156k | sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr); |
207 | 156k | char* sub_index_pos = &sub_index[sub_index_offset]; |
208 | 156k | IndexRecord* record = hash_to_offsets[i]; |
209 | 156k | int j; |
210 | 531k | for (j = num_keys_for_bucket - 1; j >= 0 && record375k ; |
211 | 375k | j--, record = record->next) { |
212 | 375k | EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); |
213 | 375k | } |
214 | 156k | assert(j == -1 && record == nullptr); |
215 | 0 | sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; |
216 | 156k | assert(sub_index_offset <= sub_index_size_); |
217 | 0 | break; |
218 | 900k | } |
219 | 900k | } |
220 | 3.09k | assert(sub_index_offset == sub_index_size_); |
221 | | |
222 | 3.09k | RLOG(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log, |
223 | 3.09k | "hash table size: %d, suffix_map length %" ROCKSDB_PRIszt, index_size_, |
224 | 3.09k | sub_index_size_); |
225 | 3.09k | return Slice(allocated, GetTotalSize()); |
226 | 3.10k | } |
227 | | |
228 | | const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = |
229 | | "PlainTableIndexBlock"; |
230 | | }; // namespace rocksdb |
231 | | |
232 | | #endif // ROCKSDB_LITE |